In [1]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
import warnings
from sklearn import metrics
warnings.filterwarnings('ignore')

# データ読み込み

In [2]:
df = pd.read_csv("train_data")

In [3]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.001579e+19,0,14102100,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,...,1,0,19772,320,50,2227,0,687,100075,48
2,1.002948e+18,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,1779deee,2347f47a,...,1,0,20596,320,50,2161,0,35,-1,157
3,1.004511e+19,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,...,1,0,19743,320,50,2264,3,427,100000,61
4,1.00599e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15702,320,50,1722,0,35,-1,79


In [4]:
df.describe()

Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
count,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0,404290.0
mean,9.227571e+18,0.16881,14102560.0,1004.968965,0.287484,1.015546,0.332039,18848.537033,318.883217,60.079591,2113.233204,1.429744,226.456237,53158.651233,83.376487
std,5.32236e+18,0.374584,296.6841,1.096874,0.506203,0.527517,0.855497,4943.484325,21.406455,47.261532,607.66869,1.325706,349.919353,49960.437553,70.192991
min,42654270000000.0,0.0,14102100.0,1001.0,0.0,0.0,0.0,375.0,120.0,20.0,112.0,0.0,33.0,-1.0,1.0
25%,4.616612e+18,0.0,14102300.0,1005.0,0.0,1.0,0.0,16920.0,320.0,50.0,1863.0,0.0,35.0,-1.0,23.0
50%,9.223151e+18,0.0,14102600.0,1005.0,0.0,1.0,0.0,20346.0,320.0,50.0,2323.0,2.0,39.0,100043.0,61.0
75%,1.383943e+19,0.0,14102810.0,1005.0,1.0,1.0,0.0,21894.0,320.0,50.0,2526.0,3.0,171.0,100084.0,101.0
max,1.844674e+19,1.0,14103020.0,1012.0,7.0,5.0,5.0,24043.0,1024.0,1024.0,2757.0,3.0,1839.0,100248.0,255.0


In [5]:
df.isnull().sum()

id                  0
click               0
hour                0
C1                  0
banner_pos          0
site_id             0
site_domain         0
site_category       0
app_id              0
app_domain          0
app_category        0
device_id           0
device_ip           0
device_model        0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
dtype: int64

# 前処理

In [6]:
feature_names = [
    "hour",
    "banner_pos",
    "site_id",
    "site_domain",
    "site_category",
    "app_id",
    "app_domain",
    "app_category",
    "device_id",
    "device_ip",
    "device_model",
    "device_type",
]

target_name = "click"

In [7]:
X = df[feature_names]
y= df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, shuffle=False)

In [8]:
def preprocess(df: pd.DataFrame):
    df['datetime'] = pd.to_datetime(df['hour'], format='%y%m%d%H')
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday
    feature_hasher = FeatureHasher(n_features=2**18, input_type="string")
    hashed_features = feature_hasher.fit_transform(df.astype(str).values)
    return hashed_features

In [9]:
X_train_preprocessed = preprocess(X_train)
X_valid_preprocessed = preprocess(X_valid)
X_test_preprocessed = preprocess(X_test)

# ハイパラチューニング

In [12]:
def grid_search(X_train, y_train, X_valid, y_valid) -> float:
    best_score = 1e10
    best_alpha = 0.01
    for alpha in [1e-5, 1e-4, 1e-3, 1e2, 1e-1]:
        model = SGDClassifier(loss="log_loss", penalty="l2", random_state=42, alpha=alpha)
        model.fit(X_train, y_train)
        train_proba = model.predict_proba(X_train_preprocessed)[:, 1]
        valid_proba = model.predict_proba(X_valid_preprocessed)[:, 1]
        train_score = metrics.log_loss(y_train, train_proba)
        valid_score = metrics.log_loss(y_valid, valid_proba)
        print(f"Grid Search| alpha: {alpha}, train log loss: {train_score}, valid log loss: {valid_score}")
        if best_score > valid_score:
            best_score = valid_score
            best_alpha = alpha
    return best_alpha

# 学習・評価

In [13]:
best_alpha = grid_search(X_train_preprocessed, y_train, X_valid_preprocessed, y_valid)

best_model = SGDClassifier(
    loss="log_loss", penalty="l2", random_state=42, alpha=best_alpha
)
best_model.fit(X_train_preprocessed, y_train)

# 予測確率を取得
y_pred_proba = best_model.predict_proba(X_test_preprocessed)[:, 1]
# 予測クラスを取得
y_pred = best_model.predict(X_test_preprocessed)

# logloss
logloss = metrics.log_loss(y_test, y_pred_proba)
# AUC
auc = metrics.roc_auc_score(y_test, y_pred_proba)
# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)

print("test logloss: {}".format(logloss))
print("AUC: {}".format(auc))
print("Accuracy: {}".format(accuracy))

Grid Search| alpha: 1e-05, train log loss: 0.3763317840156504, valid log loss: 0.3974486898518637
Grid Search| alpha: 0.0001, train log loss: 0.40359257628927386, valid log loss: 0.3933346694988594
Grid Search| alpha: 0.001, train log loss: 0.41710329797392487, valid log loss: 0.4020087564820292
Grid Search| alpha: 100.0, train log loss: 0.6862219721328485, valid log loss: 0.6864819640382058
Grid Search| alpha: 0.1, train log loss: 0.4522462546021422, valid log loss: 0.43974074906133426
test logloss: 0.40316918447397815
AUC: 0.721722629994545
Accuracy: 0.8375670929283435
