In [None]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
import warnings
from sklearn import metrics
warnings.filterwarnings('ignore')

# データ読み込み

In [None]:
df = pd.read_csv("train_data")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

# 前処理

In [None]:
feature_names = [
    "hour",
    "banner_pos",
    "site_id",
    "site_domain",
    "site_category",
    "app_id",
    "app_domain",
    "app_category",
    "device_id",
    "device_ip",
    "device_model",
    "device_type",
]

target_name = "click"

In [None]:
X = df[feature_names]
y= df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, shuffle=False)

In [None]:
def preprocess(df: pd.DataFrame):
    df['datetime'] = pd.to_datetime(df['hour'], format='%y%m%d%H')
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday
    feature_hasher = FeatureHasher(n_features=2**18, input_type="string")
    hashed_features = feature_hasher.fit_transform(df.astype(str).values)
    return hashed_features

In [None]:
X_train_preprocessed = preprocess(X_train)
X_valid_preprocessed = preprocess(X_valid)
X_test_preprocessed = preprocess(X_test)

# ハイパラチューニング

In [None]:
def grid_search(X_train, y_train, X_valid, y_valid) -> float:
    best_score = 1e10
    best_alpha = 0.01
    for alpha in [1e-5, 1e-4, 1e-3, 1e2, 1e-1]:
        model = SGDClassifier(loss="log_loss", penalty="l2", random_state=42, alpha=alpha)
        model.fit(X_train, y_train)
        valid_proba = model.predict_proba(X_valid_preprocessed)[:, 1]
        valid_score = metrics.log_loss(y_valid, valid_proba)
        print(f"Grid Search| alpha: {alpha}, score: {valid_score}")

        if best_score > valid_score:
            best_score = valid_score
            best_alpha = alpha
    return best_alpha

# 学習・評価

In [None]:
best_alpha = grid_search(X_train_preprocessed, y_train, X_valid_preprocessed, y_valid)

best_model = SGDClassifier(
    loss="log_loss", penalty="l2", random_state=42, alpha=best_alpha
)
best_model.fit(X_train_preprocessed, y_train)

# 予測確率を取得
y_pred_proba = best_model.predict_proba(X_test_preprocessed)[:, 1]
# 予測クラスを取得
y_pred = best_model.predict(X_test_preprocessed)

# logloss
logloss = metrics.log_loss(y_test, y_pred_proba)
# AUC
auc = metrics.roc_auc_score(y_test, y_pred_proba)
# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)

print("test logloss: {}".format(logloss))
print("AUC: {}".format(auc))
print("Accuracy: {}".format(accuracy))