In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [2]:
base_dir = "data/"

In [3]:
# kurupical
df1 = pd.read_csv(f"{base_dir}/model275/epoch11.csv")
df2 = pd.read_csv(f"{base_dir}/model276/epoch10.csv")
df3 = pd.read_csv(f"{base_dir}/model277/epoch11.csv")
df4 = pd.read_csv(f"{base_dir}/model277_2/epoch11.csv")

In [4]:
# takoi-san
df_val_row = pd.read_feather("../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather")

def extract_val_row(path):
    df = pd.read_csv(path)
    return pd.merge(df, df_val_row, how="inner")

df101 = extract_val_row(f"{base_dir}/ex67_68_70_74_75_predict.csv/ex67_68_70_74_75_predict.csv")
df102 = extract_val_row(f"{base_dir}/lgb014.csv/lgb014.csv") 

In [5]:
df101.columns, df102.columns

(Index(['row_id', 'ex68_transformer_75m', 'ex70_transformer_75m',
        'ex67_lstm_75m', 'ex74_transformer_conv1d_75m',
        'ex75_transformer_linear_75m'],
       dtype='object'),
 Index(['row_id', 'ex014_lgb'], dtype='object'))

In [6]:
len(df101)

2500000

In [7]:
# merge
df = pd.DataFrame()
df["target"] = df1["target"]
df["model275"] = df1["predict"]
df["model276"] = df2["predict"]
df["model277"] = df3["predict"]
df["model277_2"] = df4["predict"]
df["ex68"] = df101["ex68_transformer_75m"]
df["ex70"] = df101["ex70_transformer_75m"]
df["ex67"] = df101["ex67_lstm_75m"]
df["ex74"] = df101["ex74_transformer_conv1d_75m"]
df["ex75"] = df101["ex75_transformer_linear_75m"]
df["lgb014"] = df102["ex014_lgb"]


# df["ex55"] = df4["ex55"]
# df["ex56"] = df4["ex56"]
# df["ex57"] = df4["ex57"]

In [8]:
df_train = df.iloc[:2_000_000]
df_val = df.iloc[2_000_000:]

In [9]:
for col in df_val.columns:
    score = roc_auc_score(df_val["target"], df_val[col])
    print(f"model={col} auc={round(score, 4)}")

model=target auc=1.0
model=model275 auc=0.8072
model=model276 auc=0.8035
model=model277 auc=0.8061
model=model277_2 auc=0.8075
model=ex68 auc=0.8082
model=ex70 auc=0.8083
model=ex67 auc=0.8067
model=ex74 auc=0.8081
model=ex75 auc=0.8076
model=lgb014 auc=0.8011


In [10]:
params = {
    'objective': 'binary',
    'max_depth': -1,
    'num_leaves': 32,
    'learning_rate': 0.01,
    'bagging_fraction': 0.7,
    'bagging_seed': 0,
    'random_state': 0,
    'metric': 'auc',
    'verbosity': -1,
    "n_estimators": 10000,
    "early_stopping_rounds": 50
}

In [11]:
# lgbm
train_data = lgb.Dataset(df_train.drop("target", axis=1), label=df_train["target"])
valid_data = lgb.Dataset(df_val.drop("target", axis=1), label=df_val["target"])

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    verbose_eval=100
)



Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.81486	valid_1's auc: 0.811547
[200]	training's auc: 0.815084	valid_1's auc: 0.811739
[300]	training's auc: 0.815196	valid_1's auc: 0.811851
[400]	training's auc: 0.81528	valid_1's auc: 0.811911
[500]	training's auc: 0.81536	valid_1's auc: 0.811946
[600]	training's auc: 0.815439	valid_1's auc: 0.811965
[700]	training's auc: 0.815513	valid_1's auc: 0.811976
[800]	training's auc: 0.815572	valid_1's auc: 0.81198
[900]	training's auc: 0.815628	valid_1's auc: 0.811983
[1000]	training's auc: 0.815682	valid_1's auc: 0.811985
Early stopping, best iteration is:
[1025]	training's auc: 0.815694	valid_1's auc: 0.811986


In [12]:
lgbm = model.predict(df_val.drop("target", axis=1))

In [14]:
from datetime import datetime as dt
import os
import pickle
now = dt.now().strftime("%Y%m%d%H%M%S")
os.makedirs(f"model/{now}")
with open(f"model/{now}/lgbm_stacking008.pickle", "wb") as f:
    pickle.dump(model, f)

# 以下、没になったモデリング集

0.810462038379595

In [94]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [93]:
from sklearn.ensemble import StackingClassifier

In [99]:
clf = StackingClassifier(
    estimators= [
        ('lgb', lgb.LGBMClassifier()),
        ('et', ExtraTreesClassifier()),
        ('rf', RandomForestClassifier()),
        ('lr', LogisticRegression()),
    ],
    final_estimator=LogisticRegression()
)

In [107]:
clf.fit(df_train.drop("target", axis=1).values, df_train["target"].values)

KeyboardInterrupt: 

In [None]:
pred = clf.predict_proba(df_val.drop("target", axis=1).values)[:, 1]

In [78]:
# logistic reg
from sklearn.linear_model import LogisticRegression

clf_logistic = LogisticRegression()
clf_logistic.fit(df_train.drop("target", axis=1).values, df_train["target"].values)
logistic = clf_logistic.predict_proba(df_val.drop("target", axis=1).values)[:, 1]

In [79]:
roc_auc_score(df_val["target"].values, logistic)

0.8101269984455765

In [73]:
# mlp
from sklearn.neural_network import MLPClassifier

clf_mlp = MLPClassifier()
clf_mlp.fit(df_train.drop("target", axis=1).values, df_train["target"].values)
mlp = clf_mlp.predict(df_val.drop("target", axis=1).values)

In [81]:
mlp = clf_mlp.predict_proba(df_val.drop("target", axis=1).values)[:, 1]
roc_auc_score(df_val["target"].values, mlp)

0.8103880130787595

In [86]:
roc_auc_score(df_val["target"].values, lgbm*10 + logistic + mlp)

0.8104891955658622

In [54]:
import torch
import torch.nn.functional
import torch.utils.data


class Net(torch.nn.Module):
    def __init__(self, input_dim):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, 1)
    def forward(self, x):
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [69]:
num_epochs = 100
X_train = torch.from_numpy(df_train.drop("target", axis=1).values).float().to("cuda")
y_train = torch.from_numpy(df_train["target"].values.reshape(-1, 1)).float().to("cuda")

X_val = torch.from_numpy(df_val.drop("target", axis=1).values).float().to("cuda")
y_val = df_val["target"].values.reshape(-1, 1)

clf_nn = Net(input_dim=6).to("cuda")
clf_nn.train()

# set training parameters
optimizer = torch.optim.Adam(clf_nn.parameters(), lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

# start to train
epoch_loss = []
for epoch in range(num_epochs):
    # forward
    outputs = clf_nn(X_train)
    
    # calculate loss
    loss = criterion(outputs, y_train)
    
    # update weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(roc_auc_score(y_val, clf_nn(X_val).data.cpu().numpy().tolist()))
    # save loss of this epoch

0.7909581340391222
0.7935409500374382
0.7956666729694192
0.7974297737518439
0.7988992712446684
0.8001329722162667
0.8011701916274742
0.8020340682563649
0.8027583226875239
0.8033708392113896
0.8038887175406277
0.804325117394177
0.8046982276649539
0.8050129981342328
0.8052841835169412
0.8055141863101432
0.8057102073651655
0.8058813882524619
0.8060306927844096
0.8061612273097309
0.8062760516807792
0.806376720136718
0.806466009344421
0.8065450104319479
0.8066167818319152
0.8066805997470996
0.8067388392407545
0.8067913341344606
0.8068394449512679
0.806883785839731
0.8069241703642148
0.8069614405929146
0.8069959036477212
0.8070278322825143
0.8070579214669422
0.8070863846635745
0.8071132274301601
0.807138871653504
0.807162856083742
0.8071856804767614
0.8072081889957349
0.8072291030381362
0.8072491589450804
0.8072681256371467
0.8072863242171664
0.8073029564383213
0.8073194043274748
0.8073341931333909
0.8073485380725576
0.8073623072479955
0.8073752850502173
0.8073876057914849
0.8073991259139354