In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [12]:
base_dir = "data/"

In [13]:
# kurupical
df1 = pd.read_csv(f"{base_dir}/model225/transformers1.csv")
df2 = pd.read_csv(f"{base_dir}/model226/transformers1.csv")
df3 = pd.read_csv(f"{base_dir}/model228/transformers1.csv")

In [15]:
# takoi-san
df_val_row = pd.read_feather("../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather")

def extract_val_row(path):
    df = pd.read_feather(path)
    return pd.merge(df, df_val_row, how="inner")
# takoi_san
df4 = extract_val_row(f"{base_dir}/takoi_ex55_56_57/train_ex55_ex56_ex57.feather")

In [20]:
df4

Unnamed: 0,row_id,answered_correctly,ex55,ex56,ex57
0,10753,1,1.098598,1.017184,1.354200
1,10754,1,1.579756,1.821863,1.717242
2,10755,0,-0.476880,-0.236096,-0.361019
3,10756,1,2.296592,2.599542,2.009301
4,10757,0,-0.137751,-0.063009,-0.064646
...,...,...,...,...,...
2499995,101230020,1,1.958967,1.710324,1.325099
2499996,101230021,1,1.342866,1.091345,1.129338
2499997,101230022,1,2.421757,2.378265,2.595206
2499998,101230023,0,-0.691799,-0.754669,-0.884679


In [70]:
# merge
df = pd.DataFrame()
df["target"] = df1["target"]
df["model225"] = df1["predict"]
df["model226"] = df2["predict"]
df["model228"] = df3["predict"]
df["ex55"] = sigmoid(df4["ex55"])
df["ex56"] = sigmoid(df4["ex56"])
df["ex57"] = sigmoid(df4["ex57"])

In [87]:
df_train = df.iloc[:2_000_000]
df_val = df.iloc[2_000_000:]

In [89]:
for col in ["model225", "model226", "model228", "ex55", "ex56", "ex57"]:
    score = roc_auc_score(df_val["target"], df_val[col])
    print(f"model={col} auc={round(score, 4)}")

model=model225 auc=0.8073
model=model226 auc=0.8083
model=model228 auc=0.8078
model=ex55 auc=0.8054
model=ex56 auc=0.8055
model=ex57 auc=0.8055


In [90]:
params = {
    'objective': 'binary',
    'max_depth': 6,
    'learning_rate': 0.01,
    'bagging_fraction': 0.7,
    'bagging_seed': 0,
    'random_state': 0,
    'metric': 'auc',
    'verbosity': -1,
    "n_estimators": 10000,
    "early_stopping_rounds": 50
}

In [91]:
# lgbm
train_data = lgb.Dataset(df_train.drop("target", axis=1), label=df_train["target"])
valid_data = lgb.Dataset(df_val.drop("target", axis=1), label=df_val["target"])

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    verbose_eval=100
)



Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.814974	valid_1's auc: 0.81101
[200]	training's auc: 0.815183	valid_1's auc: 0.811193
[300]	training's auc: 0.81529	valid_1's auc: 0.81126
[400]	training's auc: 0.81538	valid_1's auc: 0.81128
[500]	training's auc: 0.815459	valid_1's auc: 0.811288
Early stopping, best iteration is:
[545]	training's auc: 0.815496	valid_1's auc: 0.81129


In [47]:
lgbm = model.predict(df_val.drop("target", axis=1))

# 以下、没になったモデリング集

In [92]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

ModuleNotFoundError: No module named 'rgf'

In [93]:
from sklearn.ensemble import StackingClassifier

In [None]:
clf = StackingClassifier(
    estimators= [
        ('lgb', lgbm.LGBMClassifier(**lgbm_params)),
        ('et', ExtraTreesClassifier(**et_params)),
        ('rf', RandomForestClassifier(**rf_params)),
        ('lr', LogisticRegression()),
        ('knn', KNeighborsClassifier())
    ],
    final_estimator=final_estimator
)

In [78]:
# logistic reg
from sklearn.linear_model import LogisticRegression

clf_logistic = LogisticRegression()
clf_logistic.fit(df_train.drop("target", axis=1).values, df_train["target"].values)
logistic = clf_logistic.predict_proba(df_val.drop("target", axis=1).values)[:, 1]

In [79]:
roc_auc_score(df_val["target"].values, logistic)

0.8101269984455765

In [73]:
# mlp
from sklearn.neural_network import MLPClassifier

clf_mlp = MLPClassifier()
clf_mlp.fit(df_train.drop("target", axis=1).values, df_train["target"].values)
mlp = clf_mlp.predict(df_val.drop("target", axis=1).values)

In [81]:
mlp = clf_mlp.predict_proba(df_val.drop("target", axis=1).values)[:, 1]
roc_auc_score(df_val["target"].values, mlp)

0.8103880130787595

In [86]:
roc_auc_score(df_val["target"].values, lgbm*10 + logistic + mlp)

0.8104891955658622

In [54]:
import torch
import torch.nn.functional
import torch.utils.data


class Net(torch.nn.Module):
    def __init__(self, input_dim):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, 1)
    def forward(self, x):
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [69]:
num_epochs = 100
X_train = torch.from_numpy(df_train.drop("target", axis=1).values).float().to("cuda")
y_train = torch.from_numpy(df_train["target"].values.reshape(-1, 1)).float().to("cuda")

X_val = torch.from_numpy(df_val.drop("target", axis=1).values).float().to("cuda")
y_val = df_val["target"].values.reshape(-1, 1)

clf_nn = Net(input_dim=6).to("cuda")
clf_nn.train()

# set training parameters
optimizer = torch.optim.Adam(clf_nn.parameters(), lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

# start to train
epoch_loss = []
for epoch in range(num_epochs):
    # forward
    outputs = clf_nn(X_train)
    
    # calculate loss
    loss = criterion(outputs, y_train)
    
    # update weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(roc_auc_score(y_val, clf_nn(X_val).data.cpu().numpy().tolist()))
    # save loss of this epoch

0.7909581340391222
0.7935409500374382
0.7956666729694192
0.7974297737518439
0.7988992712446684
0.8001329722162667
0.8011701916274742
0.8020340682563649
0.8027583226875239
0.8033708392113896
0.8038887175406277
0.804325117394177
0.8046982276649539
0.8050129981342328
0.8052841835169412
0.8055141863101432
0.8057102073651655
0.8058813882524619
0.8060306927844096
0.8061612273097309
0.8062760516807792
0.806376720136718
0.806466009344421
0.8065450104319479
0.8066167818319152
0.8066805997470996
0.8067388392407545
0.8067913341344606
0.8068394449512679
0.806883785839731
0.8069241703642148
0.8069614405929146
0.8069959036477212
0.8070278322825143
0.8070579214669422
0.8070863846635745
0.8071132274301601
0.807138871653504
0.807162856083742
0.8071856804767614
0.8072081889957349
0.8072291030381362
0.8072491589450804
0.8072681256371467
0.8072863242171664
0.8073029564383213
0.8073194043274748
0.8073341931333909
0.8073485380725576
0.8073623072479955
0.8073752850502173
0.8073876057914849
0.8073991259139354