In [1]:
!pip install pytorch-tabnet



In [2]:
from google import colab
colab.drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings(action='ignore')

In [4]:
job_paths = '/content/drive/MyDrive/JobCare_data/'

train_df = pd.read_csv(job_paths + 'train.csv')
test_df = pd.read_csv(job_paths + 'test.csv')

In [5]:
train_df

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,...,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target
0,0,True,True,True,False,False,False,1,4,3,...,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1
1,1,False,False,False,True,True,False,1,3,4,...,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0
2,2,False,False,False,True,False,False,2,0,3,...,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0
3,3,False,False,False,True,False,False,2,0,2,...,2,1608,275,5,3,74,827967,572323,2020-01-13 18:09:34,0
4,4,True,True,True,False,False,False,1,3,4,...,2,1608,275,1,4,74,831614,573899,2020-03-09 20:39:22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501946,501946,False,False,False,True,False,False,1,1,2,...,2,354,147,1,5,65,503156,285850,2020-03-13 12:55:52,1
501947,501947,True,True,False,True,False,False,1,6,2,...,2,163,120,1,4,142,676255,456996,2020-01-20 11:51:51,1
501948,501948,True,True,True,True,False,False,1,7,4,...,2,438,147,2,7,65,484528,293258,2020-08-05 17:27:24,1
501949,501949,True,False,False,True,False,False,1,1,2,...,2,660,147,3,4,259,456330,273797,2020-06-15 09:23:21,1


In [6]:
test_df

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,...,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt
0,0,True,False,False,True,True,True,1,1,2,...,1,2,1147,839,1,5,263,393790,236865,2020-12-01 02:24:18
1,1,False,False,False,True,False,False,2,0,2,...,1,2,1611,278,1,4,263,394058,236572,2020-12-17 05:42:53
2,2,True,False,False,True,True,True,2,3,2,...,1,2,1817,490,3,4,177,1002061,704612,2020-12-10 23:33:41
3,3,True,False,False,True,True,True,1,2,2,...,1,2,101,150,5,3,177,1000813,704652,2020-12-03 19:44:55
4,4,True,False,False,True,False,False,1,6,4,...,1,1,985,1097,1,4,177,111146,704413,2020-12-11 21:24:34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46399,46399,True,True,True,False,False,False,2,0,4,...,1,2,759,147,1,5,91,425040,726084,2020-12-17 01:06:26
46400,46400,True,False,False,True,False,False,2,0,4,...,1,2,759,147,1,4,91,290061,156948,2020-12-29 21:57:25
46401,46401,True,True,False,True,True,True,2,0,3,...,1,2,759,147,1,4,288,307951,175069,2020-12-10 19:09:09
46402,46402,True,True,False,True,True,True,1,3,3,...,3,2,759,147,1,5,288,308354,174849,2020-12-07 14:04:34


In [7]:
train = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)<9].copy()
val = train_df[train_df['contents_open_dt'].apply(lambda x: pd.Timestamp(x).month)>=9].copy()
test = test_df.copy()

In [8]:
for df in [train,val,test]:
    df.drop(['contents_open_dt','contents_rn','id','person_rn','contents_open_dt','person_prefer_f','person_prefer_g'],axis=1,inplace=True)

columns = sorted(test.columns)
train = train[columns+['target']]*1
val = val[columns+['target']]*1
test = test[columns]*1

In [9]:
cat_idxs = []
cat_dims =  []

for idx,col in enumerate(train.columns):
    if 'match' not in col and col!='target':
        le = LabelEncoder()
        le.fit(train_df[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        train[col] = train[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        val[col] = val[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        cat_idxs.append(idx)
        cat_dims.append(len(le_dict)+1)

In [10]:
X_train = train.drop('target',axis=1).values
y_train = train['target'].values
X_val = val.drop('target',axis=1).values
y_val = val['target'].values
X_test = test.values

In [11]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=10,
                       optimizer_fn=torch.optim.AdamW,
                       optimizer_params=dict(lr=1e-4),
                       scheduler_params={"step_size":100,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # "sparsemax", entmax
                      )

Device used : cuda


In [12]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score
class F1_Score(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True

    def __call__(self, y_true, y_score):
        score = f1_score(y_true, (y_score[:, 1]>0.5)*1)
        return score

In [13]:
max_epochs = 30

clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc','f1'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=1,
    drop_last=False,
)

epoch 0  | loss: 1.01476 | train_auc: 0.49717 | train_f1: 0.61296 | valid_auc: 0.49333 | valid_f1: 0.6081  |  0:00:47s
epoch 1  | loss: 0.83215 | train_auc: 0.49996 | train_f1: 0.5336  | valid_auc: 0.49612 | valid_f1: 0.53027 |  0:01:36s
epoch 2  | loss: 0.76936 | train_auc: 0.50342 | train_f1: 0.48113 | valid_auc: 0.49697 | valid_f1: 0.4755  |  0:02:24s
epoch 3  | loss: 0.74349 | train_auc: 0.50478 | train_f1: 0.45784 | valid_auc: 0.49923 | valid_f1: 0.45462 |  0:03:13s
epoch 4  | loss: 0.72869 | train_auc: 0.50606 | train_f1: 0.4488  | valid_auc: 0.49999 | valid_f1: 0.44419 |  0:04:01s
epoch 5  | loss: 0.7191  | train_auc: 0.5082  | train_f1: 0.43335 | valid_auc: 0.50171 | valid_f1: 0.42798 |  0:04:50s
epoch 6  | loss: 0.71394 | train_auc: 0.50843 | train_f1: 0.42795 | valid_auc: 0.50151 | valid_f1: 0.42173 |  0:05:38s
epoch 7  | loss: 0.70928 | train_auc: 0.50963 | train_f1: 0.40996 | valid_auc: 0.5026  | valid_f1: 0.40515 |  0:06:26s
epoch 8  | loss: 0.70656 | train_auc: 0.511   | 

In [14]:
preds = clf.predict_proba(X_test)

In [15]:
preds = (preds[:,1]>0.5)*1
preds

array([1, 1, 1, ..., 1, 1, 1])

In [16]:
submission = pd.read_csv('/content/drive/MyDrive/JobCare_data/sample_submission.csv')
submission['target'] = preds

In [17]:
submission

Unnamed: 0,id,target
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
46399,46399,1
46400,46400,0
46401,46401,1
46402,46402,1


In [18]:
submission.to_csv('/content/drive/MyDrive/JobCare_data/baseline_4.csv',index=False)