In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
os.listdir("../data")

['sample_submission.csv', 'test.csv', 'train.csv']

In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
sample_submission = pd.read_csv("../data/sample_submission.csv")

In [4]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

 #TabNetRegressor()
#clf.fit(
#  X_train, Y_train,
#  eval_set=[(X_valid, y_valid)]
#)
#preds = clf.predict(X_test)

In [5]:
xcolumns = list(test.columns)
ycolumns = "NSP"

In [6]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

class Gini(Metric):
    def __init__(self):
        self._name = "gini"
        self._maximize = True

    def __call__(self, y_true, y_score):
        #print(y_true, np.argmax(y_score, 1))
        n_size = np.max(y_true)+1
        y_true_ = np.eye(n_size)[y_true]
        auc = roc_auc_score(y_true_, y_score, average='macro', multi_class="ovr")
        return max(2*auc - 1, 0.)

class F1(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True
        
    def __call__(self, y_true, y_score):
        return f1_score(y_true, np.argmax(y_score, 1), average='weighted')

In [7]:

X = train[xcolumns].values
y = train[ycolumns].values
X_test = test[xcolumns].values

In [8]:
train.describe()

Unnamed: 0,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
count,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,...,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0
mean,133.219412,133.219412,2.684118,6.851176,3.597059,46.857647,1.334118,9.872353,8.207353,1.527647,...,93.644118,163.952941,4.105294,0.32,137.171176,134.485882,137.935882,18.837647,0.313529,1.302353
std,9.845934,9.845934,3.452645,34.902431,2.788713,17.164055,0.878399,18.577602,5.698527,2.430154,...,29.728492,17.910908,2.98043,0.706449,16.542794,15.694191,14.535621,29.648333,0.609851,0.61595
min,106.0,106.0,0.0,0.0,0.0,12.0,0.2,0.0,0.0,0.0,...,50.0,123.0,0.0,0.0,60.0,73.0,77.0,0.0,-1.0,1.0
25%,126.0,126.0,0.0,0.0,1.0,32.0,0.7,0.0,4.5,0.0,...,67.0,152.0,2.0,0.0,129.0,125.0,128.0,2.0,0.0,1.0
50%,133.0,133.0,1.0,0.0,3.0,48.0,1.2,0.0,7.4,0.0,...,93.5,162.0,3.0,0.0,139.0,136.0,139.0,7.0,0.0,1.0
75%,140.0,140.0,4.0,2.0,5.0,61.0,1.7,10.25,10.8,2.25,...,120.0,174.0,6.0,0.0,148.0,145.0,148.0,24.0,1.0,1.0
max,160.0,160.0,19.0,564.0,23.0,86.0,7.0,91.0,50.7,14.0,...,158.0,238.0,18.0,10.0,187.0,182.0,186.0,269.0,1.0,3.0


In [9]:
from sklearn.preprocessing import QuantileTransformer
rng = np.random.RandomState(0)

transformer=QuantileTransformer(n_quantiles=400, output_distribution='normal')
X_t = transformer.fit_transform(X)
X_test_t = transformer.transform(X_test)

In [10]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=7, random_state=13, shuffle=True)

preds = []
cv_preds = []
for train_index, test_index in skf.split(X_t, y):
    xtrain = X_t[train_index]
    ytrain = y[train_index]
    xval = X_t[test_index]
    yval = y[test_index]
    clf = TabNetClassifier(seed=13) 
    clf.fit(
        xtrain, ytrain, eval_set=[(xval, yval)], weights=1,
        eval_metric=[F1, Gini]
    )
    cv_preds.append(clf.predict_proba(X_t))
    preds.append(clf.predict_proba(X_test_t))

Device used : cpu
epoch 0  | loss: 1.31976 | val_0_f1: 0.33406 | val_0_gini: 0.0     |  0:00:00s
epoch 1  | loss: 1.03838 | val_0_f1: 0.30846 | val_0_gini: 0.13354 |  0:00:00s
epoch 2  | loss: 0.90338 | val_0_f1: 0.38981 | val_0_gini: 0.2627  |  0:00:00s
epoch 3  | loss: 0.86537 | val_0_f1: 0.53657 | val_0_gini: 0.33117 |  0:00:00s
epoch 4  | loss: 0.82035 | val_0_f1: 0.58602 | val_0_gini: 0.24709 |  0:00:01s
epoch 5  | loss: 0.73239 | val_0_f1: 0.66691 | val_0_gini: 0.46408 |  0:00:01s
epoch 6  | loss: 0.68299 | val_0_f1: 0.6621  | val_0_gini: 0.54042 |  0:00:01s
epoch 7  | loss: 0.68114 | val_0_f1: 0.672   | val_0_gini: 0.61836 |  0:00:01s
epoch 8  | loss: 0.62297 | val_0_f1: 0.64301 | val_0_gini: 0.63453 |  0:00:02s
epoch 9  | loss: 0.59369 | val_0_f1: 0.65554 | val_0_gini: 0.69495 |  0:00:02s
epoch 10 | loss: 0.59073 | val_0_f1: 0.64496 | val_0_gini: 0.72158 |  0:00:02s
epoch 11 | loss: 0.5665  | val_0_f1: 0.67115 | val_0_gini: 0.77164 |  0:00:03s
epoch 12 | loss: 0.5153  | val_0_f

epoch 40 | loss: 0.28235 | val_0_f1: 0.85352 | val_0_gini: 0.87656 |  0:00:10s
epoch 41 | loss: 0.25822 | val_0_f1: 0.85278 | val_0_gini: 0.89008 |  0:00:11s
epoch 42 | loss: 0.24811 | val_0_f1: 0.82363 | val_0_gini: 0.8884  |  0:00:11s
epoch 43 | loss: 0.26516 | val_0_f1: 0.83066 | val_0_gini: 0.90018 |  0:00:11s
epoch 44 | loss: 0.27142 | val_0_f1: 0.84284 | val_0_gini: 0.89937 |  0:00:11s
epoch 45 | loss: 0.27783 | val_0_f1: 0.82824 | val_0_gini: 0.88459 |  0:00:12s
epoch 46 | loss: 0.24563 | val_0_f1: 0.80611 | val_0_gini: 0.86412 |  0:00:12s
epoch 47 | loss: 0.26662 | val_0_f1: 0.82712 | val_0_gini: 0.85996 |  0:00:12s
epoch 48 | loss: 0.26367 | val_0_f1: 0.82274 | val_0_gini: 0.83856 |  0:00:12s
epoch 49 | loss: 0.28358 | val_0_f1: 0.81676 | val_0_gini: 0.81582 |  0:00:13s
epoch 50 | loss: 0.28519 | val_0_f1: 0.81911 | val_0_gini: 0.83348 |  0:00:13s
epoch 51 | loss: 0.26953 | val_0_f1: 0.80763 | val_0_gini: 0.7884  |  0:00:13s
epoch 52 | loss: 0.24897 | val_0_f1: 0.81594 | val_0

epoch 23 | loss: 0.384   | val_0_f1: 0.75332 | val_0_gini: 0.65359 |  0:00:05s
epoch 24 | loss: 0.35723 | val_0_f1: 0.73343 | val_0_gini: 0.62995 |  0:00:06s
epoch 25 | loss: 0.38265 | val_0_f1: 0.72484 | val_0_gini: 0.64066 |  0:00:06s
epoch 26 | loss: 0.38729 | val_0_f1: 0.7566  | val_0_gini: 0.64762 |  0:00:06s
epoch 27 | loss: 0.33492 | val_0_f1: 0.759   | val_0_gini: 0.67635 |  0:00:06s
epoch 28 | loss: 0.35747 | val_0_f1: 0.77481 | val_0_gini: 0.69284 |  0:00:07s
epoch 29 | loss: 0.30605 | val_0_f1: 0.76349 | val_0_gini: 0.69742 |  0:00:07s
epoch 30 | loss: 0.27803 | val_0_f1: 0.78045 | val_0_gini: 0.70522 |  0:00:07s
epoch 31 | loss: 0.32501 | val_0_f1: 0.77079 | val_0_gini: 0.70594 |  0:00:07s
epoch 32 | loss: 0.3201  | val_0_f1: 0.75746 | val_0_gini: 0.69886 |  0:00:08s
epoch 33 | loss: 0.25266 | val_0_f1: 0.77785 | val_0_gini: 0.70142 |  0:00:08s
epoch 34 | loss: 0.27639 | val_0_f1: 0.77304 | val_0_gini: 0.72664 |  0:00:08s
epoch 35 | loss: 0.26296 | val_0_f1: 0.77949 | val_0

In [11]:
#!pip install --upgrade pytorch_tabnet
np.mean(np.stack(cv_preds).mean(0).argmax(1)+1==y)

0.8423529411764706

In [12]:
sample_submission['NSP'] = np.stack(preds).mean(0).argmax(1)+1

In [13]:
sample_submission.to_csv("prediction_tabnet_1.csv", index=None)