# Library import

### pytorch로 진행

In [None]:
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss

import pandas as pd
import numpy as np
np.random.seed(0)

from tqdm.notebook import tqdm

import os

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# data load
df = pd.read_csv('preprocessing.csv')
del df['Unnamed: 0']
df.head(2)

In [None]:
train_targets = df[['credit']].copy()
train = df.drop(['credit'], axis=1).copy()

In [None]:
# train/test data split
np.random.seed(42)
if "Set" not in train.columns:
        train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [None]:
# Encoding train set and test set
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

In [None]:
unused_feat = ['Set', 'credit'] # Let's not use splitting sets and sig_id

features = [ col for col in train.columns if col not in unused_feat] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [None]:
X_train = train[features].values[train_indices]
y_train = train_targets.values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train_targets.values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train_targets.values[test_indices]

In [None]:
# Tabnet - multi classifier - clf 변수에 저장
clf = TabNetMultiTaskClassifier(n_steps=1,
                                cat_idxs=cat_idxs,
                                cat_dims=cat_dims,
                                cat_emb_dim=1, # embedding dimension
                                optimizer_fn=torch.optim.Adam, #Adam-optimizer
                                optimizer_params=dict(lr=2e-2), # learning rate
                                scheduler_params={"step_size":50, # step마다 learning rate에 gamma 곱해서 조절.
                                                  "gamma":0.9},
                                scheduler_fn=torch.optim.lr_scheduler.StepLR, # learning rate scheduler
                                mask_type='entmax' #softmax/sparsemax 함수의 일반화 함수,
                                lambda_sparse=0)

In [None]:
# Fit the model on training data
max_epochs = 100
clf.fit(X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=['train', 'valid'],
        eval_metric=['accuracy','logloss'],
        max_epochs=max_epochs , patience=20,
        batch_size=1024, virtual_batch_size=128,
        num_workers=0, drop_last=False)

In [None]:
# Visualize
# Accuracy graph
acc = clf.history['train_accuracy']
val_acc = clf.history['valid_accuracy']

x_len = np.arange(len(acc))

plt.plot(x_len, acc, marker='.', c='blue', label="Train-set Acc.")
plt.plot(x_len, val_acc, marker='.', c='red', label="Validation-set Acc.")

plt.legend(loc='upper right')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('Accuracy')
plt.show()

# Loss graph
loss = clf.history['train_logloss']
val_loss = clf.history['valid_logloss']

x_len = np.arange(len(acc))

plt.plot(x_len, loss, marker='.', c='blue', label="Train-set loss.")
plt.plot(x_len, val_loss, marker='.', c='red', label="Validation-set loss.")

plt.legend(loc='upper right')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('Cross-entropy')
plt.show()

In [None]:
# feature importance 확인
preds_valid = clf.predict_proba(X_valid) 
preds = clf.predict_proba(X_test)
clf.feature_importances_