In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import norm, skew

import warnings
warnings.filterwarnings('ignore')

In [2]:
train=pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")
train.shape

#train=train.sample(frac=0.1)
#train.reset_index(drop=True,inplace=True)

In [3]:
from sklearn.model_selection import KFold,StratifiedKFold
kf=StratifiedKFold(n_splits=5,shuffle=True,random_state=5)
train["fold"]=-1
for i,(t,v) in enumerate(kf.split(train.sample(frac=1.),train.target)):
    train.loc[v,"fold"]=i 
test=train.copy()
train=train[train.fold!=4]
test=test[test.fold==4]

train=train.drop(["fold"],axis=1)
test=test.drop(["fold"],axis=1)

train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [4]:
train["fold"]=-1
kf=StratifiedKFold(n_splits=5,shuffle=True,random_state=5)
for i,(t,v) in enumerate(kf.split(train.sample(frac=1.),train.target)):
    train.loc[v,"fold"]=i 

In [5]:
train_ID=train['id']
test_ID=test['id']

del train["id"]
del test["id"]

# data processing

In [6]:
train.shape[0]

In [7]:
all_data=pd.concat([train,test],axis=0)

In [8]:
all_data

In [9]:
sel_cat=[col for col in train.columns if "cat" in col]
sel_cont=[col for col in train.columns if "cont" in col]

In [10]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for col in sel_cat:
    all_data[col]=le.fit_transform(all_data[col])

In [11]:
train=all_data[:train.shape[0]]
test=all_data[train.shape[0]:]

In [12]:
sel_col=np.concatenate([sel_cat,sel_cont],axis=0)
sel_tar=["target"]

In [13]:
#StandardScaler
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
d1=ss.fit_transform(train[sel_cont].values)
train2=train.copy()
train2[sel_cont]=d1

In [14]:
#f,ax=plt.subplots(11,2,figsize=(14,24))
#for i in range(11):
#    sns.distplot(train[sel_cont[i]],ax=ax[i][0])
#    sns.distplot(train2[sel_cont[i]],ax=ax[i][1])    

# feature engineering

In [15]:
#corr
corr=train.corr()
plt.subplots(figsize=(16,16))
sns.heatmap(corr,annot=True, fmt='.1f',cmap="coolwarm")

In [16]:
all_data.isnull().sum()/all_data.shape[0]

In [17]:
X=train[sel_col].values
X2=train2[sel_col].values
y=train.target

# Model

In [18]:
!pip install pytorch_tabnet

In [19]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

import xgboost as xgb
import lightgbm as lgbm
import catboost as catb
from pytorch_tabnet.tab_model import TabNetRegressor,TabNetClassifier
import torch.optim as optim

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold,train_test_split
from sklearn.metrics import mean_squared_error,roc_auc_score
import datetime

In [20]:
sel_col=np.concatenate([sel_cat,sel_cont],axis=0)
sel_tar="target"

cv_results_all=[]
test_results_all=[]
cv_algorithms=[]


NN=True
XGB=True
TABNET=True
CV=False
LGBM=True
CATB=True

In [21]:
sel_col,sel_tar

# tabnet

In [22]:
tabnet_params = dict(
    n_d = 32,
    n_a = 32,
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "sparsemax",
    scheduler_params = dict(
        mode = "max", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = optim.lr_scheduler.ReduceLROnPlateau,
    seed = 42,
    verbose = 50,
    device_name="cuda",
)

In [23]:
if TABNET:
    TABNET_model=[]
    cv_result=[]
    for fold in range(5):
        print(datetime.datetime.now().strftime("%H:%M:%S"))

        train_df=train[train.fold!=fold].reset_index(drop=True)
        valid_df=train[train.fold==fold].reset_index(drop=True)

        x_train=train_df[sel_col].to_numpy()
        x_valid=valid_df[sel_col].to_numpy()
        y_train=train_df[sel_tar].to_numpy()
        y_valid=valid_df[sel_tar].to_numpy()

        TABNETR=TabNetClassifier(**tabnet_params)
        TABNETR.fit(X_train=x_train, y_train=y_train,
                      eval_set=[(x_valid, y_valid)],
                      patience=30,max_epochs = 200,
                      batch_size=8192,
                      eval_metric=["auc"],
                   )
        ret = TABNETR.predict(x_valid)
        cv_result.append(roc_auc_score(ret,y_valid))
        TABNET_model.append(TABNETR)

    print(datetime.datetime.now().strftime("%H:%M:%S"))
    cv_results_all.append(cv_result)
    cv_algorithms.append("TABNET_Optuna")

In [24]:
test[sel_col].to_numpy()

In [26]:
if TABNET:
    p_all=[]
    for i in range(5):
        print(datetime.datetime.now().strftime("%H:%M:%S"))
        p_all.append(TABNET_model[i].predict_proba(test[sel_col].to_numpy())[:,-1])
    print(datetime.datetime.now().strftime("%H:%M:%S"))
    preds=np.mean(p_all,axis=0)
    ret=roc_auc_score(test.target,preds)
    print("tabnet",ret)
    test_results_all.append(ret)

# XGB

In [27]:
# google search
Best_trial={
    "gpu_id":0,
    "tree_method":"gpu_hist",
    "learning_rate":0.01,
    "n_estimators":10000,
    "max_depth":4,
    "min_child_weight":0,
    "gamma":0.6,
    "subsample":0.7,
    "colsample_bytree":0.7,
    "nthread":-1,
    "scale_pos_weight":1,
    "seed":27,
    "reg_alpha":0.00006,
    "random_state":42,
    }   

In [28]:
if XGB:
    XGB_model=[]
    cv_result=[]
    for fold in range(5):
        print(datetime.datetime.now().strftime("%H:%M:%S"))

        train_df=train[train.fold!=fold].reset_index(drop=True)
        valid_df=train[train.fold==fold].reset_index(drop=True)

        x_train=train_df[sel_col].to_numpy()
        x_valid=valid_df[sel_col].to_numpy()
        y_train=train_df[sel_tar].to_numpy()
        y_valid=valid_df[sel_tar].to_numpy()

        XGBR=xgb.XGBClassifier(**Best_trial)
        XGBR.fit(X=x_train, y=y_train,
                      eval_set=[(x_valid, y_valid)],
                       early_stopping_rounds=1000,verbose=1000,
                      eval_metric=['auc'])
        ret = XGBR.predict(x_valid)
        cv_result.append(roc_auc_score(ret,y_valid))
        XGB_model.append(XGBR)

    print(datetime.datetime.now().strftime("%H:%M:%S"))
    cv_results_all.append(cv_result)
    cv_algorithms.append("XGB_Optuna")

In [29]:
if XGB:
    p_all=[]
    for i in range(5):
        p_all.append(XGB_model[i].predict_proba(test[sel_col].to_numpy())[:,-1])
    
    preds=np.mean(p_all,axis=0)
    ret=roc_auc_score(test.target,preds)
    print("XGB_Optuna",ret)
    test_results_all.append(ret)    

# LGBM

In [30]:
Best_trial2={
    'device':"gpu",
    'n_estimators':10000,
#     'num_leaves': 491,
#     'min_child_weight': 0.03,
#     'feature_fraction': 0.3,
#     'bagging_fraction': 0.4,
#     'min_data_in_leaf': 106,
#     'objective': 'binary',
#     'max_depth': -1,
#     'learning_rate': 0.01,
#     "boosting_type": "gbdt",
#     "bagging_seed": 11,
#     "metric": 'binary_logloss',
#     "verbosity": 0,
#     'reg_alpha': 0.4,
#     'reg_lambda': 0.6,
#     'random_state': 47
}

In [31]:
if LGBM:
    LGBM_model=[]
    cv_result=[]
    for fold in range(5):
        print(datetime.datetime.now().strftime("%H:%M:%S"))

        train_df=train[train.fold!=fold].reset_index(drop=True)
        valid_df=train[train.fold==fold].reset_index(drop=True)

        x_train=train_df[sel_col].to_numpy()
        x_valid=valid_df[sel_col].to_numpy()
        y_train=train_df[sel_tar].to_numpy()
        y_valid=valid_df[sel_tar].to_numpy()

        LGBMR=lgbm.LGBMClassifier(**Best_trial2)
        LGBMR.fit(X=x_train, y=y_train,
                 eval_set=[(x_valid, y_valid)],
                 early_stopping_rounds=1000,verbose=500,
                 eval_metric=['auc'])
        ret = LGBMR.predict(x_valid)
        cv_result.append(roc_auc_score(ret,y_valid))
        LGBM_model.append(LGBMR)

    print(datetime.datetime.now().strftime("%H:%M:%S"))
    cv_results_all.append(cv_result)
    cv_algorithms.append("LGBM_Optuna")

In [32]:
if LGBM:
    p_all=[]
    for i in range(5):
        p_all.append(LGBM_model[i].predict_proba(test[sel_col].to_numpy())[:,-1])
    
    preds=np.mean(p_all,axis=0)
    ret=roc_auc_score(test.target,preds)
    print("LGBM_Optuna",ret)
    test_results_all.append(ret)   

# Catboost

In [33]:
Best_trial3={
#     'task_type':"gpu",
    'iterations':10000,
    'eval_metric':"AUC",

#     'eval_metric':['rmse'],
#     'num_leaves': 491,
#     'min_child_weight': 0.03,
#     'feature_fraction': 0.3,
#     'bagging_fraction': 0.4,
#     'min_data_in_leaf': 106,
#     'objective': 'binary',
#     'max_depth': -1,
#     'learning_rate': 0.01,
#     "boosting_type": "gbdt",
#     "bagging_seed": 11,
#     "metric": 'binary_logloss',
#     "verbosity": 0,
#     'reg_alpha': 0.4,
#     'reg_lambda': 0.6,
#     'random_state': 47
}

In [34]:
if CATB:
    CATB_model=[]
    cv_result=[]
    for fold in range(5):
        print(datetime.datetime.now().strftime("%H:%M:%S"))

        train_df=train[train.fold!=fold].reset_index(drop=True)
        valid_df=train[train.fold==fold].reset_index(drop=True)

        x_train=train_df[sel_col].to_numpy()
        x_valid=valid_df[sel_col].to_numpy()
        y_train=train_df[sel_tar].to_numpy()
        y_valid=valid_df[sel_tar].to_numpy()

        CATBR=catb.CatBoostClassifier(**Best_trial3)
        CATBR.fit(X=x_train, y=y_train,
                 eval_set=[(x_valid, y_valid)],
                 early_stopping_rounds=1000,verbose=1000)
        ret = CATBR.predict(x_valid)
        cv_result.append(roc_auc_score(ret,y_valid))
        CATB_model.append(CATBR)

    print(datetime.datetime.now().strftime("%H:%M:%S"))
    cv_results_all.append(cv_result)
    cv_algorithms.append("CATB_Optuna")

In [35]:
if CATB:
    p_all=[]
    for i in range(5):
        p_all.append(CATB_model[i].predict_proba(test[sel_col].to_numpy())[:,-1])
    
    preds=np.mean(p_all,axis=0)
    ret=roc_auc_score(test.target,preds)
    print("CATB_Optuna",ret)
    test_results_all.append(ret)    

# Gradient Boosting

In [36]:
GB=True
if GB:
    GB_model=[]
    cv_result=[]
    for fold in range(5):
        print(datetime.datetime.now().strftime("%H:%M:%S"))

        train_df=train[train.fold!=fold].reset_index(drop=True)
        valid_df=train[train.fold==fold].reset_index(drop=True)

        x_train=train_df[sel_col].to_numpy()
        x_valid=valid_df[sel_col].to_numpy()
        y_train=train_df[sel_tar].to_numpy()
        y_valid=valid_df[sel_tar].to_numpy()

        GBR=GradientBoostingClassifier()
        GBR.fit(X=x_train, y=y_train)
        ret = GBR.predict(x_valid)
        cv_result.append(roc_auc_score(ret,y_valid))
        GB_model.append(GBR)

    print(datetime.datetime.now().strftime("%H:%M:%S"))
    cv_results_all.append(cv_result)
    cv_algorithms.append("GB")

In [37]:
if GB:
    p_all=[]
    for i in range(5):
        p_all.append(GB_model[i].predict_proba(test[sel_col].to_numpy())[:,-1])
    
    preds=np.mean(p_all,axis=0)
    ret=roc_auc_score(test.target,preds)
    print("GB",ret)
    test_results_all.append(ret)    

# NN

In [38]:
import torch
import torch.nn as nn

In [39]:
##Dataset
import torch
import torch.nn as nn

class TDataset:
    def __init__(self,dataset,features):
        self.dataset=dataset
        self.feature=features
    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self,item):
        return {
            "x":torch.tensor(self.dataset[item,:],dtype=torch.float),
            "y":torch.tensor(self.feature[item,:],dtype=torch.float)
        }

In [40]:
##Dataset
import torch
import torch.nn as nn

class TDataset:
    def __init__(self,dataset,features):
        self.dataset=dataset
        self.feature=features
    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self,item):
        return {
            "x":torch.tensor(self.dataset[item,:],dtype=torch.float),
            "y":torch.tensor(self.feature[item,:],dtype=torch.float)
        }

In [41]:
## Model
class TModel(nn.Module):
    def __init__(self,num_features,num_targets,num_layers,hidden_size,dropout):
        super().__init__()
        layers=[]
        for _ in range(num_layers):
            if len(layers)==0:
                layers.append(nn.Linear(num_features,hidden_size))
                layers.append(nn.BatchNorm1d(hidden_size))
                layers.append(nn.Dropout(dropout))
                layers.append(nn.ReLU())
            else:
                layers.append(nn.Linear(hidden_size,hidden_size))
                layers.append(nn.BatchNorm1d(hidden_size))
                layers.append(nn.Dropout(dropout))
                layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_size,num_targets))

        self.model=nn.Sequential(*layers)

    def forward(self,x):
        x=self.model(x)
        return x

In [42]:
class Engine:
    def __init__(self,model,optimizer,device):
        self.model=model
        self.optimizer=optimizer
        self.device=device   
    
    def loss_fn(self,targets,outputs):
        return nn.BCEWithLogitsLoss()(outputs,targets)
    
    def train(self,data_loader):
        self.model.train()
        final_loss=0
        for data in data_loader:
            self.optimizer.zero_grad()
            inputs=data["x"].to(self.device)
            targets=data["y"].to(self.device)
            outputs=self.model(inputs)
            loss=self.loss_fn(targets,outputs)
            loss.backward()
            self.optimizer.step()
            final_loss += loss.item()
        return final_loss/len(data_loader)
    
    def validate(self,data_loader):
        self.model.eval()
        final_loss=0
        for data in data_loader:
            inputs=data["x"].to(self.device)
            targets=data["y"].to(self.device)
            outputs=self.model(inputs)
            loss=self.loss_fn(targets,outputs)
            final_loss += loss.item()
        return final_loss/len(data_loader)
    
    def predict(self,data_loader):
        self.model.eval()
        final_predictions = []
        for data in data_loader:
            inputs=data["x"].to(self.device)
            predictions = self.model(inputs)
            predictions = predictions.cpu()
            final_predictions.append(predictions.detach().numpy())
        return final_predictions

In [43]:
def train_fold(fold,df,sel_cos,sel_tar,save_model=False):

    # Dataset
    train_df=df[df.fold!=fold].reset_index(drop=True)
    valid_df=df[df.fold==fold].reset_index(drop=True)

    x_train=train_df[sel_cos].to_numpy()
    x_valid=valid_df[sel_cos].to_numpy()
    y_train=train_df[sel_tar].to_numpy().reshape(-1,1)
    y_valid=valid_df[sel_tar].to_numpy().reshape(-1,1)

    # DataLoader
    train_dataset=TDataset(x_train,y_train)
    train_loader=torch.utils.data.DataLoader(
        train_dataset,batch_size=1024,num_workers=8,shuffle=True
    )
    valid_dataset=TDataset(x_valid,y_valid)
    valid_loader=torch.utils.data.DataLoader(
        valid_dataset,batch_size=1024,num_workers=8,shuffle=False
    )
    
    # Model,Optimizer, scheduler, engine
    model=TModel(
        num_features=x_train.shape[1],
        num_targets=1,#y_train.shape[1],
        num_layers=5,
        hidden_size=2048,
        dropout=0.3        
    )
    
    device="cuda" if torch.cuda.is_available() else "cpu"
    print(device,f"inputs:{x_train.shape[1]}, targets:{1}")
    model.to(device)
    optimizer=torch.optim.Adam(model.parameters(),lr=3e-4)
    scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,patience=3,threshold=1e-5,mode="max",verbose=True
    )

    engine=Engine(model,optimizer,device)
    best_loss=np.inf
    early_stopping=10
    early_stopping_cnt=0
    EPOCH=300
    for epoch in range(EPOCH):
        train_loss=engine.train(train_loader)
        valid_loss=engine.validate(valid_loader)
        scheduler.step(valid_loss)

        if valid_loss<best_loss:
            best_loss=valid_loss
            torch.save(model.state_dict(),f"model_fold_{fold}.bin")
            print(f"fold={fold}, epoch={epoch}, train_loss={train_loss:.6f}, valid_loss={valid_loss:.6f}")    
            early_stopping_cnt=0
        else:
            early_stopping_cnt+=1
        if early_stopping_cnt>early_stopping:
            break

    print(f"fold={fold}, best val loss={best_loss}")
    return best_loss

In [44]:
def predict_fold(fold,df,sel_col,sel_tar):
    x_test=df[sel_col].to_numpy()
    y_test=np.zeros((df.shape[0],1))
    test_dataset=TDataset(x_test,y_test)
    test_loader=torch.utils.data.DataLoader(
        test_dataset,batch_size=1024,num_workers=8,shuffle=False
    )
    

    device="cuda" if torch.cuda.is_available() else "cpu"
    model=TModel(
        num_features=x_test.shape[1],
        num_targets=1,#y_test.shape[1],
        num_layers=5,
        hidden_size=2048,
        dropout=0.3        
    )
    
    device="cuda" if torch.cuda.is_available() else "cpu"
    print(device,f"inputs:{x_test.shape[1]}, targets:{1}")

    model_save_path=f"./model_fold_{fold}.bin"
    model.load_state_dict(torch.load(model_save_path))
    model=model.to(device)
    
    engine=Engine(model,None,device)
    preds=engine.predict(test_loader)
    preds=np.vstack(preds)
    return preds

In [45]:
cv_results=[]
if NN:
    for i in range(5):
        print(datetime.datetime.now().strftime("%H:%M:%S"))
        cv_results.append(train_fold(i,train,sel_col,sel_tar,True))
    
    print(datetime.datetime.now().strftime("%H:%M:%S"))
    cv_results_all.append(cv_results)
    cv_algorithms.append("NN")

In [46]:
if NN:
    p_all=[]
    for i in range(5):
        print(datetime.datetime.now().strftime("%H:%M:%S"))
        ret=p_all.append(predict_fold(i,test,sel_col,sel_tar))
    print(datetime.datetime.now().strftime("%H:%M:%S"))
    preds=np.mean(p_all,axis=0)
    ret=roc_auc_score(test.target,preds)
    print("NN",ret)
    test_results_all.append(ret)   

In [47]:
df=pd.DataFrame({"algorithms":cv_algorithms,"test_ret":test_results_all, "cv_ret":np.mean(cv_results_all,axis=1)})
df