In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings(action='ignore')

In [2]:
train=pd.read_csv("../input/tsp-dataset/train_2106_0.05.csv")

In [3]:
train_id=train.id
del train["id"]

In [4]:
le=LabelEncoder()
train.target=le.fit_transform(train.target)

In [5]:
from sklearn.model_selection import StratifiedKFold

train["fold"]=-1
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
for idx,(trn,vld) in enumerate(skf.split(X=train.sample(frac=1.0).values, y=train.target.values)):
    train.loc[vld,"fold"]=idx

In [6]:
sel_features=[col for col in train.columns if "feature" in col]

In [7]:
y=train.target
X=train[sel_features]

# EDA

In [8]:
sns.countplot(train.target)

In [9]:
skewValue = train.skew(axis=0)
skewValue

# Optuna

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,log_loss
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import optuna

OPTUNA=False

In [11]:
def objective(trial,data=X,target=y):    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)

    param = {
    'objective':'multi:softprob', 
    'eval_metric':'mlogloss', 
    'use_label_encoder':False,
    'num_class' : 9,
    'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
    'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
    'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
    'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
    'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
    'n_estimators': 10000,
    'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
    'random_state': trial.suggest_categorical('random_state', [2020]),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }    
    model = xgb.XGBClassifier(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict_proba(test_x)
    loss = log_loss(test_y, preds)
    return loss

In [12]:
#xgboost optuna 
if OPTUNA:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=30)
    print('Number of finished trials:', len(study.trials))
    print('Best_trial:', study.best_trial.params)

In [13]:
if OPTUNA:
    display(optuna.visualization.plot_optimization_history(study))
    display(optuna.visualization.plot_slice(study))

In [14]:
#lgbm optuna
def objective(trial,data=X,target=y):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)

    param = {
        'device':'gpu',
        'objective': 'multiclass',
        'metric': 'multi_logloss', 
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }

    model = lgb.LGBMClassifier(**param)
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)],early_stopping_rounds=100,verbose=False)
    preds=model.predict_proba(test_x)
    loss = log_loss(test_y,preds)
    return loss

In [15]:
# lightgbm optuna
if OPTUNA:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=120)
    print('Number of finished trials:', len(study.trials))
    print('Best_trial2=', study.best_trial.params)

In [16]:
if OPTUNA:
    display(optuna.visualization.plot_optimization_history(study))
    display(optuna.visualization.plot_slice(study))

# model

In [17]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,VotingClassifier
import xgboost as xgb
import lightgbm as lgbm

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

In [18]:
# optuna xgb
Best_trial= {'lambda': 0.05034796715262917, 
             'alpha': 0.014356150099756269, 
             'colsample_bytree': 0.4, 
             'subsample': 0.5, 
             'learning_rate': 0.018, 
             'max_depth': 11, 
             'random_state': 2020, 
             'min_child_weight': 300}
XGBC = xgb.XGBClassifier(**Best_trial)

In [19]:
# optuna lgbm
Best_trial2= {'max_depth': 15, 
              'learning_rate': 0.018, 
              'n_estimators': 1202, 
              'min_child_samples': 25, 
              'subsample': 0.9861890425507288}
LGBMC=lgb.LGBMRegressor(**Best_trial2)

In [20]:
random_state = 2
classifiers = []
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(xgb.XGBClassifier())
classifiers.append(XGBC)
classifiers.append(lgbm.LGBMClassifier())
classifiers.append(LGBMC)

In [None]:
import datetime

cv_results = []
cv=StratifiedKFold(5, shuffle=True, random_state=42)
for model in classifiers :
    print(datetime.datetime.now())
    cv_results.append(-cross_val_score(model, X=X, y = y, scoring = "neg_log_loss", cv = cv, n_jobs=4))

In [None]:
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["RandomForest","GradientBoosting","XGB","XGB_Optuna","LGBM","LGBM_Optuna"]})
print(cv_res)    

In [None]:
f,ax=plt.subplots(1,1,figsize=(12,5))
sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std},ax=ax)
ax.set_title("Cross validation scores")

In [None]:
sdf=324d