In [37]:
import numpy as np
import pandas as pd

# Concatenate train and test data

In [38]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

target_map = {'Class_1': 0, 'Class_2':1, 'Class_3':2, 'Class_4':3}
train['target'] = train['target'].map(lambda x: target_map[x])

id = test['id']
test = test.drop(['id'], axis=1)

train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
0,0,0,0,1,0,1,0,0,0,0,...,0,0,21,0,0,0,0,0,0,1
1,1,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,13,2,0,0
3,3,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,1,0,3
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


# Modeling

In [39]:
y = train['target']
X = train.drop(['id','target'], axis=1)
X.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,0,0,1,0,1,0,0,0,0,0,...,3,0,0,21,0,0,0,0,0,0
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,0,0,1,0,0,0,0,13,2,0
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [40]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
n_splits = 5

def stratified_lgb(X,y, params):
    kf = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True)
    log_loss_arr= []  
    for tr_idx, te_idx in kf.split(X, y):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
        lgb_classifier = lgb.LGBMClassifier(**params)
        lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=200, eval_metric=['multi_logloss'])
        
        log_loss_arr.append(log_loss(y_te, lgb_classifier.predict_proba(X_te)))
        
    return np.mean(log_loss_arr)

## LightGBM Classification

def objective(trial):
    params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'n_estimators': 2000,
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-7, 1e-2)
        }

    return stratified_lgb(X, y, params)
    
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, n_jobs=-1)
lgb_best = study.best_params
print(lgb_best)

[32m[I 2021-05-06 13:10:34,864][0m A new study created in memory with name: no-name-c21a6b9a-b712-4747-a19e-c9522f5b9846[0m
[32m[I 2021-05-06 13:10:52,503][0m Trial 0 finished with value: 1.117346377301225 and parameters: {'lambda_l1': 3.0435761424725743e-05, 'lambda_l2': 8.667415098179686e-08, 'num_leaves': 137, 'feature_fraction': 0.4039908983483346, 'bagging_fraction': 0.6381129811646222, 'bagging_freq': 5, 'min_child_samples': 15, 'learning_rate': 0.0061618822991274975}. Best is trial 0 with value: 1.117346377301225.[0m
[32m[I 2021-05-06 13:10:53,070][0m Trial 1 finished with value: 1.1185762430927795 and parameters: {'lambda_l1': 2.663572183191629e-05, 'lambda_l2': 7.561042197181775e-07, 'num_leaves': 162, 'feature_fraction': 0.9824554145691907, 'bagging_fraction': 0.7822713160084374, 'bagging_freq': 7, 'min_child_samples': 34, 'learning_rate': 1.8517549131250708e-06}. Best is trial 0 with value: 1.117346377301225.[0m


{'lambda_l1': 3.0435761424725743e-05, 'lambda_l2': 8.667415098179686e-08, 'num_leaves': 137, 'feature_fraction': 0.4039908983483346, 'bagging_fraction': 0.6381129811646222, 'bagging_freq': 5, 'min_child_samples': 15, 'learning_rate': 0.0061618822991274975}


# Predict

In [41]:
## Predict
lgb_best['n_estimators'] = 10000
lgb_best['objective'] = 'multiclass'
lgb_best['metric'] = 'multi_logloss'

kf = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True)                  

for tr_idx, te_idx in kf.split(X, y):
    X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
    y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
    lgb_classifier = lgb.LGBMClassifier(**lgb_best)
    lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=500, eval_metric='multi_logloss')
    y_pred += lgb_classifier.predict_proba(test)

prob = y_pred/n_splits

output = pd.DataFrame({'id': id, 'Class_1': prob[:,0],'Class_2':prob[:,1] ,'Class_3':prob[:,2], 'Class_4':prob[:,3]})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
