In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

In [11]:
## Very very basic data ingest

ds_tr = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
#print(ds_tr.info())
ds_pr = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
#print(ds_pr.info())
dataset = pd.concat([ds_tr, ds_pr], axis=0)
#print(dataset.info())

cat_col = []
num_col = []
for col in ds_tr.columns:
    if np.dtype(ds_tr[col]) == 'object':
        cat_col.append(col)
    else:
        if col not in ['id', 'target']:
            num_col.append(col)

print(f'Categorical is {cat_col}')
print(f'Numerical is {num_col}')

# categorical Encoding *****************
from sklearn.preprocessing import LabelEncoder
dataset = pd.get_dummies(dataset, columns=cat_col, drop_first=True)
#encoder = LabelEncoder()
#for cat in cat_col:
#    dataset[cat] = encoder.fit_transform(dataset[cat])
# ********************************

# Scaling ************************
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dataset[num_col] = scaler.fit_transform(dataset[num_col])

ds_train = dataset.loc[dataset['target'].notnull(),:]
ds_predict = dataset.loc[dataset['target'].isnull(),:]

y_train = ds_train['target']
X_train = ds_train.drop(['id','target'], axis=1)

id = ds_predict['id']
ds_predict = ds_predict.drop(['id','target'], axis=1)

Categorical is ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']
Numerical is ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']


In [12]:
## Visualize dataset
# In categorical, use histogram
'''
fig, axes = plt.subplots(5,4, figsize=(20,10))
for i, cat in enumerate(cat_col):
    row = int(i/4)
    col = int(i%4)
    axes[row, col].set_title(cat)
    plt.tight_layout()
    ds_tr[cat].hist(ax=axes[row, col])

# In numerical, use scatter
fig, axes = plt.subplots(3,4, figsize=(20,10))
for i, num in enumerate(num_col):
    row = int(i/4)
    col = int(i%4)
    axes[row, col].set_title(num)
    plt.tight_layout()
    ds_tr[num].plot(ax=axes[row, col])
'''

'\nfig, axes = plt.subplots(5,4, figsize=(20,10))\nfor i, cat in enumerate(cat_col):\n    row = int(i/4)\n    col = int(i%4)\n    axes[row, col].set_title(cat)\n    plt.tight_layout()\n    ds_tr[cat].hist(ax=axes[row, col])\n\n# In numerical, use scatter\nfig, axes = plt.subplots(3,4, figsize=(20,10))\nfor i, num in enumerate(num_col):\n    row = int(i/4)\n    col = int(i%4)\n    axes[row, col].set_title(num)\n    plt.tight_layout()\n    ds_tr[num].plot(ax=axes[row, col])\n'

# SMOTE 
To resolve imbalanced data, try [SMOTENC](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTENC.html).

In [13]:
#print(f'Orignal X shape = {X_train.shape}')
#sm = SMOTE()
#X_train, y_train = sm.fit_resample(X_train,y_train)
#print(f'SMOTED X shape = {X_train.shape}')

#from sklearn.model_selection import train_test_split
#X_tr, X_te, y_tr, y_te = train_test_split(X_train,y_train,test_size=0.2,random_state=42)

# Lightgbm Classification

In [14]:
def stratified_lgb(X,y, params):
    kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)                  
    auc=[]   # list contains AUC for each fold  
    n=0   
    for tr_idx, te_idx in kf.split(X, y):
        X_tr, X_te = X_train.iloc[tr_idx], X_train.iloc[te_idx]
        y_tr, y_te = y_train.iloc[tr_idx], y_train.iloc[te_idx]
        lgb_classifier = lgb.LGBMClassifier(**params)
        lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=50)
        
        auc.append(roc_auc_score(y_te, lgb_classifier.predict_proba(X_te)[:, 1]))                               
        n+=1
    return np.mean(auc)


## LightGBM Classification

import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score

def objective(trial):
    params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'n_estimators': 50,
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-6, 1e-2)
        }

    return stratified_lgb(X_train, y_train, params)
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, n_jobs=-1)
lgb_best = study.best_params
print(lgb_best)

[32m[I 2021-03-26 08:27:33,622][0m A new study created in memory with name: no-name-b9b1dcc6-0f46-458b-8202-90c2bc1f47a1[0m
[32m[I 2021-03-26 08:29:42,504][0m Trial 0 finished with value: 0.8750773784239871 and parameters: {'lambda_l1': 1.089323772752983e-08, 'lambda_l2': 1.407670295018942, 'num_leaves': 239, 'feature_fraction': 0.9747929673715144, 'bagging_fraction': 0.795869710235032, 'bagging_freq': 1, 'min_child_samples': 33, 'learning_rate': 2.398459252841212e-06}. Best is trial 0 with value: 0.8750773784239871.[0m
[32m[I 2021-03-26 08:29:44,837][0m Trial 2 finished with value: 0.8796773765772352 and parameters: {'lambda_l1': 0.0024031598388958167, 'lambda_l2': 2.1068341065325404e-08, 'num_leaves': 156, 'feature_fraction': 0.6696974472182409, 'bagging_fraction': 0.5545883495157834, 'bagging_freq': 4, 'min_child_samples': 60, 'learning_rate': 0.0023455767210478207}. Best is trial 2 with value: 0.8796773765772352.[0m
[32m[I 2021-03-26 08:29:49,436][0m Trial 1 finished wit

[32m[I 2021-03-26 08:39:35,547][0m Trial 20 finished with value: 0.8823554513316738 and parameters: {'lambda_l1': 0.05432039225799929, 'lambda_l2': 0.00011723115277175058, 'num_leaves': 222, 'feature_fraction': 0.4240833268936386, 'bagging_fraction': 0.7290971875939899, 'bagging_freq': 2, 'min_child_samples': 100, 'learning_rate': 0.0004678539891869662}. Best is trial 19 with value: 0.8841063396728458.[0m
[32m[I 2021-03-26 08:39:41,173][0m Trial 21 finished with value: 0.8832477753706499 and parameters: {'lambda_l1': 6.436127101378765e-05, 'lambda_l2': 8.801938815986766e-05, 'num_leaves': 229, 'feature_fraction': 0.42262642010906704, 'bagging_fraction': 0.6998733598063356, 'bagging_freq': 2, 'min_child_samples': 43, 'learning_rate': 0.00046205096972218925}. Best is trial 19 with value: 0.8841063396728458.[0m
[32m[I 2021-03-26 08:40:36,691][0m Trial 22 finished with value: 0.8826714744889237 and parameters: {'lambda_l1': 9.581407804524492e-05, 'lambda_l2': 9.045269178321737e-05,

[32m[I 2021-03-26 08:50:30,143][0m Trial 40 finished with value: 0.882389657317766 and parameters: {'lambda_l1': 2.402821583193519, 'lambda_l2': 8.068509406692567e-06, 'num_leaves': 242, 'feature_fraction': 0.504383704797179, 'bagging_fraction': 0.7367554729019027, 'bagging_freq': 4, 'min_child_samples': 86, 'learning_rate': 0.002384564783787829}. Best is trial 37 with value: 0.8842380892055284.[0m
[32m[I 2021-03-26 08:50:45,264][0m Trial 41 finished with value: 0.8817213903276272 and parameters: {'lambda_l1': 0.44292458652424627, 'lambda_l2': 1.6225876977448063e-05, 'num_leaves': 244, 'feature_fraction': 0.6515188939236999, 'bagging_fraction': 0.7554943698077549, 'bagging_freq': 4, 'min_child_samples': 62, 'learning_rate': 0.0022065132724119092}. Best is trial 37 with value: 0.8842380892055284.[0m
[32m[I 2021-03-26 08:51:58,729][0m Trial 43 finished with value: 0.8783628355772206 and parameters: {'lambda_l1': 0.5031248331992156, 'lambda_l2': 4.297370820849554e-08, 'num_leaves'

{'lambda_l1': 0.015336635878538896, 'lambda_l2': 1.104433169409131e-08, 'num_leaves': 241, 'feature_fraction': 0.4473252757344324, 'bagging_fraction': 0.8336995710327374, 'bagging_freq': 3, 'min_child_samples': 87, 'learning_rate': 0.009934599427841553}


In [15]:
# 2021/03/25
# lgb_best = {'lambda_l1': 1.1669869035375718e-08, 'lambda_l2': 7.3221393789428e-07, 'num_leaves': 240, 'feature_fraction': 0.4510517074106803, 'bagging_fraction': 0.6612420891081325, 'bagging_freq': 1, 'min_child_samples': 28, 'learning_rate': 0.00992481068796537}

# 2021/3/26 // n_estimators = 50, LabelEncoding
# lgb_best = {'lambda_l1': 4.302940448383378e-06, 'lambda_l2': 1.077771617437953e-08, 'num_leaves': 246, 'feature_fraction': 0.42965919004382874, 'bagging_fraction': 0.9224775786038552, 'bagging_freq': 6, 'min_child_samples': 59, 'learning_rate': 0.007573090063045077}
# update best score 

# 2021/3/26 // n_estimators = 50, OneHotEncoding

In [16]:
'''
Estimators=[80,100,120,150,180,200,210,220,250,280,300,350,400,500,800,1000,1200,1500,2000,2500, 3000, 4000, 5000]
optimized_est = 0
auc_status = 0
for i in Estimators:
    lgb_best['n_estimators']=i
    auc = stratified_lgb(X_train, y_train, lgb_best)
    if auc > auc_status:
        auc_status = auc
        optimized_est = i
    print("\n\n For Estimators = {} \t  AUC Score : {} \n\n".format(i,auc))

print(f'Best estimator = {optimized_est}, auc = {auc_status}')
lgb_best['n_estimators'] = optimized_est
'''



 For Estimators = 80 	  AUC Score : 0.8852364752889266 




KeyboardInterrupt: 

# CatBoost Classification
* category columnsをfit時に選択すると、時間むっちゃかかる
* SMOTENC + CatBoostでは、 "Best is trial 45 with value: 0.13424544780540504." の程度

In [None]:
## CatBoost Classification
'''
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

def objective(trial):
    params = {
        #'iterations' : trial.suggest_int('iterations', 50, 3000),
        'iterations' : 1000,
        'loss_function': 'CrossEntropy',
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-6, 0.1),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50)
    }

    # 学習
    model = CatBoostClassifier(**params)
    
    model.fit(X=X_tr, y=y_tr, eval_set=[(X_te, y_te)], use_best_model=True, early_stopping_rounds=50, verbose=False)
    y_pred = model.predict(X_te)
    
    return 1 - roc_auc_score(y_te, y_pred)
    
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)

study = optuna.create_study()
study.optimize(objective, n_trials=100, n_jobs=-1)
cat_best = study.best_params
print(cat_best)
'''

In [None]:
## Predict
lgb_best['n_estimators'] = 2000
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)                  
preds = 0
n=0   
for tr_idx, te_idx in kf.split(X_train, y_train):
    X_tr, X_te = X_train.iloc[tr_idx], X_train.iloc[te_idx]
    y_tr, y_te = y_train.iloc[tr_idx], y_train.iloc[te_idx]
    lgb_classifier = lgb.LGBMClassifier(**lgb_best)
    lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=50)
    preds += lgb_classifier.predict_proba(ds_predict)[:, 1]/kf.n_splits 


'''
predictor = CatBoostClassifier(**cat_best)
predictor.fit(X=X_tr, y=y_tr, eval_set=[(X_te, y_te)],cat_features=cat_col, 
               use_best_model=True, early_stopping_rounds=50, verbose=False)
    
y_pred = predictor(predict_ds)
'''


output = pd.DataFrame({'id': id, 'target': preds})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")