In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

In [2]:
## Very very basic data ingest

ds_tr = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
#print(ds_tr.info())
ds_pr = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
#print(ds_pr.info())
dataset = pd.concat([ds_tr, ds_pr], axis=0)
#print(dataset.info())

cat_col = []
num_col = []
for col in ds_tr.columns:
    if np.dtype(ds_tr[col]) == 'object':
        cat_col.append(col)
    else:
        if col not in ['id', 'target']:
            num_col.append(col)

print(f'Categorical is {cat_col}')
print(f'Numerical is {num_col}')

# Label Encoding *****************
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for cat in cat_col:
    dataset[cat] = encoder.fit_transform(dataset[cat])
# ********************************

# Scaling ************************
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dataset[num_col] = scaler.fit_transform(dataset[num_col])

ds_train = dataset.loc[dataset['target'].notnull(),:]
ds_predict = dataset.loc[dataset['target'].isnull(),:]

y_train = ds_train['target']
X_train = ds_train.drop(['id','target'], axis=1)

id = ds_predict['id']
ds_predict = ds_predict.drop(['id','target'], axis=1)

Categorical is ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']
Numerical is ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']


In [3]:
## Visualize dataset
# In categorical, use histogram
'''
fig, axes = plt.subplots(5,4, figsize=(20,10))
for i, cat in enumerate(cat_col):
    row = int(i/4)
    col = int(i%4)
    axes[row, col].set_title(cat)
    plt.tight_layout()
    ds_tr[cat].hist(ax=axes[row, col])

# In numerical, use scatter
fig, axes = plt.subplots(3,4, figsize=(20,10))
for i, num in enumerate(num_col):
    row = int(i/4)
    col = int(i%4)
    axes[row, col].set_title(num)
    plt.tight_layout()
    ds_tr[num].plot(ax=axes[row, col])
'''

'\nfig, axes = plt.subplots(5,4, figsize=(20,10))\nfor i, cat in enumerate(cat_col):\n    row = int(i/4)\n    col = int(i%4)\n    axes[row, col].set_title(cat)\n    plt.tight_layout()\n    ds_tr[cat].hist(ax=axes[row, col])\n\n# In numerical, use scatter\nfig, axes = plt.subplots(3,4, figsize=(20,10))\nfor i, num in enumerate(num_col):\n    row = int(i/4)\n    col = int(i%4)\n    axes[row, col].set_title(num)\n    plt.tight_layout()\n    ds_tr[num].plot(ax=axes[row, col])\n'

# SMOTE 
To resolve imbalanced data, try [SMOTENC](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTENC.html).

In [4]:
#print(f'Orignal X shape = {X_train.shape}')
#sm = SMOTE()
#X_train, y_train = sm.fit_resample(X_train,y_train)
#print(f'SMOTED X shape = {X_train.shape}')

#from sklearn.model_selection import train_test_split
#X_tr, X_te, y_tr, y_te = train_test_split(X_train,y_train,test_size=0.2,random_state=42)

# Lightgbm Classification

In [5]:
def stratified_lgb(X,y, params):
    kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)                  
    auc=[]   # list contains AUC for each fold  
    n=0   
    for tr_idx, te_idx in kf.split(X, y):
        X_tr, X_te = X_train.iloc[tr_idx], X_train.iloc[te_idx]
        y_tr, y_te = y_train.iloc[tr_idx], y_train.iloc[te_idx]
        lgb_classifier = lgb.LGBMClassifier(**params)
        lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=50)
        
        auc.append(roc_auc_score(y_te, lgb_classifier.predict_proba(X_te)[:, 1]))                               
        n+=1
    return np.mean(auc)


## LightGBM Classification

import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score

def objective(trial):
    params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'n_estimators': 50,
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-6, 1e-2)
        }

    return stratified_lgb(X_train, y_train, params)
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, n_jobs=-1)
lgb_best = study.best_params
print(lgb_best)

[32m[I 2021-03-26 07:39:54,744][0m A new study created in memory with name: no-name-5b1270e6-e7b9-4d62-8c7d-79e3eec58f2b[0m
[32m[I 2021-03-26 07:40:54,852][0m Trial 3 finished with value: 0.88104367434755 and parameters: {'lambda_l1': 0.17679212147643725, 'lambda_l2': 0.06011326497167807, 'num_leaves': 118, 'feature_fraction': 0.4745936990970153, 'bagging_fraction': 0.4020585792940372, 'bagging_freq': 6, 'min_child_samples': 93, 'learning_rate': 0.002200355986871634}. Best is trial 3 with value: 0.88104367434755.[0m
[32m[I 2021-03-26 07:41:04,723][0m Trial 2 finished with value: 0.8716613657646624 and parameters: {'lambda_l1': 0.00016767630546039302, 'lambda_l2': 0.054977729230618035, 'num_leaves': 99, 'feature_fraction': 0.9242526287338781, 'bagging_fraction': 0.7350678834162303, 'bagging_freq': 2, 'min_child_samples': 69, 'learning_rate': 9.21777917550574e-06}. Best is trial 3 with value: 0.88104367434755.[0m
[32m[I 2021-03-26 07:41:08,747][0m Trial 1 finished with value: 

[32m[I 2021-03-26 07:47:17,546][0m Trial 20 finished with value: 0.8819425652562412 and parameters: {'lambda_l1': 0.0014540397223706122, 'lambda_l2': 7.284091516389663e-07, 'num_leaves': 202, 'feature_fraction': 0.5779495377971744, 'bagging_fraction': 0.5625433046531221, 'bagging_freq': 7, 'min_child_samples': 12, 'learning_rate': 0.00012097287205047531}. Best is trial 13 with value: 0.8824073851333667.[0m
[32m[I 2021-03-26 07:47:28,540][0m Trial 21 finished with value: 0.8817686821352939 and parameters: {'lambda_l1': 0.0014392601804402569, 'lambda_l2': 2.3387328174926904e-06, 'num_leaves': 211, 'feature_fraction': 0.5884187091697821, 'bagging_fraction': 0.5524209366621688, 'bagging_freq': 5, 'min_child_samples': 15, 'learning_rate': 0.000160571688758693}. Best is trial 13 with value: 0.8824073851333667.[0m
[32m[I 2021-03-26 07:47:53,617][0m Trial 22 finished with value: 0.881850375322444 and parameters: {'lambda_l1': 0.0011338427599704632, 'lambda_l2': 2.0125571363422284e-06, 

[32m[I 2021-03-26 07:54:57,894][0m Trial 40 finished with value: 0.8840566273033597 and parameters: {'lambda_l1': 2.2655468467126363e-07, 'lambda_l2': 3.713466559754379e-05, 'num_leaves': 234, 'feature_fraction': 0.44535693689298406, 'bagging_fraction': 0.9852768378384139, 'bagging_freq': 4, 'min_child_samples': 68, 'learning_rate': 0.0032424758142770528}. Best is trial 32 with value: 0.8845157435741813.[0m
[32m[I 2021-03-26 07:55:11,133][0m Trial 41 finished with value: 0.8838540744209624 and parameters: {'lambda_l1': 1.6937691243280752e-07, 'lambda_l2': 1.0798626215152728e-05, 'num_leaves': 236, 'feature_fraction': 0.5045045322535331, 'bagging_fraction': 0.9904756719545027, 'bagging_freq': 4, 'min_child_samples': 73, 'learning_rate': 0.003468029616374865}. Best is trial 32 with value: 0.8845157435741813.[0m
[32m[I 2021-03-26 07:55:19,810][0m Trial 42 finished with value: 0.8839874323136077 and parameters: {'lambda_l1': 1.5666612318523767e-07, 'lambda_l2': 2.7931142865636927e-

{'lambda_l1': 4.302940448383378e-06, 'lambda_l2': 1.077771617437953e-08, 'num_leaves': 246, 'feature_fraction': 0.42965919004382874, 'bagging_fraction': 0.9224775786038552, 'bagging_freq': 6, 'min_child_samples': 59, 'learning_rate': 0.007573090063045077}


In [6]:
# 2021/03/25
# lgb_best = {'lambda_l1': 1.1669869035375718e-08, 'lambda_l2': 7.3221393789428e-07, 'num_leaves': 240, 'feature_fraction': 0.4510517074106803, 'bagging_fraction': 0.6612420891081325, 'bagging_freq': 1, 'min_child_samples': 28, 'learning_rate': 0.00992481068796537}

# 2021/3/26 // n_estimators = 50
# lgb_best = {'lambda_l1': 4.302940448383378e-06, 'lambda_l2': 1.077771617437953e-08, 'num_leaves': 246, 'feature_fraction': 0.42965919004382874, 'bagging_fraction': 0.9224775786038552, 'bagging_freq': 6, 'min_child_samples': 59, 'learning_rate': 0.007573090063045077}


In [7]:
Estimators=[80,100,120,150,180,200,210,220,250,280,300,350,400,500,800,1000,1200,1500,2000,2500, 3000, 4000, 5000]
optimized_est = 0
auc_status = 0
for i in Estimators:
    lgb_best['n_estimators']=i
    auc = stratified_lgb(X_train, y_train, lgb_best)
    if auc > auc_status:
        auc_status = auc
        optimized_est = i
    print("\n\n For Estimators = {} \t  AUC Score : {} \n\n".format(i,auc))

print(f'Best estimator = {optimized_est}, auc = {auc_status}')
lgb_best['n_estimators'] = optimized_est



 For Estimators = 80 	  AUC Score : 0.8858311241691161 




 For Estimators = 100 	  AUC Score : 0.8861844593739638 






 For Estimators = 120 	  AUC Score : 0.8866119620301351 




 For Estimators = 150 	  AUC Score : 0.8871349033761028 






 For Estimators = 180 	  AUC Score : 0.8876797927695655 




 For Estimators = 200 	  AUC Score : 0.8879148212180686 




 For Estimators = 210 	  AUC Score : 0.888129413440694 






KeyboardInterrupt: 

# CatBoost Classification
* category columnsをfit時に選択すると、時間むっちゃかかる
* SMOTENC + CatBoostでは、 "Best is trial 45 with value: 0.13424544780540504." の程度

In [None]:
## CatBoost Classification
'''
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

def objective(trial):
    params = {
        #'iterations' : trial.suggest_int('iterations', 50, 3000),
        'iterations' : 1000,
        'loss_function': 'CrossEntropy',
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-6, 0.1),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50)
    }

    # 学習
    model = CatBoostClassifier(**params)
    
    model.fit(X=X_tr, y=y_tr, eval_set=[(X_te, y_te)], use_best_model=True, early_stopping_rounds=50, verbose=False)
    y_pred = model.predict(X_te)
    
    return 1 - roc_auc_score(y_te, y_pred)
    
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)

study = optuna.create_study()
study.optimize(objective, n_trials=100, n_jobs=-1)
cat_best = study.best_params
print(cat_best)
'''

In [8]:
## Predict
lgb_best['n_estimators'] = 2000
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)                  
preds = 0
n=0   
for tr_idx, te_idx in kf.split(X_train, y_train):
    X_tr, X_te = X_train.iloc[tr_idx], X_train.iloc[te_idx]
    y_tr, y_te = y_train.iloc[tr_idx], y_train.iloc[te_idx]
    lgb_classifier = lgb.LGBMClassifier(**lgb_best)
    lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=50)
    preds += lgb_classifier.predict_proba(ds_predict)[:, 1]/kf.n_splits 


'''
predictor = CatBoostClassifier(**cat_best)
predictor.fit(X=X_tr, y=y_tr, eval_set=[(X_te, y_te)],cat_features=cat_col, 
               use_best_model=True, early_stopping_rounds=50, verbose=False)
    
y_pred = predictor(predict_ds)
'''


output = pd.DataFrame({'id': id, 'target': preds})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

SyntaxError: invalid syntax (<ipython-input-8-aa23e1b41e54>, line 2)