In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

In [2]:
## Very very basic data ingest

ds_tr = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
#print(ds_tr.info())
ds_pr = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
#print(ds_pr.info())
dataset = pd.concat([ds_tr, ds_pr], axis=0)
#print(dataset.info())

cat_col = []
num_col = []
for col in ds_tr.columns:
    if np.dtype(ds_tr[col]) == 'object':
        cat_col.append(col)
    else:
        if col not in ['id', 'target']:
            num_col.append(col)

print(f'Categorical is {cat_col}')
print(f'Numerical is {num_col}')

# categorical Encoding *****************
from sklearn.preprocessing import LabelEncoder
dataset = pd.get_dummies(dataset, columns=cat_col, drop_first=True)
#encoder = LabelEncoder()
#for cat in cat_col:
#    dataset[cat] = encoder.fit_transform(dataset[cat])
# ********************************

# Scaling ************************
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dataset[num_col] = scaler.fit_transform(dataset[num_col])

ds_train = dataset.loc[dataset['target'].notnull(),:]
ds_predict = dataset.loc[dataset['target'].isnull(),:]

y_train = ds_train['target']
X_train = ds_train.drop(['id','target'], axis=1)

id = ds_predict['id']
ds_predict = ds_predict.drop(['id','target'], axis=1)

Categorical is ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']
Numerical is ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']


In [3]:
## Visualize dataset
# In categorical, use histogram
'''
fig, axes = plt.subplots(5,4, figsize=(20,10))
for i, cat in enumerate(cat_col):
    row = int(i/4)
    col = int(i%4)
    axes[row, col].set_title(cat)
    plt.tight_layout()
    ds_tr[cat].hist(ax=axes[row, col])

# In numerical, use scatter
fig, axes = plt.subplots(3,4, figsize=(20,10))
for i, num in enumerate(num_col):
    row = int(i/4)
    col = int(i%4)
    axes[row, col].set_title(num)
    plt.tight_layout()
    ds_tr[num].plot(ax=axes[row, col])
'''

'\nfig, axes = plt.subplots(5,4, figsize=(20,10))\nfor i, cat in enumerate(cat_col):\n    row = int(i/4)\n    col = int(i%4)\n    axes[row, col].set_title(cat)\n    plt.tight_layout()\n    ds_tr[cat].hist(ax=axes[row, col])\n\n# In numerical, use scatter\nfig, axes = plt.subplots(3,4, figsize=(20,10))\nfor i, num in enumerate(num_col):\n    row = int(i/4)\n    col = int(i%4)\n    axes[row, col].set_title(num)\n    plt.tight_layout()\n    ds_tr[num].plot(ax=axes[row, col])\n'

# SMOTE 
To resolve imbalanced data, try [SMOTENC](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTENC.html).

In [4]:
#print(f'Orignal X shape = {X_train.shape}')
#sm = SMOTE()
#X_train, y_train = sm.fit_resample(X_train,y_train)
#print(f'SMOTED X shape = {X_train.shape}')

#from sklearn.model_selection import train_test_split
#X_tr, X_te, y_tr, y_te = train_test_split(X_train,y_train,test_size=0.2,random_state=42)

# Lightgbm Classification

In [5]:
def stratified_lgb(X,y, params):
    kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)                  
    auc=[]   # list contains AUC for each fold  
    n=0   
    for tr_idx, te_idx in kf.split(X, y):
        X_tr, X_te = X_train.iloc[tr_idx], X_train.iloc[te_idx]
        y_tr, y_te = y_train.iloc[tr_idx], y_train.iloc[te_idx]
        lgb_classifier = lgb.LGBMClassifier(**params)
        lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=50)
        
        auc.append(roc_auc_score(y_te, lgb_classifier.predict_proba(X_te)[:, 1]))                               
        n+=1
    return np.mean(auc)


## LightGBM Classification

import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score

def objective(trial):
    params = {
            'objective': 'binary',
            'metric': 'auc',
            'n_estimators': 100,
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-6, 1e-2)
        }

    return stratified_lgb(X_train, y_train, params)
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, n_jobs=-1)
lgb_best = study.best_params
print(lgb_best)

[32m[I 2021-03-29 01:26:05,166][0m A new study created in memory with name: no-name-651845ab-46d2-4b89-8160-b4b15ecd2e9b[0m
[32m[I 2021-03-29 01:28:17,064][0m Trial 2 finished with value: 0.8797207241246635 and parameters: {'lambda_l1': 9.58944918590701e-08, 'lambda_l2': 3.3653481133533162e-06, 'num_leaves': 75, 'feature_fraction': 0.6496492936110181, 'bagging_fraction': 0.5357231152991182, 'bagging_freq': 6, 'min_child_samples': 75, 'learning_rate': 0.009938651779133974}. Best is trial 2 with value: 0.8797207241246635.[0m
[32m[I 2021-03-29 01:28:18,453][0m Trial 0 finished with value: 0.8779823447492283 and parameters: {'lambda_l1': 7.519446170888107e-05, 'lambda_l2': 0.007135577830511128, 'num_leaves': 75, 'feature_fraction': 0.45185068231301945, 'bagging_fraction': 0.5866756234925776, 'bagging_freq': 5, 'min_child_samples': 44, 'learning_rate': 5.191557968214286e-06}. Best is trial 2 with value: 0.8797207241246635.[0m
[32m[I 2021-03-29 01:28:34,555][0m Trial 3 finished wi

[32m[I 2021-03-29 01:42:37,118][0m Trial 19 finished with value: 0.8831640074863932 and parameters: {'lambda_l1': 3.868948179783203, 'lambda_l2': 1.4972254547227202e-08, 'num_leaves': 254, 'feature_fraction': 0.4032592985554663, 'bagging_fraction': 0.9972600294460043, 'bagging_freq': 2, 'min_child_samples': 80, 'learning_rate': 4.1838721275426567e-05}. Best is trial 16 with value: 0.8848299082314405.[0m
[32m[I 2021-03-29 01:44:00,149][0m Trial 22 finished with value: 0.8787446336641359 and parameters: {'lambda_l1': 7.99264540346057, 'lambda_l2': 3.482799475540851e-07, 'num_leaves': 226, 'feature_fraction': 0.5713812001681776, 'bagging_fraction': 0.4264103610172441, 'bagging_freq': 6, 'min_child_samples': 83, 'learning_rate': 0.0006986042141721647}. Best is trial 16 with value: 0.8848299082314405.[0m
[32m[I 2021-03-29 01:44:04,636][0m Trial 23 finished with value: 0.8782965324150576 and parameters: {'lambda_l1': 9.923796221769638, 'lambda_l2': 3.8383467101759544e-07, 'num_leaves

[32m[I 2021-03-29 01:57:22,404][0m Trial 40 finished with value: 0.8853090221461077 and parameters: {'lambda_l1': 0.0011251876960643242, 'lambda_l2': 8.690595033370834e-06, 'num_leaves': 208, 'feature_fraction': 0.4476416840191716, 'bagging_fraction': 0.5218788661531341, 'bagging_freq': 5, 'min_child_samples': 87, 'learning_rate': 0.009770872878458323}. Best is trial 40 with value: 0.8853090221461077.[0m
[32m[I 2021-03-29 01:58:54,966][0m Trial 41 finished with value: 0.8844765724190399 and parameters: {'lambda_l1': 0.0013189109034416344, 'lambda_l2': 5.115625114318912e-06, 'num_leaves': 208, 'feature_fraction': 0.5218093563296895, 'bagging_fraction': 0.5055973573589042, 'bagging_freq': 4, 'min_child_samples': 86, 'learning_rate': 0.00803355602475131}. Best is trial 40 with value: 0.8853090221461077.[0m
[32m[I 2021-03-29 01:58:57,602][0m Trial 43 finished with value: 0.8809523310600589 and parameters: {'lambda_l1': 0.0014001311833077017, 'lambda_l2': 7.73991240548902e-05, 'num_

{'lambda_l1': 0.0011251876960643242, 'lambda_l2': 8.690595033370834e-06, 'num_leaves': 208, 'feature_fraction': 0.4476416840191716, 'bagging_fraction': 0.5218788661531341, 'bagging_freq': 5, 'min_child_samples': 87, 'learning_rate': 0.009770872878458323}


In [6]:
# 2021/03/25
# lgb_best = {'lambda_l1': 1.1669869035375718e-08, 'lambda_l2': 7.3221393789428e-07, 'num_leaves': 240, 'feature_fraction': 0.4510517074106803, 'bagging_fraction': 0.6612420891081325, 'bagging_freq': 1, 'min_child_samples': 28, 'learning_rate': 0.00992481068796537}

# 2021/3/26 // n_estimators = 50, LabelEncoding
# lgb_best = {'lambda_l1': 4.302940448383378e-06, 'lambda_l2': 1.077771617437953e-08, 'num_leaves': 246, 'feature_fraction': 0.42965919004382874, 'bagging_fraction': 0.9224775786038552, 'bagging_freq': 6, 'min_child_samples': 59, 'learning_rate': 0.007573090063045077}
# update best score 

# 2021/3/26 // n_estimators = 50, OneHotEncoding

In [7]:
'''
Estimators=[80,100,120,150,180,200,210,220,250,280,300,350,400,500,800,1000,1200,1500,2000,2500, 3000, 4000, 5000]
optimized_est = 0
auc_status = 0
for i in Estimators:
    lgb_best['n_estimators']=i
    auc = stratified_lgb(X_train, y_train, lgb_best)
    if auc > auc_status:
        auc_status = auc
        optimized_est = i
    print("\n\n For Estimators = {} \t  AUC Score : {} \n\n".format(i,auc))

print(f'Best estimator = {optimized_est}, auc = {auc_status}')
lgb_best['n_estimators'] = optimized_est
'''

'\nEstimators=[80,100,120,150,180,200,210,220,250,280,300,350,400,500,800,1000,1200,1500,2000,2500, 3000, 4000, 5000]\noptimized_est = 0\nauc_status = 0\nfor i in Estimators:\n    lgb_best[\'n_estimators\']=i\n    auc = stratified_lgb(X_train, y_train, lgb_best)\n    if auc > auc_status:\n        auc_status = auc\n        optimized_est = i\n    print("\n\n For Estimators = {} \t  AUC Score : {} \n\n".format(i,auc))\n\nprint(f\'Best estimator = {optimized_est}, auc = {auc_status}\')\nlgb_best[\'n_estimators\'] = optimized_est\n'

# CatBoost Classification
* category columnsをfit時に選択すると、時間むっちゃかかる
* SMOTENC + CatBoostでは、 "Best is trial 45 with value: 0.13424544780540504." の程度

In [8]:
## CatBoost Classification
'''
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

def objective(trial):
    params = {
        #'iterations' : trial.suggest_int('iterations', 50, 3000),
        'iterations' : 1000,
        'loss_function': 'CrossEntropy',
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-6, 0.1),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50)
    }

    # 学習
    model = CatBoostClassifier(**params)
    
    model.fit(X=X_tr, y=y_tr, eval_set=[(X_te, y_te)], use_best_model=True, early_stopping_rounds=50, verbose=False)
    y_pred = model.predict(X_te)
    
    return 1 - roc_auc_score(y_te, y_pred)
    
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)

study = optuna.create_study()
study.optimize(objective, n_trials=100, n_jobs=-1)
cat_best = study.best_params
print(cat_best)
'''

"\nimport optuna\nfrom catboost import CatBoostClassifier\nfrom sklearn.metrics import roc_auc_score\n\ndef objective(trial):\n    params = {\n        #'iterations' : trial.suggest_int('iterations', 50, 3000),\n        'iterations' : 1000,\n        'loss_function': 'CrossEntropy',\n        'depth' : trial.suggest_int('depth', 4, 10),                                       \n        'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-6, 0.1),               \n        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       \n        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), \n        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),\n        'od_wait' :trial.suggest_int('od_wait', 10, 50)\n    }\n\n    # 学習\n    model = CatBoostClassifier(**params)\n    \n    model.fit(X=X_tr, y=y_tr, eval_set=[(X_te, y_te)], use_best_model=True, early_stopping_rounds=50, verbose=False)\n    y_pred = 

In [9]:
## Predict
lgb_best['n_estimators'] = 2000
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)                  
preds = 0
n=0   
for tr_idx, te_idx in kf.split(X_train, y_train):
    X_tr, X_te = X_train.iloc[tr_idx], X_train.iloc[te_idx]
    y_tr, y_te = y_train.iloc[tr_idx], y_train.iloc[te_idx]
    lgb_classifier = lgb.LGBMClassifier(**lgb_best)
    lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=100)
    preds += lgb_classifier.predict_proba(ds_predict)[:, 1]/kf.n_splits 


'''
predictor = CatBoostClassifier(**cat_best)
predictor.fit(X=X_tr, y=y_tr, eval_set=[(X_te, y_te)],cat_features=cat_col, 
               use_best_model=True, early_stopping_rounds=50, verbose=False)
    
y_pred = predictor(predict_ds)
'''


output = pd.DataFrame({'id': id, 'target': preds})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
