# tabular-playground-series-mar-2021

https://www.kaggle.com/c/tabular-playground-series-mar-2021

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

%matplotlib inline

In [3]:
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, roc_curve
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_union, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
import xgboost as xgb
import lightgbm as lgb

In [4]:
import optuna
import time

In [5]:
from sklearn.utils import shuffle

### ----------------Load data----------------------

In [18]:
ds = pd.read_csv('train.csv', delimiter=',')

In [19]:
columns_numeric = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 
                   'cont6', 'cont7', 'cont8', 'cont9', 'cont10']
columns_cat = [col for col in ds.drop(['id', 'target'], axis=1).columns if col not in columns_numeric]

### ------------Train Test---------------

In [20]:
# ds_train, ds_test = train_test_split(ds, test_size=0.3, random_state=42, shuffle=True)

In [21]:
# ds.shape, ds_train.shape, ds_test.shape, 

In [22]:
# ds_train.target.value_counts(normalize=True)

### ---------- Oversampling--------------

In [23]:
# print('Add rows with target == 1:', ds.target.value_counts()[0] - ds.target.value_counts()[1], 'rows')
# print('Or rows with target == 1 multiply by 2 (', ds.target.value_counts()[0] / ds.target.value_counts()[1], ')')

In [24]:
# def oversampling_with_noise(df, noise=[1.0, 1.0]):
#     add_rows_index = df[df.target == 1].index
#     add_rows = df.loc[add_rows_index]
#     add_rows[columns_numeric] = add_rows[columns_numeric] * noise[0]
    
#     add_number_rows = df[df.target == 0].shape[0] - (df[df.target == 1].shape[0] * 2)
    
#     index_rows = np.random.choice(add_rows_index, add_number_rows)
#     add_rows_two = df.loc[index_rows]
#     add_rows_two[columns_numeric] = add_rows_two[columns_numeric] * noise[1]
       
#     df = df.append(add_rows)
#     df = df.append(add_rows_two)
    
#     df = shuffle(df)
#     return df

In [25]:
# %%time
# ds_over = oversampling_with_noise(ds, noise=[0.98, 1.02])

In [26]:
# ds_over.target.value_counts(), ds_over.target.value_counts(normalize=True)

In [27]:
# sns.countplot(ds_over.target);

In [28]:
# ds_over.head(3)

### ----------metrics------------

In [29]:
def print_metrics(actual, predict, title=''):
    precision = precision_score(actual, predict)
    recall = recall_score(actual, predict)
    f1 = f1_score(actual, predict)
        
    print('-'*15, title, '-'*15)
    print(f'Precision - {precision}')
    print(f'Recall    - {recall}')
    print(f'F1 score  - {f1}')
    print('-'*35)

In [30]:
def plot_roc_auc_curve(actual, predict):
    fpr, tpr, threshold = roc_curve(actual, predict)
    roc_auc = roc_auc_score(actual, predict)
    plt.figure(figsize=(4, 3))
    plt.plot(fpr, tpr, c='b')
    plt.plot([0, 1], [0, 1], c='red')
    plt.title(f'ROC AUC {roc_auc}')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

### ---------------Transform--------------

In [31]:
ds_transf = ds.copy(deep=True)

In [32]:
ds_transf.cont9 = np.log1p(ds_transf.cont9)

In [33]:
#scaler = StandardScaler().fit(ds_over[columns_numeric])

In [34]:
#ds_transf[columns_numeric] = scaler.transform(ds_over[columns_numeric])

### Label Encoder

In [35]:
le = LabelEncoder()

In [36]:
for col in columns_cat:
    ds_transf[col] = le.fit_transform(ds_transf[col])

In [37]:
ds_transf

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,0,8,0,1,1,33,0,44,54,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.675495,0.665915,0
1,1,0,8,0,0,4,33,8,48,3,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.656175,0.493729,0
2,2,0,10,0,0,4,33,0,30,38,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.256730,0.549452,0
3,3,0,10,0,2,4,33,0,50,3,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.565495,0.934242,0
4,4,0,8,6,1,4,33,2,32,54,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.305338,0.328960,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,0,13,5,0,4,45,0,19,48,...,0.662428,0.671927,0.390566,0.145840,0.262767,0.514248,0.519340,0.480842,0.688007,0
299996,499995,0,10,0,0,6,33,0,36,4,...,0.821657,0.620356,0.384891,0.735879,0.547731,0.726653,0.470575,0.243529,0.638939,0
299997,499996,0,6,12,0,7,33,2,37,43,...,0.407037,0.232436,0.832482,0.810663,0.596939,0.308821,0.373997,0.417410,0.452144,1
299998,499997,1,7,0,3,1,33,0,1,23,...,0.808045,0.630708,0.346898,0.735147,0.563488,0.609836,0.680430,0.276459,0.335822,0


### -------------RF-----------------

#### -------------OPTUNA-------------

In [45]:
scores = []
n = 1
def objective(trial, data=ds_transf.drop(['id', 'target'], axis=1), target=ds_transf.target):
    print(time.strftime('%X'), end=' ')
    params = {
        'criterion': trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'max_depth': trial.suggest_int('max_depth', 3, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 15),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        
    }
    
    model = RandomForestClassifier(**params, n_estimators=500, n_jobs=-1, random_state=33)
    score = cross_val_score(estimator=model, X=data, y=target, scoring='roc_auc', cv=4, n_jobs=-1)
    scores.append(score)
    print(time.strftime('%X'), score, end='|')
    #n += 1
    return np.mean(score)

In [46]:
%%time
study = optuna.create_study(direction='maximize', study_name='RF')
study.optimize(objective, n_trials=10)

13:53:59 14:04:54 [0.88973128 0.88900929 0.88693961 0.88763142]|

[32m[I 2021-03-28 14:04:55,197][0m Finished trial#0 resulted in value: 0.888327898754066. Current best value is 0.888327898754066 with parameters: {'criterion': 'entropy', 'max_depth': 36, 'min_samples_split': 2, 'min_samples_leaf': 13, 'max_features': 'log2', 'bootstrap': True}.[0m


14:04:55 

[32m[I 2021-03-28 14:10:00,234][0m Finished trial#1 resulted in value: 0.8740479665615539. Current best value is 0.888327898754066 with parameters: {'criterion': 'entropy', 'max_depth': 36, 'min_samples_split': 2, 'min_samples_leaf': 13, 'max_features': 'log2', 'bootstrap': True}.[0m


14:10:00 [0.87589968 0.87411964 0.87291858 0.87325396]|14:10:00 

[32m[I 2021-03-28 15:02:32,475][0m Finished trial#2 resulted in value: 0.8528648231098914. Current best value is 0.888327898754066 with parameters: {'criterion': 'entropy', 'max_depth': 36, 'min_samples_split': 2, 'min_samples_leaf': 13, 'max_features': 'log2', 'bootstrap': True}.[0m


15:02:32 [0.85331919 0.85448566 0.85145225 0.85220218]|15:02:32 15:51:55 [0.88254338 0.88293972 0.88065622 0.880443  ]|

[32m[I 2021-03-28 15:51:57,906][0m Finished trial#3 resulted in value: 0.8816455792195244. Current best value is 0.888327898754066 with parameters: {'criterion': 'entropy', 'max_depth': 36, 'min_samples_split': 2, 'min_samples_leaf': 13, 'max_features': 'log2', 'bootstrap': True}.[0m


15:51:57 16:02:17 [0.8900269  0.88914835 0.88737861 0.88790467]|

[32m[I 2021-03-28 16:02:18,209][0m Finished trial#4 resulted in value: 0.8886146302291201. Current best value is 0.8886146302291201 with parameters: {'criterion': 'entropy', 'max_depth': 21, 'min_samples_split': 13, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': True}.[0m


16:02:18 

[32m[I 2021-03-28 16:20:48,031][0m Finished trial#5 resulted in value: 0.8889100681390897. Current best value is 0.8889100681390897 with parameters: {'criterion': 'entropy', 'max_depth': 33, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}.[0m


16:20:47 [0.89033287 0.889608   0.88755344 0.88814596]|16:20:48 

[32m[I 2021-03-28 16:30:01,020][0m Finished trial#6 resulted in value: 0.8877497769133187. Current best value is 0.8889100681390897 with parameters: {'criterion': 'entropy', 'max_depth': 33, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}.[0m


16:30:00 [0.88908358 0.88845264 0.8864141  0.88704879]|16:30:01 16:53:05 [0.89027066 0.88967268 0.88757504 0.88795599]|

[32m[I 2021-03-28 16:53:07,461][0m Finished trial#7 resulted in value: 0.8888685943485765. Current best value is 0.8889100681390897 with parameters: {'criterion': 'entropy', 'max_depth': 33, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}.[0m


16:53:07 

[32m[I 2021-03-28 17:01:21,917][0m Finished trial#8 resulted in value: 0.8782103188250473. Current best value is 0.8889100681390897 with parameters: {'criterion': 'entropy', 'max_depth': 33, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}.[0m


17:01:21 [0.87988279 0.87849408 0.877016   0.87744841]|17:01:21 

[32m[I 2021-03-28 17:09:19,070][0m Finished trial#9 resulted in value: 0.885837052365366. Current best value is 0.8889100681390897 with parameters: {'criterion': 'entropy', 'max_depth': 33, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}.[0m


17:09:18 [0.88732976 0.88621492 0.88451041 0.88529312]|CPU times: user 4.06 s, sys: 2.64 s, total: 6.7 s
Wall time: 3h 15min 20s


In [47]:
p_RF_Label_no_over = study.best_trial.params

In [49]:
with open('p_RF_Label_no_over.txt', 'w') as t:
    t.write(str(p_RF_Label_no_over))  

In [50]:
p_RF_Label_no_over

{'criterion': 'entropy',
 'max_depth': 33,
 'min_samples_split': 10,
 'min_samples_leaf': 10,
 'max_features': 'sqrt',
 'bootstrap': False}

In [52]:
scores = []
n = 1
def objective(trial, data=ds_transf.drop(['id', 'target'], axis=1), target=ds_transf.target):
    print(time.strftime('%X'), end=' ')
    params = {
        'criterion': trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'max_depth': trial.suggest_int('max_depth', 30, 60),
        'min_samples_split': 10,
        'min_samples_leaf': 10,
        'max_features': 'sqrt',
        'bootstrap': False,
        
    }
    
    model = RandomForestClassifier(**params, n_estimators=300, n_jobs=-1, random_state=33)
    score = cross_val_score(estimator=model, X=data, y=target, scoring='roc_auc', cv=3, n_jobs=-1)
    scores.append(score)
    print(time.strftime('%X'), score, end='|')
    #n += 1
    return np.mean(score)

In [None]:
%%time
study = optuna.create_study(direction='maximize', study_name='RF')
study.optimize(objective, n_trials=10)

17:36:19 

In [51]:
model = RandomForestClassifier(n_estimators=152, random_state=36, n_jobs=-1)

In [52]:
model

RandomForestClassifier(n_estimators=152, n_jobs=-1, random_state=36)

In [55]:
p = model.set_params

In [56]:
p

<bound method BaseEstimator.set_params of RandomForestClassifier(n_estimators=152, n_jobs=-1, random_state=36)>

In [57]:
with open('params.txt', 'w') as t:
    t.write(str(p))

In [161]:
score = cross_val_score(estimator=model, X=ds_transf.drop(['id', 'target'], axis=1), y=ds_transf.target, cv=3)

In [162]:
score

array([0.89588114, 0.89821378, 0.89761092])

In [70]:
ds.columns

Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4',
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'target'],
      dtype='object')