In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from utils.storage import get_storage
import optuna

In [2]:
application_train_df = pd.read_csv('./data/application_train.csv')

In [3]:
print(application_train_df.info())
print(application_train_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None
          SK_ID_CURR         TARGET   CNT_CHILDREN  AMT_INCOME_TOTAL  \
count  307511.000000  307511.000000  307511.000000      3.075110e+05   
mean   278180.518577       0.080729       0.417052      1.687979e+05   
std    102790.175348       0.272419       0.722121      2.371231e+05   
min    100002.000000       0.000000       0.000000      2.565000e+04   
25%    189145.500000       0.000000       0.000000      1.125000e+05   
50%    278202.000000       0.000000       0.000000      1.471500e+05   
75%    367142.500000       0.000000       1.000000      2.025000e+05   
max    456255.000000       1.000000      19.000000      1.170000e+08   

         AMT_CREDIT    AMT_ANNUITY  AMT_GOODS_PRICE  \
count  3.075110e+05  307499.000000     3.072330e+05   
mean   5.990260e+05   27108.

In [4]:
y = application_train_df['TARGET']
train_df = application_train_df.drop(['SK_ID_CURR','TARGET'], axis='columns')

In [5]:
def process_train_df(i_df):
    train_df = i_df.copy()
    for column in train_df.columns:
        # categorical
        if train_df[column].dtype == 'object':
            # missing values
            if train_df[column].isnull().sum() > 0:
                train_df[column] = train_df[column].fillna('other')
            # label or one hot encoder
            if len(train_df[column].unique()) < 20:
                one_hot = pd.get_dummies(train_df[column], prefix=column)
                train_df = train_df.drop([column], axis='columns')
                train_df = train_df.join(one_hot)
            else:
                labelEncoder = LabelEncoder()
                train_df[column] = labelEncoder.fit_transform(train_df[column])
        elif train_df[column].dtype != 'object':
            if train_df[column].isnull().sum() > 0:
                train_df[column] = train_df[column].fillna(0)
    return train_df

In [6]:
X = process_train_df(train_df)

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [8]:
def objective(trial):

    # search better model from RandomForestRegressor, XGBRegressor
    regressor_name = trial.suggest_categorical('classifier', ['RandomForest', 'XGBoost'])
    # search better max_depth from 2 to 16
    max_depth = trial.suggest_int('max_depth', 2, 16)
    # search better n_estimators from 50 to 4000
    n_estimators = trial.suggest_int('n_estimators', 50, 4000)
    if regressor_name == 'RandomForest':
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=1234)
    else:
        model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, objective='binary:logistic', random_state=1234)

    
    
    error_list = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc')

    return error_list.mean()  # An objective value linked with the Trial object.

In [10]:
study = optuna.create_study(direction='maximize', study_name='home_credit_default_risk2', storage=get_storage(), load_if_exists=True)  # Create a new study.
study.optimize(objective, n_trials=50)  # Invoke optimization of the objective function.

[32m[I 2021-03-13 23:07:58,215][0m A new study created in RDB with name: home_credit_default_risk[0m












[32m[I 2021-03-13 23:35:02,131][0m Trial 0 finished with value: 0.7033132454791557 and parameters: {'classifier': 'XGBoost', 'max_depth': 6, 'n_estimators': 2120}. Best is trial 1 with value: 0.723404.[0m
[32m[I 2021-03-13 23:40:42,238][0m Trial 3 finished with value: 0.7053502163971025 and parameters: {'classifier': 'RandomForest', 'max_depth': 3, 'n_estimators': 2176}. Best is trial 2 with value: 0.737007.[0m


KeyboardInterrupt: 

In [None]:
study.best_trial