In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from utils.storage import get_storage
import optuna

In [2]:
application_train_df = pd.read_csv('./data/application_train.csv')

In [3]:
application_test_df = pd.read_csv('./data/application_test.csv')

In [4]:
bureau_df = pd.read_csv('./data/bureau.csv')

In [5]:
bureau_balance_df = pd.read_csv('./data/bureau_balance.csv')

In [6]:
print(application_train_df.info())
print(application_train_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None
          SK_ID_CURR         TARGET   CNT_CHILDREN  AMT_INCOME_TOTAL  \
count  307511.000000  307511.000000  307511.000000      3.075110e+05   
mean   278180.518577       0.080729       0.417052      1.687979e+05   
std    102790.175348       0.272419       0.722121      2.371231e+05   
min    100002.000000       0.000000       0.000000      2.565000e+04   
25%    189145.500000       0.000000       0.000000      1.125000e+05   
50%    278202.000000       0.000000       0.000000      1.471500e+05   
75%    367142.500000       0.000000       1.000000      2.025000e+05   
max    456255.000000       1.000000      19.000000      1.170000e+08   

         AMT_CREDIT    AMT_ANNUITY  AMT_GOODS_PRICE  \
count  3.075110e+05  307499.000000     3.072330e+05   
mean   5.990260e+05   27108.

In [7]:
bureau_balance_pivot_mean_df = bureau_balance_df.pivot_table(index='SK_ID_BUREAU', values='MONTHS_BALANCE', aggfunc=np.mean, fill_value=0)
bureau_balance_pivot_len_df = bureau_balance_df.drop('MONTHS_BALANCE', axis='columns').pivot_table(index='SK_ID_BUREAU', columns='STATUS', aggfunc=len, fill_value=0)
bureau_balance_pivot_df = pd.concat([bureau_balance_pivot_mean_df, bureau_balance_pivot_len_df], axis='columns')

In [8]:
bureau_mered_df = bureau_df.merge(bureau_balance_pivot_df, how='left', on='SK_ID_BUREAU')
bureau_pivot_mean_df_columns = ['SK_ID_CURR','DAYS_CREDIT','CREDIT_DAY_OVERDUE','DAYS_CREDIT_ENDDATE','DAYS_ENDDATE_FACT','AMT_CREDIT_MAX_OVERDUE','CNT_CREDIT_PROLONG','AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT','AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE','DAYS_CREDIT_UPDATE','AMT_ANNUITY']
bureau_pivot_mean_df_columns = bureau_pivot_mean_df_columns + bureau_balance_pivot_df.columns.tolist()
bureau_pivot_mean_df = bureau_mered_df[bureau_pivot_mean_df_columns].pivot_table(index='SK_ID_CURR', aggfunc=np.mean, fill_value=0)
bureau_pivot_CREDIT_ACTIVE_df = bureau_mered_df[['SK_ID_CURR','CREDIT_ACTIVE']].pivot_table(index='SK_ID_CURR', columns='CREDIT_ACTIVE', aggfunc=len, fill_value=0)
bureau_pivot_CREDIT_TYPE_df = bureau_mered_df[['SK_ID_CURR','CREDIT_TYPE']].pivot_table(index='SK_ID_CURR', columns='CREDIT_TYPE', aggfunc=len, fill_value=0)
bureau_pivot_df = pd.concat([bureau_pivot_mean_df, bureau_pivot_CREDIT_ACTIVE_df, bureau_pivot_CREDIT_TYPE_df], axis='columns')
bureau_pivot_df

Unnamed: 0_level_0,0,1,2,3,4,5,AMT_ANNUITY,AMT_CREDIT_MAX_OVERDUE,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,...,Interbank credit,Loan for business development,Loan for purchase of shares (margin lending),Loan for the purchase of equipment,Loan for working capital replenishment,Microloan,Mobile operator loan,Mortgage,Real estate loan,Unknown type of loan
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,4.428571,0.142857,0.0,0.0,0.0,0.0,3545.357143,0.000,2.076236e+05,85240.928571,...,0,0,0,0,0,0,0,0,0,0
100002,5.625000,3.375000,0.0,0.0,0.0,0.0,0.000000,1681.029,1.081319e+05,49156.200000,...,0,0,0,0,0,0,0,0,0,0
100003,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000,2.543501e+05,0.000000,...,0,0,0,0,0,0,0,0,0,0
100004,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000,9.451890e+04,0.000000,...,0,0,0,0,0,0,0,0,0,0
100005,4.666667,0.000000,0.0,0.0,0.0,0.0,1420.500000,0.000,2.190420e+05,189469.500000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456249,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,6147.000,2.841430e+05,16307.100000,...,0,0,0,0,0,0,0,0,0,0
456250,4.000000,0.000000,0.0,0.0,0.0,0.0,154567.965000,0.000,1.028820e+06,744013.365000,...,0,0,0,0,0,0,0,0,0,0
456253,11.750000,0.000000,0.0,0.0,0.0,0.0,58369.500000,0.000,9.900000e+05,448958.250000,...,0,0,0,0,0,0,0,0,0,0
456254,8.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000,4.500000e+04,0.000000,...,0,0,0,0,0,0,0,0,0,0


In [9]:
y = application_train_df['TARGET']
train_df = application_train_df.merge(bureau_pivot_df, how='left', on='SK_ID_CURR')
train_df = train_df.drop(['SK_ID_CURR','TARGET'], axis='columns')

In [10]:
def process_train_df(i_df):
    train_df = i_df.copy()
    for column in train_df.columns:
        # categorical
        if train_df[column].dtype == 'object':
            # missing values
            if train_df[column].isnull().sum() > 0:
                train_df[column] = train_df[column].fillna('other')
            # label or one hot encoder
            if len(train_df[column].unique()) < 20:
                one_hot = pd.get_dummies(train_df[column], prefix=column)
                train_df = train_df.drop([column], axis='columns')
                train_df = train_df.join(one_hot)
            else:
                labelEncoder = LabelEncoder()
                train_df[column] = labelEncoder.fit_transform(train_df[column])
        elif train_df[column].dtype != 'object':
            if train_df[column].isnull().sum() > 0:
                train_df[column] = train_df[column].fillna(0)
    return train_df

In [11]:
X = process_train_df(train_df)

In [12]:
test_model = XGBClassifier(random_state=1234)
test_model.fit(X, y)
feature_importance = pd.DataFrame(test_model.feature_importances_, columns=["importance"], index=X.columns)
# feature_importance.sort_values("importance", ascending=False).plot(kind="bar", figsize=(100, 70))





In [13]:
# feature selection
important_feature = feature_importance.sort_values("importance", ascending=False)[0:190]
print(len(important_feature))
print(important_feature)

190
                                      importance
EXT_SOURCE_3                            0.032779
NAME_INCOME_TYPE_Pensioner              0.031337
CODE_GENDER_M                           0.025896
NAME_EDUCATION_TYPE_Higher education    0.024196
EXT_SOURCE_2                            0.024156
...                                          ...
FLAG_DOCUMENT_15                        0.001428
HOUSETYPE_MODE_terraced house           0.001368
FLOORSMIN_MODE                          0.001360
OCCUPATION_TYPE_Secretaries             0.000000
FLOORSMAX_MODE                          0.000000

[190 rows x 1 columns]


In [19]:
X = X[important_feature.index.tolist()]
#print(type(important_feature.index.tolist()))

In [20]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [21]:
def objective(trial):

    # search better model from RandomForestRegressor, XGBRegressor
    regressor_name = trial.suggest_categorical('classifier', ['RandomForest', 'XGBoost'])
    # search better max_depth from 2 to 16
    max_depth = trial.suggest_int('max_depth', 2, 16)
    # search better n_estimators from 50 to 4000
    n_estimators = trial.suggest_int('n_estimators', 50, 4000)
    if regressor_name == 'RandomForest':
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=1234)
    else:
        model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, objective='binary:logistic', random_state=1234)

    
    
    error_list = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc')

    return error_list.mean()  # An objective value linked with the Trial object.

In [None]:
study = optuna.create_study(direction='maximize', study_name='home_credit_default_risk', storage=get_storage(), load_if_exists=True)  # Create a new study.
study.optimize(objective, n_trials=50)  # Invoke optimization of the objective function.

[32m[I 2021-03-18 00:29:43,580][0m A new study created in RDB with name: home_credit_default_risk[0m
