## Machine learning flow

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("application_train.csv")

In [3]:
data = data.select_dtypes('number')

In [4]:
data = data.replace(np.nan,data.mean()) 

In [5]:
data.isnull().sum()

SK_ID_CURR                    0
TARGET                        0
CNT_CHILDREN                  0
AMT_INCOME_TOTAL              0
AMT_CREDIT                    0
                             ..
AMT_REQ_CREDIT_BUREAU_DAY     0
AMT_REQ_CREDIT_BUREAU_WEEK    0
AMT_REQ_CREDIT_BUREAU_MON     0
AMT_REQ_CREDIT_BUREAU_QRT     0
AMT_REQ_CREDIT_BUREAU_YEAR    0
Length: 106, dtype: int64

In [6]:
Y = data['TARGET']
X = data.drop(['TARGET'], axis=1).to_numpy()

In [7]:
from sklearn.model_selection import KFold

In [8]:
kf = KFold(n_splits=2)

In [9]:
kf.get_n_splits(X)

2

In [10]:
 for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]


TRAIN: [153756 153757 153758 ... 307508 307509 307510] TEST: [     0      1      2 ... 153753 153754 153755]
TRAIN: [     0      1      2 ... 153753 153754 153755] TEST: [153756 153757 153758 ... 307508 307509 307510]


## Grid search

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
# from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [17]:
# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
parameters = {'n_estimators':[1,3,5,7,10]}

In [14]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_news = scaler.transform(X_train)
X_test_news = scaler.transform(X_test)

In [18]:
# svc = svm.SVC()
rfc = RandomForestClassifier()

In [19]:
clf = GridSearchCV(rfc, parameters)

In [20]:
clf.fit(X_train_news, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [1, 3, 5, 7, 10]})

In [23]:
print("best score: {}, best para: {}".format(clf.best_score_,clf.best_params_))

best score: 0.9030801152258154, best para: {'n_estimators': 10}


## Survey from Kaggle Notebooks

* Grid search
* Gradient Boosting Machine
* Early Stopping


## Creating a model with high generalization performance

In [27]:
import xgboost as xgb

In [28]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, eval_metric="auc")

In [29]:
xgb_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)])



[0]	validation_0-auc:0.71150
[1]	validation_0-auc:0.71704
[2]	validation_0-auc:0.72097
[3]	validation_0-auc:0.72241
[4]	validation_0-auc:0.72389
[5]	validation_0-auc:0.72397
[6]	validation_0-auc:0.72605
[7]	validation_0-auc:0.72723
[8]	validation_0-auc:0.72991
[9]	validation_0-auc:0.73207
[10]	validation_0-auc:0.73274
[11]	validation_0-auc:0.73465
[12]	validation_0-auc:0.73642
[13]	validation_0-auc:0.73656
[14]	validation_0-auc:0.73903
[15]	validation_0-auc:0.73888
[16]	validation_0-auc:0.73948
[17]	validation_0-auc:0.73988
[18]	validation_0-auc:0.74003
[19]	validation_0-auc:0.74000
[20]	validation_0-auc:0.74054
[21]	validation_0-auc:0.74040
[22]	validation_0-auc:0.74034
[23]	validation_0-auc:0.74072
[24]	validation_0-auc:0.74057
[25]	validation_0-auc:0.74061
[26]	validation_0-auc:0.74128
[27]	validation_0-auc:0.74074
[28]	validation_0-auc:0.74074
[29]	validation_0-auc:0.73763
[30]	validation_0-auc:0.73813


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [30]:
from sklearn.metrics import auc, accuracy_score
y_pred = xgb_model.predict(X_test)

accuracy_score(y_test, y_pred)

0.9201066631979448

* I loaded dataset
* I splited it using kfold
* I used gridsearch to find model with best parameters
* I used xgboost to test the accuracy of model which is quite high


##  Final model selection

In [34]:
test_df = pd.read_csv('application_test.csv')
test_df = test_df.select_dtypes('number')
test_df = test_df.replace(np.nan,test_df.mean()) 
scaler = preprocessing.MinMaxScaler()
test_df_news =scaler.fit_transform(test_df)

test_reg_pred = clf.predict(test_df_news)

kgl_submission = pd.concat([test_df['SK_ID_CURR'], pd.Series(test_reg_pred, name='TARGET')], axis=1)
kgl_submission.to_csv('submission.csv', index=False)

The score is 0.5