In [1]:
import pandas as pd

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

import cleaning_functions as clean

In [2]:
data = pd.read_csv('Code_challenge_train.csv')

In [3]:
data = clean.cleaning(data)

In [4]:
nulls = data.isnull().sum().to_frame()
nulls.loc[nulls[0] != 0, :]

Unnamed: 0,0


In [5]:
from sklearn.model_selection import train_test_split

X = data.drop('y', axis = 1)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 72019)

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [8]:
ss_xg = Pipeline([
    ('ss', StandardScaler()),
    ('xg', XGBClassifier())
])

In [9]:
ss_xg.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xg', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
    ...
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])

In [11]:
ss_xg.score(X_test, y_test)

0.9041

In [15]:
pipe_params = {
    'xg__n_estimators' : [100, 50, 114],
    'xg__max_depth' : [3, 1, 5],
    'xg__learning_rate' : [.1, .5],
    'xg__reg_alpha' : [0,.3]
}

In [16]:
gs = GridSearchCV(estimator = ss_xg, param_grid = pipe_params, cv = 5)

In [17]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xg', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
    ...
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'xg__n_estimators': [100, 50, 114], 'xg__max_depth': [3, 1, 5], 'xg__learning_rate': [0.1, 0.5], 'xg__reg_alpha': [0, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
gs.best_params_

{'xg__learning_rate': 0.5,
 'xg__max_depth': 5,
 'xg__n_estimators': 114,
 'xg__reg_alpha': 0.3}

In [20]:
gs.score(X_test, y_test)

0.97

***

In [21]:
# saving the model 
import pickle
filename = 'model_2_xgboost.sav'
pickle.dump(gs, open(filename, 'wb'))