# imports

In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from IPython.core.display import HTML
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics
#import scikitplot as skplt
pd.options.mode.chained_assignment = None  # default='warn'

# helper functions

In [59]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    sns.set()
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


In [60]:
def precision_at_k( data, k=2000 ):
    # reset index
    data = data.reset_index( drop=True )

    # create ranking order
    data['ranking'] = data.index + 1 

    data['precision_at_k'] = data['response'].cumsum() / data['ranking']

    return data.loc[k, 'precision_at_k']

In [61]:
def recall_at_k( data, k=2000 ):
    # reset index
    data = data.reset_index( drop=True )

    # create ranking order
    data['ranking'] = data.index + 1 

    data['recall_at_k'] = data['response'].cumsum() / data['response'].sum()

    return data.loc[k, 'recall_at_k']

In [62]:
def performace(model_name,data,y_val,yhat_class,k):
    import sklearn.metrics
    prec = sklearn.metrics.precision_score(y_val,yhat_class)
    recal = sklearn.metrics.recall_score(y_val,yhat_class)
    recal_k = recall_at_k(data,k)
    prec_k = precision_at_k(data,k)
    
    return pd.DataFrame({'Model Name':model_name,
                        'Precison':prec,
                        'Recall':recal,
                        'k':k,
                        'Precison at k':prec_k,
                        'Recall at k':recal_k},index=[0])

In [63]:
def performace_cross_val(data,target,model,model_name,round_n=3,splits=4,shuffle_n=True,random=42,k=2000):
    
    import sklearn.model_selection as ms
    import sklearn.metrics
    import numpy as np
    skf = ms.StratifiedKFold(n_splits=splits,shuffle=shuffle_n,random_state=random)
    X = data
    y = X[target]
    X = X.drop(columns=[target,'id'])
    precision = []
    recall = []
    prec_k = []
    rec_k = []
    for train_index, test_index in skf.split(X,y):
        #train the model 
        model.fit(X.iloc[train_index],y.iloc[train_index])

        #predict the classification 
        yhat_class = model.predict( X.iloc[test_index] )

        #predict the probabilit
        yhat_proba = model.predict_proba(X.iloc[test_index])

        # precision and recall
        prec = sklearn.metrics.precision_score(y.iloc[test_index],yhat_class)
        rec = sklearn.metrics.recall_score(y.iloc[test_index],yhat_class)
        precision.append(prec)
        recall.append(rec)

        # sort the test dataframe by the probabiliti score of the model
        aux = X.iloc[test_index]
        aux[target] = y.iloc[test_index]
        aux['score'] = yhat_proba[:,1].tolist()

        # sorte by score
        aux = aux.sort_values('score',ascending=False)

        # precision and recall at k
        prec_k.append(precision_at_k(aux,k))
        rec_k.append(recall_at_k(aux,k))

    # return a dataset with the metrics    
    return pd.DataFrame({'Model name': model_name + " Cross_Val",
                                'PRECISION CROSS_VAL': np.round( np.mean( precision ), round_n ),
                                'PRECISON STD': np.round( np.std( precision ), round_n ),
                                'RECALL CROSS_VAL': np.round( np.mean( recall ), round_n ),
                                'RECALL STD': np.round( np.std( recall ), round_n ),
                                'K': k,
                                'PRECISION AT K CROSS_VAL': np.round( np.mean( prec_k ), round_n ),
                                'PRECISION AT K STD': np.round( np.std( prec_k ), round_n ),
                                'RECALL AT K CROSS_VAL': np.round( np.mean( rec_k), round_n ),
                                'RECALL AT K STD': np.round( np.std( rec_k )) },index=[0]) 

# Load data

In [64]:
df6 = df6 = pd.read_csv('data/df5.csv')

In [65]:
df6.head()

Unnamed: 0,id,age,region_code,policy_sales_channel,previously_insured,vintage,vehicle_age,vehicle_damage,annual_premium,response
0,1,0.369231,28.0,26.0,0,0.716263,3,1,0.574539,1
1,2,0.861538,3.0,26.0,0,0.598616,2,0,0.172636,0
2,3,0.415385,28.0,26.0,0,0.058824,3,1,0.449053,1
3,4,0.015385,11.0,152.0,1,0.66782,1,0,-0.113018,0
4,5,0.138462,41.0,152.0,1,0.100346,1,0,-0.178259,0


In [66]:
df6.dtypes

id                        int64
age                     float64
region_code             float64
policy_sales_channel    float64
previously_insured        int64
vintage                 float64
vehicle_age               int64
vehicle_damage            int64
annual_premium          float64
response                  int64
dtype: object

In [67]:
# train, val = train_test_split(df6,test_size=0.3,stratify=df6['response'],random_state=42)

# # train dataframes
# X_train = train.copy()
# y_train = X_train['response']
# X_train.drop(columns = ['response','id'],inplace=True)
# # validation dataframes
# x_val = val.copy()
# y_val = val['response']
# x_val.drop(columns = ['response','id'],inplace=True)

# Fine tuning

In [68]:
from hyperopt import tpe, Trials, hp, fmin, STATUS_OK

import warnings
warnings.filterwarnings('ignore')

In [69]:
# xgb_model = xgb.XGBClassifier(objective='binary:logistic',
#                              n_estimators=300,
#                              eta=0.01,
#                              max_depth=10,
#                              subsample=0.7,
#                              colsample_bytree=0.9)
# score = performace_cross_val(df6,'response',xgb_model,'XGB_MODEL',round_n=5,splits=4,k=20000)

In [70]:
# score

In [71]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.choice('n_estimators',np.arange(300,1000+1,100)),
        'seed': 42
    }


In [72]:
def objective_function(space):
    xgb_model = xgb.XGBClassifier(n_estimators =int(space['n_estimators']),
                                  max_depth = int(space['max_depth']), 
                                  gamma = space['gamma'],
                                  reg_alpha = int(space['reg_alpha']),
                                  min_child_weight=int(space['min_child_weight']),
                                  colsample_bytree=int(space['colsample_bytree']))      ii
    score = performace_cross_val(df6,'response',xgb_model,'XGB_MODEL',round_n=5,splits=6,k=20000)
    print(space)
    print(score['RECALL AT K CROSS_VAL'][0])
    return {'loss': -score['RECALL AT K CROSS_VAL'][0],'status':STATUS_OK}

In [73]:
tpe_algorithm = tpe.suggest
trials = Trials()
num_eval = 25

In [74]:
best_paramns = fmin(fn=objective_function,space=space,algo=tpe_algorithm,max_evals=num_eval,
                    trials=trials)

{'colsample_bytree': 0.8771031844161058, 'gamma': 7.127817745361825, 'max_depth': 17.0, 'min_child_weight': 7.0, 'n_estimators': 400, 'reg_alpha': 107.0, 'reg_lambda': 0.07600040628206517, 'seed': 42}
0.80529                                               
{'colsample_bytree': 0.6607655933753276, 'gamma': 3.400294666308536, 'max_depth': 6.0, 'min_child_weight': 5.0, 'n_estimators': 600, 'reg_alpha': 51.0, 'reg_lambda': 0.013881737677217454, 'seed': 42}
0.8082                                                                   
  8%|▊         | 2/25 [13:18<2:32:57, 399.03s/trial, best loss: -0.8082] 


KeyboardInterrupt: 

In [None]:
best_paramns

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3161d838-98a4-47ed-ae81-127ad2068af4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>