#### Import packages

In [50]:
from typing import List
from datetime import date
from production_training import ProductionTraining
import joblib
import pickle
import json
from training_and_evluation import TrainingAndEvaluation
import numpy as np 
import os
import pandas as pd 
from warnings import simplefilter
import inspect

simplefilter(action='ignore', category=FutureWarning)

tae = TrainingAndEvaluation()
pt = ProductionTraining()

### 1) Read configs 

In [51]:
config_path = './production/configurations/production_configuration.txt'
configs = pt.read_json_to_class(config_path)
print(f"Converting configs to class object for easy referral\n\n{configs}")

Converting configs to class object for easy referral

X(model_params=X(learning_rate=0.1, max_features='sqrt', subsample=0.8, random_state=10, model__loss='exponential', model__max_depth=10, model__max_features='auto', model__min_samples_leaf=20, model__min_samples_split=40, model__n_estimators=400), model_path='./production/trained_models/model_2021-04-23.pkl', grid_search=X(last_model_path='./production/grid_search_models/gread_search_2021-04-24.pkl', grid_params=X(n_estimators=[400, 500], max_depth=[12, 14], min_samples_split=[40, 30], min_samples_leaf=[20, 10], max_features=['sqrt'], loss=['exponential', 'deviance']), fixed_params=X(learning_rate=0.1, max_features='sqrt', subsample=0.8, random_state=10)), sanity=X(sanity_check_path='./production/sanity_check/sampels.txt'))


### 2) load data

In [52]:
# The fetures which gave the best results
lasso_features = ['has_burglar_alarm',
                 'state',
                 'previous_policies',
                 'card_type',
                 'portable_electronics',
                 'square_ft',
                 'product']

In [53]:
a = 'train_resampled_previous_policies'
b = 'train_resampled_k_means_shuffled'
c = 'df_resampled'

df_not_encoded = pd.read_csv(f'./data/{a}.csv', index_col=[0])
df_not_encoded_raw = pd.read_csv(f'./data/df_before_encoding.csv', index_col=[0])

df_not_encoded = df_not_encoded[[i for i in df_not_encoded.columns if i not in ['id','user_id','postal_code']]]
df_not_encoded_raw = df_not_encoded_raw[[i for i in df_not_encoded_raw.columns if i not in ['id','user_id','postal_code']]]

_ , X_test, _ , y_test  = tae.train_test_split(df = df_not_encoded_raw, labeled_col_name = 'label', test_size = 0.15,random_state=20)

y_train = df_not_encoded.label
X_train = df_not_encoded.drop(columns = 'label')


### 3) Grid Search

###### 3.1) Parmas

In [54]:
# Map columns to types : Categorical, Numeric, Boolean
cols = df_not_encoded_raw[lasso_features].columns
categorical_cols = [col for col in cols if df_not_encoded_raw[col].dtype == 'object']
numerical_cols = [col for col in cols if ((df_not_encoded_raw[col].dtype == 'int64') or (df_not_encoded_raw[col].dtype == 'float64'))]
bool_cols = [col for col in cols if df_not_encoded_raw[col].dtype == 'bool' if 'label' != col]


# Grid search params
gs_params = dict(configs.grid_search.grid_params.__dict__)
gs_params = {f'model__{k}':v for k,v in gs_params.items()}

# Fixed params 
fixed_params = dict(configs.grid_search.fixed_params.__dict__)


In [55]:
print(f'bool_cols\n\n {bool_cols}')
print(f'\n\ncategorical_cols\n\n {categorical_cols}')
print(f'\n\nnumerical_cols\n\n {numerical_cols}\n\n')
df_not_encoded_raw.info()

bool_cols

 ['has_burglar_alarm', 'portable_electronics']


categorical_cols

 ['state', 'card_type', 'product']


numerical_cols

 ['previous_policies', 'square_ft']


<class 'pandas.core.frame.DataFrame'>
Index: 12397 entries, -9.16005e+18_2.49201e+18 to -9.03879e+18_-6.86341e+18
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   state                    12397 non-null  object 
 1   product                  12397 non-null  object 
 2   square_ft                12397 non-null  float64
 3   has_fire_alarm           12397 non-null  bool   
 4   has_burglar_alarm        12397 non-null  bool   
 5   portable_electronics     12397 non-null  bool   
 6   coast                    12397 non-null  int64  
 7   fire_housing_proximity   12397 non-null  int64  
 8   previous_policies        12397 non-null  int64  
 9   user_age                 12397 non-null  float64
 10  card_type                12396 

###### 3.2) Define pipline

In [56]:
lines = inspect.getsource(pt.run_gridsearchCV_with_pipline)
print(lines)

    @staticmethod
    def run_gridsearchCV_with_pipline(X_train: pd.DataFrame,
                                      y_train: pd.Series,
                                      numerical_cols: List[str],
                                      categorical_cols: List[str],
                                      bool_cols: List[str],
                                      fixed_params: dict,
                                      gs_params: dict):
        '''
        Run grid search with cross validation pipeline
        :param X_train: pd.DataFrame - feature matrix
        :param y_train: pd.Series - target vactore
        :param numerical_cols: List[str] - numerical features
        :param categorical_cols: List[str] - Categorical features
        :param bool_cols: List[str] - Boolean features
        :param fixed_params: dict - not for grid search
        :param gs_params: dict -  for grid search
        :return: GridSearchCV model (contain best model params)
        '''
        categorical_

###### 3.3) Run gread search with pipeline

In [57]:
grid = pt.run_gridsearchCV_with_pipline(X_train=X_train, 
                                     y_train=y_train,
                                     numerical_cols=numerical_cols,
                                     categorical_cols=categorical_cols,
                                     bool_cols=bool_cols,
                                     fixed_params=fixed_params,
                                     gs_params=gs_params)

###### 3.4) GridSerachCV - results 

In [58]:
pd.DataFrame(grid.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__loss,param_model__max_depth,param_model__max_features,param_model__min_samples_leaf,param_model__min_samples_split,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,7.344966,0.222615,0.702963,0.024634,exponential,12,sqrt,20,40,400,"{'model__loss': 'exponential', 'model__max_dep...",0.500317,0.890861,0.899783,0.763654,0.186243,24
1,9.797974,0.405647,0.866664,0.08594,exponential,12,sqrt,20,40,500,"{'model__loss': 'exponential', 'model__max_dep...",0.501586,0.891585,0.899752,0.764308,0.185802,16
2,8.006838,0.186057,0.666034,0.035801,exponential,12,sqrt,20,30,400,"{'model__loss': 'exponential', 'model__max_dep...",0.500317,0.890861,0.899783,0.763654,0.186243,24
3,10.535018,0.211123,0.831514,0.032263,exponential,12,sqrt,20,30,500,"{'model__loss': 'exponential', 'model__max_dep...",0.501586,0.891585,0.899752,0.764308,0.185802,16
4,10.107229,0.363475,0.77571,0.061486,exponential,12,sqrt,10,40,400,"{'model__loss': 'exponential', 'model__max_dep...",0.499048,0.89248,0.897991,0.763173,0.186778,32


In [59]:
print(f"--- Best params ---\n\n{grid.best_params_}")

--- Best params ---

{'model__loss': 'exponential', 'model__max_depth': 14, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 10, 'model__min_samples_split': 40, 'model__n_estimators': 400}


In [60]:
print(f"--- Best score ---\n\n{grid.best_score_}")

--- Best score ---

0.7693178387733149


##### 3.3) Combine params before full training 

In [61]:
fixed_params.update(grid.best_params_)
final_model_params = fixed_params
print(f"--- Final model params ---\n\n{final_model_params}")

--- Final model params ---

{'learning_rate': 0.1, 'max_features': 'sqrt', 'subsample': 0.8, 'random_state': 10, 'model__loss': 'exponential', 'model__max_depth': 14, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 10, 'model__min_samples_split': 40, 'model__n_estimators': 400}


##### 3.4) Performence on test set

In [62]:
# converting bool to int (imputer cant dill with booleans)
X_test = X_test*1

# Predict for test set 
y_pred = grid.predict(X_test)
pt.evaluate(y_true=y_test, y_pred = y_pred)

{'f1': 0.1,
 'confusion_matrix': array([[1784,   34],
        [  38,    4]])}

### 4) Save model

In [66]:
configs.grid_search.last_model_path = f'./production/grid_search_models/gread_search_{date.today()}.pkl'
joblib.dump(grid, configs.grid_search.last_model_path)

['./production/grid_search_models/gread_search_2021-04-24.pkl']

### 5) Save configs

In [65]:
confs = {"model_params":fixed_params,
         "model_path":configs.model_path,
         "grid_search":{"last_model_path":configs.grid_search.last_model_path, 
                        "grid_params": { "n_estimators": [400,500],
                                         "max_depth": [12, 14],
                                         "min_samples_split": [40,30],
                                         "min_samples_leaf": [20,10],
                                         "max_features": [ 'sqrt'],
                                         "loss": ['exponential','deviance']},
                                        
         
                        "fixed_params":{"learning_rate":0.1, 
                                        
                                        "max_features":'sqrt',
                                        "subsample":0.8,
                                        "random_state":10}},
        "sanity":{"sanity_check_path" : './production/sanity_check/sampels.txt'}}


with open(config_path, 'w') as outfile:
    json.dump(confs, outfile)

##### Save 5 request for sanity check 

In [70]:
d = X_train.iloc[0:5,:].to_dict("index")
with open('./production/sanity_check/sampels.txt', 'w') as outfile:
    samples = json.dump(d,outfile)