# Importing libraries

In [7]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

from category_encoders import TargetEncoder

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer

# Reading file and tidying

In [2]:
df_train = pd.read_csv("../dataset/train.csv")
df_test = pd.read_csv("../dataset/test.csv")

In [3]:
y_train = df_train["risk_flag"]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test["risk_flag"]
x_test = df_test.drop("risk_flag", axis=1)

In [4]:
def target_encoding(df_x, df_y):
    x = df_x.copy()
    
    # Target Encoding — categorical columns with high cardinality: profession, city, state
    profession_target_enc = TargetEncoder()
    x["profession_encoded"] = profession_target_enc.fit_transform(x["profession"], df_y)
    
    city_target_enc = TargetEncoder()
    x["city_encoded"] = city_target_enc.fit_transform(x["city"], df_y)
    
    state_target_enc = TargetEncoder()
    x["state_encoded"] = state_target_enc.fit_transform(x["state"], df_y)
    
    x.drop("profession", axis=1, inplace=True)
    x.drop("city", axis=1, inplace=True)
    x.drop("state", axis=1, inplace=True)
    return x

x_train = target_encoding(x_train, y_train)
x_test = target_encoding(x_test, y_test)

  elif pd.api.types.is_categorical(cols):


In [5]:
xgb = XGBClassifier(use_label_encoder=False, random_state=2021)
      
oversampler = SMOTE(random_state=2021)
x_train, y_train = oversampler.fit_resample(x_train, y_train)

xgb.fit(x_train, y_train, eval_metric='logloss')
y_pred = xgb.predict(x_test)

print(f"-----------------------TEST SCORES-----------------------")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
print()

-----------------------TEST SCORES-----------------------
Recall: 0.9537022100338765
Precision: 0.13576136128780397
F2-Score: 0.4325241795063136
Accuracy score: 0.24757936507936507
AUC Score: 0.551125442701606



In [24]:
# Creating parameter grid to search
learning_rates = [0.2, 0.3, 0.4]
max_depths = [12, 15, 20]
min_child_weights = [0.8, 1.0, 1.2]
gammas = [0.5, 1.0, 1.5]
colsample_bytrees = [0.6, 0.8, 1.0]

params_grid = {
                'classifier__learning_rate': learning_rates,
                'classifier__max_depth': max_depths,
                'classifier__min_child_weight': min_child_weights,
                'classifier__gamma': gammas,
                'classifier__colsample_bytree': colsample_bytrees
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__learning_rate [0.2, 0.3, 0.4]
classifier__max_depth [12, 15, 20]
classifier__min_child_weight [0.8, 1.0, 1.2]
classifier__gamma [0.5, 1.0, 1.5]
classifier__colsample_bytree [0.6, 0.8, 1.0]
-----------------
Total combinations: 243


In [25]:
y_train = df_train["risk_flag"]
x_train = df_train.drop("risk_flag", axis=1)

te_features = ['profession', 'city','state']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

xgb_randomgridsearch = RandomizedSearchCV(estimator = pipeline,
                                   param_distributions = params_grid,
                                   scoring = 'recall',
                                   cv = stratified_kfold,
                                   refit = True,
                                   n_jobs = -1,
                                   random_state = 2021)

xgb_randomgridsearch.fit( x_train, y_train.values.ravel() )

xgb_randomgridsearch.best_params_

# {'classifier__min_child_weight': 1.2,
#  'classifier__max_depth': 20,
#  'classifier__learning_rate': 0.3,
#  'classifier__gamma': 1.0,
#  'classifier__colsample_bytree': 1.0}

  elif pd.api.types.is_categorical(cols):




{'classifier__min_child_weight': 1.2,
 'classifier__max_depth': 20,
 'classifier__learning_rate': 0.3,
 'classifier__gamma': 1.0,
 'classifier__colsample_bytree': 1.0}



  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):




  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):




  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):




  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):




  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):




  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):




In [None]:
learning_rates = [0.2, 0.3, 0.4]
max_depths = [15, 20, 25]
min_child_weights = [1.0, 1.2, 1.4]
gammas = [0.5, 1, 1.5]
colsample_bytrees = [0.8, 1.0, 1.2]

params_grid = {
                'classifier__learning_rate': learning_rates,
                'classifier__max_depth': max_depths,
                'classifier__min_child_weight': min_child_weights,
                'classifier__gamma': gammas,
                'classifier__colsample_bytree': colsample_bytrees
              }

te_features = ['profession', 'city','state']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

xgb_gridsearch = GridSearchCV(
                                estimator = pipeline,
                                param_grid = params_grid,
                                scoring = 'recall',
                                cv = stratified_kfold,
                                refit = True,
                                n_jobs = -1
                             )

xgb_gridsearch.fit( x_train, y_train.values.ravel() )

xgb_gridsearch.best_params_