# Algorithm Modifications

### Import rebalanced dataset

In [1]:
import pandas as pd
import os
import numpy as np

os.chdir('/Users/owenxoual/Desktop/DataHEC/quintenProject/unbalanced_data/dataset/raw')
!pwd

/Users/owenxoual/Desktop/DataHEC/quintenProject/unbalanced_data/dataset/raw


In [2]:
df = pd.read_csv("creditcard.csv")

### Load functions from function.py script

In [3]:
os.chdir('/Users/owenxoual/Desktop/DataHEC/quintenProject/unbalanced_data/src')
!pwd

/Users/owenxoual/Desktop/DataHEC/quintenProject/unbalanced_data/src


In [4]:
import function
import master

### Resample datasets

In [5]:
from function import smote_enn, smote_tomek, getdataset

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = getdataset(df)

# Resample the training dataset
X_res, y_res = smote_enn(X_train, y_train, strategy = 0.094083)
X_res1, y_res1 = smote_tomek(X_train, y_train, strategy = 0.003492)

In [6]:
print(type(y_res))
unique, count = np.unique(y_res, return_counts=True)
print(unique, count)

<class 'numpy.ndarray'>
[0 1] [191947  12078]


In [7]:
print(type(y_res1))
unique1, count1 = np.unique(y_res1, return_counts=True)
print(unique1, count1)

<class 'numpy.ndarray'>
[0 1] [198923    597]


### Random Forest - Algorithm modifications

#### Randomized Grid Search for first estimation of model hyperparameter

Create a dictionary of the most impactful model hyperparameters to tune for the Random Forest classifier

In [10]:
from function import grid_search_dict_RF

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 70, num = 7)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Weights associated to each class
class_weight = ('balanced', 'balanced_subsample')


random_grid_RF = grid_search_dict_RF(n_estimators, 
                                     max_features,
                                     max_depth,
                                     min_samples_split,
                                     min_samples_leaf,
                                     class_weight
                                     )

print(random_grid_RF)

{'n_estimators': [100, 200, 300, 400], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'class_weight': ('balanced', 'balanced_subsample')}


Perform a random grid search on the dictionary of parameters specified above to get a first look at the range of optimal values. These values will be passed in a cross-validated grid search (more brute force) with more precise ranges to find an optimal solution.

In [12]:
from function import grid_search_random
import joblib

print("----> Beginning Random Grid Search")
grid_search_RF = [grid_search_random(X_res,
                                   y_res,
                                   X_test,
                                   method = "RF",
                                   random_grid = random_grid_RF,
                                   n_iter = 200,
                                   cv = 3)]
print("----> Random Grid Search Completed!")
print(grid_search_RF)
best_model_RF = grid_search_RF[0]

filename_RF = 'saved_model_RF.sav'
joblib.dump(grid_search_RF, filename_RF)

----> Beginning Random Grid Search
Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=70, class_weight=balanced_subsample 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

#### Cross-validated grid search for final hyperparameter choice

With the first estimate of the optimal hyperparameters, we now create another dictionary that is precise around the range of this first estimate to pinpoint an optimal solution

In [None]:
'''
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 70, num = 7)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Weights associated to each class
class_weight = ('balanced', 'balanced_subsample')


grid_cv_RF = grid_search_dict_RF(n_estimators, 
                                 max_features,
                                 max_depth,
                                 min_samples_split,
                                 min_samples_leaf,
                                 class_weight
                                 )
'''

With this more focused dictionary of model hyperparameters, we launch a cross-validated grid search to find the optimal solution among the proposed dictionary of hyperparameters (given each resampled dataset).

In [None]:
'''
from function import grid_search_CV

print("----> Beginning Cross-validated Grid Search")
best_param_cv_RF = grid_search_CV(X_res,
                               y_res,
                               method = "RF",
                               random_grid = grid_cv_RF,
                               n_iter = 2,
                               cv = 2)
print("----> Cross-validated Grid Search Completed!")
print(best_param_cv_RF)
'''

### XGBoost - Algorithm modifications

In [15]:
from function import grid_search_dict_XGB

# Minimum sum of weights of all observations required in a child - used to control overfitting
min_child_weight = [1, 3, 5, 7]
# Minimum loss reduction required to make a further partition on a leaf node of the tree
gamma = [0.01, 0.2, 0.5, 1]
# Subsample ratio of columns used to build each tree of the model
colsample_bytree = [0.3, 0.6, 1.0]
# Maximum depth of a tree 
max_depth = [4, 8, 14, 20]
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 4)]
# L2 Regularization term
reg_lambda = [1, 2, 3]
# Step size shrinkage used in update to prevents overfitting.
learning_rate = [0.1, 0.01, 0.001]

'''
# Prior probability for the minority class - considered unimpactful given the size of the trees
unique, count = np.unique(y_res, return_counts=True)
min_maj_ratio = count[1]/count[0]
base_score = (0.5, min_maj_ratio)
'''



random_grid_XGB = {'min_child_weight': min_child_weight,
                   'gamma': gamma,
                   'colsample_bytree': colsample_bytree,
                   'max_depth': max_depth,
                   'n_estimators': n_estimators,
                   'reg_lambda': reg_lambda,
                   'learning_rate': learning_rate
                   }

print(random_grid_XGB)

{'min_child_weight': [1, 3, 5, 7], 'gamma': [0.01, 0.2, 0.5, 1, 2], 'colsample_bytree': [0.3, 0.5, 0.7, 1.0], 'max_depth': [4, 8, 14, 20], 'n_estimators': [100, 200, 300, 400], 'reg_lambda': [1, 1.5, 2, 3], 'learning_rate': [0.1, 0.01, 0.001]}


Perform a random grid search on the dictionary of parameters specified above to get a first look at the range of optimal values. These values will be passed in a cross-validated grid search (more brute force) with more precise ranges to find an optimal solution.

In [17]:
print("----> Beginning Random Grid Search")
grid_search_XGB = [grid_search_random(X_res,
                                   y_res,
                                   X_test,
                                   method = "XGB",
                                   random_grid = random_grid_XGB,
                                   n_iter = 200,
                                   cv = 3)]
print("----> Random Grid Search Completed!")
print(grid_search_XGB)
best_model_XGB = grid_search_XGB[0]

filename_XGB = 'saved_model_XGB.sav'
joblib.dump(grid_search_XGB, filename_XGB)

----> Beginning Random Grid Search
Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] reg_lambda=1.5, n_estimators=100, min_child_weight=3, max_depth=4, learning_rate=0.001, gamma=0.2, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 