In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import random

In [4]:
data = pd.read_csv('Data/full_data.csv')
col_list = data.columns.tolist()
features = col_list.copy()
features.remove('y')
num_cols = [col for col in features if data[col].dtype=='int64']
data_le = pd.read_csv('Data/encoded_data.csv')

In [5]:
data_mm = data_le.copy()
for num_col in num_cols:
    scaler = MinMaxScaler()
    data_mm[num_col] = scaler.fit_transform(data_le[num_col].values.reshape(-1, 1))

In [6]:
def CVRfc(df, n_splits=5,  rando_state=2021, features=features, if_print=True, 
                  *args, **kwargs):
    # Prepare KStratifiedKFOLD
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rando_state)
    
    # Make copy of data
    data = df.copy()
    
    # Prepare empty lists
    train_results = []
    test_results = []
    preds = []
    
    # Prepare int to count fold s
    fold_number = 1
       
    for train, test in kf.split(data.index.values, data['y']):
        # Prepare KNN model 
        model = RandomForestClassifier(*args, **kwargs)
        model.fit(data.loc[train, features], data.loc[train, 'y'])
        
        # Make predictions
        train_preds = model.predict(data.loc[train, features])
        test_preds = model.predict(data.loc[test, features])
        preds.append(test_preds)
        
        # Prepare ROC_AUC score
        train_roc = metrics.roc_auc_score(data.loc[train, 'y'], train_preds)
        test_roc = metrics.roc_auc_score(data.loc[test, 'y'], test_preds)
        
        # Add ROC_AUC to lis
        train_results.append(train_roc)
        test_results.append(test_roc)
        
        if if_print:
            print(f'FOLD NUMBER: {fold_number}')
            print(f'ROC_AUC ON TRAIN SCORE {train_roc}')
            print(f'ROC_AUC ON TEST SCORE {test_roc}')
                  
        fold_number += 1 
        
    return train_results, test_results, preds 

In [7]:
CVRfc(df=data_le)

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.999896054072912
ROC_AUC ON TEST SCORE 0.7764705320312717
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.9997692519530665
ROC_AUC ON TEST SCORE 0.7750502513054518
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.9997860962566845
ROC_AUC ON TEST SCORE 0.7777020561782865
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.999822750768748
ROC_AUC ON TEST SCORE 0.7757575946029098
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.9998930481283422
ROC_AUC ON TEST SCORE 0.7809612232280239


([0.999896054072912,
  0.9997692519530665,
  0.9997860962566845,
  0.999822750768748,
  0.9998930481283422],
 [0.7764705320312717,
  0.7750502513054518,
  0.7777020561782865,
  0.7757575946029098,
  0.7809612232280239],
 [array([0, 0, 1, ..., 0, 0, 1]),
  array([0, 1, 0, ..., 0, 0, 1]),
  array([0, 0, 0, ..., 0, 0, 0]),
  array([0, 1, 0, ..., 0, 1, 0]),
  array([0, 0, 0, ..., 1, 0, 0])])

In [8]:
def Rfc_h_tuning(data):
    test_list = []
    for i in range(60):
        param_dict = {
            'n_estimators': random.randrange(50, 200, 10),
            'max_depth': random.randrange(5, 25, 2),
            'max_features': random.randrange(2, 15, 1),
            'min_samples_split': random.randrange(2, 10, 1)
        }
        param_dict['min_samples_leaf'] = random.randrange(1, param_dict['min_samples_split'])

        train_results, test_results, preds = CVRfc(df=data, if_print=False, **param_dict)
        if i % 5 == 0:
            print(param_dict.items(), np.mean(test_results))
        test_list.append([param_dict.items(), np.mean(test_results)])
        
    return test_list

In [11]:
tuned_list = Rfc_h_tuning(data_le)
sorted_list = sorted(tuned_list, key=lambda x: x[-1], reverse=True)

dict_items([('n_estimators', 110), ('max_depth', 21), ('max_features', 11), ('min_samples_split', 2), ('min_samples_leaf', 1)]) 0.7815912089195638
dict_items([('n_estimators', 130), ('max_depth', 19), ('max_features', 10), ('min_samples_split', 6), ('min_samples_leaf', 5)]) 0.7789097770125462
dict_items([('n_estimators', 190), ('max_depth', 23), ('max_features', 12), ('min_samples_split', 6), ('min_samples_leaf', 5)]) 0.778940522023204
dict_items([('n_estimators', 80), ('max_depth', 9), ('max_features', 9), ('min_samples_split', 5), ('min_samples_leaf', 4)]) 0.7544939706254447
dict_items([('n_estimators', 130), ('max_depth', 21), ('max_features', 7), ('min_samples_split', 7), ('min_samples_leaf', 3)]) 0.7793107068445777
dict_items([('n_estimators', 140), ('max_depth', 5), ('max_features', 12), ('min_samples_split', 3), ('min_samples_leaf', 2)]) 0.7446385074265078
dict_items([('n_estimators', 50), ('max_depth', 7), ('max_features', 13), ('min_samples_split', 6), ('min_samples_leaf', 3)]

In [13]:
sorted_list

[[dict_items([('n_estimators', 110), ('max_depth', 21), ('max_features', 11), ('min_samples_split', 2), ('min_samples_leaf', 1)]),
  0.7815912089195638],
 [dict_items([('n_estimators', 160), ('max_depth', 17), ('max_features', 7), ('min_samples_split', 2), ('min_samples_leaf', 1)]),
  0.7810283581778764],
 [dict_items([('n_estimators', 100), ('max_depth', 17), ('max_features', 13), ('min_samples_split', 7), ('min_samples_leaf', 1)]),
  0.7806364435994351],
 [dict_items([('n_estimators', 90), ('max_depth', 19), ('max_features', 5), ('min_samples_split', 4), ('min_samples_leaf', 2)]),
  0.7803546851993091],
 [dict_items([('n_estimators', 170), ('max_depth', 23), ('max_features', 6), ('min_samples_split', 6), ('min_samples_leaf', 5)]),
  0.7803131685043786],
 [dict_items([('n_estimators', 140), ('max_depth', 19), ('max_features', 6), ('min_samples_split', 8), ('min_samples_leaf', 4)]),
  0.7798835279905207],
 [dict_items([('n_estimators', 50), ('max_depth', 15), ('max_features', 10), ('mi