In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import random

In [7]:
data = pd.read_csv('full_data.csv')
col_list = data.columns.tolist()
features = col_list.copy()
features.remove('y')
num_cols = [col for col in features if data[col].dtype=='int64']
data_le = pd.read_csv('encoded_data.csv')

In [16]:
data_mm = data_le.copy()
for num_col in num_cols:
    scaler = MinMaxScaler()
    data_mm[num_col] = scaler.fit_transform(data_le[num_col].values.reshape(-1, 1))

In [9]:
def CVRfc(df, n_splits=5,  rando_state=2021, features=features, if_print=True, 
                  *args, **kwargs):
    # Prepare KStratifiedKFOLD
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rando_state)
    
    # Make copy of data
    data = df.copy()
    
    # Prepare empty lists
    train_results = []
    test_results = []
    preds = []
    
    # Prepare int to count fold s
    fold_number = 1
       
    for train, test in kf.split(data.index.values, data['y']):
        # Prepare KNN model 
        model = RandomForestClassifier(*args, **kwargs)
        model.fit(data.loc[train, features], data.loc[train, 'y'])
        
        # Make predictions
        train_preds = model.predict(data.loc[train, features])
        test_preds = model.predict(data.loc[test, features])
        preds.append(test_preds)
        
        # Prepare ROC_AUC score
        train_roc = metrics.roc_auc_score(data.loc[train, 'y'], train_preds)
        test_roc = metrics.roc_auc_score(data.loc[test, 'y'], test_preds)
        
        # Add ROC_AUC to lis
        train_results.append(train_roc)
        test_results.append(test_roc)
        
        if if_print:
            print(f'FOLD NUMBER: {fold_number}')
            print(f'ROC_AUC ON TRAIN SCORE {train_roc}')
            print(f'ROC_AUC ON TEST SCORE {test_roc}')
                  
        fold_number += 1 
        
    return train_results, test_results, preds 

In [5]:
CVRfc(df=data_le)

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.9998227336088584
ROC_AUC ON TEST SCORE 0.7750840956071594
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.9997692519530665
ROC_AUC ON TEST SCORE 0.7709149438593255
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.9998594052808116
ROC_AUC ON TEST SCORE 0.7772499723169894
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.999876226704577
ROC_AUC ON TEST SCORE 0.7706953609493175
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.999822750768748
ROC_AUC ON TEST SCORE 0.7806489795981826


([0.9998227336088584,
  0.9997692519530665,
  0.9998594052808116,
  0.999876226704577,
  0.999822750768748],
 [0.7750840956071594,
  0.7709149438593255,
  0.7772499723169894,
  0.7706953609493175,
  0.7806489795981826],
 [array([0, 0, 1, ..., 0, 0, 1]),
  array([0, 1, 0, ..., 0, 0, 1]),
  array([0, 1, 0, ..., 0, 0, 0]),
  array([0, 1, 0, ..., 0, 1, 0]),
  array([0, 0, 0, ..., 1, 0, 0])])

In [18]:
def Rfc_h_tuning(data):
    test_list = []
    for i in range(60):
        param_dict = {
            'n_estimators': random.randrange(50, 200, 10),
            'max_depth': random.randrange(5, 25, 2),
            'max_features': random.randrange(2, 15, 1),
            'min_samples_split': random.randrange(2, 10, 1)
        }
        param_dict['min_samples_leaf'] = random.randrange(1, param_dict['min_samples_split'])

        train_results, test_results, preds = CVRfc(df=data, if_print=False, **param_dict)
        if i % 5 == 0:
            print(param_dict.items(), np.mean(test_results))
        test_list.append([param_dict.items(), np.mean(test_results)])
        
    return test_list

In [11]:
tuned_list = Rfc_h_tuning()

dict_items([('n_estimators', 130), ('max_depth', 5), ('max_features', 5), ('min_samples_split', 6), ('min_samples_leaf', 4)]) 0.7410283059305712
dict_items([('n_estimators', 60), ('max_depth', 15), ('max_features', 10), ('min_samples_split', 3), ('min_samples_leaf', 2)]) 0.7795067647020287
dict_items([('n_estimators', 80), ('max_depth', 17), ('max_features', 3), ('min_samples_split', 6), ('min_samples_leaf', 5)]) 0.7744738859329147
dict_items([('n_estimators', 120), ('max_depth', 13), ('max_features', 11), ('min_samples_split', 8), ('min_samples_leaf', 6)]) 0.7748085581622572
dict_items([('n_estimators', 160), ('max_depth', 5), ('max_features', 14), ('min_samples_split', 4), ('min_samples_leaf', 3)]) 0.7456613160098896
dict_items([('n_estimators', 110), ('max_depth', 11), ('max_features', 7), ('min_samples_split', 6), ('min_samples_leaf', 2)]) 0.7690065149329117
dict_items([('n_estimators', 100), ('max_depth', 21), ('max_features', 11), ('min_samples_split', 5), ('min_samples_leaf', 4)

In [19]:
min_max_tuned = Rfc_h_tuning(data_mm)

dict_items([('n_estimators', 50), ('max_depth', 9), ('max_features', 4), ('min_samples_split', 6), ('min_samples_leaf', 4)]) 0.7496052016971919
dict_items([('n_estimators', 130), ('max_depth', 15), ('max_features', 13), ('min_samples_split', 3), ('min_samples_leaf', 2)]) 0.779383231966372
dict_items([('n_estimators', 130), ('max_depth', 19), ('max_features', 4), ('min_samples_split', 7), ('min_samples_leaf', 3)]) 0.7785869409789805
dict_items([('n_estimators', 130), ('max_depth', 15), ('max_features', 5), ('min_samples_split', 7), ('min_samples_leaf', 1)]) 0.7770575873389887
dict_items([('n_estimators', 80), ('max_depth', 15), ('max_features', 14), ('min_samples_split', 3), ('min_samples_leaf', 1)]) 0.7799326037131348
dict_items([('n_estimators', 180), ('max_depth', 23), ('max_features', 11), ('min_samples_split', 4), ('min_samples_leaf', 2)]) 0.7807318363822215
dict_items([('n_estimators', 120), ('max_depth', 9), ('max_features', 3), ('min_samples_split', 4), ('min_samples_leaf', 1)])

In [19]:
list1 = [1, 43, 45, 5, 67, 34,5 ]

In [31]:
print(param_dict.values())

dict_values([160, 21, 13, 5])


In [42]:
param_dict.items()

dict_items([('n_estimators', 160), ('max_depth', 21), ('max_features', 13), ('min_samples_split', 5)])