# 54. Models (tuning Random Forest)
## Contents
- Prerequisites
- Tuning Random Forest
---------------------------------------------------------
## Prerequisites

In [1]:
import time
import os
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tqdm import tqdm
import joblib
import warnings
from sklearn.model_selection import cross_val_score
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
file_path = 'C:/Users/Koen.Janssen/OneDrive/Documents/TILBURG UNIVERSITY/Thesis/Data/'
parties = np.load('00_parties.npy')
options = ['imbalanced','oversampling','undersampling','balancedsampling']
rs = 1

------------------------------------------------
<br>
<br>
<br>


## Hyperparameter tuning
### Random Forest
--------------------------------------------

In [2]:
dur = time.time()
options = ['imbalanced']
rf_results = pd.DataFrame(columns=['party', 'option', 'n_est', 'max_depth', 'min_split','rf_score'])
# -------------------------------------------------------------------------------------~----------------------------------------
for party in tqdm(parties, desc = 'Party loop'):                                       # for each party
    for option in options:                                                             # for each sampling method

# load training data 
# -------------------------------------------------------------------------------------~----------------------------------------
        X_train_url = file_path + f"20_models/{option}/" + f"{party}_X_train.csv"      # define X_train url
        y_train_url = file_path + f"20_models/{option}/" + f"{party}_y_train.csv"      # define y_train url
        X_train = pd.read_csv(X_train_url)                                             # read X_train
        X_train = X_train.drop(columns=['source', 'text','stemming_id','document_id']) # redefine X_train without columns
        y_train = pd.read_csv(y_train_url)                                             # read y_train
        
# load validation data
# -------------------------------------------------------------------------------------~----------------------------------------
        X_valid_url = file_path + f"20_models/{option}/" + f"{party}_X_valid.csv"      # define X_valid url
        y_valid_url = file_path + f"20_models/{option}/" + f"{party}_y_valid.csv"      # define y_valid url
        X_valid = pd.read_csv(X_valid_url)                                             # read X_valid
        X_valid = X_valid.drop(columns=['source', 'text','stemming_id','document_id']) # redefine X-valid without columns
        y_valid = pd.read_csv(y_valid_url)                                             # read y_valid
        
# Parameter tuning
# -------------------------------------------------------------------------------------~----------------------------------------    
        rf_param_grid = {'n_est': [250,500],
                         'max_depth': [25,50],
                         'min_split': [2,5]}                                           # define parameter grid
        for n_est in rf_param_grid['n_est']:                                           # for each n_est
            for max_depth in rf_param_grid['max_depth']:                               # for each max_depth
                for min_split in rf_param_grid['min_split']:                           # for each min_split
                    try:                                                               # try
                        rf = RandomForestClassifier(n_estimators=n_est, max_depth=max_depth, min_samples_split=min_split, random_state=rs)
                        rf.fit(X_train, y_train)                                       # fit model
                        rf_score = rf.score(X_valid, y_valid)                          # score model
                        rf_result = pd.DataFrame({'party': [party],
                                                'option': [option],
                                                'n_est': [n_est],
                                                'max_depth': [max_depth],
                                                'min_split': [min_split],
                                                'rf_score': [rf_score]})               # define result
                        rf_results = pd.concat([rf_results, rf_result], ignore_index=True)
                    except ValueError as e:                                            # if error
                        continue                                                       # continue

# -------------------------------------------------------------------------------------~----------------------------------------
display(rf_results)                                                                    # display results
print('\n---------------------------------------------------------------------------------------------------------------------')
print(f"Code duration: {round((time.time()  - dur),3)} seconds")    

Party loop: 100%|██████████| 18/18 [4:53:38<00:00, 978.78s/it]   


Unnamed: 0,party,option,n_est,max_depth,min_split,rf_score
0,50PLUS,imbalanced,250,25,2,0.805645
1,50PLUS,imbalanced,250,25,5,0.804655
2,50PLUS,imbalanced,250,50,2,0.805397
3,50PLUS,imbalanced,250,50,5,0.803664
4,50PLUS,imbalanced,500,25,2,0.806140
...,...,...,...,...,...,...
139,Volt,imbalanced,250,50,5,0.807587
140,Volt,imbalanced,500,25,2,0.807037
141,Volt,imbalanced,500,25,5,0.802639
142,Volt,imbalanced,500,50,2,0.805388



---------------------------------------------------------------------------------------------------------------------
Code duration: 17618.154 seconds


In [3]:
# -------------------------------------------------------------------------------------~----------------------------------------
best_rf = rf_results.sort_values(by=['party', 'rf_score'], ascending=[True, False])    # sort highest values per party
best_rf = best_rf.groupby('party').apply(lambda x: x.head(1))                          # define highest values per party
display(best_rf)                                                                       # display best values
avg_best_rf = best_rf['rf_score'].mean()                                               # define average best values
print("Average accuracy: ", avg_best_rf)                                               # print average best values
file_url  = file_path + '54_data_tuned_random_forest.csv' 
best_rf.to_csv(file_url, index=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,party,option,n_est,max_depth,min_split,rf_score
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
50PLUS,6,50PLUS,imbalanced,500,50,2,0.80713
BBB,104,BBB,imbalanced,250,25,2,0.841314
BIJ1,114,BIJ1,imbalanced,250,50,2,0.790968
BVNL,100,BVNL,imbalanced,500,25,2,0.692783
CDA,9,CDA,imbalanced,250,25,5,0.676357
CU,21,CU,imbalanced,500,25,5,0.691412
D66,31,D66,imbalanced,500,50,5,0.695104
DENK,38,DENK,imbalanced,500,50,2,0.840087
FVD,44,FVD,imbalanced,500,25,2,0.714566
GLPvdA,51,GLPvdA,imbalanced,250,50,5,0.822889


Average accuracy:  0.7543124566433861
