# 53. Models (tuning Logistic Regression)
## Contents
- Prerequisites
- Tuning Logistic Regression
---------------------------------------------------------
## Prerequisites

In [1]:
import time
import os
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tqdm import tqdm
import joblib
import warnings
from sklearn.model_selection import cross_val_score
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
file_path = 'C:/Users/Koen.Janssen/OneDrive/Documents/TILBURG UNIVERSITY/Thesis/Data/'
parties = np.load('00_parties.npy')
options = ['imbalanced','oversampling','undersampling','balancedsampling']
rs = 1

------------------------------------------------
<br>
<br>
<br>


## Hyperparameter tuning
### Logistic Regression
--------------------------------------------

In [2]:
dur = time.time()
options = ['imbalanced']
lr_results = pd.DataFrame(columns=['party', 'option','C','max_iter','lr_score'])
# -------------------------------------------------------------------------------------~----------------------------------------
for party in tqdm(parties, desc = 'Party loop'):                                       # for each party
    for option in options:                                                             # for each sampling method

# load training data 
# -------------------------------------------------------------------------------------~----------------------------------------
        X_train_url = file_path + f"20_models/{option}/" + f"{party}_X_train.csv"      # define X_train url
        y_train_url = file_path + f"20_models/{option}/" + f"{party}_y_train.csv"      # define y_train url
        X_train = pd.read_csv(X_train_url)                                             # read X_train
        X_train = X_train.drop(columns=['source', 'text','stemming_id','document_id']) # redefine X_train without columns
        y_train = pd.read_csv(y_train_url)                                             # read y_train
        
# load validation data
# -------------------------------------------------------------------------------------~----------------------------------------      
        X_valid_url = file_path + f"20_models/{option}/" + f"{party}_X_valid.csv"      # define X_valid url
        y_valid_url = file_path + f"20_models/{option}/" + f"{party}_y_valid.csv"      # define y_valid url
        X_valid = pd.read_csv(X_valid_url)                                             # read X_valid
        X_valid = X_valid.drop(columns=['source', 'text','stemming_id','document_id']) # redefine X-valid without columns
        y_valid = pd.read_csv(y_valid_url)                                             # read y_valid
        
# Parameter tuning
# -------------------------------------------------------------------------------------~----------------------------------------        
        lr_param_grid = {'C':           [0.001, 0.01, 0.1, 1, 10, 100,1000],       
                         'max_iter':    [10,100,1000,10000]}                           # define parameter grid
        for C in lr_param_grid['C']:                                                   # for each C value
            for max_iter in lr_param_grid['max_iter']:                                 # for each max_iter value
                try:                                                                   # try
                    lr = LogisticRegression(C=C, max_iter=max_iter, random_state=rs)   # define model, logistic regression
                    lr.fit(X_train, y_train)                                           # fit model
                    lr_score = lr.score(X_valid, y_valid)                              # score model
                    lr_result = pd.DataFrame({'party':    [party],
                                              'option':   [option],
                                              'C':        [C], 
                                              'max_iter': [max_iter],
                                              'lr_score': [lr_score]})                 # define result
                    lr_results = pd.concat([lr_results, lr_result], ignore_index=True) # redefine results
                except ValueError as e:                                                # if error
                    continue                                                           # continue

# -------------------------------------------------------------------------------------~----------------------------------------
display(lr_results)                                                                    # display results
print('\n---------------------------------------------------------------------------------------------------------------------')
print(f"Code duration: {round((time.time()  - dur),3)} seconds")    

Party loop: 100%|██████████| 18/18 [04:11<00:00, 13.97s/it]


Unnamed: 0,party,option,C,max_iter,lr_score
0,50PLUS,imbalanced,0.001,10,0.790295
1,50PLUS,imbalanced,0.001,100,0.796484
2,50PLUS,imbalanced,0.001,1000,0.796979
3,50PLUS,imbalanced,0.001,10000,0.796979
4,50PLUS,imbalanced,0.010,10,0.790295
...,...,...,...,...,...
499,Volt,imbalanced,100.000,10000,0.798791
500,Volt,imbalanced,1000.000,10,0.757009
501,Volt,imbalanced,1000.000,100,0.798241
502,Volt,imbalanced,1000.000,1000,0.800990



---------------------------------------------------------------------------------------------------------------------
Code duration: 251.521 seconds


In [4]:
# -------------------------------------------------------------------------------------~----------------------------------------
best_lr = lr_results.sort_values(by=['party', 'lr_score'], ascending=[True, False])    # sort highest values per party
best_lr = best_lr.groupby('party').apply(lambda x: x.head(1))                          # define highest values per party
display(best_lr)                                                                       # display best values
avg_best_lr = best_lr['lr_score'].mean()                                               # define average best values
print("Average accuracy: ", avg_best_lr)                                               # print average best values
file_url  = file_path + '53_data_tuned_logistic_regression.csv' 
best_lr.to_csv(file_url, index=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,party,option,C,max_iter,lr_score
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
50PLUS,18,50PLUS,imbalanced,10.0,1000,0.802179
BBB,365,BBB,imbalanced,0.001,100,0.837973
BIJ1,419,BIJ1,imbalanced,1000.0,10000,0.782581
BVNL,345,BVNL,imbalanced,0.1,100,0.680046
CDA,29,CDA,imbalanced,0.001,100,0.676357
CU,78,CU,imbalanced,100.0,1000,0.687045
D66,85,D66,imbalanced,0.001,100,0.682744
DENK,133,DENK,imbalanced,100.0,100,0.834264
FVD,154,FVD,imbalanced,1.0,1000,0.695194
GLPvdA,190,GLPvdA,imbalanced,100.0,1000,0.821195


Average accuracy:  0.7456245106521546
