# 55. Models (tuning Support Vector Machine)
## Contents
- Prerequisites
- Tuning Support Vector Machine
---------------------------------------------------------
## Prerequisites

In [1]:
import time
import os
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from tqdm import tqdm
import joblib
import warnings
from sklearn.model_selection import cross_val_score
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
file_path = 'C:/Users/Koen.Janssen/OneDrive/Documents/TILBURG UNIVERSITY/Thesis/Data/'
parties = np.load('00_parties.npy')
options = ['imbalanced','oversampling','undersampling','balancedsampling']
rs = 1

------------------------------------------------
<br>
<br>
<br>


## Hyperparameter tuning
### Support Vector Machine
--------------------------------------------

In [4]:
dur = time.time()
options = ['imbalanced']
svm_results = pd.DataFrame(columns=['party', 'option', 'C', 'kernel','gamma','svm_score'])
# -------------------------------------------------------------------------------------~----------------------------------------
for party in tqdm(parties, desc = 'Party loop'):                                       # for each party
    for option in options:                                                             # for each sampling method

# load training data 
# -------------------------------------------------------------------------------------~----------------------------------------
        X_train_url = file_path + f"20_models/{option}/" + f"{party}_X_train.csv"      # define X_train url
        y_train_url = file_path + f"20_models/{option}/" + f"{party}_y_train.csv"      # define y_train url
        X_train = pd.read_csv(X_train_url)                                             # read X_train
        X_train = X_train.drop(columns=['source', 'text','stemming_id','document_id']) # redefine X_train without columns
        y_train = pd.read_csv(y_train_url)                                             # read y_train
        
# load validation data
# -------------------------------------------------------------------------------------~----------------------------------------
        X_valid_url = file_path + f"20_models/{option}/" + f"{party}_X_valid.csv"      # define X_valid url
        y_valid_url = file_path + f"20_models/{option}/" + f"{party}_y_valid.csv"      # define y_valid url
        X_valid = pd.read_csv(X_valid_url)                                             # read X_valid
        X_valid = X_valid.drop(columns=['source', 'text','stemming_id','document_id']) # redefine X-valid without columns
        y_valid = pd.read_csv(y_valid_url)                                             # read y_valid
                
# Parameter tuning
# -------------------------------------------------------------------------------------~----------------------------------------
        svm_param_grid = {'C': [0.1,1, 10],
                          'kernel': ['rbf'],                                           
                          'gamma': ['scale', 0.1, 0.01]}                               # define parameter grid
        for C in svm_param_grid['C']:                                                  # for each C
            for kernel in svm_param_grid['kernel']:                                    # for each kernel
                for gamma in svm_param_grid['gamma']:                                  # for each gamma 
                    try:                                                                   # try
                        svm = SVC(C=C, kernel=kernel, random_state=rs)                     # define model
                        svm.fit(X_train, y_train.values.ravel())                           # fit model
                        svm_score = svm.score(X_valid, y_valid)                            # score model
                        svm_result = pd.DataFrame({'party': [party],
                                                   'option': [option],
                                                   'C': [C],
                                                   'kernel': [kernel],
                                                   'gamma': [gamma],
                                                   'svm_score': [svm_score]})              # define result
                        svm_results = pd.concat([svm_results, svm_result], ignore_index=True)
                    except ValueError as e:                                                # if error
                        continue                                                           # continue

# -------------------------------------------------------------------------------------~----------------------------------------
display(svm_results)                                                                   # display results
print('\n---------------------------------------------------------------------------------------------------------------------')
print(f"Code duration: {round((time.time()  - dur),3)} seconds")    

Party loop: 100%|██████████| 18/18 [37:00<00:00, 123.38s/it]


Unnamed: 0,party,option,C,kernel,gamma,svm_score
0,50PLUS,imbalanced,0.1,rbf,scale,0.790295
1,50PLUS,imbalanced,0.1,rbf,0.1,0.790295
2,50PLUS,imbalanced,0.1,rbf,0.01,0.790295
3,50PLUS,imbalanced,1.0,rbf,scale,0.790295
4,50PLUS,imbalanced,1.0,rbf,0.1,0.790295
...,...,...,...,...,...,...
157,Volt,imbalanced,1.0,rbf,0.1,0.757009
158,Volt,imbalanced,1.0,rbf,0.01,0.757009
159,Volt,imbalanced,10.0,rbf,scale,0.757009
160,Volt,imbalanced,10.0,rbf,0.1,0.757009



---------------------------------------------------------------------------------------------------------------------
Code duration: 2220.827 seconds


In [5]:
# -------------------------------------------------------------------------------------~----------------------------------------
best_svm = svm_results.sort_values(by=['party', 'svm_score'], ascending=[True, False]) # sort highest values per party
best_svm = best_svm.groupby('party').apply(lambda x: x.head(1))                        # define highest values per party
display(best_svm)                                                                      # display best values
avg_best_svm = best_svm['svm_score'].mean()                                            # define average best values
print("Average accuracy: ", avg_best_svm)                                              # print average best values
file_url  = file_path + '55_data_tuned_support_vector_machine.csv' 
best_svm.to_csv(file_url, index=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,party,option,C,kernel,gamma,svm_score
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
50PLUS,0,50PLUS,imbalanced,0.1,rbf,scale,0.790295
BBB,117,BBB,imbalanced,0.1,rbf,scale,0.830735
BIJ1,126,BIJ1,imbalanced,0.1,rbf,scale,0.758065
BVNL,108,BVNL,imbalanced,0.1,rbf,scale,0.63296
CDA,15,CDA,imbalanced,10.0,rbf,scale,0.544816
CU,24,CU,imbalanced,10.0,rbf,scale,0.571082
D66,33,D66,imbalanced,10.0,rbf,scale,0.557925
DENK,36,DENK,imbalanced,0.1,rbf,scale,0.8248
FVD,45,FVD,imbalanced,0.1,rbf,scale,0.605934
GLPvdA,54,GLPvdA,imbalanced,0.1,rbf,scale,0.801839


Average accuracy:  0.6932356457353035
