In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import time
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report 

# Parameter tuning

For Hyperparameter Tuning we considered GridSearchCV and RandomizedSearchCV. GridSearchCV evaluates all the options listed in the parameter grid, which can be computationally expensive. Therefore, we chose RandomizedSearchCV, which randomly chooses from different parameter combinations to find the best score.

In [10]:
combined_results = pd.read_csv('data/combined_results.csv')

## 1. Dataset Congressional Voting

In [11]:
congressional_voting_results = combined_results[combined_results['dataset'] == 'congression_voting']

Top 5 combinations based on F1 score:

In [12]:
congressional_voting_results.nlargest(5, 'f1')

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
82,congression_voting,RandomForestClassifier,holdout,none,standard,16,16,0.954545,0.954678,0.954545,0.95438,0.012542,0.084082,0.0189,
88,congression_voting,RandomForestClassifier,holdout,none,robust,16,16,0.954545,0.954678,0.954545,0.95438,0.0,0.104615,0.0,
94,congression_voting,RandomForestClassifier,holdout,none,minmax,16,16,0.954545,0.954678,0.954545,0.95438,0.00074,0.08813,0.016628,
100,congression_voting,RandomForestClassifier,holdout,p-value,none,16,14,0.954545,0.954678,0.954545,0.95438,0.032598,0.083973,0.018123,
106,congression_voting,RandomForestClassifier,holdout,p-value,standard,16,14,0.954545,0.954678,0.954545,0.95438,0.03832,0.095003,0.0,


For parameter tuning we use the best combination:
- classifier: RandomForest
- feature_selection: None
- scaling: standard

In [13]:
# Read the preprocessed dataset
congressional_voting = pd.read_csv("./preprocessed-datasets/CongressionVoting_prepro.csv")
X = congressional_voting.drop("class", axis=1) 
y = congressional_voting['class'] 
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = { 
    'n_estimators': [25, 50, 100, 150], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
} 

In [14]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 

In [15]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = congressional_voting_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.9543804053607976
f1 score of the best estimator from RandomizedSearchCV: 0.9696969696969697


Our best combination (classifier: RandomForest, feature_selection: None, scaling: standard) for the congressional voting dataset had an f1 score = 0.95438. With RandomizedSearchCV we found the best parameter setting which result in a higher f1 score.  

## 2. Dataset Bank Marketing

In [16]:
bank_marketing_results = combined_results[combined_results['dataset'] == 'bank_marketing']

In [17]:
bank_marketing_results.nlargest(5, 'f1')

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
221,bank_marketing,RandomForestClassifier,cv,none,minmax,34,34,0.93919,0.939728,0.93919,0.939171,1.2e-05,6.99521,0.288231,0.001061
220,bank_marketing,RandomForestClassifier,holdout,none,minmax,34,34,0.937389,0.938072,0.937389,0.937368,7e-06,6.563403,0.498113,
233,bank_marketing,RandomForestClassifier,cv,correlation,minmax,34,26,0.935441,0.936028,0.935441,0.935419,0.15921,7.780731,0.274147,0.00123
232,bank_marketing,RandomForestClassifier,holdout,correlation,minmax,34,26,0.933604,0.934403,0.933604,0.933577,0.149638,6.250977,0.42551,
227,bank_marketing,RandomForestClassifier,cv,p-value,minmax,34,30,0.931597,0.931801,0.931597,0.931589,0.731339,9.361306,0.421076,0.000658


For parameter tuning we use the best combination:
- classifier: RandomForest
- feature_selection: None
- scaling: minmax

Besides, we also apply SMOTE on the bank marketing dataset, just like in the baseline experimentations.

In [18]:
def apply_smote(df, target_column, k_neighbors=4, random_state=321):

    X = df.drop(target_column, axis=1)
    y = df[target_column]

    sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
    X_resampled, y_resampled = sm.fit_resample(X, y)

    return X_resampled, y_resampled

In [19]:
# Read the preprocessed dataset
bank_marketing = pd.read_csv("./preprocessed-datasets/bank_marketing_prepro.csv")
target_column = 'class'
X,y = apply_smote(bank_marketing,target_column)
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = { 
    'n_estimators': [25, 50, 100, 150], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
} 

In [20]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 

In [22]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = bank_marketing_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.9391710746130112
f1 score of the best estimator from RandomizedSearchCV: 0.9372780628807765


The search space explored by RandomizedSearchCV did not include a set of hyperparameters that performed better than the ones in the original best model.

## 3. Dataset Wine Quality

In [27]:
wine_quality_results = combined_results[combined_results['dataset'] == 'wine_quality']

In [28]:
wine_quality_results.nlargest(5, 'f1')

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
23,wine_quality,RandomForestClassifier,cv,none,minmax,13,13,0.891145,0.887836,0.891145,0.888645,0.005564,3.137737,0.063741,0.007228
41,wine_quality,RandomForestClassifier,cv,p-value,robust,13,11,0.89049,0.88729,0.89049,0.88801,0.058313,3.11937,0.066823,0.007146
11,wine_quality,RandomForestClassifier,cv,none,standard,13,13,0.889986,0.886811,0.889986,0.887407,0.006954,3.088176,0.062916,0.006602
5,wine_quality,RandomForestClassifier,cv,none,none,13,13,0.889835,0.886253,0.889835,0.887116,0.000399,3.160678,0.066656,0.006167
35,wine_quality,RandomForestClassifier,cv,p-value,standard,13,11,0.889483,0.886272,0.889483,0.887033,0.047879,3.140681,0.066032,0.006499


For parameter tuning we use the best combination:
- classifier: RandomForest
- feature_selection: None
- scaling: minmax

Besides, we also apply SMOTE on the wine quality dataset, just like in the baseline experimentations.

In [43]:
# Read the preprocessed dataset
wine_quality = pd.read_csv("./preprocessed-datasets/wine_quality_prepro.csv")
target_column = 'class'
X,y = apply_smote(wine_quality,target_column)
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = { 
    'n_estimators': [25, 50, 100, 200, 500], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [5, 25, 50],
    'min_samples_split': [2, 5, 10],
}

In [51]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 

In [52]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = wine_quality_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.8886451647615082
f1 score of the best estimator from RandomizedSearchCV: 0.8743748512466861


The search space explored by RandomizedSearchCV did not include a set of hyperparameters that performed better than the ones in the original best model.

## 4. Dataset Amazon Reviews

In [57]:
reviews_results = combined_results[combined_results['dataset'] == 'reviews']

Top 5 combinations based on F1 score:

In [58]:
reviews_results.nlargest(5, 'f1')

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
154,reviews,RandomForestClassifier,holdout,none,standard,10000,10000,0.533333,0.633823,0.533333,0.524547,1.145058,1.584533,0.050707,
185,reviews,RandomForestClassifier,cv,p-value,robust,10000,6054,0.526667,0.581471,0.526667,0.511494,73.258305,1.383942,0.028825,0.051475
184,reviews,RandomForestClassifier,holdout,p-value,robust,10000,5797,0.52,0.60466,0.52,0.508507,68.972446,1.184847,0.03258,
191,reviews,RandomForestClassifier,cv,p-value,minmax,10000,6054,0.522667,0.584946,0.522667,0.507046,73.231496,1.372783,0.027929,0.054543
179,reviews,RandomForestClassifier,cv,p-value,standard,10000,6054,0.521333,0.592879,0.521333,0.506898,72.612922,1.398013,0.029681,0.038711


For parameter tuning we use the best combination:
- classifier: RandomForest
- feature_selection: None
- scaling: standard

In [59]:
# Read the preprocessed dataset
reviews = pd.read_csv("./preprocessed-datasets/Review_prepro.csv")
X = reviews.drop("class", axis=1) 
y = reviews['class'] 
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = { 
    'n_estimators': [25, 50, 100, 150], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
} 

In [60]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 



In [63]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = reviews_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.5245474278807613
f1 score of the best estimator from RandomizedSearchCV: 0.4750727249027903
