In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Parameter tuning

For Hyperparameter Tuning we considered GridSearchCV and RandomizedSearchCV. GridSearchCV evaluates all the options listed in the parameter grid, which can be computationally expensive. Therefore, we chose RandomizedSearchCV, which randomly chooses from different parameter combinations to find the best score.

For each dataset, we carry out a parameter tuning for the best combination (highest f1 score) from the baseline. RandomForestClassifier gave the highest f1 score for each dataset, except for the Congressional Voting. In case of the Congressional Voting dataset, DecisionTreeClassifier and RandomForestClassifier both gave the highest f1 scores. For this dataset, we perform a parameter tuning for all 3 classifiers.

In [2]:
combined_results = pd.read_csv('data/combined_results.csv')

## 1. Dataset Congressional Voting

In [3]:
congressional_voting_results = combined_results[combined_results['dataset'] == 'congression_voting']

Top combinations based on F1 score:

In [4]:
congressional_voting_results.nlargest(10, 'f1')

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
82,congression_voting,RandomForestClassifier,holdout,none,standard,16,16,0.954545,0.954678,0.954545,0.95438,0.012542,0.084082,0.0189,
88,congression_voting,RandomForestClassifier,holdout,none,robust,16,16,0.954545,0.954678,0.954545,0.95438,0.0,0.104615,0.0,
94,congression_voting,RandomForestClassifier,holdout,none,minmax,16,16,0.954545,0.954678,0.954545,0.95438,0.00074,0.08813,0.016628,
100,congression_voting,RandomForestClassifier,holdout,p-value,none,16,14,0.954545,0.954678,0.954545,0.95438,0.032598,0.083973,0.018123,
106,congression_voting,RandomForestClassifier,holdout,p-value,standard,16,14,0.954545,0.954678,0.954545,0.95438,0.03832,0.095003,0.0,
110,congression_voting,DecisionTreeClassifier,holdout,p-value,robust,16,14,0.954545,0.954678,0.954545,0.95438,0.04763,0.000996,0.0,
112,congression_voting,RandomForestClassifier,holdout,p-value,robust,16,14,0.954545,0.954678,0.954545,0.95438,0.02452,0.099888,0.0,
116,congression_voting,DecisionTreeClassifier,holdout,p-value,minmax,16,14,0.954545,0.954678,0.954545,0.95438,0.047163,0.000966,0.0,
118,congression_voting,RandomForestClassifier,holdout,p-value,minmax,16,14,0.954545,0.954678,0.954545,0.95438,0.029113,0.096108,0.004035,
97,congression_voting,KNeighborsClassifier,cv,p-value,none,16,13,0.953911,0.954145,0.953911,0.953808,0.030002,0.000139,0.003349,0.014932


Best combination per classifier:

In [5]:
top_per_classifier = congressional_voting_results.groupby('classifier').apply(lambda group: group.nlargest(3, 'f1')).reset_index(drop=True)

top_per_classifier

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
0,congression_voting,DecisionTreeClassifier,holdout,p-value,robust,16,14,0.954545,0.954678,0.954545,0.95438,0.04763,0.000996,0.0,
1,congression_voting,DecisionTreeClassifier,holdout,p-value,minmax,16,14,0.954545,0.954678,0.954545,0.95438,0.047163,0.000966,0.0,
2,congression_voting,DecisionTreeClassifier,cv,p-value,robust,16,13,0.940063,0.940295,0.940063,0.939847,0.035642,0.0,0.0,0.011615
3,congression_voting,KNeighborsClassifier,cv,p-value,none,16,13,0.953911,0.954145,0.953911,0.953808,0.030002,0.000139,0.003349,0.014932
4,congression_voting,KNeighborsClassifier,cv,p-value,robust,16,13,0.953911,0.954145,0.953911,0.953808,0.03702,0.000171,0.000147,0.014932
5,congression_voting,KNeighborsClassifier,cv,p-value,minmax,16,13,0.953911,0.954145,0.953911,0.953808,0.033656,0.0,0.005726,0.014932
6,congression_voting,RandomForestClassifier,holdout,none,standard,16,16,0.954545,0.954678,0.954545,0.95438,0.012542,0.084082,0.0189,
7,congression_voting,RandomForestClassifier,holdout,none,robust,16,16,0.954545,0.954678,0.954545,0.95438,0.0,0.104615,0.0,
8,congression_voting,RandomForestClassifier,holdout,none,minmax,16,16,0.954545,0.954678,0.954545,0.95438,0.00074,0.08813,0.016628,


### 1.1. RandomForestClassifier
For parameter tuning we use one of the best combinations:
- feature_selection: None
- scaling: standard

In [6]:
# Read the preprocessed dataset
congressional_voting = pd.read_csv("./preprocessed-datasets/CongressionVoting_prepro.csv")
congressional_voting.set_index('ID', inplace=True)
X = congressional_voting.drop("class", axis=1) 
y = congressional_voting['class'] 
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = { 
    'n_estimators': [25, 50, 100, 150], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
} 

In [7]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 

In [8]:
cv_results = random_search.cv_results_

results_data = {
    'n_estimators': cv_results['param_n_estimators'],
    'max_features': cv_results['param_max_features'],
    'max_depth': cv_results['param_max_depth'],
    'min_samples_split': cv_results['param_min_samples_split'],
    'min_samples_leaf': cv_results['param_min_samples_leaf'],
    'F1_score': cv_results['mean_test_score'],
    'Runtime': cv_results['mean_fit_time'] + cv_results['mean_score_time']
}


results = pd.DataFrame(data=results_data)
results.sort_values(by=['F1_score'], ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
results


Unnamed: 0,n_estimators,max_features,max_depth,min_samples_split,min_samples_leaf,F1_score,Runtime
0,25,sqrt,20,2,1,0.954133,0.063357
1,50,log2,20,2,2,0.9538,0.208429
2,100,sqrt,30,10,1,0.947393,0.205302
3,150,,10,10,1,0.947393,0.284539
4,50,sqrt,20,5,2,0.941087,0.119194
5,25,,20,10,2,0.940444,0.076667
6,25,sqrt,30,2,2,0.934476,0.268061
7,150,,10,2,1,0.934147,0.471824
8,150,log2,30,2,1,0.934058,0.369178
9,100,log2,10,5,4,0.92091,0.368389


In [9]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = congressional_voting_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.9543804053607976
f1 score of the best estimator from RandomizedSearchCV: 0.9543804053607975


One of the best combinations (classifier: RandomForest, feature_selection: None, scaling: standard) for the congressional voting dataset had an f1 score = 0.95438. With RandomizedSearchCV we found the best parameter setting which result in a higher f1 score.  

### 1.2. KNeighborsClassifier

For parameter tuning of the KNeighborsClassifier we use one of its the best combinations:
- feature_selection: p-value
- scaling: None

In [10]:
congressional_voting = pd.read_csv("./preprocessed-datasets/CongressionVoting_prepro.csv")
congressional_voting.set_index('ID', inplace=True)
X = congressional_voting.drop("class", axis=1) 
y = congressional_voting['class'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

# Feature selection
X_train = X_train.loc[:, X_train.apply(pd.Series.nunique) != 1]
p_values = []
for feature in X_train.columns:
    f, p = chi2(X_train[[feature]], y_train)
    p_values.append(p)

p_values = np.array(p_values).reshape(-1)
p_values = pd.Series(p_values, index=X_train.columns).sort_values()
selected_features = p_values[p_values < 0.05].index

X_train_preprocessed, X_test_preprocessed = X_train[selected_features], X_test[selected_features]

print("Without feature selection: ", X_train.shape[1])
print("With feature selection: ", len(selected_features))

# Define the parameter grid for RandomizedSearchCV
param_grid_knn = {
    'n_neighbors': [1, 5, 10, 50, 100],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

Without feature selection:  16
With feature selection:  14


In [11]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_grid_knn, scoring="f1_weighted") 
random_search.fit(X_train_preprocessed, y_train) 

In [12]:
cv_results = random_search.cv_results_

results_data = {
    'n_neighbors': cv_results['param_n_neighbors'],
    'weights': cv_results['param_weights'],
    'algorithm': cv_results['param_algorithm'],
    'p': cv_results['param_p'],
    'F1_score': cv_results['mean_test_score'],
    'Runtime': cv_results['mean_fit_time'] + cv_results['mean_score_time']
}


results = pd.DataFrame(data=results_data)
results.sort_values(by=['F1_score'], ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
results

Unnamed: 0,n_neighbors,weights,algorithm,p,F1_score,Runtime
0,1,uniform,kd_tree,1,0.946949,0.016717
1,5,uniform,auto,2,0.94067,0.028429
2,1,uniform,ball_tree,2,0.939768,0.028522
3,10,distance,kd_tree,2,0.933809,0.011229
4,10,uniform,brute,2,0.933809,0.048381
5,100,distance,auto,1,0.933757,0.087939
6,5,uniform,brute,2,0.933605,0.013115
7,50,uniform,ball_tree,1,0.907951,0.026028
8,50,distance,ball_tree,1,0.907951,0.036733
9,50,distance,kd_tree,1,0.907951,0.009251


In [13]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_preprocessed)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = congressional_voting_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.9543804053607976
f1 score of the best estimator from RandomizedSearchCV: 0.9074675324675324


### 1.3. DecisionTreeClassifier

For parameter tuning of the KNeighborsClassifier we use one of its the best combinations:
- feature_selection: p-value
- scaling: robust

In [14]:
congressional_voting = pd.read_csv("./preprocessed-datasets/CongressionVoting_prepro.csv")
congressional_voting.set_index('ID', inplace=True)
X = congressional_voting.drop("class", axis=1) 
y = congressional_voting['class'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

# Feature selection
X_train = X_train.loc[:, X_train.apply(pd.Series.nunique) != 1]
p_values = []
for feature in X_train.columns:
    f, p = chi2(X_train[[feature]], y_train)
    p_values.append(p)

p_values = np.array(p_values).reshape(-1)
p_values = pd.Series(p_values, index=X_train.columns).sort_values()
selected_features = p_values[p_values < 0.05].index

X_train_preprocessed, X_test_preprocessed = X_train[selected_features], X_test[selected_features]

scaler = RobustScaler().fit(X_train_preprocessed)
X_train_scaled = scaler.transform(X_train_preprocessed)
X_test_scaled = scaler.transform(X_test_preprocessed)

# Define the parameter grid for RandomizedSearchCV
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 10, 20, 30],
    'min_samples_leaf': [1, 5, 10, 50, 100],
}

In [15]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_grid_dt, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 

ValueError: Invalid parameter 'Runtime' for estimator DecisionTreeClassifier(criterion='entropy', max_depth=30, min_samples_leaf=50). Valid parameters are: ['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'].

In [None]:
cv_results = random_search.cv_results_

results_data = {
    'criterion': cv_results['param_criterion'],
    'max_depth': cv_results['param_max_depth'],
    'min_samples_leaf': cv_results['param_min_samples_leaf'],
    'F1_score': cv_results['mean_test_score'],
    'Runtime': cv_results['mean_fit_time'] + cv_results['mean_score_time']
}


results = pd.DataFrame(data=results_data)
results.sort_values(by=['F1_score'], ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
results

Unnamed: 0,criterion,max_depth,min_samples_leaf,F1_score
0,gini,2,10,0.960873
1,entropy,30,5,0.933517
2,entropy,10,1,0.920801
3,gini,20,1,0.920539
4,gini,10,1,0.913915
5,entropy,20,1,0.907029
6,gini,30,50,0.855615
7,entropy,10,50,0.855615
8,gini,2,100,0.502562
9,entropy,2,100,0.502562


In [None]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

print(best_estimator)

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = congressional_voting_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

DecisionTreeClassifier(max_depth=2, min_samples_leaf=10)
f1 score of the best performing combination without parameter tuning: 0.9543804053607976
f1 score of the best estimator from RandomizedSearchCV: 0.9696969696969697


One of the best combinations (classifier: DecisionTree, feature_selection: p-value, scaling: robust) for the congressional voting dataset had an f1 score = 0.95438. With RandomizedSearchCV we found the best parameter setting which result in a higher f1 score.  

## 2. Dataset Bank Marketing

In [None]:
bank_marketing_results = combined_results[combined_results['dataset'] == 'bank_marketing']

In [None]:
bank_marketing_results.nlargest(5, 'f1')

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
221,bank_marketing,RandomForestClassifier,cv,none,minmax,34,34,0.93919,0.939728,0.93919,0.939171,1.2e-05,6.99521,0.288231,0.001061
220,bank_marketing,RandomForestClassifier,holdout,none,minmax,34,34,0.937389,0.938072,0.937389,0.937368,7e-06,6.563403,0.498113,
233,bank_marketing,RandomForestClassifier,cv,correlation,minmax,34,26,0.935441,0.936028,0.935441,0.935419,0.15921,7.780731,0.274147,0.00123
232,bank_marketing,RandomForestClassifier,holdout,correlation,minmax,34,26,0.933604,0.934403,0.933604,0.933577,0.149638,6.250977,0.42551,
227,bank_marketing,RandomForestClassifier,cv,p-value,minmax,34,30,0.931597,0.931801,0.931597,0.931589,0.731339,9.361306,0.421076,0.000658


For parameter tuning we use the best combination:
- classifier: RandomForest
- feature_selection: None
- scaling: minmax

Besides, we also apply SMOTE on the bank marketing dataset, just like in the baseline experimentations.

In [None]:
def apply_smote(df, target_column, k_neighbors=4, random_state=321):

    X = df.drop(target_column, axis=1)
    y = df[target_column]

    sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
    X_resampled, y_resampled = sm.fit_resample(X, y)

    return X_resampled, y_resampled

In [None]:
# Read the preprocessed dataset
bank_marketing = pd.read_csv("./preprocessed-datasets/bank_marketing_prepro.csv")
target_column = 'class'
X,y = apply_smote(bank_marketing,target_column)
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = { 
    'n_estimators': [25, 50, 100, 150], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
} 

In [None]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 

In [None]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = bank_marketing_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.9391710746130112
f1 score of the best estimator from RandomizedSearchCV: 0.9349736581403411


The search space explored by RandomizedSearchCV did not include a set of hyperparameters that performed better than the ones in the original best model.

## 3. Dataset Wine Quality

In [None]:
wine_quality_results = combined_results[combined_results['dataset'] == 'wine_quality']

In [None]:
wine_quality_results.nlargest(5, 'f1')

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
23,wine_quality,RandomForestClassifier,cv,none,minmax,13,13,0.891145,0.887836,0.891145,0.888645,0.005564,3.137737,0.063741,0.007228
41,wine_quality,RandomForestClassifier,cv,p-value,robust,13,11,0.89049,0.88729,0.89049,0.88801,0.058313,3.11937,0.066823,0.007146
11,wine_quality,RandomForestClassifier,cv,none,standard,13,13,0.889986,0.886811,0.889986,0.887407,0.006954,3.088176,0.062916,0.006602
5,wine_quality,RandomForestClassifier,cv,none,none,13,13,0.889835,0.886253,0.889835,0.887116,0.000399,3.160678,0.066656,0.006167
35,wine_quality,RandomForestClassifier,cv,p-value,standard,13,11,0.889483,0.886272,0.889483,0.887033,0.047879,3.140681,0.066032,0.006499


For parameter tuning we use the best combination:
- classifier: RandomForest
- feature_selection: None
- scaling: minmax

Besides, we also apply SMOTE on the wine quality dataset, just like in the baseline experimentations.

In [None]:
# Read the preprocessed dataset
wine_quality = pd.read_csv("./preprocessed-datasets/wine_quality_prepro.csv")
target_column = 'class'
X,y = apply_smote(wine_quality,target_column)
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = { 
    'n_estimators': [25, 50, 100, 200, 500], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [5, 25, 50],
    'min_samples_split': [2, 5, 10]
}

In [None]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 

In [None]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = wine_quality_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.8886451647615082
f1 score of the best estimator from RandomizedSearchCV: 0.8675855470727418


The search space explored by RandomizedSearchCV did not include a set of hyperparameters that performed better than the ones in the original best model.

## 4. Dataset Amazon Reviews

In [None]:
reviews_results = combined_results[combined_results['dataset'] == 'reviews']

Top 5 combinations based on F1 score:

In [None]:
reviews_results.nlargest(5, 'f1')

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
154,reviews,RandomForestClassifier,holdout,none,standard,10000,10000,0.533333,0.633823,0.533333,0.524547,1.145058,1.584533,0.050707,
185,reviews,RandomForestClassifier,cv,p-value,robust,10000,6054,0.526667,0.581471,0.526667,0.511494,73.258305,1.383942,0.028825,0.051475
184,reviews,RandomForestClassifier,holdout,p-value,robust,10000,5797,0.52,0.60466,0.52,0.508507,68.972446,1.184847,0.03258,
191,reviews,RandomForestClassifier,cv,p-value,minmax,10000,6054,0.522667,0.584946,0.522667,0.507046,73.231496,1.372783,0.027929,0.054543
179,reviews,RandomForestClassifier,cv,p-value,standard,10000,6054,0.521333,0.592879,0.521333,0.506898,72.612922,1.398013,0.029681,0.038711


For parameter tuning we use the best combination:
- classifier: RandomForest
- feature_selection: None
- scaling: standard

In [None]:
# Read the preprocessed dataset
reviews = pd.read_csv("./preprocessed-datasets/Review_prepro.csv")
X = reviews.drop("class", axis=1) 
y = reviews['class'] 
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18) 

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomizedSearchCV
param_grid = { 
    'n_estimators': [25, 50, 100, 150], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
} 

In [None]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, scoring="f1_weighted") 
random_search.fit(X_train_scaled, y_train) 



In [None]:
# Get the best estimator from RandomizedSearchCV
best_estimator = random_search.best_estimator_

# Evaluate the best estimator on the test set
y_pred_test = best_estimator.predict(X_test_scaled)
f1_test = metrics.f1_score(y_test, y_pred_test, average='weighted')

largest = reviews_results.nlargest(1, 'f1')
f1_value = largest['f1'].iloc[0]
print(f"f1 score of the best performing combination without parameter tuning: {f1_value}")
print(f"f1 score of the best estimator from RandomizedSearchCV: {f1_test}")

f1 score of the best performing combination without parameter tuning: 0.5245474278807613
f1 score of the best estimator from RandomizedSearchCV: 0.49317114693585284
