# The Role of Hyperparameters in Machine Learning Models and How to Tune Them (PSRM, 2023)
### Christian Arnold, Luka Biedebach, Andreas Küpfer, and Marcel Neunhoeffer

### *Code to replicate information depicted in Table 2, A4, A5, A6 and A7*

In [2]:
#IMPORTS for Baselines
import numpy as np
np.set_printoptions(precision=15)
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer, SpanishStemmer

import nltk
nltk.download('stopwords')

# IMPORTS for CNN
import os
import tensorflow as tf
import keras.backend as K
import keras_tuner

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling1D, GlobalMaxPooling2D, Dropout
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from keras_tuner import HyperModel, Objective
from keras_tuner.tuners import RandomSearch, Hyperband
from keras.models import Sequential, Model
from keras.layers import Dense, concatenate
from keras.wrappers.scikit_learn import KerasClassifier

from gensim import models
import gensim.downloader as api

from utils.functions import *

# Set to True if models should be run again
rerun = True

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaskuepfer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ModuleNotFoundError: No module named 'functions'

### Naive Bayes, Random Forest, and SVM Models

In [None]:
if rerun:
    print("Rerun flag true: tuning in progress...")
if not rerun:
    print("Rerun flag false: results loaded from files...")

# set 5 different seeds for reproducibility
seeds = [20210101, 20210102, 20210103, 20210104, 20210105]

# initialize dictionary for countries/datasets
countries = {"Venezuela": "data/vz-tweets_full.csv", "Ghana": "data/gh-tweets_full.csv", "Philippines": "data/ph-tweets_full.csv"}

# define dataframe to store results
results_svc = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])
results_svc_full = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])

results_randomforest = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])
results_randomforest_full = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])

results_naivebayes = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])
results_naivebayes_full = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])

results_tuning_svc = pd.DataFrame(columns=["Country", "kernel", "C", "class_weight", "gamma", "Tuning F1", "OOS F1"])
results_tuning_randomforest = pd.DataFrame(columns=["Country", "max_depth", "n_estimators", "class_weight", "max_features", "Tuning F1", "OOS F1"])
results_tuning_naivebayes = pd.DataFrame(columns=["Country", "alpha", "Tuning F1", "OOS F1"])

# loop over all countries

for country, path in countries.items():
    print("\nCurrent Country: " + country)
    # initialize stopwords and stemmer in correct language
    stops = set(stopwords.words("spanish")) if country == "Venezuela" else set(stopwords.words("english"))
    stemmer = SpanishStemmer() if country == "Venezuela" else EnglishStemmer()
    
    results_scores_current_svc = []
    results_scores_current_randomforest = []
    results_scores_current_naivebayes = []
    results_scores_current_untuned_svc = []
    results_scores_current_untuned_randomforest = []
    results_scores_current_untuned_naivebayes = []
    
    results_tuning_current_svc = []
    results_tuning_current_randomforest = []
    results_tuning_current_naivebayes = []
    
    # preprocess the data
    if rerun:
        data = preprocess_data(path, stops, stemmer)

    # loop over seeds, load data and tune/train baseline models
    for seed in seeds:
        if rerun:
            X_train_tfidf, X_test_tfidf, y_train, y_test = load_data(data, seed)

        # SVC Tuned
        print("SVC...")
        if rerun:
            result_scores, results_tuning_current = run_svc(X_train_tfidf, X_test_tfidf, y_train, y_test)
            results_scores_current_svc.append(result_scores)
            results_tuning_svc = results_tuning_svc.append({"Country": country,
                                                    "kernel": results_tuning_current[0],
                                                    "C": results_tuning_current[1],
                                                    "class_weight": results_tuning_current[2],
                                                    "gamma": results_tuning_current[3],
                                                    "Tuning F1": results_tuning_current[4],
                                                    "OOS F1": result_scores[3]}, ignore_index=True)
            print(result_scores)

        # SVC Untuned
        print("SVC Default...")
        if rerun:
            result_scores = run_svc(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
            results_scores_current_untuned_svc.append(result_scores)
            print(result_scores)
        
        # Random Forest Tuned
        print("Random Forest...")
        if rerun:
            result_scores, results_tuning_current = run_randomforest(X_train_tfidf, X_test_tfidf, y_train, y_test)
            results_scores_current_randomforest.append(result_scores)
            results_tuning_randomforest = results_tuning_randomforest.append({"Country": country,
                                                    "max_depth": results_tuning_current[0],
                                                    "n_estimators": results_tuning_current[1],
                                                    "class_weight": results_tuning_current[2],
                                                    "max_features": results_tuning_current[3],
                                                    "Tuning F1": results_tuning_current[4],
                                                    "OOS F1": result_scores[3]}, ignore_index=True)
            print(result_scores)
        
        # Random Forest Untuned  
        print("Random Forest Default...")
        if rerun:
            result_scores = run_randomforest(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
            results_scores_current_untuned_randomforest.append(result_scores)
            print(result_scores)
        
        # Naives Bayes Tuned
        print("Naives Bayes...")
        if rerun:
            result_scores, results_tuning_current = run_naivebayes(X_train_tfidf, X_test_tfidf, y_train, y_test)
            results_scores_current_naivebayes.append(result_scores)
            results_tuning_naivebayes = results_tuning_naivebayes.append({"Country": country,
                                                                          "alpha": results_tuning_current[0],
                                                                          "Tuning F1": results_tuning_current[1],
                                                                          "OOS F1": result_scores[3]}, ignore_index=True)
            print(result_scores)
        
        # Naives Bayes Untuned  
        print("Naives Bayes Default...")
        if rerun:
            result_scores = run_naivebayes(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
            results_scores_current_untuned_naivebayes.append(result_scores)
            print(result_scores)

    if rerun:
        # SVC Tuned
        results_svc = results_svc.append({"Baseline": "SVM Tuned", "Country": country,
                                  "Accuracy": np.array(results_scores_current_svc)[:, -4][0],
                                  "Precision": np.array(results_scores_current_svc)[:, -3][0],
                                  "Recall": np.array(results_scores_current_svc)[:, -2][0],
                                  "F1": np.array(results_scores_current_svc)[:, -1][0]}, ignore_index=True)
        results_svc_full = results_svc_full.append({"Baseline": "SVM", "Country": country,
                              "Accuracy": results_scores_current_svc,
                              "Precision": results_scores_current_svc,
                              "Recall": results_scores_current_svc,
                              "F1": results_scores_current_svc}, ignore_index=True)

        # SVC Untuned                              
        results_svc = results_svc.append({"Baseline": "SVM Default", "Country": country,
                                  "Accuracy": np.array(results_scores_current_untuned_svc)[:, -4][0],
                                  "Precision": np.array(results_scores_current_untuned_svc)[:, -3][0],
                                  "Recall": np.array(results_scores_current_untuned_svc)[:, -2][0],
                                  "F1": np.array(results_scores_current_untuned_svc)[:, -1][0]}, ignore_index=True)
        results_svc_full = results_svc_full.append({"Baseline": "SVM Untuned", "Country": country,
                              "Accuracy": results_scores_current_untuned_svc,
                              "Precision": results_scores_current_untuned_svc,
                              "Recall": results_scores_current_untuned_svc,
                              "F1": results_scores_current_untuned_svc}, ignore_index=True)

        # Random Forest Tuned
        results_randomforest = results_randomforest.append({"Baseline": "Random Forest Tuned", "Country": country,
                                  "Accuracy": np.array(results_scores_current_randomforest)[:, -4][0],
                                  "Precision": np.array(results_scores_current_randomforest)[:, -3][0],
                                  "Recall": np.array(results_scores_current_randomforest)[:, -2][0],
                                  "F1": np.array(results_scores_current_randomforest)[:, -1][0]}, ignore_index=True)
        results_randomforest_full = results_randomforest_full.append({"Baseline": "Random Forest", "Country": country,
                              "Accuracy": results_scores_current_randomforest,
                              "Precision": results_scores_current_randomforest,
                              "Recall": results_scores_current_randomforest,
                              "F1": results_scores_current_randomforest}, ignore_index=True) 

        # Random Forest Untuned
        results_randomforest = results_randomforest.append({"Baseline": "Random Forest Default", "Country": country,
                                  "Accuracy": np.array(results_scores_current_untuned_randomforest)[:, -4][0],
                                  "Precision": np.array(results_scores_current_untuned_randomforest)[:, -3][0],
                                  "Recall": np.array(results_scores_current_untuned_randomforest)[:, -2][0],
                                  "F1": np.array(results_scores_current_untuned_randomforest)[:, -1][0]}, ignore_index=True)
        results_randomforest_full = results_randomforest_full.append({"Baseline": "Random Forest Untuned", "Country": country,
                              "Accuracy": results_scores_current_untuned_randomforest,
                              "Precision": results_scores_current_untuned_randomforest,
                              "Recall": results_scores_current_untuned_randomforest,
                              "F1": results_scores_current_untuned_randomforest}, ignore_index=True) 

        # Naive Bayes Tuned
        results_naivebayes = results_naivebayes.append({"Baseline": "Naive Bayes Tuned", "Country": country,
                                  "Accuracy": np.array(results_scores_current_naivebayes)[:, -4][0],
                                  "Precision": np.array(results_scores_current_naivebayes)[:, -3][0],
                                  "Recall": np.array(results_scores_current_naivebayes)[:, -2][0],
                                  "F1": np.array(results_scores_current_naivebayes)[:, -1][0]}, ignore_index=True)
        results_naivebayes_full = results_naivebayes_full.append({"Baseline": "Naive Bayes", "Country": country,
                              "Accuracy": results_scores_current_naivebayes,
                              "Precision": results_scores_current_naivebayes,
                              "Recall": results_scores_current_naivebayes,
                              "F1": results_scores_current_naivebayes}, ignore_index=True)    

        # Naive Bayes Untuned
        results_naivebayes = results_naivebayes.append({"Baseline": "Naive Bayes Default", "Country": country,
                                  "Accuracy": np.array(results_scores_current_untuned_naivebayes)[:, -4][0],
                                  "Precision": np.array(results_scores_current_untuned_naivebayes)[:, -3][0],
                                  "Recall": np.array(results_scores_current_untuned_naivebayes)[:, -2][0],
                                  "F1": np.array(results_scores_current_untuned_naivebayes)[:, -1][0]}, ignore_index=True)
        results_naivebayes_full = results_naivebayes_full.append({"Baseline": "Naive Bayes", "Country": country,
                              "Accuracy": results_scores_current_untuned_naivebayes,
                              "Precision": results_scores_current_untuned_naivebayes,
                              "Recall": results_scores_current_untuned_naivebayes,
                              "F1": results_scores_current_untuned_naivebayes}, ignore_index=True)

if not rerun:
    results_svc = pd.read_csv("results/svm_results.csv")
    results_randomforest = pd.read_csv("results/randomforest_results.csv")
    results_naivebayes = pd.read_csv("results/naivebayes_results.csv")

## Print and store detailed result scores
print("--- Begin Table 2 ---")
print(results_naivebayes[["Baseline", "Country", "F1"]].round(3))
print(results_randomforest[["Baseline", "Country", "F1"]].round(3))
print(results_svc[["Baseline", "Country", "F1"]].round(3))
print("--- End Table 2 ---")
results_svc.round(3).to_csv("results/svm_results.csv", index=False)
results_randomforest.round(3).to_csv("results/randomforest_results.csv", index=False)
results_naivebayes.round(3).to_csv("results/naivebayes_results.csv", index=False)

if rerun:
    results_svc_full.round(3).to_csv("results/svm_results_full.csv", index=False)
    results_randomforest_full.round(3).to_csv("results/randomforest_results_full.csv", index=False)
    results_naivebayes_full.round(3).to_csv("results/naivebayes_results_full.csv", index=False)

if not rerun:
    results_tuning_svc = pd.read_csv("results/svm_results_hyperparameter.csv")
    results_tuning_randomforest = pd.read_csv("results/randomforest_results_hyperparameter.csv")
    results_tuning_naivebayes = pd.read_csv("results/naivebayes_results_hyperparameter.csv")
# Print and store best hyperparameter combinations (5 tuning runs) for each country
print("--- Begin Table A4 ---")
print(results_tuning_naivebayes.round(3))
print("--- End Table A4 ---")
print("--- Begin Table A5 ---")
print(results_tuning_svc.round(3))
print("--- End Table A5 ---")
print("--- Begin Table A6 ---")
print(results_tuning_randomforest.round(3))
print("--- End Table A6 ---")

results_tuning_svc.round(3).to_csv("results/svm_results_hyperparameter.csv", index=False)
results_tuning_randomforest.round(3).to_csv("results/randomforest_results_hyperparameter.csv", index=False)
results_tuning_naivebayes.round(3).to_csv("results/naivebayes_results_hyperparameter.csv", index=False)


### CNN Model

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = ""

#load pre-trained model for correct language
w2v_es = models.KeyedVectors.load_word2vec_format('sbw_vectors.bin', binary=True)
w2v_en = api.load("word2vec-google-news-300")

# initialize dictionary for countries/datasets
countries = {"Ghana": "raw/gh-tweets.csv", "Philippines": "raw/ph-tweets.csv", "Venezuela": "raw/vz-tweets.csv"}

#run several times with different param settings and seeds
seeds = [20210101, 20210102, 20210103]

#initialize result objects
results_df = pd.DataFrame()
results = []

#loop over all countries
for country, path in countries.items():

    print("\nCurrent Country: " + country)

    # initialize stopwords and stemmer in correct language
    stops = set(stopwords.words("spanish")) if country == "Venezuela" else set(stopwords.words("english"))
    stemmer = SpanishStemmer() if country == "Venezuela" else EnglishStemmer()
    results_current = []

    #load pre-trained model for correct language, to-do: aus der schleife holen
    if country == "Venezuela":
        w2v = w2v_es
    else:
        w2v = w2v_en

    #preprocess the data
    data = preprocess_data(path, stops, stemmer)

    words=list(w2v.index_to_key)
    vocab_len = len(w2v)
    
    i=1

    for seed in seeds:

        print("Run {i}/3".format(i=i))

        #iterate through different random train test splits to capture model variation
        X_train_vec, X_train_tfidf, \
        X_test_vec, X_test_tfidf, \
        y_train_vec, y_test = embedding_transform(data, w2v,words, seed)
        
        tuner = tune_model_cv(X_train_vec, y_train_vec, model= country+str(seed), runs=2, epochs=200)
        
        #build model with best params
        best_hp = tuner.get_best_hyperparameters()[0]
        model = tuner.hypermodel.build(best_hp)
        
        #set class weight
        ratio_1 = 1.0 - len(y_train_vec[y_train_vec == 1]) / float(len(y_train_vec))  ## ratio of violence instances
        ratio_0 = 1.0 - ratio_1
        class_weight = {0: ratio_0, 1: ratio_1}

        #fit model
        model.fit([X_train_vec, X_train_vec, X_train_vec], y_train_vec, epochs=200, batch_size=64, class_weight=class_weight)

        #classify sequences
        y_pred = model.predict([X_test_vec, X_test_vec, X_test_vec])
        y_pred =(y_pred>0.5)

        results.append(best_hp+[print_stats(y_test, y_pred, model = "{c}_CNN_{p}".format(p="tuned params", c=country))])
        pd.DataFrame(results, columns=["parameters", "model", "accuracy", "precision", "recall", "f1"]).to_csv("temp_results_{c}_run_{p}.csv".format(p=i, c=country))
            
        print(results)
        
        i+=1
        
#combine all results and calculate summary statistics
results_df = pd.DataFrame(results, columns=["parameters", "model", "accuracy", "precision", "recall", "f1"])
cnn_results.to_csv("results.csv")
cnn_results = results_df.groupby(results_df["model"]).agg([np.mean, np.std])
cnn_results.to_csv("final_results.csv")

print(cnn_results)