# The Role of Hyperparameters in Machine Learning Models and How to Tune Them (PSRM, 2023)
### Christian Arnold, Luka Biedebach, Andreas Küpfer, and Marcel Neunhoeffer

### *Code to replicate information depicted in Table 2, A4, A5, and A6*

In [6]:
#IMPORTS for Baselines
from functions import load_data, preprocess_data, run_svc, run_dummy, run_randomforest, run_naivebayes
import numpy as np
np.set_printoptions(precision=15)
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer, SpanishStemmer

import nltk
nltk.download('stopwords')

# IMPORTS for CNN
import os
import tensorflow as tf
import keras.backend as K
import keras_tuner

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling1D, GlobalMaxPooling2D, Dropout
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from keras_tuner import HyperModel, Objective
from keras_tuner.tuners import RandomSearch, Hyperband
from keras.models import Sequential, Model
from keras.layers import Dense, concatenate
from keras.wrappers.scikit_learn import KerasClassifier

from gensim import models
import gensim.downloader as api

from functions import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaskuepfer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Naive Bayes, Random Forest, and SVM Models

In [7]:
# set 5 different seeds for reproducibility
seeds = [20210101, 20210102, 20210103, 20210104, 20210105]

# initialize dictionary for countries/datasets
countries = {"Venezuela": "raw/vz-tweets_full.csv", "Ghana": "raw/gh-tweets_full.csv", "Philippines": "raw/ph-tweets_full.csv"}

# define dataframe to store results
results_svc = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])
results_svc_full = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])

results_randomforest = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])
results_randomforest_full = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])

results_naivebayes = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])
results_naivebayes_full = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Precision", "Recall", "F1"])

results_tuning_svc = pd.DataFrame(columns=["Country", "kernel", "C", "class_weight", "gamma", "Tuning F1", "OOS F1"])
results_tuning_randomforest = pd.DataFrame(columns=["Country", "max_depth", "n_estimators", "class_weight", "max_features", "Tuning F1", "OOS F1"])
results_tuning_naivebayes = pd.DataFrame(columns=["Country", "alpha", "Tuning F1", "OOS F1"])

# loop over all countries

for country, path in countries.items():
    print("\nCurrent Country: " + country)
    # initialize stopwords and stemmer in correct language
    stops = set(stopwords.words("spanish")) if country == "Venezuela" else set(stopwords.words("english"))
    stemmer = SpanishStemmer() if country == "Venezuela" else EnglishStemmer()
    
    results_scores_current_svc = []
    results_scores_current_randomforest = []
    results_scores_current_naivebayes = []
    results_scores_current_untuned_svc = []
    results_scores_current_untuned_randomforest = []
    results_scores_current_untuned_naivebayes = []
    
    results_tuning_current_svc = []
    results_tuning_current_randomforest = []
    results_tuning_current_naivebayes = []
    
    # preprocess the data
    data = preprocess_data(path, stops, stemmer)

    # loop over seeds, load data and tune/train baseline models
    for seed in seeds:
        X_train_tfidf, X_test_tfidf, y_train, y_test = load_data(data, seed)

        # SVC Tuned
        print("SVC...")
        result_scores, results_tuning_current = run_svc(X_train_tfidf, X_test_tfidf, y_train, y_test)
        results_scores_current_svc.append(result_scores)
        results_tuning_svc = results_tuning_svc.append({"Country": country,
                                                "kernel": results_tuning_current[0],
                                                "C": results_tuning_current[1],
                                                "class_weight": results_tuning_current[2],
                                                "gamma": results_tuning_current[3],
                                                "Tuning F1": results_tuning_current[4],
                                                "OOS F1": result_scores[3]}, ignore_index=True)
        print(result_scores)

        # SVC Untuned
        print("SVC Untuned...")
        result_scores = run_svc(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
        results_scores_current_untuned_svc.append(result_scores)
        print(result_scores)
        
        # Random Forest Tuned
        print("Random Forest...")
        result_scores, results_tuning_current = run_randomforest(X_train_tfidf, X_test_tfidf, y_train, y_test)
        results_scores_current_randomforest.append(result_scores)
        results_tuning_randomforest = results_tuning_randomforest.append({"Country": country,
                                                "max_depth": results_tuning_current[0],
                                                "n_estimators": results_tuning_current[1],
                                                "class_weight": results_tuning_current[2],
                                                "max_features": results_tuning_current[3],
                                                "Tuning F1": results_tuning_current[4],
                                                "OOS F1": result_scores[3]}, ignore_index=True)
        print(result_scores)
        
        # Random Forest Untuned  
        print("Random Forest Untuned...")
        result_scores = run_randomforest(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
        results_scores_current_untuned_randomforest.append(result_scores)
        print(result_scores)
        
        # Naives Bayes Tuned
        print("Naives Bayes...")
        result_scores, results_tuning_current = run_naivebayes(X_train_tfidf, X_test_tfidf, y_train, y_test)
        results_scores_current_naivebayes.append(result_scores)
        results_tuning_naivebayes = results_tuning_naivebayes.append({"Country": country,
                                                                      "alpha": results_tuning_current[0],
                                                                      "Tuning F1": results_tuning_current[1],
                                                                      "OOS F1": result_scores[3]}, ignore_index=True)
        print(result_scores)
        
        # Naives Bayes Untuned  
        print("Naives Bayes Untuned...")
        result_scores = run_naivebayes(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
        results_scores_current_untuned_naivebayes.append(result_scores)
        print(result_scores)
        
    # SVC Tuned
    results_svc = results_svc.append({"Baseline": "SVM", "Country": country,
                              "Accuracy": np.array(results_scores_current_svc)[:, -4][0],
                              "Precision": np.array(results_scores_current_svc)[:, -3][0],
                              "Recall": np.array(results_scores_current_svc)[:, -2][0],
                              "F1": np.array(results_scores_current_svc)[:, -1][0]}, ignore_index=True)
    results_svc_full = results_svc_full.append({"Baseline": "SVM", "Country": country,
                          "Accuracy": results_scores_current_svc,
                          "Precision": results_scores_current_svc,
                          "Recall": results_scores_current_svc,
                          "F1": results_scores_current_svc}, ignore_index=True)
        
    # SVC Untuned                              
    results_svc = results_svc.append({"Baseline": "SVM Untuned", "Country": country,
                              "Accuracy": np.array(results_scores_current_untuned_svc)[:, -4][0],
                              "Precision": np.array(results_scores_current_untuned_svc)[:, -3][0],
                              "Recall": np.array(results_scores_current_untuned_svc)[:, -2][0],
                              "F1": np.array(results_scores_current_untuned_svc)[:, -1][0]}, ignore_index=True)
    results_svc_full = results_svc_full.append({"Baseline": "SVM Untuned", "Country": country,
                          "Accuracy": results_scores_current_untuned_svc,
                          "Precision": results_scores_current_untuned_svc,
                          "Recall": results_scores_current_untuned_svc,
                          "F1": results_scores_current_untuned_svc}, ignore_index=True)
        
    # Random Forest Tuned
    results_randomforest = results_randomforest.append({"Baseline": "Random Forest", "Country": country,
                              "Accuracy": np.array(results_scores_current_randomforest)[:, -4][0],
                              "Precision": np.array(results_scores_current_randomforest)[:, -3][0],
                              "Recall": np.array(results_scores_current_randomforest)[:, -2][0],
                              "F1": np.array(results_scores_current_randomforest)[:, -1][0]}, ignore_index=True)
    results_randomforest_full = results_randomforest_full.append({"Baseline": "Random Forest", "Country": country,
                          "Accuracy": results_scores_current_randomforest,
                          "Precision": results_scores_current_randomforest,
                          "Recall": results_scores_current_randomforest,
                          "F1": results_scores_current_randomforest}, ignore_index=True) 

    # Random Forest Untuned
    results_randomforest = results_randomforest.append({"Baseline": "Random Forest Untuned", "Country": country,
                              "Accuracy": np.array(results_scores_current_untuned_randomforest)[:, -4][0],
                              "Precision": np.array(results_scores_current_untuned_randomforest)[:, -3][0],
                              "Recall": np.array(results_scores_current_untuned_randomforest)[:, -2][0],
                              "F1": np.array(results_scores_current_untuned_randomforest)[:, -1][0]}, ignore_index=True)
    results_randomforest_full = results_randomforest_full.append({"Baseline": "Random Forest Untuned", "Country": country,
                          "Accuracy": results_scores_current_untuned_randomforest,
                          "Precision": results_scores_current_untuned_randomforest,
                          "Recall": results_scores_current_untuned_randomforest,
                          "F1": results_scores_current_untuned_randomforest}, ignore_index=True) 
        
    # Naive Bayes Tuned
    results_naivebayes = results_naivebayes.append({"Baseline": "Naive Bayes", "Country": country,
                              "Accuracy": np.array(results_scores_current_naivebayes)[:, -4][0],
                              "Precision": np.array(results_scores_current_naivebayes)[:, -3][0],
                              "Recall": np.array(results_scores_current_naivebayes)[:, -2][0],
                              "F1": np.array(results_scores_current_naivebayes)[:, -1][0]}, ignore_index=True)
    results_naivebayes_full = results_naivebayes_full.append({"Baseline": "Naive Bayes", "Country": country,
                          "Accuracy": results_scores_current_naivebayes,
                          "Precision": results_scores_current_naivebayes,
                          "Recall": results_scores_current_naivebayes,
                          "F1": results_scores_current_naivebayes}, ignore_index=True)    
                              
    # Naive Bayes Untuned
    results_naivebayes = results_naivebayes.append({"Baseline": "Naive Bayes Untuned", "Country": country,
                              "Accuracy": np.array(results_scores_current_untuned_naivebayes)[:, -4][0],
                              "Precision": np.array(results_scores_current_untuned_naivebayes)[:, -3][0],
                              "Recall": np.array(results_scores_current_untuned_naivebayes)[:, -2][0],
                              "F1": np.array(results_scores_current_untuned_naivebayes)[:, -1][0]}, ignore_index=True)
    results_naivebayes_full = results_naivebayes_full.append({"Baseline": "Naive Bayes Untuned", "Country": country,
                          "Accuracy": results_scores_current_untuned_naivebayes,
                          "Precision": results_scores_current_untuned_naivebayes,
                          "Recall": results_scores_current_untuned_naivebayes,
                          "F1": results_scores_current_untuned_naivebayes}, ignore_index=True)

## Store detailed result scores
print(results_svc)
print(results_randomforest)
print(results_naivebayes)
results_svc.round(3).to_csv("results/svm_results.csv", index=False)
results_randomforest.round(3).to_csv("results/randomforest_results.csv", index=False)
results_naivebayes.round(3).to_csv("results/naivebayes_results.csv", index=False)

results_svc_full.round(3).to_csv("results/svm_results_full.csv", index=False)
results_randomforest_full.round(3).to_csv("results/randomforest_results_full.csv", index=False)
results_naivebayes_full.round(3).to_csv("results/naivebayes_results_full.csv", index=False)

# Store best hyperparameter combinations (5 tuning runs) for each country
print(results_tuning_svc)
print(results_tuning_randomforest)
print(results_tuning_naivebayes)
results_tuning_svc.round(3).to_csv("results/svm_results_hyperparameter.csv", index=False)
results_tuning_randomforest.round(3).to_csv("results/randomforest_results_hyperparameter.csv", index=False)
results_tuning_naivebayes.round(3).to_csv("results/naivebayes_results_hyperparameter.csv", index=False)


Current Country: Venezuela
SVC...
Fitting 5 folds for each of 484 candidates, totalling 2420 fits
Best Tuning Score is 0.5377622377622379 with params {'svc__C': 1.0, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9191090269636577, 0.36585365853658536, 0.6382978723404256, 0.46511627906976744]
SVC Untuned...
[0.9460726846424384, 0.6666666666666666, 0.0425531914893617, 0.08]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.4862738813474108 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 1000, 'randomforest__random_state': 20211010}
[0.9413833528722158, 0.46938775510204084, 0.48936170212765956, 0.47916666666666663]
Random Forest Untuned...
[0.9472450175849941, 0.5833333333333334, 0.14893617021276595, 0.23728813559322032]
Naives Bayes...
Fitting 5 folds for each of 100 can

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.5411752556444264 with params {'svc__C': 403.4287934927351, 'svc__class_weight': 'balanced', 'svc__gamma': 0.0001, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9273153575615475, 0.38461538461538464, 0.5319148936170213, 0.44642857142857145]
SVC Untuned...
[0.9472450175849941, 0.6, 0.1276595744680851, 0.21052631578947367]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.5051955826676062 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 150, 'randomforest__random_state': 20211010}
[0.9109026963657679, 0.2542372881355932, 0.3191489361702128, 0.28301886792452835]
Random Forest Untuned...
[0.9472450175849941, 0.5714285714285714, 0.1702127659574468, 0.2622950819672131]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.3212246301177243 with params {'naive

  _warn_prf(average, modifier, msg_start, len(result))


SVC...
Fitting 5 folds for each of 484 candidates, totalling 2420 fits
Best Tuning Score is 0.5583062790591106 with params {'svc__C': 1.0, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9320046893317703, 0.42028985507246375, 0.6170212765957447, 0.5]
SVC Untuned...
[0.9460726846424384, 0.6, 0.06382978723404255, 0.11538461538461536]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.46898123119683577 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 400, 'randomforest__random_state': 20211010}
[0.9472450175849941, 0.5217391304347826, 0.5106382978723404, 0.5161290322580645]
Random Forest Untuned...
[0.9472450175849941, 0.5833333333333334, 0.14893617021276595, 0.23728813559322032]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score 

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.4989018138177184 with params {'svc__C': 148.4131591025766, 'svc__class_weight': 'balanced', 'svc__gamma': 'auto', 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9296600234466589, 0.41975308641975306, 0.723404255319149, 0.53125]
SVC Untuned...
[0.9531066822977726, 0.8181818181818182, 0.19148936170212766, 0.31034482758620685]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.4859400053652738 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 400, 'randomforest__random_state': 20211010}
[0.936694021101993, 0.4406779661016949, 0.5531914893617021, 0.49056603773584906]
Random Forest Untuned...
[0.9507620164126612, 0.6923076923076923, 0.19148936170212766, 0.30000000000000004]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.29013453392774535 with params {'

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.5268164678152867 with params {'svc__C': 54.598150033144236, 'svc__class_weight': 'balanced', 'svc__gamma': 0.001, 'svc__kernel': 'sigmoid', 'svc__random_state': 20211010}
[0.9378663540445487, 0.45714285714285713, 0.6808510638297872, 0.547008547008547]
SVC Untuned...
[0.9507620164126612, 1.0, 0.10638297872340426, 0.1923076923076923]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.4797324361131138 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 200, 'randomforest__random_state': 20211010}
[0.9320046893317703, 0.39622641509433965, 0.44680851063829785, 0.42000000000000004]
Random Forest Untuned...
[0.9531066822977726, 0.8888888888888888, 0.1702127659574468, 0.2857142857142857]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.3397329168721574 with params {'n

  _warn_prf(average, modifier, msg_start, len(result))


SVC...
Fitting 5 folds for each of 484 candidates, totalling 2420 fits
Best Tuning Score is 0.674169622806332 with params {'svc__C': 20.085536923187668, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9645669291338582, 0.7058823529411765, 0.75, 0.7272727272727272]
SVC Untuned...
[0.9488188976377953, 0.8, 0.25, 0.38095238095238093]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.5991022914367862 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 100, 'randomforest__random_state': 20211010}
[0.9507874015748031, 0.6129032258064516, 0.59375, 0.6031746031746031]
Random Forest Untuned...
[0.9468503937007874, 0.7777777777777778, 0.21875, 0.34146341463414637]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.5118148872990834 wi

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.6661288515406163 with params {'svc__C': 2980.9579870417283, 'svc__class_weight': 'balanced', 'svc__gamma': 0.0001, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9468503937007874, 0.5714285714285714, 0.625, 0.5970149253731343]
SVC Untuned...
[0.9547244094488189, 0.8461538461538461, 0.34375, 0.4888888888888889]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.5919452225334578 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 200, 'randomforest__random_state': 20211010}
[0.9251968503937008, 0.425, 0.53125, 0.47222222222222215]
Random Forest Untuned...
[0.952755905511811, 0.7857142857142857, 0.34375, 0.4782608695652174]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.4574571811167556 with params {'naivebayes__alpha': 8.111308307896872e-07}
[0.956692

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.6566268242940857 with params {'svc__C': 2.718281828459045, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1, 'svc__kernel': 'sigmoid', 'svc__random_state': 20211010}
[0.9330708661417323, 0.4807692307692308, 0.78125, 0.5952380952380952]
SVC Untuned...
[0.9409448818897638, 0.5833333333333334, 0.21875, 0.31818181818181823]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.610888455482305 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 150, 'randomforest__random_state': 20211010}
[0.9389763779527559, 0.5135135135135135, 0.59375, 0.5507246376811593]
Random Forest Untuned...
[0.9330708661417323, 0.4, 0.125, 0.19047619047619047]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.45180690540538393 with params {'naivebayes__alpha': 5.336699231206313e-06}
[0.938976

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.6711242142276626 with params {'svc__C': 148.4131591025766, 'svc__class_weight': 'balanced', 'svc__gamma': 0.001, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9291338582677166, 0.46, 0.71875, 0.5609756097560976]
SVC Untuned...
[0.9606299212598425, 0.9285714285714286, 0.40625, 0.5652173913043478]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.5812121212121213 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 150, 'randomforest__random_state': 20211010}
[0.9251968503937008, 0.4318181818181818, 0.59375, 0.5]
Random Forest Untuned...
[0.9566929133858267, 0.9166666666666666, 0.34375, 0.5]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.44381353299699 with params {'naivebayes__alpha': 3.5111917342151275e-06}
[0.9586614173228346, 0.72, 0.5625, 0.6315

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.683830113241878 with params {'svc__C': 20.085536923187668, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01, 'svc__kernel': 'sigmoid', 'svc__random_state': 20211010}
[0.9468503937007874, 0.5581395348837209, 0.75, 0.6399999999999999]
SVC Untuned...
[0.9468503937007874, 0.7272727272727273, 0.25, 0.37209302325581395]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.5973762010347375 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 400, 'randomforest__random_state': 20211010}
[0.9409448818897638, 0.5294117647058824, 0.5625, 0.5454545454545455]
Random Forest Untuned...
[0.9429133858267716, 0.6363636363636364, 0.21875, 0.3255813953488372]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.4561482540483466 with params {'naivebayes__alpha': 6.579332246575682e-08}

  _warn_prf(average, modifier, msg_start, len(result))


SVC...
Fitting 5 folds for each of 484 candidates, totalling 2420 fits
Best Tuning Score is 0.5206484731931174 with params {'svc__C': 2980.9579870417283, 'svc__class_weight': 'balanced', 'svc__gamma': 0.0001, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.948559670781893, 0.47058823529411764, 0.6956521739130435, 0.5614035087719297]
SVC Untuned...
[0.9629629629629629, 1.0, 0.21739130434782608, 0.3571428571428571]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.46231702510772277 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 1, 'randomforest__max_features': 'log2', 'randomforest__n_estimators': 400, 'randomforest__random_state': 20211010}
[0.9567901234567902, 1.0, 0.08695652173913043, 0.16]
Random Forest Untuned...
[0.9629629629629629, 0.8571428571428571, 0.2608695652173913, 0.4]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.481555533

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.5513322884012538 with params {'svc__C': 148.4131591025766, 'svc__class_weight': None, 'svc__gamma': 0.01, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9609053497942387, 0.7, 0.30434782608695654, 0.42424242424242425]
SVC Untuned...
[0.9567901234567902, 0.75, 0.13043478260869565, 0.22222222222222218]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.4721958501268846 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 1000, 'randomforest__random_state': 20211010}
[0.9423868312757202, 0.4, 0.43478260869565216, 0.41666666666666663]
Random Forest Untuned...
[0.9526748971193416, 0.5, 0.13043478260869565, 0.20689655172413793]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.4487388916930513 with params {'naivebayes__alpha': 1e-08}
[0.9547325102880658, 0.53

  _warn_prf(average, modifier, msg_start, len(result))


Best Tuning Score is 0.5495726495726496 with params {'svc__C': 20.085536923187668, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__random_state': 20211010}
[0.9567901234567902, 0.55, 0.4782608695652174, 0.5116279069767442]
SVC Untuned...
[0.9588477366255144, 0.8, 0.17391304347826086, 0.2857142857142857]
Random Forest...
Fitting 5 folds for each of 660 candidates, totalling 3300 fits
Best Tuning Score is 0.46595481227834173 with params {'randomforest__class_weight': 'balanced', 'randomforest__max_depth': 5, 'randomforest__max_features': 'sqrt', 'randomforest__n_estimators': 100, 'randomforest__random_state': 20211010}
[0.9444444444444444, 0.4, 0.34782608695652173, 0.37209302325581395]
Random Forest Untuned...
[0.9650205761316872, 0.875, 0.30434782608695654, 0.4516129032258065]
Naives Bayes...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Tuning Score is 0.4617736365801381 with params {'naivebayes__alpha': 1.2328467394420635e-09}
[0.9

### CNN Model

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = ""

#load pre-trained model for correct language
w2v_es = models.KeyedVectors.load_word2vec_format('sbw_vectors.bin', binary=True)
w2v_en = api.load("word2vec-google-news-300")

# initialize dictionary for countries/datasets
countries = {"Ghana": "raw/gh-tweets.csv", "Philippines": "raw/ph-tweets.csv", "Venezuela": "raw/vz-tweets.csv"}

#run several times with different param settings and seeds
seeds = [20210101, 20210102, 20210103]

#initialize result objects
results_df = pd.DataFrame()
results = []

#loop over all countries
for country, path in countries.items():

    print("\nCurrent Country: " + country)

    # initialize stopwords and stemmer in correct language
    stops = set(stopwords.words("spanish")) if country == "Venezuela" else set(stopwords.words("english"))
    stemmer = SpanishStemmer() if country == "Venezuela" else EnglishStemmer()
    results_current = []

    #load pre-trained model for correct language, to-do: aus der schleife holen
    if country == "Venezuela":
        w2v = w2v_es
    else:
        w2v = w2v_en

    #preprocess the data
    data = preprocess_data(path, stops, stemmer)

    words=list(w2v.index_to_key)
    vocab_len = len(w2v)
    
    i=1

    for seed in seeds:

        print("Run {i}/3".format(i=i))

        #iterate through different random train test splits to capture model variation
        X_train_vec, X_train_tfidf, \
        X_test_vec, X_test_tfidf, \
        y_train_vec, y_test = embedding_transform(data, w2v,words, seed)
        
        tuner = tune_model_cv(X_train_vec, y_train_vec, model= country+str(seed), runs=2, epochs=200)
        
        #build model with best params
        best_hp = tuner.get_best_hyperparameters()[0]
        model = tuner.hypermodel.build(best_hp)
        
        #set class weight
        ratio_1 = 1.0 - len(y_train_vec[y_train_vec == 1]) / float(len(y_train_vec))  ## ratio of violence instances
        ratio_0 = 1.0 - ratio_1
        class_weight = {0: ratio_0, 1: ratio_1}

        #fit model
        model.fit([X_train_vec, X_train_vec, X_train_vec], y_train_vec, epochs=200, batch_size=64, class_weight=class_weight)

        #classify sequences
        y_pred = model.predict([X_test_vec, X_test_vec, X_test_vec])
        y_pred =(y_pred>0.5)

        results.append(best_hp+[print_stats(y_test, y_pred, model = "{c}_CNN_{p}".format(p="tuned params", c=country))])
        pd.DataFrame(results, columns=["parameters", "model", "accuracy", "precision", "recall", "f1"]).to_csv("temp_results_{c}_run_{p}.csv".format(p=i, c=country))
            
        print(results)
        
        i+=1
        
#combine all results and calculate summary statistics
results_df = pd.DataFrame(results, columns=["parameters", "model", "accuracy", "precision", "recall", "f1"])
cnn_results.to_csv("results.csv")
cnn_results = results_df.groupby(results_df["model"]).agg([np.mean, np.std])
cnn_results.to_csv("final_results.csv")

print(cnn_results)