In [1]:
from functions import load_data, load_data_leakage, preprocess_data, run_svc, run_dummy, run_randomforest, run_naivebayes, run_svc_leakage
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer, SpanishStemmer

import nltk
nltk.download('stopwords')

# set 5 different seeds for reproducibility
seeds = [20210101, 20210102, 20210103, 20210104, 20210105]

# initialize dictionary for countries/datasets
countries = {"Venezuela": "raw/vz-tweets 2.csv", "Ghana": "raw/gh-tweets 2.csv", "Philippines": "raw/ph-tweets 2.csv"}

# define dataframe to store results
results_dummy = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Accuracy Std. Dev.", "Precision", "Precision Std. Dev.", "Recall",
             "Recall Std. Dev.", "F1", "F1 Std. Dev."])
results_svc= pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Accuracy Std. Dev.", "Precision", "Precision Std. Dev.", "Recall",
             "Recall Std. Dev.", "F1", "F1 Std. Dev."])
results_randomforest = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Accuracy Std. Dev.", "Precision", "Precision Std. Dev.", "Recall",
             "Recall Std. Dev.", "F1", "F1 Std. Dev."])
results_naivebayes = pd.DataFrame(
    columns=["Baseline", "Country", "Accuracy", "Accuracy Std. Dev.", "Precision", "Precision Std. Dev.", "Recall",
             "Recall Std. Dev.", "F1", "F1 Std. Dev."])
results_tuning_svc = pd.DataFrame(columns=["Country", "kernel", "C", "class_weight", "gamma", "Tuning F1"])
results_tuning_randomforest = pd.DataFrame(columns=["Country", "max_depth", "n_estimators", "class_weight", "max_features", "Tuning F1"])
results_tuning_naivebayes = pd.DataFrame(columns=["Country", "alpha"])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaskuepfer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# loop over all countries

for country, path in countries.items():
    print("\nCurrent Country: " + country)
    # initialize stopwords and stemmer in correct language
    stops = set(stopwords.words("spanish")) if country == "Venezuela" else set(stopwords.words("english"))
    stemmer = SpanishStemmer() if country == "Venezuela" else EnglishStemmer()
    
    results_scores_current_dummy = []
    results_scores_current_svc = []
    results_scores_current_randomforest = []
    results_scores_current_naivebayes = []
    results_scores_current_untuned_svc = []
    results_scores_current_untuned_randomforest = []
    results_scores_current_untuned_naivebayes = []
    
    results_tuning_current_svc = []
    results_tuning_current_randomforest = []
    results_tuning_current_naivebayes = []
    
    # preprocess the data
    data = preprocess_data(path, stops, stemmer)

    # loop over seeds, load data and tune/train baseline models
    for seed in seeds:
        X_train_tfidf, X_test_tfidf, y_train, y_test = load_data_leakage(data, seed)
        
        # Dummy Classifier
        #print("Dummy...")
        #result_scores = run_dummy(X_train_tfidf, X_test_tfidf, y_train, y_test)
        #results_scores_current_dummy.append(result_scores)
        #print(result_scores)

        # SVC Tuned
        print("SVC...")
        #result_scores, results_tuning_current = run_svc(X_train_tfidf, X_test_tfidf, y_train, y_test)
        result_scores, results_tuning_current = run_svc_leakage(X_train_tfidf, X_test_tfidf, y_train, y_test)
        results_scores_current_svc.append(result_scores)
        results_tuning_svc = results_tuning_svc.append({"Country": country,
                                                "kernel": results_tuning_current[0],
                                                "C": results_tuning_current[1],
                                                "class_weight": results_tuning_current[2],
                                                "gamma": results_tuning_current[3],
                                                "Tuning F1": results_tuning_current[4]}, ignore_index=True)
        print(result_scores)

        # SVC Untuned
        print("SVC Untuned...")
        result_scores = run_svc_leakage(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
        results_scores_current_untuned_svc.append(result_scores)
        print(result_scores)
        
        # Random Forest Tuned
        #print("Random Forest...")
        #result_scores, results_tuning_current = run_randomforest(X_train_tfidf, X_test_tfidf, y_train, y_test)
        #results_scores_current_randomforest.append(result_scores)
        #results_tuning_randomforest = results_tuning_randomforest.append({"Country": country,
        #                                        "max_depth": results_tuning_current[0],
        #                                        "n_estimators": results_tuning_current[1],
        #                                        "class_weight": results_tuning_current[2],
        #                                        "max_features": results_tuning_current[3],
        #                                        "Tuning F1": results_tuning_current[4]}, ignore_index=True)
        #print(result_scores)
        
        # Random Forest Untuned  
        #print("Random Forest Untuned...")
        #result_scores = run_randomforest(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
        #results_scores_current_untuned_randomforest.append(result_scores)
        #print(result_scores)
        
        # Naives Bayes Tuned
        #print("Naives Bayes...")
        #result_scores, results_tuning_current = run_naivebayes(X_train_tfidf, X_test_tfidf, y_train, y_test)
        #results_scores_current_naivebayes.append(result_scores)
        #results_tuning_naivebayes = results_tuning_naivebayes.append({"Country": country,
        #                                        "alpha": results_tuning_current[0]}, ignore_index=True)
        #print(result_scores)
        
        # Naives Bayes Untuned  
        #print("Naives Bayes Untuned...")
        #result_scores = run_naivebayes(X_train_tfidf, X_test_tfidf, y_train, y_test, tune = False)
        #results_scores_current_untuned_naivebayes.append(result_scores)
        #print(result_scores)
        
    # calculate means/standard deviations from results
    # Dummy Classifier
    #results_current_mean_dummy = np.array(results_scores_current_dummy).mean(axis=0)
    #results_std_dev_dummy = np.array(results_scores_current_dummy).std(axis=0)
    #results_dummy = results_dummy.append({"Baseline": "Dummy", "Country": country,
    #                          "Accuracy": results_current_mean_dummy[0],
    #                          "Accuracy Std. Dev.": results_std_dev_dummy[0],
    #                          "Precision": results_current_mean_dummy[1],
    #                          "Precision Std. Dev.": results_std_dev_dummy[1],
    #                          "Recall": results_current_mean_dummy[2],
    #                          "Recall Std. Dev.": results_std_dev_dummy[2],
    #                          "F1": results_current_mean_dummy[3],
    #                          "F1 Std. Dev.": results_std_dev_dummy[3]}, ignore_index=True)
                              
    # SVC Tuned
    #results_current_mean_svc = np.array(results_scores_current_svc).mean(axis=0)
    #results_std_dev_svc = np.array(results_scores_current_svc).std(axis=0)
    #results_svc = results_svc.append({"Baseline": "SVM", "Country": country,
    #                          "Accuracy": results_current_mean_svc[0],
    #                          "Accuracy Std. Dev.": results_std_dev_svc[0],
    #                          "Precision": results_current_mean_svc[1],
    #                          "Precision Std. Dev.": results_std_dev_svc[1],
    #                          "Recall": results_current_mean_svc[2],
    #                          "Recall Std. Dev.": results_std_dev_svc[2],
    #                          "F1": results_current_mean_svc[3],
    #                          "F1 Std. Dev.": results_std_dev_svc[3]}, ignore_index=True)
                              
    # SVC Untuned                              
    #results_current_mean_untuned_svc = np.array(results_scores_current_untuned_svc).mean(axis=0)
    #results_std_dev_untuned_svc = np.array(results_scores_current_untuned_svc).std(axis=0)
    #results_svc = results_svc.append({"Baseline": "SVM Untuned", "Country": country,
    #                          "Accuracy": results_current_mean_untuned_svc[0],
    #                          "Accuracy Std. Dev.": results_std_dev_untuned_svc[0],
    #                          "Precision": results_current_mean_untuned_svc[1],
    #                          "Precision Std. Dev.": results_std_dev_untuned_svc[1],
    #                          "Recall": results_current_mean_untuned_svc[2],
    #                          "Recall Std. Dev.": results_std_dev_untuned_svc[2],
    #                          "F1": results_current_mean_untuned_svc[3],
    #                          "F1 Std. Dev.": results_std_dev_untuned_svc[3]}, ignore_index=True)
                              
    # Random Forest Tuned
    #results_current_mean_randomforest = np.array(results_scores_current_randomforest).mean(axis=0)
    #results_std_dev_randomforest = np.array(results_scores_current_randomforest).std(axis=0)
    #results_randomforest = results_randomforest.append({"Baseline": "Random Forest", "Country": country,
    #                          "Accuracy": results_current_mean_randomforest[0],
    #                          "Accuracy Std. Dev.": results_std_dev_randomforest[0],
    #                          "Precision": results_current_mean_randomforest[1],
    #                          "Precision Std. Dev.": results_std_dev_randomforest[1],
    #                          "Recall": results_current_mean_randomforest[2],
    #                          "Recall Std. Dev.": results_std_dev_randomforest[2],
    #                          "F1": results_current_mean_randomforest[3],
    #                          "F1 Std. Dev.": results_std_dev_randomforest[3]}, ignore_index=True)
                              
    # Random Forest Untuned
    #results_current_mean_untuned_randomforest = np.array(results_scores_current_untuned_randomforest).mean(axis=0)
    #results_std_dev_untuned_randomforest = np.array(results_scores_current_untuned_randomforest).std(axis=0)
    #results_randomforest = results_randomforest.append({"Baseline": "Random Forest Untuned", "Country": country,
    #                          "Accuracy": results_current_mean_untuned_randomforest[0],
    #                          "Accuracy Std. Dev.": results_std_dev_untuned_randomforest[0],
    #                          "Precision": results_current_mean_untuned_randomforest[1],
    #                          "Precision Std. Dev.": results_std_dev_untuned_randomforest[1],
    #                          "Recall": results_current_mean_untuned_randomforest[2],
    #                          "Recall Std. Dev.": results_std_dev_untuned_randomforest[2],
    #                          "F1": results_current_mean_untuned_randomforest[3],
    #                          "F1 Std. Dev.": results_std_dev_untuned_randomforest[3]}, ignore_index=True)

    # Naive Bayes Tuned
    #results_current_mean_naivebayes = np.array(results_scores_current_naivebayes).mean(axis=0)
    #results_std_dev_naivebayes = np.array(results_scores_current_naivebayes).std(axis=0)
    #results_naivebayes = results_naivebayes.append({"Baseline": "Naive Bayes", "Country": country,
    #                          "Accuracy": results_current_mean_naivebayes[0],
    #                          "Accuracy Std. Dev.": results_std_dev_naivebayes[0],
    #                          "Precision": results_current_mean_naivebayes[1],
    #                          "Precision Std. Dev.": results_std_dev_naivebayes[1],
    #                          "Recall": results_current_mean_naivebayes[2],
    #                          "Recall Std. Dev.": results_std_dev_naivebayes[2],
    #                          "F1": results_current_mean_naivebayes[3],
    #                          "F1 Std. Dev.": results_std_dev_naivebayes[3]}, ignore_index=True)
                              
    # Naive Bayes Untuned
    #results_current_mean_untuned_naivebayes = np.array(results_scores_current_untuned_naivebayes).mean(axis=0)
    #results_std_dev_untuned_naivebayes = np.array(results_scores_current_untuned_naivebayes).std(axis=0)
    #results_naivebayes = results_naivebayes.append({"Baseline": "Naive Bayes Untuned", "Country": country,
    #                          "Accuracy": results_current_mean_untuned_naivebayes[0],
    #                          "Accuracy Std. Dev.": results_std_dev_untuned_naivebayes[0],
    #                          "Precision": results_current_mean_untuned_naivebayes[1],
    #                          "Precision Std. Dev.": results_std_dev_untuned_naivebayes[1],
    #                          "Recall": results_current_mean_untuned_naivebayes[2],
    #                          "Recall Std. Dev.": results_std_dev_untuned_naivebayes[2],
    #                          "F1": results_current_mean_untuned_naivebayes[3],
    #                          "F1 Std. Dev.": results_std_dev_untuned_naivebayes[3]}, ignore_index=True)


Current Country: Venezuela
SVC...
Fitting 5 folds for each of 484 candidates, totalling 2420 fits
Best Tuning Score is 0.5413712860671427 with params {'svc__C': 7.38905609893065, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01, 'svc__kernel': 'sigmoid'}
[0.9573283858998145, 0.5, 0.7391304347826086, 0.5964912280701754]
SVC Untuned...
[0.9795918367346939, 1.0, 0.5217391304347826, 0.6857142857142856]
SVC...
Fitting 5 folds for each of 484 candidates, totalling 2420 fits
Best Tuning Score is 0.5346364746364747 with params {'svc__C': 1.0, 'svc__class_weight': 'balanced', 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}
[0.9662921348314607, 0.6666666666666666, 0.9714285714285714, 0.7906976744186046]
SVC Untuned...
[0.9662921348314607, 1.0, 0.4857142857142857, 0.6538461538461539]
SVC...
Fitting 5 folds for each of 484 candidates, totalling 2420 fits
Best Tuning Score is 0.5413712860671427 with params {'svc__C': 7.38905609893065, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01, 'svc__kernel':

In [4]:
## Store detailed result scores
print(results_dummy)
print(results_svc)
print(results_randomforest)
print(results_naivebayes)
results_dummy.round(3).to_csv("dummy_results_tfidf.csv", index=False)
results_svc.round(3).to_csv("svm_results_tfidf.csv", index=False)
results_randomforest.round(3).to_csv("randomforest_results_tfidf.csv", index=False)
results_naivebayes.round(3).to_csv("naivebayes_results_tfidf.csv", index=False)

# Store best hyperparameter combinations (5 tuning runs) for each country
print(results_tuning_svc)
print(results_tuning_randomforest)
print(results_tuning_naivebayes)
results_tuning_svc.round(3).to_csv("svm_results_hyperparameter_tfidf.csv", index=False)
results_tuning_randomforest.round(3).to_csv("randomforest_results_hyperparameter_tfidf.csv", index=False)
results_tuning_naivebayes.round(3).to_csv("naivebayes_results_hyperparameter_tfidf.csv", index=False)

  Baseline      Country  Accuracy  Accuracy Std. Dev.  Precision  \
0    Dummy    Venezuela  0.502540            0.006586   0.055031   
1    Dummy        Ghana  0.489501            0.010016   0.064496   
2    Dummy  Philippines  0.485597            0.004115   0.047809   

   Precision Std. Dev.    Recall  Recall Std. Dev.        F1  F1 Std. Dev.  
0             0.006624  0.496454          0.059760  0.099080      0.011927  
1             0.009747  0.526042          0.079502  0.114903      0.017366  
2             0.003984  0.521739          0.043478  0.087591      0.007299  
      Baseline      Country  Accuracy  Accuracy Std. Dev.  Precision  \
0          SVM    Venezuela  0.936108            0.003814   0.438807   
1  SVM Untuned    Venezuela  0.948417            0.002621   0.714141   
2          SVM        Ghana  0.953740            0.003164   0.626350   
3  SVM Untuned        Ghana  0.949803            0.006404   0.776058   
4          SVM  Philippines  0.949588            0.009259  

In [16]:
results_dummy

Unnamed: 0,Baseline,Country,Accuracy,Accuracy Std. Dev.,Precision,Precision Std. Dev.,Recall,Recall Std. Dev.,F1,F1 Std. Dev.
0,Dummy,Venezuela,0.50254,0.006586,0.055031,0.006624,0.496454,0.05976,0.09908,0.011927
1,Dummy,Ghana,0.489501,0.010016,0.064496,0.009747,0.526042,0.079502,0.114903,0.017366
2,Dummy,Philippines,0.485597,0.004115,0.047809,0.003984,0.521739,0.043478,0.087591,0.007299


In [5]:
results = pd.concat([results_dummy, results_svc, results_randomforest, results_naivebayes])

In [6]:
results.sort_values(by=["Country", "F1"])

Unnamed: 0,Baseline,Country,Accuracy,Accuracy Std. Dev.,Precision,Precision Std. Dev.,Recall,Recall Std. Dev.,F1,F1 Std. Dev.
3,Naive Bayes Untuned,Ghana,0.937008,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Dummy,Ghana,0.489501,0.010016,0.064496,0.009747,0.526042,0.079502,0.114903,0.017366
3,Random Forest Untuned,Ghana,0.946194,0.006085,0.705484,0.126559,0.239583,0.075834,0.355207,0.09864
3,SVM Untuned,Ghana,0.949803,0.006404,0.776058,0.110115,0.28125,0.069877,0.411214,0.088561
2,Naive Bayes,Ghana,0.944226,0.00464,0.592325,0.071372,0.432292,0.066086,0.492157,0.028065
2,Random Forest,Ghana,0.948491,0.008761,0.628765,0.088291,0.458333,0.091406,0.526218,0.082644
2,SVM,Ghana,0.95374,0.003164,0.62635,0.030205,0.666667,0.023292,0.64507,0.015524
5,Naive Bayes Untuned,Philippines,0.954047,0.001534,0.5,0.5,0.028986,0.032407,0.054444,0.060144
2,Dummy,Philippines,0.485597,0.004115,0.047809,0.003984,0.521739,0.043478,0.087591,0.007299
5,Random Forest Untuned,Philippines,0.956447,0.00622,0.611772,0.199237,0.195652,0.082303,0.295287,0.116528


In [20]:
results_tuning_svc["Classifier"] = "SVC"
#results_tuning_randomforest["Classifier"] = "Random Forest"
#results_tuning_naivebayes["Classifier"] = "Naive Bayes"

results_tuning = pd.concat([results_tuning_svc])

In [21]:
results_tuning.groupby(['Country', 'Classifier'])['Tuning F1'].mean()

Country      Classifier
Ghana        SVC           0.668193
Philippines  SVC           0.549490
Venezuela    SVC           0.533502
Name: Tuning F1, dtype: float64

In [16]:
results_tuning

Unnamed: 0,Country,kernel,C,class_weight,gamma,Tuning F1,Classifier,max_depth,n_estimators,max_features,alpha
0,Venezuela,sigmoid,7.389056,balanced,0.01,0.541371,SVC,,,,
1,Venezuela,rbf,1.0,balanced,0.1,0.534636,SVC,,,,
2,Venezuela,sigmoid,7.389056,balanced,0.01,0.541371,SVC,,,,
3,Venezuela,rbf,1.0,balanced,0.1,0.558469,SVC,,,,
4,Venezuela,sigmoid,403.428793,balanced,auto,0.499276,SVC,,,,
5,Venezuela,sigmoid,7.389056,balanced,0.01,0.525889,SVC,,,,
6,Ghana,rbf,20.085537,balanced,0.01,0.666394,SVC,,,,
7,Ghana,rbf,148.413159,balanced,0.001,0.668644,SVC,,,,
8,Ghana,rbf,20.085537,balanced,0.01,0.666394,SVC,,,,
9,Ghana,sigmoid,7.389056,balanced,0.01,0.659561,SVC,,,,
