In [1]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#####################################################################
#                      Balanced vs Ranodm                           #
#####################################################################
#                                                                   #
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
import pandas as pd
import numpy as np
from collections import Counter

from utilities.preprocess import Preproccesor
from utilities.attention_layer import Attention
from utilities.helping_functions import create_embedding_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import f1_score, balanced_accuracy_score, confusion_matrix
from sklearn.utils import shuffle

from keras.preprocessing.sequence import pad_sequences
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, Bidirectional, Dense, \
    LSTM, Conv1D, Dropout, concatenate
from keras import Input, Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnmollas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnmollas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Loading our Binary Data and the binary external dataset D1: Davidson, Thomas, et al. "Automated hate speech detection and the problem of offensive language." Proceedings of the International AAAI Conference on Web and Social Media. Vol. 11. No. 1. 2017.

In [4]:
X, y = Preproccesor.load_data(True)
X_tweets, y_tweets = Preproccesor.load_external_data(True)
class_names = ['noHateSpeech', 'hateSpeech']

We will create two subsets (of the same size) of our binary dataset, one completely random, and one maintaining balance between classes. We will train then an SVM model and we will evaluate on the rest of our initial dataset. We will do the same for the external dataset.

In [19]:
acc_train = []
acc_valid = []
acc_tweets = []
f1_train = []
f1_valid = []
f1_tweets = []
f1_tweets_hate = []
f1_tweets_nohate = []
for iteration in range(10):
    print("Ready in", 10-iteration)
    data = shuffle(X, y, random_state=777 + iteration)
    X_temp = data[0]
    y_temp = data[1]
    c_0 = int((len(y_temp)-sum(y_temp))*0.875)
    c_1 = int((sum(y_temp))*0.875)

    x_val = X_temp[c_0+c_1:]  # We will test on this 12.5% of data
    y_val = y_temp[c_0+c_1:]

    # The rest available data are the 87.5% of the original
    x_rest = X_temp[:c_0+c_1]
    y_rest = y_temp[:c_0+c_1]

    c_0 = int((len(y_temp)-sum(y_temp))*0.75)  # We will select
    c_1 = int((sum(y_temp))*0.75)

    x_random = x_rest[:c_0+c_1]  # 75% of them randomly
    y_random = y_rest[:c_0+c_1]

    c_0 = int((len(y_temp)-sum(y_temp))*0.875)
    c_1 = int((sum(y_temp))*0.875)

    x_75 = []
    y_75 = []

    c_min = min(c_0, c_1)
    if c_min > int(len(y_temp)*0.75/2):
        c_min = int(len(y_temp)*0.75/2)
    c_0 = c_min
    c_1 = c_min
    for i in range(len(y_temp)):  # 75% of them maintaining class balance
        if y_temp[i] == 0 and c_0 > 0:
            x_75.append(X_temp[i])
            y_75.append(y_temp[i])
            c_0 = c_0 - 1
        elif y_temp[i] == 1 and c_1 > 0:
            x_75.append(X_temp[i])
            y_75.append(y_temp[i])
            c_1 = c_1 - 1
    training_data = {'Random:': [x_random, y_random], 'Balanced:': [x_75, y_75]}
    for k, v in training_data.items():
        x_train = v[0]
        y_train = v[1]

        vec = TfidfVectorizer(analyzer='word', max_features=5000,
                              ngram_range=(1, 2), stop_words='english')
        vec.fit(X_tweets)

        x_train = vec.transform(x_train)
        x_valid = vec.transform(x_val)
        x_tweets = vec.transform(X_tweets)

        svm = SVC(kernel='rbf')
        svm.fit(x_train, y_train)

        y_predict = svm.predict(x_train)
        # print("Train",k)
        acc_train.append([k, balanced_accuracy_score(y_train, y_predict)])
        f1_train.append([k, f1_score(y_train, y_predict, average='weighted')])

        y_predict = svm.predict(x_valid)
        # print("Valid",k)
        acc_valid.append([k, balanced_accuracy_score(y_val, y_predict)])
        f1_valid.append([k, f1_score(y_val, y_predict, average='weighted')])

        #y_predict = svm.predict(x_tweets)
        y_predict = svm.predict(x_tweets)

        # print("Tweets",k)
        acc_tweets.append([k, balanced_accuracy_score(y_tweets, y_predict)])
        f1_tweets.append([k, f1_score(y_tweets, y_predict, average='weighted')])
        f1_tweets_nohate.append(
            [k, f1_score(y_tweets, y_predict, average=None)[0]])
        f1_tweets_hate.append([k, f1_score(y_tweets, y_predict, average=None)[1]])
nnames = ["Accuracy on Train:", "Accuracy on Valid:", "Accuracy on Tweets:", "F1 on Train:",
          "F1 on Valid:", "F1 on Tweets:", "F1 on Tweets NonHate:", "F1 on Tweets Hate:"]
cc = 0
print("Printing results:")
for i in [acc_train, acc_valid, acc_tweets, f1_train, f1_valid, f1_tweets, f1_tweets_nohate, f1_tweets_hate]:
    r_i = []
    r_b = []
    for j in i:
        if j[0] == 'Random:':
            r_i.append(j[1])
        else:
            r_b.append(j[1])
    r_i = np.array(r_i)
    r_b = np.array(r_b)
    print(nnames[cc])
    print('  Random', r_i.mean(), r_i.std())
    print('  Balanced', r_b.mean(), r_b.std())
    cc = cc + 1

Ready in 10
Ready in 9
Ready in 8
Ready in 7
Ready in 6
Ready in 5
Ready in 4
Ready in 3
Ready in 2
Ready in 1
Accuracy on Train:
  Random 0.97966376499681 0.005378974684487675
  Balanced 0.9808823529411764 0.0038306280165492997
Accuracy on Valid:
  Random 0.6314528804791895 0.03931654165911837
  Balanced 0.6798714281580398 0.02167657082961586
Accuracy on Tweets:
  Random 0.5062 0.010989085494252916
  Balanced 0.43610000000000004 0.12387207110563707
F1 on Train:
  Random 0.9807072184575079 0.005128155255837031
  Balanced 0.9808812133263045 0.0038307833930770595
F1 on Valid:
  Random 0.6418969967121514 0.04890115884441745
  Balanced 0.6906148775114815 0.022944219034009755
F1 on Tweets:
  Random 0.3615152907320149 0.010478921171478238
  Balanced 0.3721146662326628 0.08249712158863223
F1 on Tweets NonHate:
  Random 0.6653220800604149 0.010088513290533238
  Balanced 0.5448483303218514 0.16317434416833318
F1 on Tweets Hate:
  Random 0.05770850140361501 0.018680498275611763
  Balanced 0.1993

The resulrs propose that better performance has the balanced subset. On both the rest of ETHOS Data, and the external data (on the hate class, which is the minority class as well in this dataset). 