## Imports

In [1]:
%env LD_LIBRARY_PATH=/home/zach/anaconda3/envs/research/lib

env: LD_LIBRARY_PATH=/home/zach/anaconda3/envs/research/lib


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import sklearn
import matplotlib.pyplot as plt
import re
import codecs
from tqdm import tqdm

## Data Import and Cleaning

In [3]:
train = pd.read_csv('data/hateval2019_en_train.csv')
test = pd.read_csv('data/hateval2019_en_test.csv')
val = pd.read_csv('data/hateval2019_en_dev.csv')

train = train.drop(['TR','AG'],1)
test = test.drop(['TR','AG'],1)
val = val.drop(['TR','AG'],1)

  train = train.drop(['TR','AG'],1)
  test = test.drop(['TR','AG'],1)
  val = val.drop(['TR','AG'],1)


In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


2022-06-03 11:53:31.301186: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-03 11:53:31.319815: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-03 11:53:31.320044: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [5]:
train.head()

Unnamed: 0,id,text,HS
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1
1,202,Why would young fighting age men be the vast m...,1
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0
4,205,Orban in Brussels: European leaders are ignori...,0


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.regularizers import L1,L2, l1_l2
import io

In [7]:
def normalize_tweet(text):
    """
    Removes hashtags, @s, links, and punctuation
    :param text:Text to be cleaned
    :return: text with mentions, hashtages, and urls removes
    """
    processed_text = text.lower()
    processed_text = re.sub(r"(?:\@|http?\://|https?\://|www|t\.)\S+", "", processed_text)
    processed_text = re.sub(r"(?:\.|,|\?|-)", " ", processed_text)
    processed_text = re.sub(r"(?:\@|http?\://|https?\://|www|\.com)", "", processed_text)
    processed_text = re.sub(r'[^\w\s]', '', processed_text)
    processed_text = " ".join(processed_text.split())
    return processed_text

def x_y_split(data):
    """splits and X and y from dataframe

    Args:
        data:dataframe to split from

    Returns:
        tuple:X data, y data
    """
    X = data['text']
    X = X.apply(normalize_tweet)
    y = data['HS']
    return X, y

## Split sequences into train, validation, and test sets

In [8]:
#Split x and ys
x_train, y_train = x_y_split(train)
print(x_train)
#x_train.to_csv('data/x_train.csv')
#y_train.to_csv('data/y_train.csv')

0       hurray saving us in so many ways lockthemup bu...
1       why would young fighting age men be the vast m...
2       illegals dump their kids at the border like ro...
3       ny times nearly all white states pose an array...
4       orban in brussels european leaders are ignorin...
                              ...                        
8995                  i am proud to be a hysterical woman
8996    hollywood is complicit in the rape and sexual ...
8997    what a fucking cunt i hate seeing kids getting...
8998                                hysterical woman like
8999    nearly every woman i know has metoo in their f...
Name: text, Length: 9000, dtype: object


In [9]:
#Split x and ys
x_test, y_test = x_y_split(test)
print(x_test)
#x_test.to_csv('data/x_test.csv')
#y_test.to_csv('data/y_test.csv')

0       oh i could have gone on about taxes since the ...
1       several of the wild fires in california and co...
2       my question is how do you resettle a refugee a...
3       europe youve got a problem we must hurry and b...
4       this is outrageous stopillegalimmigration meri...
                              ...                        
2995    you can never take a l off a real bitch im hot...
2996    likes to call me a cunt a bitch but i tell him...
2997    1 never said you were taught 2 you called me b...
2998    if i see and know you a hoe why would i hit yo...
2999     you be chasing them hoes fuck what a bitch think
Name: text, Length: 3000, dtype: object


In [10]:
#Split x and ys
x_val, y_val = x_y_split(val)
print(x_val)
#x_val.to_csv('data/x_val.csv')
#y_val.to_csv('data/y_val.csv')

0      i swear im getting to places just in the nick ...
1      im an immigrant and trump is right on immigrat...
2      illegalimmigrants illegalaliens electoralsyste...
3      we have our own invasion issues with mexicans ...
4      worker charged with sexually molesting eight c...
                             ...                        
995                     you unfollowed me fuck you pussy
996    stfu bitch and you go make some satanic music ...
997    honey as a fellow white chick let me tell you ...
998    i hate bitches who talk about niggaz with kids...
999    you won the life time recipient for hysterical...
Name: text, Length: 1000, dtype: object


## Tokenizer

In [11]:
max_features = 15000
tokenizer = Tokenizer(num_words=max_features, split=' ', lower=True)
tokenizer.fit_on_texts(x_train)


def tokenize_and_pad(x_data, tokenizer=tokenizer, length=57):
    """
    Tokenizes and pads input
    :param x_data: X column of data
    :param tokenizer: fitted tokenizer
    :param length: length to pad
    :return: tokenized and padded x_data
    """
    x_data = tokenizer.texts_to_sequences(x_data)
    x_data = pad_sequences(x_data, maxlen=length)
    return x_data


x_train = tokenize_and_pad(x_train, tokenizer)


def split_and_tokenize(data, tokenizer=tokenizer):
    """
    Splits tokenizes and pads data
    :param data:
    :return: tupple of (X,y)
    """
    X, y = x_y_split(data)
    X = tokenize_and_pad(X)
    return X, y


x_val, y_val = split_and_tokenize(val)
x_test, y_test = split_and_tokenize(test)

In [12]:
word_index = tokenizer.word_index
print("Number of unique words:", len(word_index))

Number of unique words: 17392


In [13]:
word_index

{'the': 1,
 'to': 2,
 'a': 3,
 'you': 4,
 'and': 5,
 'of': 6,
 'in': 7,
 'is': 8,
 'for': 9,
 'i': 10,
 'are': 11,
 'not': 12,
 'that': 13,
 'on': 14,
 'bitch': 15,
 'this': 16,
 'it': 17,
 'all': 18,
 'your': 19,
 'they': 20,
 'be': 21,
 'with': 22,
 'refugees': 23,
 'have': 24,
 'women': 25,
 'we': 26,
 'me': 27,
 'immigrant': 28,
 'from': 29,
 'when': 30,
 'my': 31,
 'like': 32,
 'if': 33,
 'immigration': 34,
 'who': 35,
 'dont': 36,
 'but': 37,
 'their': 38,
 'no': 39,
 'about': 40,
 'so': 41,
 'illegal': 42,
 'as': 43,
 'by': 44,
 'will': 45,
 'up': 46,
 'at': 47,
 'our': 48,
 'just': 49,
 'migrants': 50,
 'do': 51,
 'its': 52,
 'what': 53,
 'or': 54,
 'men': 55,
 'get': 56,
 'people': 57,
 'u': 58,
 'them': 59,
 'her': 60,
 'an': 61,
 'woman': 62,
 'was': 63,
 'can': 64,
 'rape': 65,
 'cunt': 66,
 'how': 67,
 'more': 68,
 'out': 69,
 'go': 70,
 'whore': 71,
 'trump': 72,
 'immigrants': 73,
 'one': 74,
 'us': 75,
 'has': 76,
 'want': 77,
 'fuck': 78,
 'im': 79,
 'youre': 80,
 'why

## Embedding Matrix using Wikipedia Embeddings

In [None]:
#download Wikipedia 2014 embeddings from https://github.com/stanfordnlp/GloVe
#Load GLoVe embeddings; here I use embeddings with only 100 dimensions
embeddings_index = {}
f = codecs.open('data/glove.42B.300d.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))



531532it [00:21, 24429.64it/s]

In [None]:
embeddings_index

In [None]:
embed_dim = 300

nb_words = min(max_features, len(tokenizer.word_index))
words_not_found = []
embedding_matrix = np.zeros((nb_words,embed_dim))
word_index = tokenizer.word_index
for word, i in tqdm(word_index.items()):
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

## Baseline Model with GloVe

In [None]:
filepath = 'checkpoints/glove_model.hdf5'
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                                 monitor='val_accuracy',
                                                save_best_only=True,
                                                 verbose=1,)
callbacks = [checkpoint]


In [None]:
import tensorflow_addons as tfa
import keras_tuner as kt
from tensorflow.keras.callbacks import EarlyStopping

f1 = tfa.metrics.F1Score(num_classes=1, average=None)
input_length=x_train.shape[1]
class MyHyperModel(kt.HyperModel):
    def __init__(self,embedding_matrix=embedding_matrix,max_features=max_features,embed_dim=embed_dim,input_length=input_length):
        self.embedding_matrix= embedding_matrix
        self.max_features = max_features
        self.embed_dim = embed_dim
        self.input_length = input_length

    def build(self, hp):
        lr = hp.Float("lr", min_value=1e-5, max_value=1e-2, sampling="log")
        dropoutLSTM = hp.Float('dropoutLSTM',min_value=0,max_value=1)
        dropout1 = hp.Float('dropout1',min_value=0,max_value=1)
        dropout2 = hp.Float('dropout2',min_value=0,max_value=1)
        wd = hp.Choice('wd', [0.0,0.01,0.001,0.1,0.005,0.05,.00001,.0001])
        l2_1 = hp.Choice('l2_1', [0.0,0.01,0.001,0.1,0.005,0.05,.00001,.0001])
        l2_2 = hp.Choice('l2_2', [0.0,0.01,0.001,0.1,0.005,0.05,.00001,.0001])
        l2_3 = hp.Choice('l2_3', [0.0,0.01,0.001,0.1,0.005,0.05,.00001,.0001])
        lstm_size = hp.Choice('lstm_size', [128,256,512,1024])
        dense_1_size = hp.Choice('dense_1_size', [128,256,512,1024])
        dense_2_size = hp.Choice('dense_2_size', [128,256,512,1024])

        optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
        #input_length=x_train.shape[1]
        model = Sequential()
        model.add(Embedding(self.max_features,self.embed_dim,input_length=self.input_length, weights=[self.embedding_matrix],trainable=False))
        model.add(LSTM(lstm_size, dropout=dropoutLSTM,kernel_regularizer=L2(l2_1)))
        model.add(Dense(dense_1_size,activation='relu', kernel_regularizer=L2(l2_2)))
        model.add(Dropout(dropout1))
        model.add(Dense(dense_2_size,activation='relu', kernel_regularizer=L2(l2_3)))
        model.add(Dropout(dropout2))
        model.add(Dense(1, activation="sigmoid"))
        model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=[f1,'accuracy'])
        return model


    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Choice("batch_size", [64, 32, 128]),
            epochs = 50,
            verbose = 2,
            **kwargs,
        )
from kerastuner.tuners import RandomSearch

tuner = kt.RandomSearch(
    MyHyperModel(),
    objective= kt.Objective('val_f1_score',direction='max'),
    max_trials = 100,
    directory="checkpoints",
    project_name="GloVe_search",
)

In [None]:
tuner.search(x=x_train,
             y=y_train,
             validation_data=(x_val,y_val))

In [None]:
parameters = ['lr','dropoutLSTM','dropout1','dropout2','wd','l2_1','l2_2','l2_3','lstm_size','dense_1_size','dense_2_size',"batch_size"]

In [None]:
best_hps=tuner.get_best_hyperparameters()[0]

In [None]:
values = {}

for i in parameters:
    values[i] = best_hps.get(i)

values

In [None]:
model = tuner.get_best_models()[0]


In [None]:
from sklearn.metrics import classification_report

pred = model.predict(x_train)
pred = np.round(pred)
#print(pred)
print(classification_report(y_train, pred))


In [None]:
pred = model.predict(x_val)
pred = np.round(pred)
#print(pred)
print(classification_report(y_val, pred))


In [None]:
pred = model.predict(x_test)
pred = np.round(pred)
#print(pred)
print(classification_report(y_test, pred))