In [38]:
%env LD_LIBRARY_PATH= / home / zach / anaconda3 / envs / research / lib

env: LD_LIBRARY_PATH=/ home / zach / anaconda3 / envs / research / lib


In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import sklearn
import matplotlib.pyplot as plt
import re
import codecs
from tqdm import tqdm

In [40]:
## Data Import and Cleaning
train = pd.read_csv('data/hateval2019_en_train.csv')
test = pd.read_csv('data/hateval2019_en_test.csv')
val = pd.read_csv('data/hateval2019_en_dev.csv')

train = train.drop(['TR', 'AG'], 1)
test = test.drop(['TR', 'AG'], 1)
val = val.drop(['TR', 'AG'], 1)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


  train = train.drop(['TR', 'AG'], 1)
  test = test.drop(['TR', 'AG'], 1)
  val = val.drop(['TR', 'AG'], 1)


In [58]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Input, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.regularizers import L1, L2, l1_l2
import io


def normalize_tweet(text):
    """
    Removes hashtags, @s, links, and punctuation
    :param text:Text to be cleaned
    :return: text with mentions, hashtages, and urls removes
    """
    processed_text = text.lower()
    processed_text = re.sub(r"(?:\@|http?\://|https?\://|www|t\.)\S+", "", processed_text)
    processed_text = re.sub(r"(?:\.|,|\?|-)", " ", processed_text)
    processed_text = re.sub(r"(?:\@|http?\://|https?\://|www|\.com)", "", processed_text)
    processed_text = re.sub(r'[^\w\s]', '', processed_text)
    processed_text = " ".join(processed_text.split())
    return processed_text


def x_y_split(data):
    """splits and X and y from dataframe

    Args:
        data:dataframe to split from

    Returns:
        tuple:X data, y data
    """
    X = data['text']
    X = X.apply(normalize_tweet)
    y = data['HS']
    return X, y


In [42]:
## Split sequences into train, validation, and test sets
#Split x and ys
x_train, y_train = x_y_split(train)
print(x_train)
#x_train.to_csv('data/x_train.csv')
#y_train.to_csv('data/y_train.csv')
#Split x and ys
x_test, y_test = x_y_split(test)
print(x_test)
#x_test.to_csv('data/x_test.csv')
#y_test.to_csv('data/y_test.csv')
#Split x and ys
x_val, y_val = x_y_split(val)
print(x_val)
#x_val.to_csv('data/x_val.csv')
#y_val.to_csv('data/y_val.csv')
## Tokenizer
max_features = 15000
tokenizer = Tokenizer(num_words=max_features, split=' ', lower=True)
tokenizer.fit_on_texts(x_train)


def tokenize_and_pad(x_data, tokenizer=tokenizer, length=57):
    """
    Tokenizes and pads input
    :param x_data: X column of data
    :param tokenizer: fitted tokenizer
    :param length: length to pad
    :return: tokenized and padded x_data
    """
    x_data = tokenizer.texts_to_sequences(x_data)
    x_data = pad_sequences(x_data, maxlen=length)
    return x_data


x_train = tokenize_and_pad(x_train, tokenizer)


def split_and_tokenize(data, tokenizer=tokenizer):
    """
    Splits tokenizes and pads data
    :param data:
    :return: tupple of (X,y)
    """
    X, y = x_y_split(data)
    X = tokenize_and_pad(X)
    return X, y


x_val, y_val = split_and_tokenize(val)
x_test, y_test = split_and_tokenize(test)
word_index = tokenizer.word_index
print("Number of unique words:", len(word_index))
word_index

0       hurray saving us in so many ways lockthemup bu...
1       why would young fighting age men be the vast m...
2       illegals dump their kids at the border like ro...
3       ny times nearly all white states pose an array...
4       orban in brussels european leaders are ignorin...
                              ...                        
8995                  i am proud to be a hysterical woman
8996    hollywood is complicit in the rape and sexual ...
8997    what a fucking cunt i hate seeing kids getting...
8998                                hysterical woman like
8999    nearly every woman i know has metoo in their f...
Name: text, Length: 9000, dtype: object
0       oh i could have gone on about taxes since the ...
1       several of the wild fires in california and co...
2       my question is how do you resettle a refugee a...
3       europe youve got a problem we must hurry and b...
4       this is outrageous stopillegalimmigration meri...
                              ..

{'the': 1,
 'to': 2,
 'a': 3,
 'you': 4,
 'and': 5,
 'of': 6,
 'in': 7,
 'is': 8,
 'for': 9,
 'i': 10,
 'are': 11,
 'not': 12,
 'that': 13,
 'on': 14,
 'bitch': 15,
 'this': 16,
 'it': 17,
 'all': 18,
 'your': 19,
 'they': 20,
 'be': 21,
 'with': 22,
 'refugees': 23,
 'have': 24,
 'women': 25,
 'we': 26,
 'me': 27,
 'immigrant': 28,
 'from': 29,
 'when': 30,
 'my': 31,
 'like': 32,
 'if': 33,
 'immigration': 34,
 'who': 35,
 'dont': 36,
 'but': 37,
 'their': 38,
 'no': 39,
 'about': 40,
 'so': 41,
 'illegal': 42,
 'as': 43,
 'by': 44,
 'will': 45,
 'up': 46,
 'at': 47,
 'our': 48,
 'just': 49,
 'migrants': 50,
 'do': 51,
 'its': 52,
 'what': 53,
 'or': 54,
 'men': 55,
 'get': 56,
 'people': 57,
 'u': 58,
 'them': 59,
 'her': 60,
 'an': 61,
 'woman': 62,
 'was': 63,
 'can': 64,
 'rape': 65,
 'cunt': 66,
 'how': 67,
 'more': 68,
 'out': 69,
 'go': 70,
 'whore': 71,
 'trump': 72,
 'immigrants': 73,
 'one': 74,
 'us': 75,
 'has': 76,
 'want': 77,
 'fuck': 78,
 'im': 79,
 'youre': 80,
 'why

In [43]:
## Embedding Matrix using Wikipedia Embeddings
#download Wikipedia 2014 embeddings from https://github.com/stanfordnlp/GloVe
#Load GLoVe embeddings; here I use embeddings with only 100 dimensions
embeddings_index = {}
f = codecs.open('data/glove.42B.300d.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

embeddings_index
embed_dim = 300

nb_words = min(max_features, len(tokenizer.word_index))
words_not_found = []
embedding_matrix = np.zeros((nb_words, embed_dim))
word_index = tokenizer.word_index
for word, i in tqdm(word_index.items()):
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

1917495it [01:20, 23911.78it/s]


found 1917495 word vectors


100%|██████████| 17392/17392 [00:00<00:00, 634749.66it/s]

number of null word embeddings: 2456





In [44]:
print('running')
x_graph_train = pd.read_csv('data/extracted_annotations_train.csv')
print('train')

x_graph_val = pd.read_csv('data/extracted_annotations_val.csv')
print('val')

x_graph_test = pd.read_csv('data/extracted_annotations_test.csv')
print('test')

running
train
val
test


In [45]:
x_graph_train = x_graph_train['annotations']
x_graph_val = x_graph_val['annotations']
x_graph_test = x_graph_test['annotations']


In [46]:
x_graph_train = x_graph_train.apply(str)
x_graph_val = x_graph_val.apply(str)
x_graph_test = x_graph_test.apply(str)

In [47]:
max_features_graph = 15000
graph_tokenizer = Tokenizer(num_words=max_features_graph,split=' ', lower=True)

In [48]:
graph_tokenizer.fit_on_texts(x_graph_train)
max_graph_length = 2254

In [49]:
x_graph_train = tokenize_and_pad(x_graph_train,graph_tokenizer,max_graph_length)
x_graph_val = tokenize_and_pad(x_graph_val,graph_tokenizer,max_graph_length)
x_graph_test = tokenize_and_pad(x_graph_test,graph_tokenizer,max_graph_length)



In [60]:
def get_embedding_matrix(tokenizer=graph_tokenizer,max_features=max_features_graph):
    embeddings_index = {}
    f = codecs.open('data/glove.42B.300d.txt', encoding='utf-8')
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))

    embeddings_index
    embed_dim = 300

    nb_words = min(max_features, len(tokenizer.word_index))
    words_not_found = []
    embedding_matrix = np.zeros((nb_words, embed_dim))
    word_index = tokenizer.word_index
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return embedding_matrix

In [61]:
graph_embbeding_matrix = get_embedding_matrix()

1917495it [01:20, 23827.75it/s]


found 1917495 word vectors


100%|██████████| 62807/62807 [00:00<00:00, 1842269.56it/s]

number of null word embeddings: 238





In [52]:
x_graph_train

array([[    0,     0,     0, ..., 12249,     5,     1],
       [    0,     0,     0, ...,     4,    82,   575],
       [    0,     0,     0, ...,  3266,  1589,   117],
       ...,
       [    0,     0,     0, ...,  1362,     8,  2784],
       [    0,     0,     0, ...,    88,    49,   389],
       [    0,     0,     0, ...,     1,    34,    33]], dtype=int32)

In [63]:
lstm_out = 392
max_length = 57
input_graph = Input(shape=(max_graph_length,))
embbeding_graph =  Embedding(max_features_graph,embed_dim,input_length=max_graph_length, weights=[graph_embbeding_matrix],trainable=False)(input_graph)
lstm_graph = LSTM(lstm_out,dropout=0.6)(embbeding_graph)
graph_model = Model(inputs=input_graph,outputs =lstm_graph)


input_x = Input(shape=(max_length,))
embedding = Embedding(max_features,embed_dim,input_length=max_length, weights=[embedding_matrix],trainable=False)(input_x)
lstm = LSTM(lstm_out,dropout=0.6)(embedding)
normal_model = Model(inputs=input_x,outputs =lstm)

combined = concatenate([normal_model.output,graph_model.output])

x = Dense(256,activation='relu')(combined)
x = Dropout(.6)(x)
x = Dense(256,activation='relu')(x)
x = Dense(1,activation='sigmoid')(x)

model = Model(inputs=[normal_model.input,graph_model.input],outputs=x)

In [64]:
model.compile(loss=BinaryCrossentropy(),optimizer=Adam(learning_rate=0.0001),metrics=['accuracy'])
history = model.fit(x=[x_train,x_graph_train],
                    y=y_train,batch_size=32,
                    validation_data=([x_val,x_graph_val],y_val),
                    epochs=5)

Epoch 1/5


2022-05-28 15:19:16.093399: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
