In [None]:
import pandas as pd
import numpy as np
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Data Preprocessing

In [None]:
train = pd.read_csv('NLP_ass_train.tsv', sep='\t')
val = pd.read_csv('NLP_ass_valid.tsv', sep='\t')
test = pd.read_csv('NLP_ass_test.tsv', sep='\t')

train.columns = ['text', 'label']
val.columns = ['text', 'label']
test.columns = ['text', 'label']

In [None]:
train.head()

Unnamed: 0,text,label
0,nawt yall niggers ignoring me,normal
1,<user> i am bit confused coz chinese ppl can n...,hatespeech
2,and this is why i end up with nigger trainee d...,hatespeech
3,nogs jews and dykes how enriching,offensive
4,guilty until proven innocent unless you are a ...,hatespeech


In [None]:
test.head()

Unnamed: 0,text,label
0,laura loomer raped me while screaming at me in...,hatespeech
1,<user> what did the old lady do was she a nazi...,normal
2,as much as i appreciate a format of open discu...,normal
3,sex be so good a bitch be slow stroking and cr...,offensive
4,<user> <user> <user> with a room full of white...,normal


In [None]:
label2id = {
    'normal': 0,
    'hatespeech': 1,
    'offensive': 2
}

train_text, train_labels = list(train['text']), np.array(list(train['label'].apply(lambda x: label2id[x])))
val_text, val_labels = list(val['text']), np.array(list(val['label'].apply(lambda x: label2id[x])))
test_text, test_labels = list(test['text']), np.array(list(test['label'].apply(lambda x: label2id[x])))

In [None]:
vocab_size = 25000
embedding_dim=300
max_length = 170
oov_tok=""

In [None]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index
word_index

{'': 1,
 'the': 2,
 'a': 3,
 'to': 4,
 'and': 5,
 'i': 6,
 'user': 7,
 'you': 8,
 'of': 9,
 'is': 10,
 'not': 11,
 'in': 12,
 'are': 13,
 'that': 14,
 'it': 15,
 'white': 16,
 'they': 17,
 'for': 18,
 'number': 19,
 'be': 20,
 'have': 21,
 'this': 22,
 'with': 23,
 'on': 24,
 'all': 25,
 'do': 26,
 'nigger': 27,
 'like': 28,
 'but': 29,
 'so': 30,
 'my': 31,
 'if': 32,
 'was': 33,
 'as': 34,
 'he': 35,
 'just': 36,
 'will': 37,
 'people': 38,
 'who': 39,
 'we': 40,
 'can': 41,
 'your': 42,
 'or': 43,
 'by': 44,
 'their': 45,
 'about': 46,
 'no': 47,
 'what': 48,
 'women': 49,
 'me': 50,
 'from': 51,
 'am': 52,
 'them': 53,
 'up': 54,
 'out': 55,
 'at': 56,
 'get': 57,
 'jews': 58,
 'one': 59,
 'how': 60,
 'there': 61,
 'when': 62,
 'bitch': 63,
 'muslim': 64,
 'kike': 65,
 'would': 66,
 'fucking': 67,
 'an': 68,
 'ghetto': 69,
 'his': 70,
 'she': 71,
 'retarded': 72,
 'black': 73,
 'more': 74,
 'why': 75,
 'shit': 76,
 'because': 77,
 'fuck': 78,
 'hate': 79,
 'only': 80,
 'has': 81,
 

In [None]:
train_seqs = tokenizer.texts_to_sequences(train_text)
train_pad_seqs = pad_sequences(train_seqs, maxlen=max_length)

val_seqs = tokenizer.texts_to_sequences(val_text)
val_pad_seqs = pad_sequences(val_seqs, maxlen=max_length)

test_seqs = tokenizer.texts_to_sequences(test_text)
test_pad_seqs = pad_sequences(test_seqs, maxlen=max_length)

In [None]:
train_seqs[0]

[11942, 822, 87, 2783, 50]

In [None]:
train_pad_seqs[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [None]:
train_labels

array([0, 1, 1, ..., 0, 1, 2])

In [None]:
from keras.utils import to_categorical

train_padded_seqs = tf.convert_to_tensor(train_pad_seqs, dtype=tf.int32)
val_padded_seqs = tf.convert_to_tensor(val_pad_seqs, dtype=tf.int32)
test_padded_seqs = tf.convert_to_tensor(test_pad_seqs, dtype=tf.int32)

train_labels = tf.convert_to_tensor(train_labels, dtype=tf.float32)
val_labels = tf.convert_to_tensor(val_labels, dtype=tf.float32)
test_labels = tf.convert_to_tensor(test_labels, dtype=tf.float32)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    SimpleRNN(64),
    Dropout(0.3),
    Dense(3, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 170, 300)          7500000   
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                23360     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 3)                 195       
                                                                 
Total params: 7523555 (28.70 MB)
Trainable params: 7523555 (28.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
early_stopper = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [None]:
model.fit(train_padded_seqs, train_labels, epochs=30, batch_size=32, validation_data=(val_padded_seqs, val_labels), callbacks=[early_stopper], verbose=1)
model.save('best_model_ckpt.h5')  # Save the best model

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30


  saving_api.save_model(


In [None]:
from sklearn.metrics import f1_score
from keras.models import load_model

finetuned_model = load_model('best_model_ckpt.h5')
logits = finetuned_model(test_padded_seqs)
predictions = tf.argmax(logits, axis=1)
test_loss, test_accuracy = finetuned_model.evaluate(test_padded_seqs, test_labels, verbose=1)
macro_f1 = f1_score(test_labels, predictions, average='macro')

print(f'Test Accuracy: {test_accuracy:.4f}, Macro-F1: {macro_f1:.4f}')

Test Accuracy: 0.6282, Macro-F1: 0.6022


In [None]:
def find_common_strings(sentences1, sentences2):
  sent1 = set(sentences1)
  sent2 = set(sentences2)

  return len(sent1.intersection(sent2))

In [None]:
print("Number of common sentences (train, test): ", find_common_strings(train_text, test_text))
print("Number of common sentences (val, test): ", find_common_strings(val_text, test_text))

Number of common sentences (train, test):  5
Number of common sentences (val, test):  1
