In [1]:
# Mounting drive, storing file paths
from google.colab import drive
drive.mount('/content/drive')
span_dataset = 'drive/MyDrive/NLP_Project/toxic_spans_data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

annotations_df = pd.read_csv(span_dataset + 'annotations.csv')
comments_df = pd.read_csv(span_dataset + 'comments.csv')

# character-wise, start of spans end of spans
spans_df = pd.read_csv(span_dataset + 'spans.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
annotations_df.head()

Unnamed: 0,annotation,comment_id,worker,country,all toxic,not toxic
0,0,5167187,868,USA,False,False
1,1,5167187,1316,USA,False,False
2,2,5167187,1295,USA,False,True
3,3,5167187,2856,USA,False,False
4,4,5521110,418,VEN,True,True


In [4]:

comments_df.head()

Unnamed: 0,comment_id,comment_text
0,239607,Yet call out all Muslims for the acts of a few...
1,239612,This bitch is nuts. Who would read a book by a...
2,240311,You're an idiot.
3,240400,"Nincompoop, that's a nice one! I'm partial to ..."
4,240461,testing purposes: \n\nyou are an idiot and i c...


In [5]:
spans_df.head()

Unnamed: 0,annotation,type,start,end
0,0,Insult,133,139
1,1,Insult,84,92
2,1,Insult,118,124
3,1,Insult,126,131
4,1,Insult,133,147


In [6]:
# marks each word within the given range as 1 the rest 0 for toxic span classification
def mark_ranges(words, ranges):
    result = []
    current_pos = 0
    for word in words:
        # Calculate the starting and ending character positions of the current word
        word_start = current_pos
        word_end = current_pos + len(word)
        

        # Determine whether the current word falls within any of the specified ranges
        in_range = any(start <= word_end and end >= word_start for start, end in ranges)
        
        # Append the mark for the current word to the result list
        result.append(1 if in_range else 0)
        
        # Update the current position to the end of the current word, including its space
        current_pos = word_end + 1
    return result

In [7]:
# make comment_id dict
def load_embeddings(filename):
    word2vec = {}
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split()
            word = line[0]
            embedding = [float(x) for x in line[1:]]
            word2vec[word] = np.array(embedding)
    return word2vec



word2vec = load_embeddings('drive/MyDrive/NLP_Project/glove.6B.100d')
word2vec['< unk >'] = np.random.rand(100)
# comment_splits = comments_df['comment_text'].apply(lambda x: x.split())

word2idx = {word: i for i, word in enumerate(word2vec.keys())}
with open('drive/MyDrive/NLP_Project/word_to_idx_Glove.json',"w") as f:
    json.dump(word2idx,f)

In [8]:
word2idx['< unk >']
word2vec['< unk >']

array([0.91177531, 0.75569562, 0.10636393, 0.48018965, 0.80990404,
       0.24596056, 0.60122782, 0.65938439, 0.23378749, 0.70775804,
       0.67394522, 0.94115042, 0.62494239, 0.16100429, 0.1624047 ,
       0.72751045, 0.29704669, 0.55336274, 0.67152149, 0.68846305,
       0.12872764, 0.37422038, 0.99836368, 0.56226647, 0.83164647,
       0.45659317, 0.12183434, 0.567015  , 0.91668265, 0.48883059,
       0.01190737, 0.57272194, 0.74128202, 0.41171716, 0.72144421,
       0.29680581, 0.31900721, 0.61752724, 0.57784892, 0.35482662,
       0.84738751, 0.22206461, 0.06634975, 0.21313842, 0.72484913,
       0.71453575, 0.17790254, 0.43124212, 0.13251101, 0.42631795,
       0.35887561, 0.35297643, 0.92477963, 0.33395391, 0.59421651,
       0.95059033, 0.39335903, 0.44577459, 0.13858051, 0.19471507,
       0.27094732, 0.9802633 , 0.6233674 , 0.54856169, 0.03000612,
       0.56906439, 0.09774575, 0.29067296, 0.67796287, 0.69058409,
       0.71387961, 0.5826095 , 0.8042706 , 0.58327855, 0.88940

In [9]:
def create_embed_matrix(word2vec, word2idx):
    embedding_matrix = np.zeros((len(word2idx), 100))
    for i, word in enumerate(word2vec):
      embedding_matrix[i] = word2vec[word]
    return torch.tensor(embedding_matrix)

embedding_matrix = create_embed_matrix(word2vec, word2idx)

In [10]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stopwords (optional)
    # stop_words = set(stopwords.words('english'))
    # words = [word for word in words if word not in stop_words]
    # Lemmatize words (optional)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Reconstruct the preprocessed text
    preprocessed_text = ' '.join(words)

    return preprocessed_text

In [11]:
# creating a comment2idx dict storing the word splits of comments
idx2comment = dict(zip(comments_df['comment_id'], comments_df['comment_text'].apply(lambda x : preprocess_text(x).strip().split())))

# storing annotation index as key and list of range values associated with it
annotation_range_dict = {}
for annotation, start, end in zip(spans_df['annotation'], spans_df['start'], spans_df['end']):
    annotation_range_dict.setdefault(annotation, []).append((start, end))

# function to map words with embeddings
def map_word2vec(words, word2vec):
  embeddings = []
  for word in words: 
    if word.lower() in word2vec:
      embeddings.append(word2vec[word.lower()])
    else:
      embeddings.append(word2vec['< unk >'])
  return embeddings

def map_word2idx(words, word2idx):
  indices = []
  for word in words:
    if word.lower() in word2idx:
      indices.append(word2idx[word.lower()])
    else:
      indices.append(word2idx['< unk >'])
  return indices

In [12]:
df = pd.DataFrame({"annotations" : annotations_df['annotation'], "comment_id" : annotations_df['comment_id']})
df['comment_splits'] = df['comment_id'].map(idx2comment)
df['range_lists'] = df['annotations'].map(lambda x: annotation_range_dict.get(x, []))
df['toxic_spans'] = df.apply(lambda x: mark_ranges(x['comment_splits'], x['range_lists']), axis=1)
df['embedding_splits'] = df['comment_splits'].apply(lambda x : map_word2vec(x, word2vec))
df['comment_idx'] = df['comment_splits'].apply(lambda x : map_word2idx(x, word2idx))
df.head()

Unnamed: 0,annotations,comment_id,comment_splits,range_lists,toxic_spans,embedding_splits,comment_idx
0,0,5167187,"[that, s, right, they, are, not, normal, and, ...","[(133, 139)]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-0.093337, 0.19043, 0.68457, -0.41548, -0.22...","[12, 1534, 248, 39, 32, 36, 1973, 5, 41, 913, ..."
1,1,5167187,"[that, s, right, they, are, not, normal, and, ...","[(84, 92), (118, 124), (126, 131), (133, 147)]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-0.093337, 0.19043, 0.68457, -0.41548, -0.22...","[12, 1534, 248, 39, 32, 36, 1973, 5, 41, 913, ..."
2,2,5167187,"[that, s, right, they, are, not, normal, and, ...","[(118, 131), (133, 147)]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-0.093337, 0.19043, 0.68457, -0.41548, -0.22...","[12, 1534, 248, 39, 32, 36, 1973, 5, 41, 913, ..."
3,3,5167187,"[that, s, right, they, are, not, normal, and, ...","[(84, 92)]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[-0.093337, 0.19043, 0.68457, -0.41548, -0.22...","[12, 1534, 248, 39, 32, 36, 1973, 5, 41, 913, ..."
4,4,5521110,"[yep, this, crap, sound, like, it, from, a, li...",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.30322, -0.091931, 0.89503, -0.44215, -0.32...","[45370, 37, 35404, 1507, 117, 20, 25, 7, 17543]"


In [13]:
# creating test train splits
X = df['comment_idx']
y = df['toxic_spans']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Reset the indices of X_train and y_train
X_train = X_train.values
y_train = y_train.values

# Reset the indices of X_test and y_test
X_test = X_test.values
y_test = y_test.values

In [14]:
# setting dataloaders
batch_size = 32

def pad_collate(batch):
    word_lens = torch.tensor([len(seq) for seq,_,_ in batch])
    word_pad = pad_sequence([torch.tensor(seq) for seq,_,_ in batch], batch_first=True, padding_value=0)
    # experiment with padding value
    label_pad = pad_sequence([torch.tensor(labels, dtype=torch.float32) for _,_,labels in batch], batch_first=True, padding_value=0)
    return word_pad, word_lens, label_pad

class ToxicDataset(Dataset):
    def __init__(self, words, labels):
        self.words = words
        self.labels = labels

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        sentence = self.words[idx]
        lengths = len(self.words[idx])
        labels = self.labels[idx]
        return sentence, lengths, labels


train_dataloader = DataLoader(ToxicDataset(X_train, y_train), batch_size=batch_size, collate_fn=pad_collate)
test_dataloader = DataLoader(ToxicDataset(X_test, y_test), batch_size=batch_size, collate_fn=pad_collate)

In [15]:
class BLSTM_Span_Model(torch.nn.Module):
    def __init__(self, embedding_matrix):
        super(BLSTM_Span_Model, self).__init__()
        self.input_dim = embedding_matrix.size()[1]

        self.hidden_dim = 256
        self.linear_output_dim = 128
        self.num_classes = 2
        self.dropout_factor = 0.33

        self.embedding = torch.nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)

        self.lstm = torch.nn.LSTM(self.input_dim, self.hidden_dim, bidirectional=True, batch_first=True)
        self.dropout = torch.nn.Dropout(self.dropout_factor)
        self.linear = torch.nn.Linear(self.hidden_dim * 2, self.linear_output_dim)
        self.elu = torch.nn.ELU()
        self.classifier = torch.nn.Linear(self.linear_output_dim, 1)

    def forward(self, x, lengths):
        x = self.embedding(x)
        x = x.to(torch.float32)
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), enforce_sorted=False, batch_first=True)
        x, _ = self.lstm(x)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        x = self.dropout(x)
        x = self.linear(x)
        x = self.elu(x)
        x = self.classifier(x)
        return x.squeeze()

In [16]:
def train(model, train_dataloader, test_dataloader, epochs, loss_fn, optimizer, scheduler, device):
    best_dev_loss = float('inf')
    best_train_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for words, lengths, labels in train_dataloader:
            optimizer.zero_grad()
            words, labels, lengths = words.to(device), labels.to(device), lengths.to(device)
            pred = model(words, lengths)
            loss = loss_fn(pred, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.detach().item()
        model.eval()
        test_loss = 0.0
        for words, lengths, labels in test_dataloader:
            words, labels, lengths = words.to(device), labels.to(device), lengths.to(device)
            pred = model(words, lengths)
            loss = loss_fn(pred, labels)
            test_loss += loss.detach().item()
        training_loss = running_loss / len(train_dataloader)
        dev_loss = test_loss / len(test_dataloader)
        # scheduler.step()

        # Saves the model with the lowest training and dev losses for testing purposes
        # if training_loss < best_train_loss:
        #     torch.save(model, 'best_model_train.pt')
        #     best_train_loss = training_loss
        #     print(f"Model Saved with {best_train_loss}")
        # if dev_loss < best_dev_loss:
        #     torch.save(model, 'best_model_dev.pt')
        #     best_dev_loss = dev_loss

        print(f'Epoch {epoch}: Training Loss = {training_loss}, Test Loss = {dev_loss}')

        # if test_loss / len(test_dataloader) < best_dev_loss:
        #     best_dev_loss = test_loss / len(test_dataloader)


In [17]:
def get_balanced_bce_loss(y_train, device):
    # Calculate the number of samples in each class
    flat_list = np.array([x for y in y_train for x in y])
    num_positive = np.sum(flat_list)
    num_negative = len(flat_list) - num_positive

    # Calculate class weights
    weight_for_positive = num_negative / (num_positive + num_negative)
    weight_for_negative = num_positive / (num_positive + num_negative)

    # Pass the weights to BCEWithLogitsLoss
    pos_weight = torch.tensor([weight_for_positive / weight_for_negative], dtype=torch.float32)
    print(pos_weight)
    balanced_bce_loss = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))

    return balanced_bce_loss

In [22]:
'''
Ideas for improvement
- Class weighting, much more negative examples than positive examples
- Varying architectures, LSTM based
- Varying preprocessing strategies
'''

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Training on {device}")
epochs = 50
lr = .003

blstm_model = BLSTM_Span_Model(embedding_matrix)
blstm_model.to(device)
optimizer = torch.optim.Adam(blstm_model.parameters(), lr)
loss_fn = get_balanced_bce_loss(y_train, device)

# not used 
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

train(blstm_model, train_dataloader, test_dataloader, epochs, loss_fn, optimizer, scheduler, device)
torch.save(blstm_model.cpu(), 'drive/MyDrive/NLP_Project/blstm_span_model.pt')

Training on cuda
tensor([15.0522])
Epoch 0: Training Loss = 0.31473956718427115, Test Loss = 0.28179820042575465
Epoch 1: Training Loss = 0.2632695418215941, Test Loss = 0.2770774563689116
Epoch 2: Training Loss = 0.19962051362402933, Test Loss = 0.2909079077992478
Epoch 3: Training Loss = 0.16426380576313065, Test Loss = 0.339003836851557
Epoch 4: Training Loss = 0.14244134082716112, Test Loss = 0.3572695937680427
Epoch 5: Training Loss = 0.12989818540091735, Test Loss = 0.44687661273540513
Epoch 6: Training Loss = 0.12127801770021149, Test Loss = 0.4620884959347486
Epoch 7: Training Loss = 0.11153995309208672, Test Loss = 0.518740848891658
Epoch 8: Training Loss = 0.10614307931574674, Test Loss = 0.5853761541393568
Epoch 9: Training Loss = 0.09990778392851152, Test Loss = 0.666905722869535
Epoch 10: Training Loss = 0.09589826628429958, Test Loss = 0.7324954799484371
Epoch 11: Training Loss = 0.09383061766916236, Test Loss = 0.7083610137196564
Epoch 12: Training Loss = 0.0879303927745

In [21]:
lr = .003
model = torch.load('drive/MyDrive/NLP_Project/blstm_span_model.pt')
device = torch.device('cpu')
optimizer = torch.optim.Adam(model.parameters(), lr)
# loss_fn = get_balanced_bce_loss(y_train, device)

preds=[]
true=[]
true2=[]
model.eval()
for i, (words, lengths, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        words, lengths, labels = words.to(device),lengths.to(device),labels.to(device)
        logits = model(words,lengths).cpu()
#         mask = labels>=0 
# #         print(labels)
#         labels = labels[mask]
#         print("labels: ",labels)
        # logits = logits.permute(0,2,1)[mask].view(-1, tagset_size-1)
#         print("preds:", torch.argmax(logits,dim=1))
        true += labels.cpu()
        true2.extend(labels.cpu().numpy())
        # make sure to put it through sigmoid at the end
        p = np.round(torch.sigmoid(logits).cpu().detach().numpy())
        preds.extend(p)

In [23]:
print(f'Accuracy: {accuracy_score(torch.cat(true).numpy(), np.concatenate(preds))}')
print(f'F1 Score: {f1_score(torch.cat(true).numpy(), np.concatenate(preds))}')
print(f'Recall Score: {recall_score(torch.cat(true).numpy(), np.concatenate(preds))}')
print(f'Precision Score: {precision_score(torch.cat(true).numpy(), np.concatenate(preds))}')

Accuracy: 0.9506490890131787
F1 Score: 0.31146241025057214
Recall Score: 0.7209410097045034
Precision Score: 0.19863953412681323


In [24]:
preds=[]
true=[]
true2=[]
model.eval()
for i, (words, lengths, labels) in enumerate(test_dataloader):
        optimizer.zero_grad()
        words, lengths, labels = words.to(device),lengths.to(device),labels.to(device)
        logits = model(words,lengths).cpu()
#         mask = labels>=0 
# #         print(labels)
#         labels = labels[mask]
#         print("labels: ",labels)
        # logits = logits.permute(0,2,1)[mask].view(-1, tagset_size-1)
#         print("preds:", torch.argmax(logits,dim=1))
        true += labels.cpu()
        true2.extend(labels.cpu().numpy())
        # make sure to put it through sigmoid at the end
        p = np.round(torch.sigmoid(logits).cpu().detach().numpy())
        preds.extend(p)

In [25]:
print(f'Accuracy: {accuracy_score(torch.cat(true).numpy(), np.concatenate(preds))}')
print(f'F1 Score: {f1_score(torch.cat(true).numpy(), np.concatenate(preds))}')
print(f'Recall Score: {recall_score(torch.cat(true).numpy(), np.concatenate(preds))}')
print(f'Precision Score: {precision_score(torch.cat(true).numpy(), np.concatenate(preds))}')

Accuracy: 0.9423065137752115
F1 Score: 0.22933547589991585
Recall Score: 0.5627766824150887
Precision Score: 0.1440103571700556
