In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report
from sklearn.model_selection import GridSearchCV
import torch
from torch.utils.data import DataLoader, Dataset
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence
import torch.optim.lr_scheduler as lr_scheduler
import json
import math

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data if you haven't done it before
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def load_embeddings(filename):
    word2vec = {}
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split()
            word = line[0]
            embedding = [float(x) for x in line[1:]]
            word2vec[word] = np.array(embedding)
    return word2vec

word2vec = load_embeddings('drive/MyDrive/NLP_HW4_data/glove.6B.100d')

In [None]:
train_data = pd.read_csv('drive/MyDrive/NLP_Project_Data/train.csv')

In [None]:
train_data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [None]:
train_data[train_data['toxic']==0][list(train_data.columns)[2:]].sum()

toxic              0
severe_toxic       0
obscene          523
threat            29
insult           533
identity_hate    103
dtype: int64

In [None]:
train_data['label'] = np.ceil(train_data.iloc[:, 2:].sum(axis=1) / 6).astype(int)

In [None]:
train_data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,label
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,1
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,0


In [None]:
train_data['label'].value_counts()

0    143346
1     16225
Name: label, dtype: int64

In [None]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stopwords (optional)
    # stop_words = set(stopwords.words('english'))
    # words = [word for word in words if word not in stop_words]
    # Lemmatize words (optional)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Reconstruct the preprocessed text
    preprocessed_text = ' '.join(words)

    return preprocessed_text

In [None]:
train_data['preprocessed_text'] = train_data['comment_text'].apply(preprocess_text)

In [None]:
processed_data = train_data[['preprocessed_text','label']]
processed_data.head()

Unnamed: 0,preprocessed_text,label
0,explanation why the edits made under my userna...,0
1,d aww he match this background colour i m seem...,0
2,hey man i m really not trying to edit war it s...,0
3,more i can t make any real suggestion on impro...,0
4,you sir are my hero any chance you remember wh...,0


In [None]:
word_to_idx = {word: i+2 for i, word in enumerate(word2vec)}
with open('drive/MyDrive/NLP_Project_Data/word_to_idx_Glove.json',"w") as f:
    json.dump(word_to_idx,f)

In [None]:
with open('drive/MyDrive/NLP_Project_Data/word_to_idx_Glove.json',"r") as f:
    word_to_idx = json.load(f)

In [None]:
def text_to_indices(text, word_to_index):
    words = text.split()
    indices = [word_to_index.get(word, 0) for word in words]
    return indices


In [None]:
processed_data['indexed_text'] = processed_data['preprocessed_text'].apply(lambda x: text_to_indices(x, word_to_idx))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['indexed_text'] = processed_data['preprocessed_text'].apply(lambda x: text_to_indices(x, word_to_idx))


In [None]:
processed_data.to_csv('processed_training_data_3.csv',index = False)

In [None]:
embedding_matrix = np.zeros((len(word_to_idx)+2, 100))

# Fill in embedding matrix with GloVe vectors
embedding_matrix[0] = np.random.randn(100)
embedding_matrix[1] = np.random.randn(100)
for i, word in enumerate(word2vec):
    embedding_matrix[i+2] = word2vec[word]

In [None]:
import ast 

data = pd.read_csv('/content/drive/MyDrive/NLP_Project_Data/processed_training_data_3.csv')
data['indexed_text'] = data['indexed_text'].apply(ast.literal_eval)
data.head()

Unnamed: 0,preprocessed_text,label,indexed_text
0,explanation why the edits made under my userna...,0,"[5293, 740, 2, 20409, 118, 126, 194, 84075, 16..."
1,d aww he match this background colour i m seem...,0,"[1970, 199073, 20, 552, 39, 2095, 8239, 43, 19..."
2,hey man i m really not trying to edit war it s...,0,"[7944, 302, 43, 1995, 590, 38, 597, 6, 16843, ..."
3,more i can t make any real suggestion on impro...,0,"[58, 43, 88, 2161, 161, 132, 569, 6984, 15, 36..."
4,you sir are my hero any chance you remember wh...,0,"[83, 2701, 34, 194, 3646, 132, 1021, 83, 2717,..."


In [None]:
def dynamic_padding(batch):
    # Get the input sequences and their corresponding labels
    # print(batch)
    inputs = [torch.tensor(item[0]) for item in batch]
    labels = [item[1] for item in batch]
    labels = torch.tensor(labels)
    # Pad the input sequences to the maximum length in the batch
    lengths = [len(inp) for inp in inputs]
    inputs = pad_sequence(inputs, batch_first=True)
    
    # P?ad the label sequences to the maximum length in the batch
    # labels = pad_sequence(labels, batch_first=True, padding_value=-1)
    # lengths_tensor = [torch.tensor(item) for item in lengths]
    
    return inputs, labels, torch.tensor(lengths)

In [None]:
embedding_matrix = torch.from_numpy(embedding_matrix.astype('float32'))
embedding_matrix.requires_grad = False

In [None]:
class Model1_project(nn.Module):
    def __init__(self, input_size, embedding_size, num_layers, hidden_size, lstm_dropout, output_size, final_output):
        super(Model1_project, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.blstm = nn.GRU(
            embedding_size,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True)
        self.linear = nn.Linear(2*hidden_size, final_output)
        self.dropout = nn.Dropout(p=lstm_dropout)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs, lengths):
        embedded = self.embedding(inputs)
        x = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        output, hidden = self.blstm(x)
        output, _ = pad_packed_sequence(output, batch_first=True)
        # output = torch.max(output, dim=1)
        output = self.mean_pooling(output)
        drop_output = self.dropout(output)
        linear_output = self.linear(drop_output)
        output = self.sigmoid(linear_output)

        return output.squeeze()
    def mean_pooling(self,x):
        return torch.mean(x,1)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
vocab_size = len(word_to_idx) 
tagset_size = 2

# Required hyperparams: 
EMBEDDING_DIM = 100
NUM_LSTM_LAYERS = 1
HIDDEN_DIM = 512
DROPOUT = 0.2
LIN_OUTPUT_DIM=64

# Changeable hyperparams
NUM_EPOCHS =50
BATCH_SIZE = 64
LEARNING_RATE = 0.3
FINAL_OUTPUT = tagset_size-1
INPUT_DIM = vocab_size +2 

train_loader = DataLoader(
    list(zip(data['indexed_text'],data['label'])), 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=dynamic_padding)
# dev_loader = DataLoader(
#     list(zip(dev_sentences,dev_labels)), 
#     batch_size=BATCH_SIZE, 
#     shuffle=True, 
#     collate_fn=dynamic_padding)

# model = Model1_project(INPUT_DIM, EMBEDDING_DIM, NUM_LSTM_LAYERS, HIDDEN_DIM, DROPOUT, LIN_OUTPUT_DIM, FINAL_OUTPUT)
# criterion = nn.CrossEntropyLoss(ignore_index=-1)
criterion  = nn.BCELoss()
# model.to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
# scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)


In [None]:
model = torch.load('model1_project.pt')

In [None]:
model = torch.load('/content/drive/MyDrive/NLP_Project_Data/model1_corrected_data_v9.pt')

In [None]:
model

Model1_project(
  (embedding): Embedding(400002, 100)
  (blstm): GRU(100, 512, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)

In [None]:
!pip install torchviz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchviz
  Downloading torchviz-0.0.2.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torchviz
  Building wheel for torchviz (setup.py) ... [?25l[?25hdone
  Created wheel for torchviz: filename=torchviz-0.0.2-py3-none-any.whl size=4147 sha256=b0d9abcbfd69973e12a3fa15c4a356fa1469abd7eb58398caa493722a809653f
  Stored in directory: /root/.cache/pip/wheels/29/65/6e/db2515eb1dc760fecd36b40d54df65c1e18534013f1c037e2e
Successfully built torchviz
Installing collected packages: torchviz
Successfully installed torchviz-0.0.2


In [None]:
!pip install onnx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting onnx
  Downloading onnx-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx
Successfully installed onnx-1.13.1


In [None]:
inputs,targets, lengths= next(iter(train_loader))
yhat = model(inputs,lengths)

In [None]:
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

In [None]:
for epoch in range(25):
    train_loss = 0.0
    val_loss = 0.0
    model.train()
    for i, (inputs, labels,lengths) in enumerate(train_loader):
        optimizer.zero_grad()
        inputs,labels,lengths = inputs.to(device),labels.to(device),lengths.to(device)
        logits = model(inputs,lengths)
        labels = labels.float()
        loss = criterion(logits, labels)
        train_loss += loss.detach().item()
        loss.backward()
        optimizer.step()
    train_loss = train_loss
    # model.eval()
    # for i,(inputs,labels,lengths) in enumerate(dev_loader):
    #     inputs,labels,lengths = inputs.to(device),labels.to(device),lengths.to(device)
    #     logits = model(inputs,lengths)
    #     loss = criterion(logits, labels)
    #     val_loss += loss.detach().item()
        
    print(f"Epoch: {epoch}  Train Loss:{train_loss/len(train_loader)} ")

TypeError: ignored

In [None]:
torch.save(model.cpu(),'drive/MyDrive/NLP_Project_Data/model1_corrected_data_v9.pt')

In [None]:
model = torch.load('drive/MyDrive/NLP_Project_Data/model1_corrected_data_v9.pt')

In [None]:
model.to(device)

Model1_project(
  (embedding): Embedding(400002, 100)
  (blstm): GRU(100, 512, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)

In [None]:
next(iter(train_loader))

(tensor([[ 11658,  40496,      5,  ...,      0,      0,      0],
         [    98,  13688,      8,  ...,      0,      0,      0],
         [  1303,  12695,    839,  ...,      0,      0,      0],
         ...,
         [     0,    839,   2732,  ...,      0,      0,      0],
         [    43,    255,     83,  ...,      0,      0,      0],
         [231558,    394,      0,  ...,      0,      0,      0]]),
 tensor([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]),
 tensor([216,  59,  30,  10,  14,  85,  65,  43,  57,  24,  59,   5,  22,  48,
          52, 128,  13,  45,  68,  35,  16,  75,  43,  53,   6,  42, 219,  30,
         186,  32,  31, 129, 223,  20,  11, 349, 187,  44, 654, 129,  79,  29,
          38,  10,  21, 189,  43,  12,  32, 104, 179,  84, 116,   4, 140,  91,
         113,   7,  90,  34,   9,   7,  81,  12]))

In [None]:
train_loader = DataLoader(
    list(zip(data['indexed_text'],data['label'])), 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=dynamic_padding)

In [None]:
preds=[]
true=[]
true2=[]
model.eval()
for i, (inputs, labels,lengths) in enumerate(train_loader):
        # optimizer.zero_grad()
        inputs,labels,lengths = inputs.to(device),labels.to(device),lengths.to(device)
        logits = model(inputs,lengths).cpu()
#         mask = labels>=0 
# #         print(labels)
#         labels = labels[mask]
#         print("labels: ",labels)
        # logits = logits.permute(0,2,1)[mask].view(-1, tagset_size-1)
#         print("preds:", torch.argmax(logits,dim=1))
        true += labels.cpu()
        true2.extend(labels.cpu().numpy())
        p = np.round(logits.cpu().detach().numpy())
        preds.extend(p)

In [None]:
for i in range(len(true)):
    if true[i]==2:
      true[i]=1

In [None]:
accuracy_score(true,preds)

0.998934643512919

In [None]:
#old
print(classification_report(true,preds))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99    143346
           1       0.94      0.82      0.88     16225

    accuracy                           0.98    159571
   macro avg       0.96      0.91      0.93    159571
weighted avg       0.98      0.98      0.98    159571



In [None]:
print(classification_report(true,preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    143346
           1       1.00      0.99      0.99     16225

    accuracy                           1.00    159571
   macro avg       1.00      1.00      1.00    159571
weighted avg       1.00      1.00      1.00    159571



In [None]:
list(data['label'][:5000])

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [None]:
preds[:5000]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [None]:
with open("jigsaw_predictions.txt","w") as f:
  f.write("\n".join([str(x) for x in preds[:5000]]))

In [None]:
### code for getting predictions on test set

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stopwords (optional)
    # stop_words = set(stopwords.words('english'))
    # words = [word for word in words if word not in stop_words]
    # Lemmatize words (optional)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Reconstruct the preprocessed text
    preprocessed_text = ' '.join(words)

    return preprocessed_text

##getting word embeddings index
with open('drive/MyDrive/NLP_Project_Data/word_to_idx_Glove.json',"r") as f:
    word_to_idx = json.load(f)

def sentence_dynamic_padding(batch):

    inputs = torch.tensor(batch)
    lengths = len(inputs)
    inputs = pad_sequence(inputs.view(1,-1), batch_first=True)
    
    return inputs, torch.tensor([lengths])


  
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Model1_project(nn.Module):
    def __init__(self, input_size, embedding_size, num_layers, hidden_size, lstm_dropout, output_size, final_output):
        super(Model1_project, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.blstm = nn.GRU(
            embedding_size,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True)
        self.linear = nn.Linear(2*hidden_size, final_output)
        self.dropout = nn.Dropout(p=lstm_dropout)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs, lengths):
        embedded = self.embedding(inputs)
        x = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        output, hidden = self.blstm(x)
        output, _ = pad_packed_sequence(output, batch_first=True)
        # output = torch.max(output, dim=1)
        output = self.mean_pooling(output)
        drop_output = self.dropout(output)
        linear_output = self.linear(drop_output)
        output = self.sigmoid(linear_output)

        return output.squeeze()
    def mean_pooling(self,x):
        return torch.mean(x,1)

# model = torch.load('model1_project.pt')
 
def text_to_indices(text, word_to_index):
    words = text.split()
    indices = [word_to_index.get(word, 0) for word in words]
    return indices


def generate_toxicity_token(input_sentence,model,word_to_index):
    processed_text = preprocess_text(input_sentence)
    tokenized_text = text_to_indices(processed_text,word_to_index)

    sentence_loader = DataLoader(
    list(tokenized_text), 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=sentence_dynamic_padding)

    preds=[]
    model.eval()
    for inputs,lengths in sentence_loader:
        inputs,lengths = inputs.to(device),lengths.to(device)
        output = model(inputs,lengths)
        preds = np.round(output.cpu().detach().numpy())
    return preds

In [None]:
def generate_toxicity_token(input_sentence,model,word_to_index):
    processed_text = preprocess_text(input_sentence)
    tokenized_text = text_to_indices(processed_text,word_to_index)

    sentence_loader = DataLoader(
    list(tokenized_text), 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=sentence_dynamic_padding)

    preds=[]
    model.eval()
    for inputs,lengths in sentence_loader:
        inputs,lengths = inputs.to(device),lengths.to(device)
        output = model(inputs,lengths)
        preds = np.round(output.cpu().detach().numpy())
    return preds

In [None]:
data['preprocessed_text'][0]

'explanation why the edits made under my username hardcore metallica fan were reverted they weren t vandalism just closure on some gas after i voted at new york doll fac and please don t remove the template from the talk page since i m retired now 89 205 38 27'

In [None]:
generate_toxicity_token(data['preprocessed_text'][0],model,word_to_idx)

0.0

In [None]:
def sentence_dynamic_padding(batch):

    inputs = torch.tensor(batch)#[torch.tensor(item) for item in batch]
    # print(inputs)
    lengths = len(inputs)
    inputs = pad_sequence(inputs.view(1,-1), batch_first=True)
    
    return inputs, torch.tensor([lengths])

sentence_loader = DataLoader(
    list(data['indexed_text'][0]), 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=sentence_dynamic_padding)

In [None]:
for inputs,lengths in sentence_loader:
    inputs,lengths = inputs.to(device),lengths.to(device)
    output = model(inputs,lengths)
    print(np.round(output.cpu().detach().numpy()))

0.0


In [None]:
type(data['indexed_text'][0])

list

In [None]:
model()

In [None]:
test_data = pd.read_csv('drive/MyDrive/NLP_Project_Data/test.csv')
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
test_data2 = pd.read_csv('drive/MyDrive/NLP_Project_Data/test_labels.csv')


In [None]:
test_data2.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [None]:
test_data2[list(test_data2.columns)[1:]]=test_data2[list(test_data2.columns)[1:]].apply(abs)
test_data2.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1,1,1,1,1,1
1,0000247867823ef7,1,1,1,1,1,1
2,00013b17ad220c46,1,1,1,1,1,1
3,00017563c3f7919a,1,1,1,1,1,1
4,00017695ad8997eb,1,1,1,1,1,1


In [None]:
test_data2['label'] = np.ceil(test_data2.iloc[:, 2:].sum(axis=1) / 6).astype(int)
test_data2.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,label
0,00001cee341fdb12,1,1,1,1,1,1,1
1,0000247867823ef7,1,1,1,1,1,1,1
2,00013b17ad220c46,1,1,1,1,1,1,1
3,00017563c3f7919a,1,1,1,1,1,1,1
4,00017695ad8997eb,1,1,1,1,1,1,1


In [None]:
def text_to_indices(text, word_to_index):
    words = text.split()
    indices = [word_to_index.get(word, 0) for word in words]
    return indices
### code for getting predictions on test set

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stopwords (optional)
    # stop_words = set(stopwords.words('english'))
    # words = [word for word in words if word not in stop_words]
    # Lemmatize words (optional)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Reconstruct the preprocessed text
    preprocessed_text = ' '.join(words)

    return preprocessed_text

In [None]:
test_data['processed_text'] = test_data['comment_text'].apply(preprocess_text)
test_data['indexed_text'] = test_data['processed_text'].apply(lambda x: text_to_indices(x, word_to_idx))

In [None]:
drop_list = []
for i in list(test_data.index):
  if len(test_data['indexed_text'][i])==0:
    drop_list.append(i)
test_data.drop(drop_list,inplace = True)

In [None]:
test_data2.drop(drop_list,inplace = True)

In [None]:
test_loader = DataLoader(
    list(zip(test_data['indexed_text'],test_data2['label'])), 
    batch_size=64, 
    shuffle=True, 
    collate_fn=dynamic_padding)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
model.to(device)

Model1_project(
  (embedding): Embedding(400002, 100)
  (blstm): GRU(100, 512, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)

In [None]:
preds=[]
true=[]
true2=[]
model.eval()
for i, (inputs, labels,lengths) in enumerate(test_loader):
        # optimizer.zero_grad()
        inputs,labels,lengths = inputs.to(device),labels.to(device),lengths.to(device)
        logits = model(inputs,lengths).cpu()
#         mask = labels>=0 
# #         print(labels)
#         labels = labels[mask]
#         print("labels: ",labels)
        # logits = logits.permute(0,2,1)[mask].view(-1, tagset_size-1)
#         print("preds:", torch.argmax(logits,dim=1))
        true += labels.cpu()
        true2.extend(labels.cpu().numpy())
        p = np.round(logits.cpu().detach().numpy())
        preds.extend(p)

for i in range(len(true)):
    if true[i]==2:
      true[i]=1

print("Accuracy Score: ",accuracy_score(true,preds))
print(classification_report(true,preds))

Accuracy Score:  0.5282523510971787
              precision    recall  f1-score   support

           0       0.45      0.91      0.60     59426
           1       0.83      0.29      0.43     93694

    accuracy                           0.53    153120
   macro avg       0.64      0.60      0.51    153120
weighted avg       0.68      0.53      0.49    153120



In [None]:
test_data = pd.read_csv('pos_tags_dataset.csv')
test_data.head()

Unnamed: 0.1,Unnamed: 0,comments,cleaned_comments,pos_tag,words
0,0,A lot of the clients I work with use SVB. This...,a lot of the clients i work with use svb this ...,"['DT', 'NN', 'IN', 'DT', 'NNS', 'PRP', 'VBP', ...","['a', 'lot', 'of', 'the', 'clients', 'i', 'wor..."
1,1,Remember when Jim Cramer a month ago said on h...,remember when jim cramer a month ago said on h...,"['VB', 'WRB', 'NNP', 'NNP', 'DT', 'NN', 'RB', ...","['remember', 'when', 'jim', 'cramer', 'a', 'mo..."
2,2,"The FDIC only insures up to $250k, what happen...",the fdic only insures up to k what happens to ...,"['DT', 'NN', 'RB', 'VBZ', 'RP', 'TO', 'NN', 'W...","['the', 'fdic', 'only', 'insures', 'up', 'to',..."
3,3,"This is a pretty big deal , they had a huge am...",this is a pretty big deal they had a huge amou...,"['DT', 'VBZ', 'DT', 'RB', 'JJ', 'NN', 'PRP', '...","['this', 'is', 'a', 'pretty', 'big', 'deal', '..."
4,4,If anyone's wondering how a bank with 200+ bil...,if anyone s wondering how a bank with billion ...,"['IN', 'NN', 'VBZ', 'VBG', 'WRB', 'DT', 'NN', ...","['if', 'anyone', 's', 'wondering', 'how', 'a',..."


In [None]:
test_data['cleaned_comments'][7420]

' '

In [None]:
for i in list(test_data.index):
  if len(test_data['indexed_text'][i])==0:
    drop_list.append(i)

In [None]:
drop_list

[43,
 516,
 1212,
 3010,
 4420,
 4424,
 4446,
 4467,
 5258,
 5642,
 6446,
 6449,
 7603,
 7977,
 9297,
 9307,
 9337,
 12684,
 14677,
 14752,
 14825,
 15226,
 15567,
 17573,
 17609,
 17778,
 17990,
 19731,
 19991,
 20434,
 21514,
 21546,
 28404,
 28465,
 28640,
 29654,
 30141,
 30851,
 32579,
 33144,
 34781,
 37059,
 37705,
 38422,
 38427,
 40254,
 40990,
 41013,
 41132,
 41140,
 41245,
 42600,
 43565,
 43906,
 44293,
 44436,
 47389,
 47605,
 48148,
 48427,
 50638,
 50921,
 53593,
 53696,
 55651,
 55837,
 56410,
 56492,
 57261,
 63709,
 7420,
 11251,
 14751,
 16472,
 40330,
 48348,
 51615,
 65252,
 7420,
 11251,
 14751,
 16472,
 40330,
 48348,
 51615,
 65252]

In [None]:
drop_list = []
for i in range(len(test_data)):
  try:
    test_data['cleaned_comments'][i].split()
  except:
    drop_list.append(i)

In [None]:
with open("NaN_indices.txt","w") as f:
  output_str = '\n'.join([str(x) for x in drop_list])
  f.write(output_str)

In [None]:
test_data.drop(drop_list,inplace = True)

In [None]:
# test_data['processed_text'] = test_data['cleaned_comments'].apply(preprocess_text)
test_data['indexed_text'] = test_data['cleaned_comments'].apply(lambda x: text_to_indices(x, word_to_idx))

In [None]:
def sentence_dynamic_padding(batch):

    # inputs = torch.tensor(batch)#[torch.tensor(item) for item in batch]
    # # print(inputs)
    # lengths = len(inputs)
    # inputs = pad_sequence(inputs.view(1,-1), batch_first=True)
    inputs = [torch.tensor(item) for item in batch]
    # labels = [item[1] for item in batch]
    # labels = torch.tensor(labels)
    # Pad the input sequences to the maximum length in the batch
    lengths = [len(inp) for inp in inputs]
    inputs = pad_sequence(inputs, batch_first=True)
    
    return inputs, torch.tensor(lengths)

In [None]:
sentence_loader = DataLoader(
    list(test_data['indexed_text']), 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=sentence_dynamic_padding)

preds=[]
model.eval()
for inputs,lengths in sentence_loader:
    inputs,lengths = inputs.to(device),lengths.to(device)
    output = model(inputs,lengths)
    preds.extend(np.round(output.cpu().detach().numpy()))

In [None]:
preds

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [None]:
with open("comments_model1_outputs.txt","w") as f:
  f.write('\n'.join([str(x) for x in preds]))