In [None]:
!pip install torch==2.3.0
!pip install torchtext==0.18.0

In [None]:
import torch
import torch.nn as nn
import sklearn as sk
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import re
import torchtext.vocab as tvc
import nltk
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')

In [8]:
wordnet = WordNetLemmatizer()
stop_words = stopwords.words('english')
data = pd.read_csv("sample_data/labeled_data.csv")
def clean_data(text, stop_words, max_length):
  text = text.lower()
  text = text.replace("'","")
  text = re.sub("[^a-zA-Z]", " ", text)
  text = text.split()
  text = [word for word in text if word not in stop_words]

  text = [wordnet.lemmatize(word) for word in text]
  text = text[:max_length]
  text
  return text
data["tweet"] = data["tweet"].apply(clean_data, stop_words = stop_words, max_length = 1000)


print(type(data["tweet"]))

<class 'pandas.core.series.Series'>


In [9]:
dataset_y = data["class"].tolist()
dataset_x = data["tweet"].tolist()
X_train = dataset_x
Y_train = dataset_y

In [None]:
min_freq = 2

unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [unk_token, pad_token]
special_tokens_output = []
Y_train = np.array(Y_train).reshape(-1,1)
input_train_dataset = tvc.build_vocab_from_iterator(X_train, min_freq = min_freq, specials = special_tokens)
input_train_dataset.set_default_index(input_train_dataset[unk_token])

Y_train = Y_train.tolist()
print(Y_train)
print(type(Y_train))


In [11]:
torch.save(input_train_dataset, 'vocabulary_hate_speech.pt')

In [None]:
for i in range(len(Y_train)):
  X_train[i] = torch.tensor(input_train_dataset.lookup_indices(X_train[i]))
print(type(X_train))
X_train = nn.utils.rnn.pad_sequence(X_train, padding_value = 0, batch_first = True)
X_train.shape


In [13]:
class CustomDataset(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y
  def __len__(self):
    return len(self.x)
  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]
data = CustomDataset(X_train, Y_train)
dataloader = DataLoader(data, batch_size = 32, shuffle = True, num_workers = 2)

In [None]:
#Build Model

class Encode(nn.Module):
  def __init__(self, input_size, hidden_size, output_size, num_layers,embedding_size, dropout):
    super(Encode, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.dropout = nn.Dropout(dropout)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
    self.ReLU = nn.ReLU()
    self.fc = nn.Linear(hidden_size, output_size)
    self.fc1 = nn.Linear(31, 3)

  def forward(self, x):
    embedding = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.lstm(embedding)
    output = self.fc(output)
    out = self.fc1(output.reshape(-1,31))
    return out

input_size = len(input_train_dataset)
hidden_size = 100
output_size = 1
num_layers = 2
embedding_size = 128
dropout = 0.5


In [15]:
model = Encode(input_size, hidden_size, output_size, num_layers, embedding_size, dropout)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [16]:
check_point = torch.load("check-point-hate-speech.pt")
model.load_state_dict(check_point['model_state_dict'])
optimizer.load_state_dict(check_point['optimizer_state_dict'])

In [None]:
#Train model
num_epochs = 1
valid_loss = 0
count = 0
for epoch in range(15,20):
  for input, output in dataloader:
    output = output[0]
    y_pred = model(input)
    l = loss(y_pred, output)
    valid_loss += l.item()
    count += 1
    l.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f"Epoch: {epoch}, Loss: {valid_loss/float(count)}")
    if count % 100 == 0:
      torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, "check-point.pt")


In [None]:
data_test = pd.read_csv("sample_data/testhatespeech.csv")
data_test["tweet"] = data_test["tweet"].apply(clean_data, stop_words = stop_words, max_length = 1000)


In [None]:
def minimize(text, max_length):
  text = text[:max_length]
  return text
data_test["tweet"] = data_test["tweet"].apply(minimize, max_length = 31)
X_test = data_test["tweet"].tolist()
Y_test = data_test["Toxicity"].tolist()


In [None]:
X_test.append(['pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad','pad'])
Y_test.append(0)

In [None]:
Y_test = np.array(Y_test).reshape(-1,1)
Y_test = Y_test.tolist()
for j in range(len(X_test)):
  X_test[j] = torch.tensor(input_train_dataset.lookup_indices(X_test[j]))
X_test = nn.utils.rnn.pad_sequence(X_test, padding_value = 0, batch_first = True)
X_test.shape
X_test

In [None]:
Y_test = torch.tensor(Y_test, dtype = torch.float32)
Y_test = Y_test.reshape(-1)

with torch.no_grad():
  y_pred = model(X_test)
  print(Y_test)
  y_pred_cls = torch.max(y_pred,1)
  for i in range (y_pred_cls.indices.shape[0]):
    if y_pred_cls.indices[i] == 2:
      y_pred_cls.indices[i] = 0

  print(y_pred_cls.indices)
  acc = y_pred_cls.indices.eq(Y_test).sum() / float(3555)
  print(acc.item()*100)

In [None]:
torch.set_printoptions(profile="full")


In [None]:

nlp_input = "I love this movie guys"
nlp_input = clean_data(nlp_input, stop_words, 1000)
input = input_train_dataset.lookup_indices(nlp_input)
for m in range(31 - len(nlp_input)):
  input.append(0)
with torch.no_grad():
  input = torch.tensor(input)
  input = input.reshape(-1,31)
  print(torch.argmax(model(input)))
