In [1]:
!pip install torch==2.3.0
!pip install torchtext==0.18.0



In [2]:
import torch
import torch.nn as nn
import sklearn as sk
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import re
import torchtext.vocab as tvc
import nltk
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer



In [7]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
wordnet = WordNetLemmatizer()
data = pd.read_csv("sample_data/train.csv")
def clean_data(text, stop_words, max_length):
  text = text.lower()
  text = text.replace("'","")
  text = re.sub("[^a-zA-Z]", " ", text)
  text = text.split()
  text = [word for word in text if word not in stop_words]

  text = [wordnet.lemmatize(word) for word in text]
  text = text[:max_length]
  text
  return text
data["text"] = data["text"].apply(clean_data, stop_words = ["http", "com"], max_length = 1000)


print(type(data["text"]))

<class 'pandas.core.series.Series'>


In [4]:
dataset_y = data["sentiment"].tolist()
dataset_x = data["text"].tolist()
X_train = dataset_x
Y_train = dataset_y

In [5]:
min_freq = 2

unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [unk_token, pad_token]
special_tokens_output = []
Y_train = np.array(Y_train).reshape(-1,1)
input_train_dataset = torch.load("movie-rating-LLM-input.pt")
output_train_dataset = torch.load("movie-rating-LLM-output.pt")
input_train_dataset.set_default_index(input_train_dataset[unk_token])

Y_train = Y_train.tolist()
len(output_train_dataset)

2

In [7]:
for i in range(len(Y_train)):
  X_train[i] = torch.tensor(input_train_dataset.lookup_indices(X_train[i]))
  Y_train[i] = torch.tensor(output_train_dataset.lookup_indices(Y_train[i]))
X_train = nn.utils.rnn.pad_sequence(X_train, padding_value = 0, batch_first = True)
X_train.shape


torch.Size([25000, 1000])

In [6]:
class CustomDataset(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y
  def __len__(self):
    return len(self.x)
  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]
data = CustomDataset(X_train, Y_train)
dataloader = DataLoader(data, batch_size = 32, num_workers = 2)

In [7]:
class Encode(nn.Module):
  def __init__(self, input_size, hidden_size, output_size, num_layers,embedding_size, dropout):
    super(Encode, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.dropout = nn.Dropout(dropout)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
    self.ReLU = nn.ReLU()
    self.fc = nn.Linear(hidden_size, output_size)
    self.fc1 = nn.Linear(1000, 1)

  def forward(self, x):
    embedding = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.lstm(embedding)
    output = self.fc(output)
    out = self.fc1(output.reshape(-1,1000))
    out = torch.sigmoid(out)
    return out

input_size = len(input_train_dataset)
hidden_size = 100
output_size = 1
num_layers = 2
embedding_size = 128
dropout = 0.5
model = Encode(input_size, hidden_size, output_size, num_layers, embedding_size, dropout)

In [16]:
loss = nn.BCELoss()

from transformers import AdamW
from torch.optim import lr_scheduler

# Setting up optimizer
check_point = torch.load("check-point.pt")
model.load_state_dict(check_point['model_state_dict'])
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
optimizer.load_state_dict(check_point['optimizer_state_dict'])
num_epochs = 3
valid_loss = 0
count = 0
for epoch in range(12,15):
  for input, output in dataloader:
    y_pred = model(input)
    output = torch.tensor(output, dtype = torch.float32)
    l = loss(y_pred, output)
    valid_loss += l.item()
    count += 1
    l.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f"Epoch: {epoch}, Loss: {valid_loss/float(count)}")
  torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            'dictionary-input': input_train_dataset,
            'dictionary-output': output_train_dataset
            }, "check-point.pt")


  output = torch.tensor(output, dtype = torch.float32)


Epoch: 12, Loss: 0.06922490894794464
Epoch: 12, Loss: 0.04316394962370396
Epoch: 12, Loss: 0.05802527442574501
Epoch: 12, Loss: 0.04913770407438278
Epoch: 12, Loss: 0.04374914318323135
Epoch: 12, Loss: 0.04229325739045938
Epoch: 12, Loss: 0.062330130487680435
Epoch: 12, Loss: 0.06290094973519444
Epoch: 12, Loss: 0.06308882062633832
Epoch: 12, Loss: 0.06653873287141324
Epoch: 12, Loss: 0.08118574964729222
Epoch: 12, Loss: 0.11179482657462358
Epoch: 12, Loss: 0.11224749139868297
Epoch: 12, Loss: 0.11377218285841602
Epoch: 12, Loss: 0.10985500638683637
Epoch: 12, Loss: 0.10452110425103456
Epoch: 12, Loss: 0.10588105209171772
Epoch: 12, Loss: 0.10411515014453067
Epoch: 12, Loss: 0.10362685324722215
Epoch: 12, Loss: 0.1010638858191669
Epoch: 12, Loss: 0.0964707598045823
Epoch: 12, Loss: 0.09327155087058517
Epoch: 12, Loss: 0.09065943687101422
Epoch: 12, Loss: 0.09617214640214418
Epoch: 12, Loss: 0.09748644163832068
Epoch: 12, Loss: 0.09479152352119294
Epoch: 12, Loss: 0.09804793307557702
Ep

KeyboardInterrupt: 

In [80]:
data_test = pd.read_csv("sample_data/testsmall.csv")
data_test["text"] = data_test["text"].apply(clean_data, stop_words = ["http", "com"], max_length = 1000)


In [81]:
X_test = data_test["text"].tolist()
Y_test = data_test["sentiment"].tolist()


In [82]:
Y_test = np.array(Y_test).reshape(-1,1)
Y_test = Y_test.tolist()
for j in range(len(X_test)):
  X_test[j] = torch.tensor(input_train_dataset.lookup_indices(X_test[j]))
  Y_test[j] = torch.tensor(output_train_dataset.lookup_indices(Y_test[j]))
X_test = nn.utils.rnn.pad_sequence(X_test, padding_value = 0, batch_first = True)
X_test.shape

torch.Size([3197, 1000])

In [83]:
Y_test = torch.tensor(Y_test, dtype = torch.float32)

with torch.no_grad():
  y_pred = model(X_test)
  y_pred_cls = y_pred.round()
  y_pred_cls = y_pred_cls.reshape(-1)
  print(y_pred_cls)
  acc = y_pred_cls.eq(Y_test).sum() / float(3197)
  print(Y_test)
  print(acc.item()*100)

tensor([0., 0., 0.,  ..., 1., 1., 1.])
tensor([0., 0., 0.,  ..., 1., 1., 1.])
86.73756718635559


In [103]:
check_point = torch.load("check-point.pt")
model.load_state_dict(check_point['model_state_dict'])
nlp_input = "I not really like the movie. The movie was normal. The actors were awesome but not good"
nlp_input = clean_data(nlp_input, ["http", "com"], 1000)
print(nlp_input)
input = input_train_dataset.lookup_indices(nlp_input)
for m in range(1000 - len(nlp_input)):
  input.append(0)
with torch.no_grad():
  input = torch.tensor(input)
  print(model(input))


['i', 'not', 'really', 'like', 'the', 'movie', 'the', 'movie', 'wa', 'normal', 'the', 'actor', 'were', 'awesome', 'but', 'not', 'good']
tensor([[0.4773]])
