# Introduction


This is a sentiment analysis model based on a Long Short-Term Memory (LSTM) neural network. The model takes as input a sequence of words represented as integers and learns to predict the sentiment of the text. The architecture of the model consists of an embedding layer to represent words as dense vectors, followed by two LSTM layers to process the sequence of embeddings, where the first layer takes the embedded input sequence as input and produces an output sequence of hidden states. The second layer takes the output sequence from the first layer as input and produces a final output as well as the final hidden and cell states and capture long-term dependencies, a dropout layer to prevent overfitting, parameter 0.3 specifies the fraction of units to drop., and a linear layer to map the LSTM output to a sentiment prediction. The output_dim parameter specifies the size of the output space, which in this case is 3 (for the 3 possible sentiment classes -1, 0, 1).
References are attached below


#Imports 

In [None]:
import pandas as pd

In [None]:
!pip install torchtext==0.6.0 --quiet
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator, TabularDataset, LabelField
import numpy as np
import pandas as pd
import spacy
import random
from torchtext.data.metrics import bleu_score
from pprint import pprint
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
import re

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25h

#DRIVE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# directory path
# CS779_NLP_COMP/2_Sentiment_Analysis
base_dir = '/content/drive/MyDrive/CS779_NLP_COMP/2_Sentiment_Analysis/'

# CS779_NLP_COMP/2_Sentiment_Analysis/datasets/train.csv
folder_name = 'datasets'

# file_path
file_path = base_dir + folder_name + '/'

In [None]:
# /content/drive/MyDrive/CS779_NLP_COMP/2_Sentiment_Analysis/datasets/train.csv
df = pd.read_csv(file_path+"train.csv")
df.head()

Unnamed: 0,text_id,sentence,gold_label
0,r1-0051002,"Cheers,\n\nDennis Nguyen\n416-879-6431",0
1,r1-0020356,May have to wait longer on holidays.,-1
2,r1-0058348,"I drove to vegas may 6th, to get my hair done.",0
3,r1-0080006,"In addition, I eat out often at various restau...",1
4,r1-0000827,Perhaps she was doing us a favor?,0


# Data Pre-processing

In [None]:
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

Number of rows:  92228
Number of columns:  3


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!python -m spacy download en_core_web_sm --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
nlp_en = spacy.load("en_core_web_sm")

In [None]:
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp39-cp39-linux_x86_64.whl (1982.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 GB[0m [31m856.1 kB/s[0m eta [36m0:00:00[0m
[0mCollecting torchvision==0.9.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torchvision-0.9.0%2Bcu111-cp39-cp39-linux_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==0.8.0
  Downloading torchaudio-0.8.0-cp39-cp39-manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchvision, torchaudio
  Att

In [None]:

!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Tokenization: 
breaking sentences into list of words.

In [None]:
def preprocess_text_en(text):
    # Lowercase 
    text = text.lower()
    # Clean Text
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    if not re.search('[a-zA-Z]', text):
        text = "z"
    doc = nlp_en(text)
    tokens = [token.text for token in doc]
    return tokens


In [None]:
# Test Run
sample = "This is Sentiment Analysis model"
print(preprocess_text_en(sample))

['i', 'love', 'machine', 'learning']


## Train/ Valid/ Test Split: 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
df.drop("text_id", axis = 1, inplace = True)

In [None]:
df.head()

Unnamed: 0,sentence,gold_label
0,"Cheers,\n\nDennis Nguyen\n416-879-6431",0
1,May have to wait longer on holidays.,-1
2,"I drove to vegas may 6th, to get my hair done.",0
3,"In addition, I eat out often at various restau...",1
4,Perhaps she was doing us a favor?,0


In [None]:
# Split data into training and testing
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
train_df.to_csv('train.csv', index = False)
test_df.to_csv('test.csv', index = False)

In [None]:
train_df.head()

Unnamed: 0,sentence,gold_label
88226,Our son joined us in the X-Child room which we...,0
68999,One concern .,-1
49345,And the LPN Rebecca the others I can say diffr...,-1
35123,"In March, I ordered 7 patio cushion and 4 pill...",0
32949,We in a bit of hurry today so did not browse.,0


In [None]:
train_df["sentence"][:2]

88226    Our son joined us in the X-Child room which we...
68999                                        One concern .
Name: sentence, dtype: object

In [None]:
train_df["gold_label"][:2]

88226    0
68999   -1
Name: gold_label, dtype: int64

## Generate vocabulary
https://torchtext.readthedocs.io/en/latest/data.html

In [None]:
text_field = Field(tokenize=preprocess_text_en,
               lower=True,
               init_token="<sos>",
               eos_token="<eos>")

label_field = LabelField(dtype=torch.float)

train_data, test_data = TabularDataset.splits(
                            path='.', 
                            train='train.csv', 
                            test='test.csv', 
                            format='csv', 
                            skip_header = True,
                            fields=[('sentence', text_field), ('gold_label', label_field)])

# build vocabulary
text_field.build_vocab(train_data,min_freq=1)
label_field.build_vocab(train_data)

In [None]:
# Define iterator
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, test_iter = BucketIterator.splits(
                                    (train_data, test_data), 
                                    batch_size=BATCH_SIZE, 
                                    sort_within_batch=True, 
                                    sort_key=lambda x: len(x.sentence),
                                    device=device)

# LSTM Model

LSTM(
  (embedding): Embedding(32085, 500)
  <br>
  (lstm1): LSTM(500, 512)
  <br>
  (lstm2): LSTM(512, 512)
  <br>
  (dropout): Dropout(p=0.3, inplace=False)
  <br>
  (fc): Linear(in_features=512, out_features=3, bias=True)
)

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        output1, _ = self.lstm1(embedded)
        output1 = self.dropout(output1)
        output, (hidden, cell) = self.lstm2(output1)
        output = self.dropout(output)
        preds = self.fc(hidden[-1])
        return preds

# initialize model
input_dim = len(text_field.vocab)
embedding_dim = 500
hidden_dim = 512
output_dim = 3
model = LSTM(input_dim, embedding_dim, hidden_dim, output_dim).to(device)

In [None]:
print(model)

LSTM(
  (embedding): Embedding(32085, 500)
  (lstm1): LSTM(500, 512)
  (lstm2): LSTM(512, 512)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=3, bias=True)
)


In [None]:
# Loss Function and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

In [None]:
# train model
def train(model, iterator, optimizer, criterion):
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.sentence).squeeze(1).to(device)
        label_tensor = torch.zeros((batch.sentence.shape[1], 3)).to(device)
        label_tensor[torch.arange(batch.sentence.shape[1]), batch.gold_label.long()] = 1
        loss = criterion(predictions, label_tensor)
        loss.backward()
        optimizer.step()

# evaluate model
def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.sentence).squeeze(1).to(device)

            label_tensor = torch.zeros((batch.sentence.shape[1], 3)).to(device)
            label_tensor[torch.arange(batch.sentence.shape[1]), batch.gold_label.long()] = 1

            loss = criterion(predictions, label_tensor)
            total_loss += loss.item()

            preds = predictions.argmax(dim=1, keepdim=True)
            total_correct += preds.eq(batch.gold_label.long().view_as(preds)).sum().item()
    return total_loss / len(iterator), total_correct / len(iterator.dataset)


### Train and Evaluate model

In [None]:
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    train(model, train_iter, optimizer, criterion)
    train_loss, train_acc = evaluate(model, train_iter, criterion)
    test_loss, test_acc = evaluate(model, test_iter, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.133 | Train Acc: 70.98%
	 Test Loss: 0.146 |  Test Acc: 67.96%
Epoch: 02
	Train Loss: 0.109 | Train Acc: 76.80%
	 Test Loss: 0.137 |  Test Acc: 70.37%
Epoch: 03
	Train Loss: 0.088 | Train Acc: 82.20%
	 Test Loss: 0.136 |  Test Acc: 71.35%
Epoch: 04
	Train Loss: 0.072 | Train Acc: 86.52%
	 Test Loss: 0.138 |  Test Acc: 70.52%
Epoch: 05
	Train Loss: 0.055 | Train Acc: 89.45%
	 Test Loss: 0.142 |  Test Acc: 70.76%


### Accuracy on train/test

In [None]:
test_loss, test_acc = evaluate(model, test_iter, criterion)
print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

	 Test Loss: 0.142 |  Test Acc: 70.76%


In [None]:
train_loss, train_acc = evaluate(model, train_iter, criterion)
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

	Train Loss: 0.025 | Train Acc: 95.66%


### Save Model

In [None]:
# Define path to save model
model_path = "/model_5_new.pt"

# Save the model
torch.save(model.state_dict(), file_path + model_path)

### Load Model

In [None]:
# Define the path where the saved model is located
model_path = "model_5_new.pt"

In [None]:
model.load_state_dict(torch.load(file_path+model_path))

<All keys matched successfully>

#Prediction

### For Dev Data

In [None]:
# /content/drive/MyDrive/CS779_NLP_COMP/2_Sentiment_Analysis/datasets/train.csv
dev_df = pd.read_csv(file_path+"dev.csv")
dev_df.head()

Unnamed: 0,text_id,sentence
0,r2-0017684,They were really quiet during lunch hour on a ...
1,r1-0056793,"They were, however, delicious and because my h..."
2,r1-0005378,We opted for a desert beverages as opposed to ...
3,r1-0065594,"Eat, sleep, repeat."
4,r1-0031164,I watched a number of those people who were wa...


In [None]:
dev_sentences = dev_df["sentence"].tolist()
type(dev_sentences)

list

In [None]:
import torch.nn.functional as F

In [None]:
def predict_sentiment(model, sentence):
    model.eval()
    with torch.no_grad():
      if type(sentence) == str:
        tmp = preprocess_text_en(sentence)
        tokens = [token.lower() for token in tmp]
      else:
        tokens = [token.lower() for token in sentence]
      text_to_indices = [text_field.vocab.stoi[token] for token in tokens]
      sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
      predictions = model(sentence_tensor).squeeze(1)
      # predictions = F.softmax(predictions, dim=0)
      # print("predictions")
      # print(predictions)
      labels = torch.argmax(predictions, dim=1)
      # print("labels")
      # print(labels)
      return labels

In [None]:
label_field.vocab.itos[:]

['0', '1', '-1']

In [None]:
cnt = 0
for i in range(10000):
  an = predict_sentiment(model, df["sentence"][i])
  # print(df["sentence"][i], df["gold_label"][i])
  # print(label_field.vocab.itos[an[0]],df["gold_label"][i])
  if int(df["gold_label"][i]) == int(label_field.vocab.itos[an[0]]):
    cnt = cnt+1
  print(i, cnt)
print(cnt)


In [None]:
cnt/10000

0.7978

## Prediction For Test

In [None]:
# /content/drive/MyDrive/CS779_NLP_COMP/2_Sentiment_Analysis/datasets/TEST/test.csv
test_df = pd.read_csv(file_path+"TEST/test.csv")
test_df.head()

Unnamed: 0,text_id,sentence
0,r1-0086521,A helpful valet at the Bellagio said it was a ...
1,r1-0044715,"People often ask ""what happened to the human c..."
2,r1-0060690,He explained there would be a diagnostic fee o...
3,r1-0016852,I had initially purchased a massage on Groupon.
4,r2-0006040,Primarily do high-end cars as they get referra...


In [None]:
len(test_df)

5110

In [None]:
test_sentences = test_df["sentence"].tolist()
tl = len(test_sentences)
tl

5110

In [None]:
list_ans = []
for i in range(tl):
  an = predict_sentiment(model, test_df["sentence"][i])
  ans = int(label_field.vocab.itos[an[0]])
  list_ans.append(ans)

In [None]:
list_ans[:5]

[1, 0, 0, 0, -1]

Should be 5110

In [None]:
len(list_ans)

5110

In [None]:
y_dev_pred_str = '\n'.join(map(str, [int(pred) for pred in list_ans]))

with open('answer_S_1.txt', 'w') as file:
    file.write(y_dev_pred_str)

# References

https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
<br>
https://drive.google.com/file/d/1D8dZAC3QAtAuPKfSb0qmBm1NxPcv4vUq/view
<br>
https://torchtext.readthedocs.io/en/latest/data.html