In [1]:
import csv
import random

In [2]:
reviews = []
sentiments = []

In [3]:
with open('Review.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header
    for row in reader:
        sentiments.append(1 if row[0] == 'Positive' else 0)
        reviews.append(row[1])

In [4]:
# Display a few random reviews
for i in range(5):
    index = random.randint(0, len(reviews) - 1)
    print(f"Review: {reviews[index]}")
    print(f"Sentiment: {'Positive' if sentiments[index] == 1 else 'Negative'}")
    print("-----")

Review: This excellent drama had me in suspense the whole time. I could not take my eyes off the screen for one second because every word kept connecting the pieces to this puzzling murder. This movie really touched me because it showed how sad and hard life can be. I really did cry in the end (which I don't want to give away!) It also let me realize how cruel and sickening people can be when it comes to murder.   The cast was also very good. The only bad cast member was the actress who played Anne Marie. The actress did a great job, but the director didn't. I say this because he found someone who didn't look a single bit like Anne Marie Fahey herself.
Sentiment: Positive
-----
Review: Robert Urich was a fine actor, and he makes this TV movie believable. I remember watching this film when I was 15, and when seeing it a second time my opinion stays the same. People lose who they were when enter this exclusive club, in a computer rich Californian town. Urich try's to figure out what is w

In [5]:
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
import torch

In [6]:
# Tokenization
tokenizer = get_tokenizer('basic_english')
tokenized_reviews = [tokenizer(review) for review in reviews]

In [7]:
# Build vocabulary
counter = Counter()
for review in tokenized_reviews:
    counter.update(review)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict, min_freq=1)

In [8]:
# Numericalize, pad, and split the data
def numericalize(tokenized_review, vocab):
    return [vocab[token] for token in tokenized_review]

numericalized_reviews = [numericalize(review, vocab) for review in tokenized_reviews]
padded_reviews = torch.nn.utils.rnn.pad_sequence([torch.tensor(review) for review in numericalized_reviews], batch_first=True)

In [9]:
# Split the data
train_reviews, val_reviews, train_sentiments, val_sentiments = train_test_split(padded_reviews, sentiments, test_size=0.2, random_state=42)

In [10]:
import torch.nn as nn

In [11]:
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        _, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return output

In [12]:
# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
LEARNING_RATE = 0.001

In [13]:
# Initialize the model
model = SentimentModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [14]:
# Define the loss function and the optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy with logits
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [15]:
# Training parameters
EPOCHS = 5
BATCH_SIZE = 64

In [16]:
# Convert data to tensor format
train_reviews_tensor = torch.stack(list(train_reviews))
train_sentiments_tensor = torch.tensor(train_sentiments, dtype=torch.float32).view(-1, 1)
val_reviews_tensor = torch.stack(list(val_reviews))
val_sentiments_tensor = torch.tensor(val_sentiments, dtype=torch.float32).view(-1, 1)

In [None]:
# Training loop
for epoch in range(EPOCHS):
    for i in range(0, len(train_reviews), BATCH_SIZE):
        batch_reviews = train_reviews_tensor[i:i + BATCH_SIZE]
        batch_sentiments = train_sentiments_tensor[i:i + BATCH_SIZE]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_reviews)
        loss = criterion(outputs, batch_sentiments)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Print loss for every epoch
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {loss.item():.4f}")

Epoch 1/5, Loss: 0.6935
Epoch 2/5, Loss: 0.6932
Epoch 3/5, Loss: 0.6931


In [None]:
with torch.no_grad():
    val_outputs = model(val_reviews_tensor)
    val_loss = criterion(val_outputs, val_sentiments_tensor)
    val_predictions = torch.round(torch.sigmoid(val_outputs))
    accuracy = (val_predictions == val_sentiments_tensor).sum().float() / len(val_sentiments)

In [None]:
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
torch.save(model.state_dict(), 'path_to_store_model_state_dict.pth')

In [None]:
model = SentimentModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model.load_state_dict(torch.load('path_to_store_model_state_dict.pth'))
model.eval()  # Set the model to evaluation mode