In [1]:
import csv
import random

In [2]:
reviews = []
sentiments = []

In [3]:
with open('Review.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header
    for row in reader:
        sentiments.append(1 if row[0] == 'Positive' else 0)
        reviews.append(row[1])

In [4]:
# Display a few random reviews
for i in range(5):
    index = random.randint(0, len(reviews) - 1)
    print(f"Review: {reviews[index]}")
    print(f"Sentiment: {'Positive' if sentiments[index] == 1 else 'Negative'}")
    print("-----")

Review: Finally we get a TV series where we get to see the acting talent! Episode one was excellent! The script gave us a little more than usual, yeah, there was still the 'i'm not your father -i'm your father and omigod you cheated on me!' rubbish but the script allowed the actors to actually feel and live those real moments rather than show us what it would feel like if -like so many TV soaps do.   The camera work also gave us a little more than usual, there were no boring shots of repeated angles for hours yet there was no unnecessary'shots inside shots or hand-held camera crap' to add an 'artistic' edge it gave us what we needed to see and also some beautiful scenery pictures as well!   Nothing was over-dramatised or melodramatic they were real people in a real place dealing with real situations, the show lacked nothing in drama and was completely relevant. It was SUCH a relief to be exposed to real acting and so nice to let our country see just how talented our actors can be when 

In [None]:
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
import torch

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
# Downloading necessary datasets and wordnet from nltk
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

# Apply preprocessing to each review
tokenized_reviews = [preprocess_text(review) for review in reviews]

In [None]:
# Tokenization
tokenizer = get_tokenizer('basic_english')
tokenized_reviews = [tokenizer(review) for review in reviews]

In [None]:
# Build vocabulary
counter = Counter()
for review in tokenized_reviews:
    counter.update(review)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict, min_freq=1)

In [None]:
# Numericalize, pad, and split the data
def numericalize(tokenized_review, vocab):
    return [vocab[token] for token in tokenized_review]

numericalized_reviews = [numericalize(review, vocab) for review in tokenized_reviews]
padded_reviews = torch.nn.utils.rnn.pad_sequence([torch.tensor(review) for review in numericalized_reviews], batch_first=True)

In [None]:
# Split the data
train_reviews, val_reviews, train_sentiments, val_sentiments = train_test_split(padded_reviews, sentiments, test_size=0.2, random_state=42)

In [None]:
import torch.nn as nn

In [None]:
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        _, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return output

In [None]:
# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
LEARNING_RATE = 0.001

In [None]:
# Initialize the model
model = SentimentModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [None]:
# Define the loss function and the optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy with logits
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Training parameters
EPOCHS = 3
BATCH_SIZE = 64

In [None]:
# Convert data to tensor format
train_reviews_tensor = torch.stack(list(train_reviews))
train_sentiments_tensor = torch.tensor(train_sentiments, dtype=torch.float32).view(-1, 1)
val_reviews_tensor = torch.stack(list(val_reviews))
val_sentiments_tensor = torch.tensor(val_sentiments, dtype=torch.float32).view(-1, 1)

In [None]:
# Training loop
for epoch in range(EPOCHS):
    for i in range(0, len(train_reviews), BATCH_SIZE):
        batch_reviews = train_reviews_tensor[i:i + BATCH_SIZE]
        batch_sentiments = train_sentiments_tensor[i:i + BATCH_SIZE]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_reviews)
        loss = criterion(outputs, batch_sentiments)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Print loss for every epoch
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {loss.item():.4f}")

In [None]:
with torch.no_grad():
    val_outputs = model(val_reviews_tensor)
    val_loss = criterion(val_outputs, val_sentiments_tensor)
    val_predictions = torch.round(torch.sigmoid(val_outputs))
    accuracy = (val_predictions == val_sentiments_tensor).sum().float() / len(val_sentiments)

In [None]:
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
torch.save(model.state_dict(), 'model_state_dict.pth')

In [None]:
model = SentimentModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model.load_state_dict(torch.load('model_state_dict.pth'))
model.eval()  # Set the model to evaluation mode