In [9]:
# Install only if needed
!pip install pandas scikit-learn

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter
import re



In [2]:
# Load datasets
train_df = pd.read_csv("/content/drive/MyDrive/Web Mining/balanced_train_data_updated.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Web Mining/balanced_test_data_updated.csv")

# Check sample
train_df.head()

Unnamed: 0,Summary,Text,Score
0,Not bad for a vegan gluten free bar,consistency like little debbie brownie taste l...,1
1,Nothing Special,bought coffee like dunkin d. also arabica bean...,1
2,Treats too Big for Small Dogs.,actual treats 3 times larger picture box . kin...,0
3,best kit for the money,got kit added 4lbs frozen strawberry 's garden...,2
4,Does the job...,disappointed product arrived one cover inside ...,1


In [3]:
#combined training of summary and text

# Combine Text and Summary
train_df['Combined'] = train_df['Summary'].fillna('') + " " + train_df['Text'].fillna('')
test_df['Combined'] = test_df['Summary'].fillna('') + " " + test_df['Text'].fillna('')


In [4]:
# Simple tokenizer using regex
def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text)
    return re.findall(r'\b\w+\b', text.lower())

# Build vocab
counter = Counter()
for text in train_df['Combined']:
    counter.update(tokenize(text))

# Create word-to-index mapping
vocab = {"<pad>": 0, "<unk>": 1}
for idx, word in enumerate(counter, start=2):
    vocab[word] = idx


In [5]:
MAX_LEN = 100  # max number of tokens

def encode_text(text):
    tokens = tokenize(text)
    ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]
    if len(ids) < MAX_LEN:
        ids += [vocab["<pad>"]] * (MAX_LEN - len(ids))
    else:
        ids = ids[:MAX_LEN]
    return torch.tensor(ids)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        x = encode_text(self.texts[idx])
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

train_dataset = SentimentDataset(train_df['Text'].tolist(), train_df['Score'].tolist())
test_dataset = SentimentDataset(test_df['Text'].tolist(), test_df['Score'].tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [6]:
class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMSentiment, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        return self.fc(hidden[-1])

In [7]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBED_DIM = 128
HIDDEN_DIM = 64
OUTPUT_DIM = train_df['Score'].nunique()  # e.g., 3 for pos/neg/neutral

model = LSTMSentiment(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
def train_model(model, dataloader):
    model.train()
    total_loss = 0
    for x_batch, y_batch in dataloader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(x_batch)
        loss = loss_fn(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

for epoch in range(25):
    loss = train_model(model, train_loader)
    print(f"Epoch {epoch+1}: Loss = {loss:.4f}")

Epoch 1: Loss = 1.0920
Epoch 2: Loss = 1.0132
Epoch 3: Loss = 0.6799
Epoch 4: Loss = 0.5116
Epoch 5: Loss = 0.3904
Epoch 6: Loss = 0.2928
Epoch 7: Loss = 0.2200
Epoch 8: Loss = 0.1694
Epoch 9: Loss = 0.1362
Epoch 10: Loss = 0.1122
Epoch 11: Loss = 0.0929
Epoch 12: Loss = 0.0825
Epoch 13: Loss = 0.0694
Epoch 14: Loss = 0.0619
Epoch 15: Loss = 0.0562
Epoch 16: Loss = 0.0511
Epoch 17: Loss = 0.0461
Epoch 18: Loss = 0.0436
Epoch 19: Loss = 0.0392
Epoch 20: Loss = 0.0369
Epoch 21: Loss = 0.0354
Epoch 22: Loss = 0.0306
Epoch 23: Loss = 0.0302
Epoch 24: Loss = 0.0289
Epoch 25: Loss = 0.0270


In [10]:
def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            output = model(x_batch)
            preds = torch.argmax(output, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    return all_labels, all_preds

true_labels, pred_labels = evaluate_model(model, test_loader)

# Print accuracy
accuracy = accuracy_score(true_labels, pred_labels)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(true_labels, pred_labels))


Accuracy: 0.7518
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      8527
           1       0.68      0.72      0.70      8528
           2       0.81      0.78      0.80      8528

    accuracy                           0.75     25583
   macro avg       0.75      0.75      0.75     25583
weighted avg       0.75      0.75      0.75     25583

