In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import re
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
from tqdm import tqdm

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df = pd.read_csv("../data/Sentiment_Analysis.csv")
df['text'] = df['text'].apply(clean_text)

texts = df['text'].tolist()
labels = df['sentiment'].values

In [4]:
class BPETokenizer:
    def __init__(self, vocab_size=2000):
        self.vocab_size = vocab_size
        self.vocab = {}
        self.merges = []

    def get_stats(self, corpus):
        pairs = Counter()
        for word, freq in corpus.items():
            symbols = word.split()
            for i in range(len(symbols)-1):
                pairs[(symbols[i], symbols[i+1])] += freq
        return pairs

    def merge_vocab(self, pair, corpus):
        merged = {}
        bigram = ' '.join(pair)
        replacement = ''.join(pair)
        for word in corpus:
            new_word = word.replace(bigram, replacement)
            merged[new_word] = corpus[word]
        return merged

    def train(self, texts):
        corpus = Counter([' '.join(list(word)) + ' </w>' for text in texts for word in text.split()])
        for _ in tqdm(range(self.vocab_size)):
            pairs = self.get_stats(corpus)
            if not pairs:
                break
            best = max(pairs, key=pairs.get)
            corpus = self.merge_vocab(best, corpus)
            self.merges.append(best)

        tokens = set()
        for word in corpus:
            tokens.update(word.split())
        self.vocab = {tok: i for i, tok in enumerate(tokens)}

    def tokenize_word(self, word):
        word = list(word) + ['</w>']
        i = 0
        while i < len(word)-1:
            pair = (word[i], word[i+1])
            if pair in self.merges:
                word[i:i+2] = [''.join(pair)]
            else:
                i += 1
        return word

    def tokenize(self, text):
        tokens = []
        for word in text.split():
            tokens.extend(self.tokenize_word(word))
        return tokens

In [5]:
df2 = pd.read_csv("../data/imdb_dataset.csv")
all_texts = pd.concat([df['text'], df2['text'].apply(clean_text)]).tolist()

tokenizer = BPETokenizer(vocab_size=2000)
tokenizer.train(all_texts)

100%|██████████| 2000/2000 [26:20<00:00,  1.27it/s]


In [6]:
embeddings = KeyedVectors.load_word2vec_format("../outputs/custom_embeddings.vec")
embedding_dim = embeddings.vector_size

In [7]:
def vectorize(text):
    tokens = tokenizer.tokenize(text)
    vecs = [embeddings[t] for t in tokens if t in embeddings]
    if len(vecs) == 0:
        return np.zeros(embedding_dim)
    return np.mean(vecs, axis=0)

X = np.array([vectorize(t) for t in texts])
y = np.array(labels)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
class Classifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        return self.fc(x)

model = Classifier(embedding_dim)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)

In [11]:
for epoch in range(10):
    optimizer.zero_grad()
    preds = model(X_train_t)
    loss = loss_fn(preds, y_train_t)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 0.6932
Epoch 2, Loss: 0.6916
Epoch 3, Loss: 0.6902
Epoch 4, Loss: 0.6890
Epoch 5, Loss: 0.6878
Epoch 6, Loss: 0.6867
Epoch 7, Loss: 0.6856
Epoch 8, Loss: 0.6845
Epoch 9, Loss: 0.6835
Epoch 10, Loss: 0.6825


In [12]:
torch.save(model.state_dict(), "../outputs/custom_model.pt")
print("✅ Model saved!")

✅ Model saved!


In [13]:
model.eval()
with torch.no_grad():
    preds = model(torch.tensor(X_test, dtype=torch.float32))
    preds = torch.argmax(preds, dim=1)

print(classification_report(y_test, preds.numpy()))

              precision    recall  f1-score   support

           0       0.57      0.54      0.56      8063
           1       0.56      0.59      0.57      7937

    accuracy                           0.57     16000
   macro avg       0.57      0.57      0.57     16000
weighted avg       0.57      0.57      0.57     16000

