In [1]:
import numpy as np
import pandas as pd
import json
from collections import Counter, defaultdict

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
print("üîπ Loading dataset...")
df = pd.read_csv("data/Sentiment_Analysis.csv")

print("üìä Available columns:", list(df.columns))

TEXT_CANDIDATES = ["text", "review", "sentence", "comment"]
LABEL_CANDIDATES = ["label", "sentiment", "polarity", "target"]

text_col = None
label_col = None

for c in TEXT_CANDIDATES:
    if c in df.columns:
        text_col = c
        break

for c in LABEL_CANDIDATES:
    if c in df.columns:
        label_col = c
        break

if text_col is None or label_col is None:
    raise ValueError("‚ùå Could not detect text or label column")

print(f"‚úÖ Text column: {text_col}")
print(f"‚úÖ Label column: {label_col}")
print(f"üìä Dataset size: {len(df)}")

üîπ Loading dataset...
üìä Available columns: ['sentiment', 'text']
‚úÖ Text column: text
‚úÖ Label column: sentiment
üìä Dataset size: 80000


In [3]:
class BPETokenizer:
    def __init__(self, vocab_size=200):
        self.vocab_size = vocab_size
        self.vocab = {}

    def get_stats(self, tokens):
        pairs = defaultdict(int)
        for word, freq in tokens.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[(symbols[i], symbols[i+1])] += freq
        return pairs

    def merge_vocab(self, pair, tokens):
        new_tokens = {}
        bigram = " ".join(pair)
        replacement = "".join(pair)
        for word in tokens:
            new_tokens[word.replace(bigram, replacement)] = tokens[word]
        return new_tokens

    def train(self, texts):
        tokens = Counter()
        for text in texts:
            for word in str(text).lower().split():
                tokens[" ".join(word) + " </w>"] += 1

        for _ in range(self.vocab_size):
            pairs = self.get_stats(tokens)
            if not pairs:
                break
            best = max(pairs, key=pairs.get)
            tokens = self.merge_vocab(best, tokens)

        vocab = set()
        for word in tokens:
            vocab.update(word.split())

        self.vocab = {t: i for i, t in enumerate(vocab)}

    def tokenize(self, text):
        output = []
        for word in str(text).lower().split():
            chars = list(word) + ["</w>"]
            i = 0
            while i < len(chars):
                j = len(chars)
                while j > i and "".join(chars[i:j]) not in self.vocab:
                    j -= 1
                output.append("".join(chars[i:j]))
                i = j
        return output

In [4]:
texts = df[text_col].astype(str).tolist()

tokenizer = BPETokenizer(vocab_size=200)

print("üîπ Training BPE tokenizer...")
tokenizer.train(texts)

with open("tokenizer/subword_vocab.json", "w", encoding="utf-8") as f:
    json.dump(tokenizer.vocab, f, indent=2)

print("‚úÖ Vocabulary saved to tokenizer/subword_vocab.json")

print("\nüîπ Sample Tokenization:\n")
for i in range(10):
    print(texts[i])
    print(tokenizer.tokenize(texts[i]))
    print("-" * 60)

üîπ Training BPE tokenizer...
‚úÖ Vocabulary saved to tokenizer/subword_vocab.json

üîπ Sample Tokenization:

And here is the rap song "African Warrior Queens", for which ChatGPT wrote the lyrics ü§é Yes, amateur but beautiful :)\n\n1/1 Œû 0.1 on KO ‚öîÔ∏è link below üîä sound on https://t.co/cyHY3m2qHy
['and</w>', 'here</w>', 'is</w>', 'the</w>', 'ra', 'p</w>', 's', 'on', 'g</w>', '"', 'a', 'f', 'rican</w>', 'war', 'ri', 'or</w>', 'qu', 'e', 'en', 's', '"', ',</w>', 'for</w>', 'which</w>', 'chatgpt</w>', 'w', 'ro', 'te</w>', 'the</w>', 'l', 'y', 'ric', 's</w>', 'ü§é', '</w>', 'y', 'es,</w>', 'amate', 'u', 'r</w>', 'but</w>', 'be', 'a', 'u', 'ti', 'fu', 'l</w>', ':', ')', '\\n', '\\n', '1', '/', '1', '</w>', 'Œæ', '</w>', '0', '.', '1', '</w>', 'on</w>', 'k', 'o</w>', '‚öî', 'Ô∏è', '</w>', 'lin', 'k</w>', 'below</w>', 'üîä', '</w>', 's', 'oun', 'd</w>', 'on</w>', 'h', 'tt', 'p', 's', ':', '/', '/', 't', '.', 'co', '/', 'c', 'y', 'h', 'y', '3', 'm', '2', 'q', 'h', 'y</w>']
-------

In [5]:
EMBED_DIM = 50
EPOCHS = 5
LR = 0.01

word2idx = tokenizer.vocab
embeddings_custom = np.random.randn(len(word2idx), EMBED_DIM)

print("üîπ Training custom embeddings...")

for epoch in range(EPOCHS):
    for text in texts:
        for word in str(text).lower().split():
            if word in word2idx:
                embeddings_custom[word2idx[word]] += LR
    print(f"Epoch {epoch+1} completed")

with open("embeddings/custom_embeddings.txt", "w", encoding="utf-8") as f:
    for word, idx in word2idx.items():
        vec = " ".join(map(str, embeddings_custom[idx]))
        f.write(f"{word} {vec}\n")

print("‚úÖ Custom embeddings saved")

üîπ Training custom embeddings...
Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Epoch 4 completed
Epoch 5 completed
‚úÖ Custom embeddings saved


In [6]:
def load_embeddings(path):
    emb = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            emb[parts[0]] = np.array(parts[1:], dtype=float)
    return emb

emb_custom = load_embeddings("embeddings/custom_embeddings.txt")
EMBED_DIM = len(next(iter(emb_custom.values())))

def sentence_vector(text, emb):
    vecs = [emb[w] for w in str(text).lower().split() if w in emb]
    return np.mean(vecs, axis=0) if vecs else np.zeros(EMBED_DIM)

X_custom = np.array([sentence_vector(t, emb_custom) for t in df[text_col]])
y = df[label_col].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_custom, y, test_size=0.2, random_state=42
)

model_custom = LogisticRegression(max_iter=1000)
model_custom.fit(X_train, y_train)

preds_custom = model_custom.predict(X_test)

print("‚úÖ RESULTS ‚Äî Custom Embeddings")
print("üéØ Accuracy:", accuracy_score(y_test, preds_custom))
print(classification_report(y_test, preds_custom))

‚úÖ RESULTS ‚Äî Custom Embeddings
üéØ Accuracy: 0.5810625
              precision    recall  f1-score   support

           0       0.59      0.55      0.57      8063
           1       0.57      0.61      0.59      7937

    accuracy                           0.58     16000
   macro avg       0.58      0.58      0.58     16000
weighted avg       0.58      0.58      0.58     16000



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
def load_glove(path):
    emb = {}
    with open(path, encoding="utf-8") as f:
        for line in f:
            values = line.strip().split()
            emb[values[0]] = np.array(values[1:], dtype=float)
    return emb

print("üîπ Loading GloVe embeddings...")
emb_glove = load_glove("embeddings/glove.6B.50d.txt")
EMBED_DIM = len(next(iter(emb_glove.values())))

üîπ Loading GloVe embeddings...


In [9]:
X_glove = np.array([sentence_vector(t, emb_glove) for t in df[text_col]])

X_train, X_test, y_train, y_test = train_test_split(
    X_glove, y, test_size=0.2, random_state=42
)

model_glove = LogisticRegression(max_iter=1000)
model_glove.fit(X_train, y_train)

preds_glove = model_glove.predict(X_test)

print("‚úÖ RESULTS ‚Äî GloVe Embeddings")
print("üéØ Accuracy:", accuracy_score(y_test, preds_glove))
print(classification_report(y_test, preds_glove))

‚úÖ RESULTS ‚Äî GloVe Embeddings
üéØ Accuracy: 0.6755625
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      8063
           1       0.67      0.67      0.67      7937

    accuracy                           0.68     16000
   macro avg       0.68      0.68      0.68     16000
weighted avg       0.68      0.68      0.68     16000



In [10]:
print("üìä FINAL COMPARISON SUMMARY")
print("-" * 40)
print("Custom Embeddings Accuracy :", accuracy_score(y_test, preds_custom))
print("GloVe Embeddings Accuracy  :", accuracy_score(y_test, preds_glove))
print("-" * 40)

print("""
Conclusion:
The model using pre-trained GloVe embeddings performs better than the
custom embeddings trained from scratch. This is expected because GloVe
embeddings are trained on very large corpora and capture richer semantic
relationships. However, the custom pipeline successfully demonstrates
the complete NLP workflow from tokenizer training to classification.
""")

üìä FINAL COMPARISON SUMMARY
----------------------------------------
Custom Embeddings Accuracy : 0.5810625
GloVe Embeddings Accuracy  : 0.6755625
----------------------------------------

Conclusion:
The model using pre-trained GloVe embeddings performs better than the
custom embeddings trained from scratch. This is expected because GloVe
embeddings are trained on very large corpora and capture richer semantic
relationships. However, the custom pipeline successfully demonstrates
the complete NLP workflow from tokenizer training to classification.

