In [1]:
from gensim.models import Word2Vec
import pandas as pd
import re
from collections import Counter
from tqdm import tqdm

In [2]:
df1 = pd.read_csv("../data/Sentiment_Analysis.csv")
df2 = pd.read_csv("../data/imdb_dataset.csv")

texts = pd.concat([df1['text'], df2['text']]).tolist()

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

texts = [clean_text(t) for t in texts]

In [4]:
class BPETokenizer:
    def __init__(self, vocab_size=2000):
        self.vocab_size = vocab_size
        self.vocab = {}
        self.merges = []

    def get_stats(self, corpus):
        pairs = Counter()
        for word, freq in corpus.items():
            symbols = word.split()
            for i in range(len(symbols)-1):
                pairs[(symbols[i], symbols[i+1])] += freq
        return pairs

    def merge_vocab(self, pair, corpus):
        merged = {}
        bigram = ' '.join(pair)
        replacement = ''.join(pair)
        for word in corpus:
            new_word = word.replace(bigram, replacement)
            merged[new_word] = corpus[word]
        return merged

    def train(self, texts):
        corpus = Counter([' '.join(list(word)) + ' </w>' for text in texts for word in text.split()])
        for _ in tqdm(range(self.vocab_size)):
            pairs = self.get_stats(corpus)
            if not pairs: break
            best = max(pairs, key=pairs.get)
            corpus = self.merge_vocab(best, corpus)
            self.merges.append(best)

        tokens = set()
        for word in corpus:
            tokens.update(word.split())
        self.vocab = {tok: i for i, tok in enumerate(tokens)}

    def tokenize_word(self, word):
        word = list(word) + ['</w>']
        i = 0
        while i < len(word)-1:
            pair = (word[i], word[i+1])
            if pair in self.merges:
                word[i:i+2] = [''.join(pair)]
            else:
                i += 1
        return word

    def tokenize(self, text):
        tokens = []
        for word in text.split():
            tokens.extend(self.tokenize_word(word))
        return tokens

In [5]:
tokenizer = BPETokenizer(vocab_size=2000)
tokenizer.train(texts)

100%|██████████| 2000/2000 [23:31<00:00,  1.42it/s]


In [6]:
tokenized_corpus = [tokenizer.tokenize(t) for t in texts]

In [7]:
model = Word2Vec(
    tokenized_corpus,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10
)

model.wv.save_word2vec_format("../outputs/custom_embeddings.vec")
print("Embeddings trained and saved!")

Embeddings trained and saved!
