This is an approach to implement the word2vec paper (https://arxiv.org/abs/1301.3781) using GloVe embeddings (previously trained).
Referred/inspiration:
- [A Dummyâ€™s Guide to Word2Vec](https://medium.com/@manansuri/a-dummys-guide-to-word2vec-456444f3c673)
- [Word2vec from Scratch](https://jaketae.github.io/study/word2vec/)

# Additions: GloVe embeddins + Reddit dataset


In [None]:
!pip install gensim nltk datasets matplotlib umap-learn

import nltk
import gensim.downloader as api
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from datasets import load_dataset
from sklearn.manifold import TSNE
import umap

In [None]:
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
# Load Pretrained GloVe embeddings
glove_vectors = api.load("glove-wiki-gigaword-50")

In [None]:
# Load Reddit dataset (subsample for efficiency)
dataset = load_dataset("reddit", split="train[:10%]")

In [None]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return words

In [None]:
# Preprocess Reddit comments
import nltk
nltk.download('punkt_tab')
corpus = [preprocess_text(comment) for comment in dataset['body'] if isinstance(comment, str)]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=corpus, vector_size=50, window=5, min_count=5, workers=4, sg=1)


In [None]:
# Save model
word2vec_model.save("word2vec_reddit.model")

In [None]:
# Test Word2Vec model
print("Words similar to 'reddit':", word2vec_model.wv.most_similar("reddit"))

In [None]:
# Visualization of embeddings
def visualize_embeddings(model, num_words=100):
    words = list(model.wv.index_to_key)[:num_words]
    vectors = np.array([model.wv[word] for word in words])
    reducer = umap.UMAP(n_components=2, random_state=42)
    vectors_2d = reducer.fit_transform(vectors)

    plt.figure(figsize=(10, 6))
    plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], marker='o')

    for word, (x, y) in zip(words, vectors_2d):
        plt.text(x, y, word, fontsize=9)

    plt.show()

visualize_embeddings(word2vec_model)