In [1]:
import sys
import os
import pandas as pd
import pickle

# Add project root to path
sys.path.append(os.path.abspath(".."))

from word2vec.train_word2vec import train_word2vec
from word2vec.utils import generate_skipgram_pairs

# Load cleaned dataset
df = pd.read_csv("../data/clean_dataset.csv")

# Load BPE models
with open("../bpe/bpe_code.pkl", "rb") as f:
    bpe_code = pickle.load(f)

with open("../bpe/bpe_doc.pkl", "rb") as f:
    bpe_doc = pickle.load(f)

# Prepare code tokens
code_tokens = [bpe_code.encode(code) for code in df['code'].astype(str).sample(500)]
doc_tokens = [bpe_doc.encode(doc) for doc in df['docstring'].astype(str).sample(500)]

# Flatten and generate pairs
code_pairs = []
for tokens in code_tokens:
    code_pairs.extend(generate_skipgram_pairs(tokens))

doc_pairs = []
for tokens in doc_tokens:
    doc_pairs.extend(generate_skipgram_pairs(tokens))

# Build vocab
code_vocab = set([w for c,o in code_pairs for w in [c,o]])
doc_vocab = set([w for c,o in doc_pairs for w in [c,o]])

# Train
model_code, w2i_code, i2w_code = train_word2vec(code_pairs, list(code_vocab), embed_dim=50, epochs=3)
model_doc, w2i_doc, i2w_doc = train_word2vec(doc_pairs, list(doc_vocab), embed_dim=50, epochs=3)

Epoch 1, Loss: 5751112.5626
Epoch 2, Loss: 5747772.9629


KeyboardInterrupt: 