# 07 — Word2Vec Embeddings

Train Word2Vec (Skip-gram) embeddings on the corpus and generate document-level vectors.
These vectors will serve as input features for FFN, CNN, and RNN models.

**Output**:
- `models/word2vec/{standard,irony}/word2vec.model`
- `models/word2vec/{standard,irony}/doc_vectors_{train,test}.npy`
- `models/word2vec/{standard,irony}/labels_{train,test}.npy`

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import os

In [2]:
%load_ext watermark
%watermark -v -n -m -p numpy,pandas,gensim

Python implementation: CPython
Python version       : 3.12.12
IPython version      : 9.10.0

numpy : 1.26.4
pandas: 3.0.0
gensim: 4.4.0

Compiler    : Clang 17.0.0 (clang-1700.6.3.2)
OS          : Darwin
Release     : 25.2.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit



## 1. Helper Functions

In [3]:
def tokenize(texts):
    """Simple whitespace tokenizer."""
    return [str(t).split() for t in texts]

def doc_vector(tokens_list, model, vector_size):
    """Average word vectors for each document."""
    vectors = []
    for tokens in tokens_list:
        word_vecs = [model.wv[w] for w in tokens if w in model.wv]
        if word_vecs:
            vectors.append(np.mean(word_vecs, axis=0))
        else:
            vectors.append(np.zeros(vector_size))
    return np.array(vectors)

## 2. Train Word2Vec & Generate Document Vectors

In [4]:
VECTOR_SIZE = 100
WINDOW = 5
MIN_COUNT = 2
EPOCHS = 30

pipelines = {
    'standard': '../data/processed/standard',
    'irony':    '../data/processed/irony'
}

for name, data_dir in pipelines.items():
    print(f"\n{'='*20} Pipeline: {name} {'='*20}")
    
    # Load data
    train_df = pd.read_csv(f'{data_dir}/train.csv').fillna('')
    test_df  = pd.read_csv(f'{data_dir}/test.csv').fillna('')
    
    # Tokenize
    train_tokens = tokenize(train_df['text_clean'])
    test_tokens  = tokenize(test_df['text_clean'])
    
    # Train Word2Vec (Skip-gram)
    w2v_model = Word2Vec(
        sentences=train_tokens,
        vector_size=VECTOR_SIZE,
        window=WINDOW,
        min_count=MIN_COUNT,
        sg=1,  # Skip-gram
        workers=4,
        epochs=EPOCHS,
        seed=42
    )
    
    print(f"Vocabulary size: {len(w2v_model.wv)}")
    
    # Generate document vectors
    train_vecs = doc_vector(train_tokens, w2v_model, VECTOR_SIZE)
    test_vecs  = doc_vector(test_tokens, w2v_model, VECTOR_SIZE)
    
    print(f"Train vectors shape: {train_vecs.shape}")
    print(f"Test vectors shape:  {test_vecs.shape}")
    
    # Save
    output_dir = f'../models/word2vec/{name}'
    os.makedirs(output_dir, exist_ok=True)
    
    w2v_model.save(f'{output_dir}/word2vec.model')
    np.save(f'{output_dir}/doc_vectors_train.npy', train_vecs)
    np.save(f'{output_dir}/doc_vectors_test.npy', test_vecs)
    np.save(f'{output_dir}/labels_train.npy', train_df['label'].values)
    np.save(f'{output_dir}/labels_test.npy', test_df['label'].values)
    
    print(f"Saved to {output_dir}")




Vocabulary size: 2247
Train vectors shape: (2100, 100)
Test vectors shape:  (450, 100)
Saved to ../models/word2vec/standard



Vocabulary size: 2238
Train vectors shape: (2100, 100)
Test vectors shape:  (450, 100)
Saved to ../models/word2vec/irony


## 3. Inspect Embeddings

In [5]:
# Show most similar words to key domain terms
w2v_standard = Word2Vec.load('../models/word2vec/standard/word2vec.model')

for word in ['droga', 'marihuana', 'fumar', 'porro', 'quiero']:
    if word in w2v_standard.wv:
        similar = w2v_standard.wv.most_similar(word, topn=5)
        print(f"\n'{word}' → {[s[0] for s in similar]}")
    else:
        print(f"\n'{word}' not in vocabulary")


'droga' → [':marca_de_cruz:', 'droga.', 'conducir', 'otras', 'eres']

'marihuana' → ['multado', 'francia', 'pesos', 'todxs', 'justicia']

'fumar' → ['ácido,', 'mota', 'podemos', 'comer', 'drogadicta']

'porro' → ['porro.', 'un', 'hice', 'olvide', 'gordo']

'quiero' → ['ah', 'falta', 'siente', 'nose', 'pase']


## Summary
Word2Vec embeddings trained on both pipelines. Document vectors saved as `.npy` files for use by FFN, CNN, and RNN notebooks.