# Word2Vec with Hyperparameter Optimization

Unlock the power of word embeddings with our Word2Vec implementation using the CBOW algorithm! 🌊 This project is designed for easy use and optimization. Here’s how to get started:

- **🚀 Quick Setup**: Clone the repository and install the required dependencies.
- **📊 Data Preparation**: Prepare your text data for training by following the provided guidelines.
- **🔧 Hyperparameter Tuning**: Adjust parameters like learning rate, batch size, and embedding dimensions to optimize performance.
- **📈 Experiment Tracking**: Use Weights & Biases (WandB) to track your experiments and visualize results effortlessly.
- **🎉 Run the Model**: Execute the training script to build and fine-tune your Word2Vec model.

Join us on this exciting journey of building and optimizing a powerful Word2Vec model! ✨

## **Step 1**: Setting Up the Training Parameters

In [2]:
MAX_EXAMPLES = 1000000
MIN_LR = 0.01
MAX_LR = 0.2
MIN_BATCH_SIZE = 200
MAX_BATCH_SIZE = 400
MIN_EMBEDDING_DIM = 512
MAX_EMBEDDING_DIM = 1024
MIN_WINDOW_SIZE = 2
MAX_WINDOW_SIZE = 4
MIN_WEIGHT_DECAY = 1e-6
MAX_WEIGHT_DECAY = 1e-4
MIN_LOSS_THRESHOLD = 2.0
MAX_EPOCHS = 10
MAX_GRAD_NORM = 1.0 
MOMENTUM = 0.9
SWEEP_COUNT = 2
LR_SCHEDULER_FACTOR = 0.5
LR_SCHEDULER_PATIENCE = 2

FOLDER_TO_CONTINUE_TRAINING = "artifacts/word2vec_trained_model/..." 

OPTIMIZERS = ['adamw']

## **Step 2**: Data Preparation

In [3]:
# Load and process text8 data
import collections

# Read text8 dataset
with open('../data/text8', 'r') as f:
    words = f.read().split()

print(f"Total words: {len(words):,}")

# Build vocabulary (words appearing at least 5 times)
word_counts = collections.Counter(words)
vocabulary = {word: count for word, count in word_counts.items() if count >= 5}
word_to_index = {word: i for i, word in enumerate(vocabulary.keys())}
index_to_word = {i: word for word, i in word_to_index.items()}

print(f"Vocabulary size: {len(vocabulary):,}")

# Convert to indices (subsample for training speed)
indexed_words = [word_to_index[word] for word in words if word in vocabulary]

print(f"Training words: {len(indexed_words):,}")
print(f"Sample words: {words[:10]}")

Total words: 17,005,207
Vocabulary size: 71,290
Training words: 16,718,844
Sample words: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [4]:
import torch

def create_training_data(indexed_words, window_size, max_examples=None):
    contexts = []
    targets = []

    window_size = int(window_size)
    
    if max_examples is None:
        end_idx = len(indexed_words) - window_size
    else:
        end_idx = min(len(indexed_words) - window_size, max_examples + window_size)

    for i in range(window_size, end_idx):
        context = (indexed_words[i - window_size:i] +
                   indexed_words[i + 1:i + window_size + 1])
        target = indexed_words[i]
        contexts.append(context)
        targets.append(target)

    return torch.tensor(contexts), torch.tensor(targets)


## **Step 3**: NN Model for training

In [5]:
# Simple Word2Vec Model
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class SimpleWord2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        
        # Good initialization
        nn.init.xavier_uniform_(self.embeddings.weight)
        nn.init.xavier_uniform_(self.linear.weight)
        nn.init.zeros_(self.linear.bias)
    
    def forward(self, context):
        # Bag of words: average context embeddings
        embedded = self.embeddings(context)
        hidden = torch.tanh(torch.mean(embedded, dim=1))
        output = self.linear(hidden)
        return output

print("✅ Model functions ready!")

✅ Model functions ready!


## **Step 4**: Hyperparameter tuning

In [6]:
# Hyperparameter Sweep
import wandb

def log_uniform_param(min_value, max_value):
    """Helper function to create a log uniform parameter configuration."""
    return {
        'distribution': 'log_uniform_values',
        'min': min_value,
        'max': max_value
    }

# Adjusted sweep configuration for large dataset (16.7M words)
sweep_config = {
    'method': 'bayes',  # Smart parameter search
    'metric': {'name': 'best_loss', 'goal': 'minimize'},
    'parameters': {
        'learning_rate': log_uniform_param(MIN_LR, MAX_LR),
        'batch_size': log_uniform_param(MIN_BATCH_SIZE, MAX_BATCH_SIZE),
        'embedding_dim': log_uniform_param(MIN_EMBEDDING_DIM, MAX_EMBEDDING_DIM),
        'window_size': log_uniform_param(MIN_WINDOW_SIZE, MAX_WINDOW_SIZE),
        'optimizer': {'values': OPTIMIZERS},  # Add AdamW for better regularization
        'weight_decay': log_uniform_param(MIN_WEIGHT_DECAY, MAX_WEIGHT_DECAY)
    }
}

In [7]:
def train_model():
    """Training function for wandb sweep"""
    # Initialize wandb
    run = wandb.init()
    config = wandb.config
    
    # Round parameters to the nearest integer
    batch_size = int(round(config.batch_size))
    embedding_dim = int(round(config.embedding_dim))
    window_size = int(round(config.window_size))

    print(f"Testing: lr={config.learning_rate:.4f}, batch={batch_size}, "
          f"embed={embedding_dim}, window={window_size}")
    
    # Create training data
    contexts, targets = create_training_data(
        indexed_words, 
        window_size,
        max_examples=MAX_EXAMPLES
    )

    print(f"💪 Training with {len(contexts):,} examples (not all available data!)")
    
    # Setup training
    dataset = TensorDataset(contexts, targets)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SimpleWord2Vec(len(vocabulary), embedding_dim).to(device)
    
    # Choose optimizer
    if config.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(), 
                              lr=config.learning_rate, 
                              weight_decay=config.weight_decay)
    elif config.optimizer == 'adamw':
        optimizer = optim.AdamW(model.parameters(), 
                               lr=config.learning_rate, 
                               weight_decay=config.weight_decay)
    else:  # sgd
        optimizer = optim.SGD(model.parameters(), 
                             lr=config.learning_rate,
                             momentum=MOMENTUM, 
                             weight_decay=config.weight_decay)
        
    # Initialize ReduceLROnPlateau scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                           mode='min', 
                                                           factor=LR_SCHEDULER_FACTOR, 
                                                           patience=LR_SCHEDULER_PATIENCE, 
                                                           verbose=True)
    
    loss_function = nn.CrossEntropyLoss()
    
    # Training loop
    model.train()
    best_loss = float('inf')
    first_below_threshold_epoch = None  # Initialize the variable here
    
    for epoch in range(MAX_EPOCHS):
        epoch_loss = 0
        
        for batch_contexts, batch_targets in dataloader:
            batch_contexts = batch_contexts.to(device)
            batch_targets = batch_targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_contexts)
            loss = loss_function(outputs, batch_targets)
            loss.backward()
            
            # Gradient clipping for stability
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=MAX_GRAD_NORM)
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(dataloader)
        best_loss = min(best_loss, avg_loss)
        
        # Log metrics
        wandb.log({
            'epoch': epoch,
            'avg_loss': avg_loss,
            'best_loss': best_loss
        })
        
        scheduler.step(avg_loss)
        
        # Check if avg_loss falls below the threshold
        if avg_loss < MIN_LOSS_THRESHOLD:
            if first_below_threshold_epoch is None:  # Log the first epoch it falls below
                first_below_threshold_epoch = epoch
        
        # Early stopping if converging well
        if avg_loss < MIN_LOSS_THRESHOLD:
            break
    
    # Log the first epoch below the threshold
    if first_below_threshold_epoch is not None:
        wandb.log({'first_below_threshold_epoch': first_below_threshold_epoch})
    
    # Final result
    wandb.log({'final_best_loss': best_loss})
    run.finish()
    
    return best_loss

print("🎯 Sweep configuration ready!")
print(f"Will test: {list(sweep_config['parameters'].keys())}")

🎯 Sweep configuration ready!
Will test: ['learning_rate', 'batch_size', 'embedding_dim', 'window_size', 'optimizer', 'weight_decay']


In [None]:
import os
from dotenv import load_dotenv
import wandb

load_dotenv()

wandb_api_key = os.getenv('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

# Run the hyperparameter sweep
print("🚀 Starting hyperparameter sweep...")

# Create and run sweep
sweep_id = wandb.sweep(sweep_config, project="word2vec")
print(f"📊 Sweep ID: {sweep_id}")
print(f"🌐 Monitor at: https://wandb.ai/{wandb.api.default_entity}/word2vec/sweeps/{sweep_id}")

# Run 8 experiments
wandb.agent(sweep_id, train_model, count=SWEEP_COUNT)

print("✅ Sweep complete!")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/vscode/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoaopaesteves99[0m ([33mjoaopaesteves99-opensc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


🚀 Starting hyperparameter sweep...
Create sweep with ID: a7zt787a
Sweep URL: https://wandb.ai/joaopaesteves99-opensc/word2vec/sweeps/a7zt787a
📊 Sweep ID: a7zt787a
🌐 Monitor at: https://wandb.ai/joaopaesteves99-opensc/word2vec/sweeps/a7zt787a


[34m[1mwandb[0m: Agent Starting Run: g7xw85nw with config:
[34m[1mwandb[0m: 	batch_size: 305.9923764978384
[34m[1mwandb[0m: 	embedding_dim: 1006.9338913366272
[34m[1mwandb[0m: 	learning_rate: 0.03377521654937487
[34m[1mwandb[0m: 	optimizer: adamw
[34m[1mwandb[0m: 	weight_decay: 4.292177166716253e-05
[34m[1mwandb[0m: 	window_size: 2.8462799369857383


Testing: lr=0.0338, batch=306, embed=1007, window=3
💪 Training with 10,000,000 examples (not all available data!)


In [None]:
# Get the best parameters from local wandb runs (works without internet)
def get_best_parameters_and_model_from_local(sweep_id=None):

    import os
    import json
    import yaml
    import glob
    
    # Find all local wandb runs
    wandb_dir = "wandb"
    run_dirs = glob.glob(os.path.join(wandb_dir, "run-*"))
    
    if sweep_id:
        print(f"🎯 Looking for best run with sweep/run ID filter: {sweep_id}")
        print(f"📊 Found {len(run_dirs)} total local wandb runs...")
    else:
        print(f"🔍 Analyzing ALL local wandb runs...")
        print(f"📊 Found {len(run_dirs)} local wandb runs...")
    
    best_run_info = None
    best_loss = float('inf')
    best_run_dir = None
    matching_runs = []
    filtered_runs = []
    
    for run_dir in run_dirs:
        try:
            run_name = os.path.basename(run_dir)
            
            # If sweep_id is specified, filter runs
            if sweep_id:
                # Check if sweep_id matches either the run suffix or is contained in the run name
                if sweep_id not in run_name:
                    continue
                filtered_runs.append(run_name)
            
            # Read the summary file to get final metrics
            summary_file = os.path.join(run_dir, "files", "wandb-summary.json")
            if os.path.exists(summary_file):
                with open(summary_file, 'r') as f:
                    summary = json.load(f)
                
                # Get the best loss from this run
                loss = summary.get('best_loss', float('inf'))
                
                sweep_indicator = f" ✅" if sweep_id else ""
                print(f"  📋 {run_name}: loss = {loss:.4f}{sweep_indicator}")
                
                matching_runs.append((run_name, loss))
                
                if loss < best_loss:
                    best_loss = loss
                    best_run_dir = run_dir
                    best_run_info = summary
                    
        except Exception as e:
            run_name = os.path.basename(run_dir)
            print(f"  ⚠️  Could not read run {run_name}: {e}")
            continue
    
    if best_run_info and best_run_dir:
        run_name = os.path.basename(best_run_dir)
        if sweep_id:
            print(f"\n🏆 BEST RUN FOUND (filtered by '{sweep_id}'): {run_name} (Loss: {best_loss:.4f})")
            print(f"   📊 Matching runs: {len(matching_runs)} out of {len(run_dirs)} total")
        else:
            print(f"\n🏆 BEST RUN FOUND (from all runs): {run_name} (Loss: {best_loss:.4f})")
        print(f"   📁 Run directory: {best_run_dir}")
        
        # Read the config file
        config_file = os.path.join(best_run_dir, "files", "config.yaml")
        if os.path.exists(config_file):
            with open(config_file, 'r') as f:
                config_data = yaml.safe_load(f)
            
            # Extract the actual config values
            best_config = {}
            for key, value_info in config_data.items():
                if key.startswith('_'):  # Skip internal wandb keys
                    continue
                if isinstance(value_info, dict) and 'value' in value_info:
                    best_config[key] = value_info['value']
                else:
                    best_config[key] = value_info
            
            # Round integer parameters
            best_config['batch_size'] = int(round(best_config['batch_size']))
            best_config['embedding_dim'] = int(round(best_config['embedding_dim']))
            best_config['window_size'] = int(round(best_config['window_size']))
            
            print(f"📋 Best Config:")
            for key, value in best_config.items():
                print(f"   - {key}: {value}")
            
            # Check if we reached the threshold
            first_below_threshold = best_run_info.get('first_below_threshold_epoch', None)
            if first_below_threshold is not None:
                print(f"   ✅ Reached <{MIN_LOSS_THRESHOLD} at Epoch: {first_below_threshold}")
            
            print(f"\n🔄 Creating model with best parameters...")
            
            # Create model with best parameters
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model = SimpleWord2Vec(len(vocabulary), best_config['embedding_dim']).to(device)
            
            print(f"✅ Model created with best parameters on {device}")
            print(f"   - Embedding dim: {best_config['embedding_dim']}")
            print(f"   - Vocab size: {len(vocabulary):,}")
            print(f"   - Final loss: {best_loss:.4f}")
            
            # Create a mock run object with the necessary info
            class MockRun:
                def __init__(self, run_dir, summary):
                    self.name = os.path.basename(run_dir)
                    self.summary = summary
            
            mock_run = MockRun(best_run_dir, best_run_info)
            
            return best_config, model, mock_run
        else:
            print(f"❌ Config file not found: {config_file}")
            return None, None, None
    else:
        print("❌ No successful runs found in local wandb directory!")
        print(f"   Checked directory: {wandb_dir}")
        print(f"   Found {len(run_dirs)} run directories")
        return None, None, None

# Get best parameters and model from local wandb runs
# You can optionally specify a sweep_id or run identifier to filter results:
# best_config, best_model, best_run = get_best_parameters_and_model_from_local("vtjlni2h")  # specific sweep
# best_config, best_model, best_run = get_best_parameters_and_model_from_local("gbvymzcf")  # specific run suffix
print("🔍 Searching for best model in local wandb runs...")
best_config, best_model, best_run = get_best_parameters_and_model_from_local()


In [None]:
# Select the one to store
best_config, best_model, best_run = get_best_parameters_and_model_from_local('run-20250615_163750-lo5zpuyu')

## **Step 5**: Save Best Model and Run Analysis

In [None]:
import numpy as np

# Use the best model from hyperparameter sweep and run analysis
if best_config and best_model:
    print("🎯 Using best model from hyperparameter sweep for analysis...")
    
    # Get embeddings from the best model (this is from the sweep, already trained)
    embeddings = best_model.embeddings.weight.detach().cpu().numpy()
    print(f"📊 Embeddings shape: {embeddings.shape}")
    
    # Extract training information from the best run
    final_loss = best_run.summary.get('best_loss', 'Unknown')
    total_epochs = best_run.summary.get('epoch', 'Unknown') + 1 if best_run.summary.get('epoch') is not None else 'Unknown'
    
    print(f"📈 Best model loss: {final_loss}")
    print(f"📈 Training epochs: {total_epochs}")
    
    # Word similarity analysis function
    def find_similar_words(word, embeddings, word_to_index, index_to_word, top_k=5):
        """Find similar words using cosine similarity"""
        if word not in word_to_index:
            return f"'{word}' not in vocabulary"
        
        word_idx = word_to_index[word]
        word_embed = embeddings[word_idx]
        
        # Cosine similarity
        similarities = np.dot(embeddings, word_embed) / (
            np.linalg.norm(embeddings, axis=1) * np.linalg.norm(word_embed)
        )
        
        # Get top similar words
        top_indices = np.argsort(similarities)[::-1][1:top_k+1]  # Skip the word itself
        
        similar_words = [(index_to_word[idx], similarities[idx]) 
                        for idx in top_indices]
        return similar_words
    
    # Comprehensive word similarity test
    test_categories = {
        'Animals': ['dog', 'cat', 'bird', 'fish', 'horse'],
        'Countries': ['america', 'england', 'france', 'china', 'japan'],
        'Numbers': ['one', 'two', 'three', 'four', 'five'],
        'Colors': ['red', 'blue', 'green', 'yellow', 'black'],
        'Technology': ['computer', 'internet', 'software', 'technology'],
        'Emotions': ['happy', 'sad', 'love', 'angry', 'fear']
    }
    
    print("\n🔍 COMPREHENSIVE WORD SIMILARITY ANALYSIS")
    print("=" * 60)
    
    all_similarity_results = {}
    
    for category, words in test_categories.items():
        print(f"\n📋 {category.upper()}:")
        category_results = {}
        
        for word in words:
            similar = find_similar_words(word, embeddings, word_to_index, index_to_word, 5)
            if isinstance(similar, str):
                print(f"  {word}: {similar}")
                category_results[word] = similar
            else:
                similar_str = ', '.join([f"{w}({s:.3f})" for w, s in similar])
                print(f"  {word}: {similar_str}")
                category_results[word] = similar_str
        
        all_similarity_results[category] = category_results
    
    # Analogy testing (if possible)
    def word_analogy(word1, word2, word3, embeddings, word_to_index, index_to_word, top_k=5):
        """Perform word analogy: word1 is to word2 as word3 is to ?"""
        try:
            if not all(w in word_to_index for w in [word1, word2, word3]):
                missing = [w for w in [word1, word2, word3] if w not in word_to_index]
                return f"Words not in vocabulary: {missing}"
            
            # Get embeddings
            v1 = embeddings[word_to_index[word1]]
            v2 = embeddings[word_to_index[word2]]
            v3 = embeddings[word_to_index[word3]]
            
            # Calculate: word2 - word1 + word3
            target_vector = v2 - v1 + v3
            
            # Find most similar words
            similarities = np.dot(embeddings, target_vector) / (
                np.linalg.norm(embeddings, axis=1) * np.linalg.norm(target_vector)
            )
            
            # Get top candidates (excluding input words)
            exclude_indices = {word_to_index[w] for w in [word1, word2, word3]}
            candidates = []
            
            for idx in np.argsort(similarities)[::-1]:
                if idx not in exclude_indices:
                    candidates.append((index_to_word[idx], similarities[idx]))
                    if len(candidates) >= top_k:
                        break
            
            return candidates
        except Exception as e:
            return f"Error: {str(e)}"
    
    print(f"\n🧠 WORD ANALOGIES")
    print("=" * 40)
    
    analogy_tests = [
        ("king", "queen", "man"),  # king is to queen as man is to ?
        ("big", "bigger", "small"),  # big is to bigger as small is to ?
        ("good", "better", "bad"),  # good is to better as bad is to ?
    ]
    
    for word1, word2, word3 in analogy_tests:
        result = word_analogy(word1, word2, word3, embeddings, word_to_index, index_to_word)
        if isinstance(result, str):
            print(f"  {word1}:{word2} :: {word3}:? -> {result}")
        else:
            candidates = ', '.join([f"{w}({s:.3f})" for w, s in result[:3]])
            print(f"  {word1}:{word2} :: {word3}:? -> {candidates}")
    
    print(f"\n✅ Analysis complete! Model ready for saving and visualization.")
    
else:
    print("❌ No best config found. Run the sweep first!")

## **Step 6**: Save Best Model and Embeddings

In [None]:
# 💾 SAVE BEST MODEL AND EMBEDDINGS FOR FUTURE USE
if best_config and best_model and 'embeddings' in locals():
    print("💾 Saving best Word2Vec model and embeddings...")
    print("=" * 60)

    import pickle
    import json
    from datetime import datetime
    import os

    # Define the timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Create a directory for saved models
    save_dir = os.path.join("../artifacts/word2vec_trained_model", f"best_run_{timestamp}")
    os.makedirs(save_dir, exist_ok=True)

    # 1. Save the embeddings as numpy array
    embeddings_path = os.path.join(save_dir, "word2vec_embeddings.npy")
    np.save(embeddings_path, embeddings)
    print(f"✅ Saved embeddings: {embeddings_path}")
    print(f"   Shape: {embeddings.shape}")

    # 2. Save vocabulary mappings
    vocab_path = os.path.join(save_dir, "word2vec_vocab.pkl")
    vocab_data = {
        'word_to_index': word_to_index,
        'index_to_word': index_to_word,
        'vocabulary': vocabulary,
        'vocab_size': len(vocabulary)
    }

    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab_data, f)
    print(f"✅ Saved vocabulary: {vocab_path}")
    print(f"   Vocabulary size: {len(vocabulary):,} words")

    # 3. Save the full model state
    model_path = os.path.join(save_dir, "word2vec_model.pth")
    model_save_data = {
        'model_state_dict': best_model.state_dict(),
        'model_config': {
            'vocab_size': len(vocabulary),
            'embedding_dim': best_config['embedding_dim'],
            'architecture': 'SimpleWord2Vec_CBOW'
        },
        'training_config': best_config,
        'training_info': {
            'final_loss': final_loss,
            'total_epochs': total_epochs,
            'training_examples': MAX_EXAMPLES,
            'training_date': datetime.now().isoformat(),
            'sweep_id': sweep_id,
            'best_run_name': best_run.name
        }
    }

    torch.save(model_save_data, model_path)
    print(f"✅ Saved model state: {model_path}")

    # 4. Save human-readable configuration
    config_path = os.path.join(save_dir, "model_info.json")
    model_info = {
        "model_name": "SimpleWord2Vec_CBOW_BestFromSweep",
        "embedding_dimensions": best_config['embedding_dim'],
        "vocabulary_size": len(vocabulary),
        "training_examples": MAX_EXAMPLES,
        "final_loss": final_loss,
        "training_epochs": total_epochs,
        "sweep_id": sweep_id,
        "best_run_name": best_run.name,
        "best_hyperparameters": {
            "learning_rate": best_config['learning_rate'],
            "batch_size": best_config['batch_size'],
            "window_size": best_config['window_size'],
            "optimizer": best_config['optimizer'],
            "weight_decay": best_config['weight_decay']
        },
        "dataset": "text8",
        "training_date": datetime.now().isoformat(),
        "usage_instructions": {
            "load_embeddings": "embeddings = np.load('word2vec_embeddings.npy')",
            "load_vocab": "with open('word2vec_vocab.pkl', 'rb') as f: vocab = pickle.load(f)",
            "load_model": "checkpoint = torch.load('word2vec_model.pth')"
        }
    }

    with open(config_path, 'w') as f:
        json.dump(model_info, f, indent=2)
    print(f"✅ Saved model info: {config_path}")
    
    # Save the save directory path for visualization
    SAVED_MODEL_DIR = save_dir
    print(f"\n🎯 Model saved to: {SAVED_MODEL_DIR}")
    print("   This path will be used for visualization in the next step.")
    
else:
    print("❌ No model to save. Make sure the analysis step completed successfully.")

## **Step 7**: Visualize results

In [None]:
# 🎨 VISUALIZE WORD EMBEDDINGS FROM BEST MODEL
if 'SAVED_MODEL_DIR' in locals():
    print("🎨 Creating visualizations of word embeddings...")
    print("=" * 60)
    
    import numpy as np
    import pickle
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    from sklearn.manifold import TSNE
    from sklearn.decomposition import PCA
    
    # Load embeddings and vocabulary from saved model
    print(f"📂 Loading from: {SAVED_MODEL_DIR}")
    
    # Load embeddings
    viz_embeddings_path = os.path.join(SAVED_MODEL_DIR, "word2vec_embeddings.npy")
    viz_embeddings_all = np.load(viz_embeddings_path)
    print(f"✅ Loaded embeddings: {viz_embeddings_all.shape}")
    
    # Load vocabulary
    viz_vocab_path = os.path.join(SAVED_MODEL_DIR, "word2vec_vocab.pkl")
    with open(viz_vocab_path, 'rb') as f:
        viz_vocab_data = pickle.load(f)
    
    viz_word_to_index = viz_vocab_data['word_to_index']
    viz_index_to_word = viz_vocab_data['index_to_word']
    print(f"✅ Loaded vocabulary: {len(viz_word_to_index):,} words")
    
    # Words to visualize
    viz_words = [
        'dog', 'cat', 'bird', 'fish', 'horse', 'cow', 'pig',
        'red', 'blue', 'green', 'yellow', 'black', 'white',
        'one', 'two', 'three', 'four', 'five', 'six',
        'america', 'england', 'france', 'china', 'city', 'country',
        'computer', 'internet', 'digital', 'software', 'technology',
        'happy', 'sad', 'love', 'angry', 'fear', 'joy'
    ]

    # Word categories
    word_categories = {
        'Animals': ['dog', 'cat', 'bird', 'fish', 'horse', 'cow', 'pig'],
        'Colors': ['red', 'blue', 'green', 'yellow', 'black', 'white'],
        'Numbers': ['one', 'two', 'three', 'four', 'five', 'six'],
        'Places': ['america', 'england', 'france', 'china', 'city', 'country'],
        'Tech': ['computer', 'internet', 'digital', 'software', 'technology'],
        'Emotions': ['happy', 'sad', 'love', 'angry', 'fear', 'joy']
    }

    # Filter available words
    available_words = [word for word in viz_words if word in viz_word_to_index]
    print(f"🔍 Found {len(available_words)} words in vocabulary for visualization")
    
    if len(available_words) < 10:
        print("❌ Not enough words for visualization")
    else:
        # Get embeddings for available words
        selected_embeddings = np.array([viz_embeddings_all[viz_word_to_index[word]] for word in available_words])
        print(f"📊 Visualization embeddings shape: {selected_embeddings.shape}")

        # 3D t-SNE
        print("🔄 Computing 3D t-SNE...")
        tsne = TSNE(n_components=3, random_state=42, perplexity=min(30, len(available_words)-1))
        embeddings_3d = tsne.fit_transform(selected_embeddings)

        # Create 3D t-SNE plot
        fig = plt.figure(figsize=(15, 12))
        ax = fig.add_subplot(111, projection='3d')

        # Color scheme
        colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'gray']
        
        # Plot points
        for i, word in enumerate(available_words):
            x, y, z = embeddings_3d[i]
            
            # Find category color
            color = 'black'
            for cat_name, cat_words in word_categories.items():
                if word in cat_words:
                    cat_index = list(word_categories.keys()).index(cat_name)
                    color = colors[cat_index % len(colors)]
                    break
            
            ax.scatter(x, y, z, c=color, s=100, alpha=0.7)
            ax.text(x, y, z, word, fontsize=9)

        ax.set_title('3D Word Embeddings (t-SNE) - Best Model', fontsize=16)
        ax.set_xlabel('Component 1')
        ax.set_ylabel('Component 2')
        ax.set_zlabel('Component 3')

        # Legend
        legend_elements = []
        for i, (cat_name, _) in enumerate(word_categories.items()):
            color = colors[i % len(colors)]
            legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                                            markerfacecolor=color, markersize=8, label=cat_name))
        ax.legend(handles=legend_elements, loc='upper left')

        plt.tight_layout()
        
        # Save t-SNE plot
        tsne_filename = f'{SAVED_MODEL_DIR}/embeddings_3d_tsne.png'
        plt.savefig(tsne_filename, dpi=300, bbox_inches='tight')
        print(f"✅ Saved t-SNE visualization: {tsne_filename}")
        plt.show()

        # 3D PCA
        print("🔄 Computing 3D PCA...")
        pca = PCA(n_components=3)
        embeddings_pca = pca.fit_transform(selected_embeddings)

        # Create 3D PCA plot
        fig2 = plt.figure(figsize=(15, 12))
        ax2 = fig2.add_subplot(111, projection='3d')

        # Plot PCA points
        for i, word in enumerate(available_words):
            x, y, z = embeddings_pca[i]
            
            # Find category color
            color = 'black'
            for cat_name, cat_words in word_categories.items():
                if word in cat_words:
                    cat_index = list(word_categories.keys()).index(cat_name)
                    color = colors[cat_index % len(colors)]
                    break
            
            ax2.scatter(x, y, z, c=color, s=100, alpha=0.7)
            ax2.text(x, y, z, word, fontsize=9)

        ax2.set_title('3D Word Embeddings (PCA) - Best Model', fontsize=16)
        ax2.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
        ax2.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
        ax2.set_zlabel(f'PC3 ({pca.explained_variance_ratio_[2]:.1%})')
        ax2.legend(handles=legend_elements, loc='upper left')

        plt.tight_layout()
        
        # Save PCA plot
        pca_filename = f'{SAVED_MODEL_DIR}/embeddings_3d_pca.png'
        plt.savefig(pca_filename, dpi=300, bbox_inches='tight')
        print(f"✅ Saved PCA visualization: {pca_filename}")
        plt.show()

        print(f"📈 PCA explained variance: {sum(pca.explained_variance_ratio_):.1%}")
        print(f"\n🎉 All visualizations complete!")
        print(f"📁 Saved to model directory: {SAVED_MODEL_DIR}")
        print(f"   - {os.path.basename(tsne_filename)}")
        print(f"   - {os.path.basename(pca_filename)}")

else:
    print("❌ No saved model directory found. Please run the previous steps first.")

## **Step 8**: Fine-tuning model

In [None]:
# 🔄 LOAD TRAINED MODEL AND CONTINUE TRAINING
print("🔄 Loading trained Word2Vec model for continued training...")
print("=" * 60)

import pickle
import json
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import wandb

# Path to saved model # Change this to your desired path
FOLDER_TO_CONTINUE_TRAINING = "artifacts/word2vec_trained_model/best_run_20250615_165915"

# 1. Load vocabulary
print("📖 Loading vocabulary...")
vocab_path = os.path.join(FOLDER_TO_CONTINUE_TRAINING, "word2vec_vocab.pkl")
with open(vocab_path, 'rb') as f:
    vocab_data = pickle.load(f)

word_to_index = vocab_data['word_to_index']
index_to_word = vocab_data['index_to_word']
vocabulary = vocab_data['vocabulary']

print(f"✅ Loaded vocabulary: {len(vocabulary):,} words")

# 2. Load model checkpoint
print("🔧 Loading model checkpoint...")
model_path = os.path.join(FOLDER_TO_CONTINUE_TRAINING, "word2vec_model.pth")
checkpoint = torch.load(model_path, map_location='cpu')

model_config = checkpoint['model_config']
training_config = checkpoint['training_config']
training_info = checkpoint['training_info']

print(f"✅ Loaded model checkpoint")
print(f"   - Embedding dim: {model_config['embedding_dim']}")
print(f"   - Vocab size: {model_config['vocab_size']:,}")
print(f"   - Previous loss: {training_info['final_loss']:.4f}")
print(f"   - Previous epochs: {training_info['total_epochs']}")

# 3. Recreate model architecture and load state
print("🏗️  Recreating model architecture...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleWord2Vec(model_config['vocab_size'], model_config['embedding_dim']).to(device)
model.load_state_dict(checkpoint['model_state_dict'])

print(f"✅ Model loaded on {device}")

# 4. Set up optimizer with same parameters
print("⚙️  Setting up optimizer...")
if training_config['optimizer'] == 'adam':
    optimizer = optim.Adam(model.parameters(), 
                          lr=training_config['learning_rate'], 
                          weight_decay=training_config['weight_decay'])
elif training_config['optimizer'] == 'adamw':
    optimizer = optim.AdamW(model.parameters(), 
                           lr=training_config['learning_rate'], 
                           weight_decay=training_config['weight_decay'])
else:  # sgd
    optimizer = optim.SGD(model.parameters(), 
                         lr=training_config['learning_rate'],
                         momentum=0.9, 
                         weight_decay=training_config['weight_decay'])

print(f"✅ Optimizer: {training_config['optimizer']} (lr={training_config['learning_rate']})")

# 5. Create training data with same parameters
print("📊 Creating training data...")
contexts, targets = create_training_data(
    indexed_words, 
    training_config['window_size'],
    max_examples=MAX_EXAMPLES  # Same as before
)

dataset = TensorDataset(contexts, targets)
dataloader = DataLoader(dataset, 
                       batch_size=training_config['batch_size'], 
                       shuffle=True)

print(f"✅ Training data ready: {len(contexts):,} examples")
print(f"   - Window size: {training_config['window_size']}")
print(f"   - Batch size: {training_config['batch_size']}")

print("🚀 Ready for continued training!")


## **Step 9**: Save fine-tuned model

In [None]:
# 💾 SAVE BEST MODEL AND EMBEDDINGS FOR FUTURE USE
if best_config and best_model and 'embeddings' in locals():
    print("💾 Saving fine-tuned Word2Vec model and embeddings...")
    print("=" * 60)

    import pickle
    import json
    from datetime import datetime
    import os

    # Define the timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Create a directory for saved models
    save_dir = os.path.join("../artifacts/word2vec_trained_model", f"fine_tuned_{timestamp}")
    os.makedirs(save_dir, exist_ok=True)

    # 1. Save the embeddings as numpy array
    embeddings_path = os.path.join(save_dir, "word2vec_embeddings.npy")
    np.save(embeddings_path, embeddings)
    print(f"✅ Saved embeddings: {embeddings_path}")
    print(f"   Shape: {embeddings.shape}")

    # 2. Save vocabulary mappings
    vocab_path = os.path.join(save_dir, "word2vec_vocab.pkl")
    vocab_data = {
        'word_to_index': word_to_index,
        'index_to_word': index_to_word,
        'vocabulary': vocabulary,
        'vocab_size': len(vocabulary)
    }

    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab_data, f)
    print(f"✅ Saved vocabulary: {vocab_path}")
    print(f"   Vocabulary size: {len(vocabulary):,} words")

    # 3. Save the full model state
    model_path = os.path.join(save_dir, "word2vec_model.pth")
    model_save_data = {
        'model_state_dict': best_model.state_dict(),
        'model_config': {
            'vocab_size': len(vocabulary),
            'embedding_dim': best_config['embedding_dim'],
            'architecture': 'SimpleWord2Vec_CBOW'
        },
        'training_config': best_config,
        'training_info': {
            'final_loss': final_loss,
            'total_epochs': total_epochs,
            'training_examples': MAX_EXAMPLES,
            'training_date': datetime.now().isoformat(),
            'sweep_id': sweep_id,
            'best_run_name': best_run.name
        }
    }

    torch.save(model_save_data, model_path)
    print(f"✅ Saved model state: {model_path}")

    # 4. Save human-readable configuration
    config_path = os.path.join(save_dir, "model_info.json")
    model_info = {
        "model_name": "SimpleWord2Vec_CBOW_BestFromSweep",
        "embedding_dimensions": best_config['embedding_dim'],
        "vocabulary_size": len(vocabulary),
        "training_examples": MAX_EXAMPLES,
        "final_loss": final_loss,
        "training_epochs": total_epochs,
        "sweep_id": sweep_id,
        "best_run_name": best_run.name,
        "best_hyperparameters": {
            "learning_rate": best_config['learning_rate'],
            "batch_size": best_config['batch_size'],
            "window_size": best_config['window_size'],
            "optimizer": best_config['optimizer'],
            "weight_decay": best_config['weight_decay']
        },
        "dataset": "text8",
        "training_date": datetime.now().isoformat(),
        "usage_instructions": {
            "load_embeddings": "embeddings = np.load('word2vec_embeddings.npy')",
            "load_vocab": "with open('word2vec_vocab.pkl', 'rb') as f: vocab = pickle.load(f)",
            "load_model": "checkpoint = torch.load('word2vec_model.pth')"
        }
    }

    with open(config_path, 'w') as f:
        json.dump(model_info, f, indent=2)
    print(f"✅ Saved model info: {config_path}")
    
    # Save the save directory path for visualization
    SAVED_MODEL_DIR = save_dir
    print(f"\n🎯 Model saved to: {SAVED_MODEL_DIR}")
    print("   This path will be used for visualization in the next step.")
    
else:
    print("❌ No model to save. Make sure the analysis step completed successfully.")