## Word Analogy Task

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import umap
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data
nltk.download('punkt')

# Load PyTorch embeddings
with open('word2vec_embeddings.pkl', 'rb') as f:
    pytorch_data = pickle.load(f)
    pytorch_embeddings = pytorch_data['embeddings']
    pytorch_word2idx = pytorch_data['word2idx']

[nltk_data] Downloading package punkt to /Users/junhohong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def word_analogy(word_a, word_b, word_c, embeddings, word2idx, top_n=10):
    """
    Solve word analogy: word_a - word_b + word_c = ?
    
    Returns:
        list of (word, similarity) tuples, or None if words missing
    """
    # Check if all words exist
    if word_a not in word2idx or word_b not in word2idx or word_c not in word2idx:
        missing = []
        if word_a not in word2idx: missing.append(word_a)
        if word_b not in word2idx: missing.append(word_b)
        if word_c not in word2idx: missing.append(word_c)
        return None, missing
    
    # Get embeddings
    vec_a = embeddings[word2idx[word_a]]
    vec_b = embeddings[word2idx[word_b]]
    vec_c = embeddings[word2idx[word_c]]
    
    # Compute target vector: a - b + c
    target_vec = vec_a - vec_b + vec_c
    
    # Compute cosine similarity with all words
    similarities = []
    for word, idx in word2idx.items():
        if word in [word_a, word_b, word_c]:
            continue
        vec = embeddings[idx]
        sim = np.dot(target_vec, vec) / (np.linalg.norm(target_vec) * np.linalg.norm(vec))
        similarities.append((word, sim))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n], None

# Define analogy test cases
analogies = [
    # Gender relationships
    ("king", "man", "woman", "queen", "Gender"),
    ("brother", "man", "woman", "sister", "Gender"),
    ("uncle", "man", "woman", "aunt", "Gender"),
    ("boy", "man", "woman", "girl", "Gender"),
    
    # Geography
    ("paris", "france", "italy", "rome", "Geography"),
    ("london", "england", "france", "paris", "Geography"),
    ("tokyo", "japan", "china", "beijing", "Geography"),
    
    # Comparative
    ("good", "better", "bad", "worse", "Comparative"),
    ("big", "bigger", "small", "smaller", "Comparative"),
    
    # Verb tenses
    ("walk", "walked", "talk", "talked", "Verb Tense"),
    ("go", "went", "do", "did", "Verb Tense"),
    
    # Plurals
    ("dog", "dogs", "cat", "cats", "Plural"),
    ("man", "men", "woman", "women", "Plural"),
]

# Track results
success = 0
total = 0
missing_vocab = 0

print()
for word_a, word_b, word_c, expected, category in analogies:
    result, missing = word_analogy(word_a, word_b, word_c, pytorch_embeddings, pytorch_word2idx, top_n=10)
    
    print(f"{word_a} - {word_b} + {word_c} = ? (Expected: {expected})")
    
    if missing:
        print(f"  Missing vocabulary: {missing}\n")
        missing_vocab += 1
        continue
    
    # Show top 5
    top5 = result[:5]
    for i, (word, sim) in enumerate(top5, 1):
        marker = "✓" if word == expected else " "
        print(f"  {marker} {i}. {word:15s} ({sim:.3f})")
    
    # Check success
    predicted_words = [w for w, _ in result]
    if expected in predicted_words:
        rank = predicted_words.index(expected) + 1
        if rank > 5:
            print(f"  (Found at rank {rank})")
        success += 1
    
    total += 1
    print()

# Summary
print("="*80)
print("SUMMARY")
print("="*80)
print(f"Accuracy: {success}/{total} ({success/total*100:.1f}%)")
print(f"Missing vocabulary: {missing_vocab}")

# Category breakdown
print(f"\n{'Category':<15s} {'Accuracy'}")
print("-" * 30)
categories = {}
for word_a, word_b, word_c, expected, category in analogies:
    if category not in categories:
        categories[category] = {'total': 0, 'success': 0}
    categories[category]['total'] += 1
    
    result, missing = word_analogy(word_a, word_b, word_c, pytorch_embeddings, pytorch_word2idx, top_n=10)
    if result:
        predicted_words = [w for w, _ in result]
        if expected in predicted_words:
            categories[category]['success'] += 1

for category, stats in sorted(categories.items()):
    rate = stats['success'] / stats['total'] * 100
    print(f"{category:<15s} {stats['success']}/{stats['total']} ({rate:.0f}%)")


king - man + woman = ? (Expected: queen)
  ✓ 1. queen           (0.582)
    2. prince          (0.582)
    3. daughter        (0.571)
    4. throne          (0.542)
    5. heir            (0.538)

brother - man + woman = ? (Expected: sister)
    1. daughter        (0.723)
    2. mother          (0.647)
    3. wife            (0.625)
  ✓ 4. sister          (0.624)
    5. married         (0.614)

uncle - man + woman = ? (Expected: aunt)
  ✓ 1. aunt            (0.629)
    2. grandfather     (0.621)
    3. mother          (0.609)
    4. grandmother     (0.600)
    5. eldest          (0.597)

boy - man + woman = ? (Expected: girl)
    1. daddy           (0.450)
    2. girlfriend      (0.439)
  ✓ 3. girl            (0.433)
    4. teenagers       (0.431)
    5. partner         (0.409)

paris - france + italy = ? (Expected: rome)
    1. genoa           (0.523)
    2. turin           (0.498)
    3. villa           (0.487)
    4. vienna          (0.473)
    5. ttingen         (0.469)

london - 