# Placeholder Notebook

Fill in code & analysis here.

In [1]:
# —— notebooks/eval_cbow.ipynb: 全新评估代码 —— 

import random
import numpy as np
from gensim.models import Word2Vec
import pathlib

# 1. 加载模型和数据
model = Word2Vec.load("../outputs/cbow_model")
comments = pathlib.Path("../data/clean/comments.txt").read_text(encoding="utf-8").splitlines()
sentences = [line.split() for line in comments]

# 2. 定义高效 Top-k 精度函数
def topk_acc(k, trials=1000):
    correct = 0
    total = 0

    for _ in range(trials):
        # 随机选择一句，并过滤 OOV
        sent = random.choice(sentences)
        sent = [w for w in sent if w in model.wv.key_to_index]
        if len(sent) < 3:
            continue
        
        # 随机掩码一个词
        idx = random.randrange(1, len(sent) - 1)
        target = sent[idx]
        context = sent[:idx] + sent[idx+1:]
        if not context:
            continue

        # 3. 计算上下文向量平均
        ctx_vec = np.mean([model.wv[w] for w in context], axis=0)

        # 4. 用 gensim 内置方法一次性找 top-k
        topk = [word for word, _ in model.wv.similar_by_vector(ctx_vec, topn=k)]

        total += 1
        if target in topk:
            correct += 1

    return correct / total if total else 0.0

# 5. 输出结果
print("▶ Starting CBOW evaluation…")
print(f"Top-1 Accuracy: {topk_acc(1):.4f}")
print(f"Top-3 Accuracy: {topk_acc(3):.4f}")


▶ Starting CBOW evaluation…
Top-1 Accuracy: 0.0020
Top-3 Accuracy: 0.0060
