# performance evaluation

this notebook is all about performance evaluation, i.e. how well do these models do on practical tasks like word analogies and similarities.

In [1]:
from gensim.models import Word2Vec
import os
import urllib.request
import zipfile

first, load whatever models you want to evaluate.

In [2]:
model_path = '../../../embedding_gen/embeddings/w2v_pfv_100_5_f_5/model'
full_path = os.path.abspath(model_path)
model = Word2Vec.load(full_path)
wordvecs = model.wv

## word similarity and analogy

to evaluate how well the model captures semantic and syntactic relationships.

### google analogies

In [24]:
analogy_url = "https://raw.githubusercontent.com/piskvorky/gensim/refs/heads/develop/gensim/test/test_data/questions-words.txt"
analogy_path = os.path.join("eval", "questions-words.txt")

if not os.path.exists(analogy_path):
    urllib.request.urlretrieve(analogy_url, analogy_path)

analogy_res = wordvecs.evaluate_word_analogies(analogy_path)
print(analogy_res[0])

0.44464203763194127


### wordsim-353

In [21]:
ws_url = "https://raw.githubusercontent.com/piskvorky/gensim/refs/heads/develop/gensim/test/test_data/wordsim353.tsv"
ws_path = os.path.join("eval", "wordsim353.txt")

if not os.path.exists(ws_path):
    urllib.request.urlretrieve(ws_url, ws_path)

ws_res = wordvecs.evaluate_word_pairs(ws_path)
print(ws_res)

(PearsonRResult(statistic=0.6460058078485802, pvalue=4.4623722878998506e-43), SignificanceResult(statistic=0.6682090256703265, pvalue=5.238914856995817e-47), 0.0)


### simlex-999

In [22]:
simlex_url = "https://raw.githubusercontent.com/piskvorky/gensim/refs/heads/develop/gensim/test/test_data/simlex999.txt"
simlex_path = os.path.join("eval", "simlex999.txt")

if not os.path.exists(simlex_path):
    urllib.request.urlretrieve(simlex_url, simlex_path)

simlex_res = wordvecs.evaluate_word_pairs(simlex_path)
print(simlex_res)

(PearsonRResult(statistic=0.3768184128980168, pvalue=4.6683703044622626e-35), SignificanceResult(statistic=0.35686805603251986, pvalue=2.262765902862798e-31), 0.0)


### MEN dataset

here some preprocessing is needed to get the scores into the right range.

In [29]:
input_file = "eval/men.txt"  
output_file = "eval/men_normalized.txt"

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for line in infile:
        parts = line.rsplit(" ", 1) 
        if len(parts) == 2:
            word_pair, score = parts[0], float(parts[1])
            normalized_score = (score * 10) / 50  # normalize to [0,10]
            words = word_pair.split()
            outfile.write(f"{words[0]}\t{words[1]}\t{normalized_score:.2f}\n")  

print(f"normalized scores saved to {output_file}")

normalized scores saved to eval/men_normalized.txt


In [30]:
men_path = "eval/men_normalized.txt"
men_res = wordvecs.evaluate_word_pairs(men_path)
print(men_res)

(PearsonRResult(statistic=0.7007690670824924, pvalue=0.0), SignificanceResult(statistic=0.7073827886513595, pvalue=0.0), 0.0)
