In [1]:
import gensim.downloader
from gensim.models import KeyedVectors
import os
import pandas as pd
import scipy.stats
from tqdm import tqdm

In [None]:
#从本地载入或下载模型
def get_pretrained_vectors(model_name):
    filepath=rf'''pretrained_models/{model_name}.kv'''
    if os.path.exists(filepath):
        vectors=KeyedVectors.load(filepath,mmap='r')
    else:
        print(f'downloading {model_name}')
        vectors=gensim.downloader.load(model_name)
        vectors.save(filepath)
        print(f'{model_name} saved')
    return vectors

In [None]:
#载入测试数据，与step3相同
def test_data_prepare(filename):
    with open(filename,'r',encoding='utf-8') as infile:
        if 'WordSim' in filename:
            pair_gold=[]
            for line in infile:
                line=line.strip()
                if not line.startswith('Word 1'):
                    line_elements=line.split(',')
                    pair_gold.append(((line_elements[0].lower(),line_elements[1].lower()),float(line_elements[2])))
            return pair_gold
        if 'SimLex' in filename:
            pair_gold=[]
            for line in infile:
                line=line.strip()
                if not line.startswith('word1'):
                    line_elements=line.split('\t')
                    pair_gold.append(((line_elements[0].lower(),line_elements[1].lower()),float(line_elements[3])))
            return pair_gold
        if 'Analogy' in filename:
            sem_positives_negative=[]
            sem_answers=[]
            mor_positives_negative=[]
            mor_answers=[]
            category=None
            for line in infile:
                line=line.strip()
                if line.startswith(':'):
                    category=1 if 'gram' in line else 0
                else:
                    line_elements=line.split()
                    if category==0:
                        sem_positives_negative.append(([line_elements[2].lower(),line_elements[1].lower()],line_elements[0].lower()))
                        sem_answers.append(line_elements[3].lower())
                    else:
                        mor_positives_negative.append(([line_elements[2].lower(),line_elements[1].lower()],line_elements[0].lower()))
                        mor_answers.append(line_elements[3].lower())        
            return sem_positives_negative,sem_answers,mor_positives_negative,mor_answers

In [None]:
#测试相关性能力，与step3基本相同
def similarity_test(pair_gold,vectors_list):
    r_values=[]
    p_r_values=[]
    rho_values=[]
    p_rho_values=[]
    test_coverage=[]
    test_pair_num=len(pair_gold)
    for test_vectors in vectors_list:
        gold_sim=[]
        pred_sim=[]
        oov_pair_num=0
        for pair,gold in pair_gold:
            try:
                pred=test_vectors.similarity(*pair)
                gold_sim.append(gold)
                pred_sim.append(pred)
            except KeyError:
                oov_pair_num+=1
                continue
        r,p_r=scipy.stats.pearsonr(gold_sim,pred_sim)
        rho,p_rho=scipy.stats.spearmanr(gold_sim,pred_sim)
        coverage=(test_pair_num-oov_pair_num)/test_pair_num
        r_values.append(r)
        p_r_values.append(p_r)
        rho_values.append(rho)
        p_rho_values.append(p_rho)
        test_coverage.append(coverage)
    return r_values,p_r_values,rho_values,p_rho_values,test_coverage

In [None]:
#与step3基本相同
def wordsim_test(model_info,vectors_list,test_data_path):
    wordsim_pair_gold=test_data_prepare(test_data_path)
    wordsim_r,wordsim_pr,wordsim_rho,wordsim_prho,wordsim_coverage=similarity_test(pair_gold=wordsim_pair_gold,vectors_list=vectors_list)
    model_info['wordsim_r']=wordsim_r
    model_info['wordsim_p_r']=wordsim_pr
    model_info['wordsim_rho']=wordsim_rho
    model_info['wordsim_prho']=wordsim_prho
    model_info['wordsim_coverage']=wordsim_coverage
    print('wordsim测试完成')

In [None]:
#与step3基本相同
def simlex_test(model_info,vectors_list,test_data_path):
    simlex_pair_gold=test_data_prepare(test_data_path)
    simlex_r,simlex_pr,simlex_rho,simlex_prho,simlex_coverage=similarity_test(pair_gold=simlex_pair_gold,vectors_list=vectors_list)
    model_info['simlex_r']=simlex_r
    model_info['simlex_p_r']=simlex_pr
    model_info['simlex_rho']=simlex_rho
    model_info['simlex_prho']=simlex_prho
    model_info['simlex_coverage']=simlex_coverage
    print('simlex测试完成')

In [None]:
##测试类比推理能力，与step3基本相同
def analogy_test(model_info,vectors_list,test_data_path):
    sem_positives_negative,sem_answers,mor_positives_negative,mor_answers=test_data_prepare(test_data_path)
    sem_num,mor_num=len(sem_positives_negative),len(mor_positives_negative)
    all_num=sem_num+mor_num
    sem_accuracy,mor_accuracy,all_accuracy=[],[],[]
    sem_coverage,mor_coverage,all_coverage=[],[],[]
    Round=0
    for test_vectors in vectors_list:
        accuracy_count={'sem':0,'mor':0}
        sem_oov_pair_num=0
        mor_oov_pair_num=0
        for sem_pos_neg,sem_answer in tqdm(zip(sem_positives_negative,sem_answers),total=sem_num,desc='analogy_test_sem'):
            sem_pos_pair,sem_neg=sem_pos_neg
            try:
                pred_semword=test_vectors.most_similar(positive=sem_pos_pair,negative=[sem_neg],topn=1)[0][0]
                if pred_semword==sem_answer:
                    accuracy_count['sem']+=1
            except KeyError:
                sem_oov_pair_num+=1
                continue
        for mor_pos_neg,mor_answer in tqdm(zip(mor_positives_negative,mor_answers),total=mor_num,desc='analogy_test_mor'):
            mor_pos_pair,mor_neg=mor_pos_neg
            try:
                pred_morword=test_vectors.most_similar(positive=mor_pos_pair,negative=[mor_neg],topn=1)[0][0]
                if pred_morword==mor_answer:
                    accuracy_count['mor']+=1
            except KeyError:
                mor_oov_pair_num+=1
                continue
        sem_valid_num=sem_num-sem_oov_pair_num
        mor_valid_num=mor_num-mor_oov_pair_num
        all_valid_num=all_num-sem_oov_pair_num-mor_oov_pair_num
        sem_acc=accuracy_count['sem']/sem_valid_num
        mor_acc=accuracy_count['mor']/mor_valid_num
        all_acc=(accuracy_count['sem']+accuracy_count['mor'])/all_valid_num
        sem_cover=sem_valid_num/sem_num
        mor_cover=mor_valid_num/mor_num
        all_cover=all_valid_num/all_num
        sem_accuracy.append(sem_acc)
        mor_accuracy.append(mor_acc)
        all_accuracy.append(all_acc)
        sem_coverage.append(sem_cover)
        mor_coverage.append(mor_cover)
        all_coverage.append(all_cover)
        print(f"Analogy:模型{Round+1}/{len(vectors_list)}测试完成")
        Round+=1
    model_info['analogy_sem']=sem_accuracy
    model_info['analogy_sem_coverage']=sem_coverage
    model_info['analogy_mor']=mor_accuracy
    model_info['analogy_mor_coverage']=mor_coverage
    model_info['analogy_all']=all_accuracy
    model_info['analogy_all_coverage']=all_coverage
    print('analogy测试完成')

In [None]:
#测试预训练模型词向量并保存结果
def test_pretrained_models(vectors_list,modelnames):
    model_info=pd.DataFrame(modelnames,columns=['model_name'])
    wordsim_test(model_info,vectors_list,test_data_path='test_data/WordSim-353.csv')
    simlex_test(model_info,vectors_list,test_data_path='test_data/SimLex-999.txt')
    analogy_test(model_info,vectors_list,test_data_path='test_data/GoogleAnalogy.txt')
    model_info.to_csv('pretrainedmodel_info.csv',encoding='utf-8',index=False)
    print('结果已保存为pretrainedmodel_info.csv')
    print(model_info)


In [9]:
os.makedirs('pretrained_models', exist_ok=True)
word2vec_google_vectors=get_pretrained_vectors('word2vec-google-news-300')
glove_wiki_vectors=get_pretrained_vectors('glove-wiki-gigaword-300')
fasttext_wiki_vectors=get_pretrained_vectors('fasttext-wiki-news-subwords-300')

In [10]:
vectors_list=[word2vec_google_vectors,glove_wiki_vectors,fasttext_wiki_vectors]
modelname_list=['word2vec-google-news-300','glove-wiki-gigaword-300','fasttext-wiki-news-subwords-300']
test_pretrained_models(vectors_list=vectors_list,modelnames=modelname_list)

wordsim测试完成
simlex测试完成


analogy_test_sem: 100%|██████████| 8869/8869 [06:02<00:00, 24.47it/s] 
analogy_test_mor: 100%|██████████| 10675/10675 [16:26<00:00, 10.82it/s]


Analogy:模型1/3测试完成


analogy_test_sem: 100%|██████████| 8869/8869 [02:00<00:00, 73.52it/s]
analogy_test_mor: 100%|██████████| 10675/10675 [02:32<00:00, 70.15it/s]


Analogy:模型2/3测试完成


analogy_test_sem: 100%|██████████| 8869/8869 [02:56<00:00, 50.19it/s] 
analogy_test_mor: 100%|██████████| 10675/10675 [06:09<00:00, 28.85it/s]


Analogy:模型3/3测试完成
analogy测试完成
结果已保存为pretrainedmodel_info.csv
                        model_name  wordsim_r   wordsim_p_r  wordsim_rho  \
0         word2vec-google-news-300   0.649271  2.822799e-43     0.694122   
1          glove-wiki-gigaword-300   0.604076  1.752305e-36     0.608535   
2  fasttext-wiki-news-subwords-300   0.699496  3.687972e-53     0.697190   

   wordsim_prho  wordsim_coverage  simlex_r    simlex_p_r  simlex_rho  \
0  1.287506e-51          0.991501  0.453928  6.173059e-52    0.441966   
1  3.879630e-37          1.000000  0.388952  1.987671e-37    0.370500   
2  1.115469e-52          1.000000  0.473067  7.587646e-57    0.440908   

    simlex_prho  simlex_coverage  analogy_sem  analogy_sem_coverage  \
0  5.068222e-49              1.0     0.245799              0.416056   
1  7.320031e-34              1.0     0.774383              1.000000   
2  9.055669e-49              1.0     0.375822              0.582929   

   analogy_mor  analogy_mor_coverage  analogy_all  analo