In [1]:
import os
import pandas as pd
from gensim.models import Word2Vec
import scipy.stats
from tqdm import tqdm

In [None]:
#从按参数保存的文件名中读取模型参数
def read_model_infos(dir):
    model_infos=[]
    for filename in os.listdir(dir):
        if filename.endswith('.model'):
            model_info={}
            modelname=filename.split('.')[0]
            params=modelname.split('_')
            model_info['style']=params[0]
            model_info['windows']=params[1].split('-')[-1]
            model_info['vec_size']=params[2].split('-')[-1]
            model_info['epochs']=params[3].split('-')[-1]
            model_info['path']=f'''{dir}/{filename}'''
            model_infos.append(model_info)
    ModelInfo_df=pd.DataFrame(model_infos)
    return ModelInfo_df

In [None]:
#载入测试数据并统一输出格式
def test_data_prepare(filename):
    with open(filename,'r',encoding='utf-8') as infile:
        if 'WordSim' in filename: #针对wordsim
            pair_gold=[]
            for line in infile:
                line=line.strip()
                if not line.startswith('Word 1'):
                    line_elements=line.split(',')
                    pair_gold.append(((line_elements[0].lower(),line_elements[1].lower()),float(line_elements[2])))
            return pair_gold
        if 'SimLex' in filename: #针对simlex
            pair_gold=[]
            for line in infile:
                line=line.strip()
                if not line.startswith('word1'):
                    line_elements=line.split('\t')
                    pair_gold.append(((line_elements[0].lower(),line_elements[1].lower()),float(line_elements[3])))
            return pair_gold
        if 'Analogy' in filename: #针对google analogy
            sem_positives_negative=[]
            sem_answers=[]
            mor_positives_negative=[]
            mor_answers=[]
            category=None
            for line in infile:
                line=line.strip()
                if line.startswith(':'):
                    category=1 if 'gram' in line else 0
                else:
                    line_elements=line.split()
                    if category==0:
                        sem_positives_negative.append(([line_elements[2].lower(),line_elements[1].lower()],line_elements[0].lower()))
                        sem_answers.append(line_elements[3].lower())
                    else:
                        mor_positives_negative.append(([line_elements[2].lower(),line_elements[1].lower()],line_elements[0].lower()))
                        mor_answers.append(line_elements[3].lower())        
            return sem_positives_negative,sem_answers,mor_positives_negative,mor_answers


In [None]:
#测试相似性判断能力
def similarity_test(pair_gold,model_paths):
    r_values=[]
    p_r_values=[]
    rho_values=[]
    p_rho_values=[]
    test_coverage=[] #因oov，设置了覆盖率的统计
    test_pair_num=len(pair_gold)
    for filepath in model_paths:
        test_model=Word2Vec.load(filepath)
        test_vectors=test_model.wv
        gold_sim=[]
        pred_sim=[]
        oov_pair_num=0
        for pair,gold in pair_gold:
            try:
                pred=test_vectors.similarity(*pair)
                gold_sim.append(gold)
                pred_sim.append(pred)
            except KeyError: #oov不纳入测试范围
                oov_pair_num+=1
                continue
        r,p_r=scipy.stats.pearsonr(gold_sim,pred_sim)
        rho,p_rho=scipy.stats.spearmanr(gold_sim,pred_sim)
        coverage=(test_pair_num-oov_pair_num)/test_pair_num
        r_values.append(r)
        p_r_values.append(p_r)
        rho_values.append(rho)
        p_rho_values.append(p_rho)
        test_coverage.append(coverage)
    return r_values,p_r_values,rho_values,p_rho_values,test_coverage

In [5]:
def wordsim_test(model_info_dataframe,data_path):
    wordsim_pair_gold=test_data_prepare(data_path)
    wordsim_r,wordsim_pr,wordsim_rho,wordsim_prho,wordsim_coverage=similarity_test(pair_gold=wordsim_pair_gold,model_paths=model_info_dataframe['path'])
    model_info_dataframe['wordsim_r']=wordsim_r
    model_info_dataframe['wordsim_p_r']=wordsim_pr
    model_info_dataframe['wordsim_rho']=wordsim_rho
    model_info_dataframe['wordsim_prho']=wordsim_prho
    model_info_dataframe['wordsim_coverage']=wordsim_coverage
    print('wordsim测试完成')

In [6]:
def simlex_test(model_info_dataframe,data_path):
    simlex_pair_gold=test_data_prepare(data_path)
    simlex_r,simlex_pr,simlex_rho,simlex_prho,simlex_coverage=similarity_test(pair_gold=simlex_pair_gold,model_paths=model_info_dataframe['path'])
    model_info_dataframe['simlex_r']=simlex_r
    model_info_dataframe['simlex_p_r']=simlex_pr
    model_info_dataframe['simlex_rho']=simlex_rho
    model_info_dataframe['simlex_prho']=simlex_prho
    model_info_dataframe['simlex_coverage']=simlex_coverage
    print('simlex测试完成')

In [None]:
#测试类比推理的能力
def analogy_test(model_info_dataframe,data_path):
    sem_positives_negative,sem_answers,mor_positives_negative,mor_answers=test_data_prepare(data_path)
    model_paths=model_info_dataframe['path']
    sem_num,mor_num=len(sem_positives_negative),len(mor_positives_negative)
    all_num=sem_num+mor_num
    sem_accuracy,mor_accuracy,all_accuracy=[],[],[]
    sem_coverage,mor_coverage,all_coverage=[],[],[] #因oov设置覆盖率统计
    Round=0
    for filepath in model_paths:
        accuracy_count={'sem':0,'mor':0}
        test_model=Word2Vec.load(filepath)
        test_vectors=test_model.wv
        sem_oov_pair_num=0 #oov设置
        mor_oov_pair_num=0 #oov设置
        for sem_pos_neg,sem_answer in zip(sem_positives_negative,sem_answers):
            sem_pos_pair,sem_neg=sem_pos_neg
            try:
                pred_semword=test_vectors.most_similar(positive=sem_pos_pair,negative=[sem_neg],topn=1)[0][0]
                if pred_semword==sem_answer:
                    accuracy_count['sem']+=1
            except KeyError: #oov不纳入
                sem_oov_pair_num+=1
                continue
        for mor_pos_neg,mor_answer in zip(mor_positives_negative,mor_answers):
            mor_pos_pair,mor_neg=mor_pos_neg
            try:
                pred_morword=test_vectors.most_similar(positive=mor_pos_pair,negative=[mor_neg],topn=1)[0][0]
                if pred_morword==mor_answer:
                    accuracy_count['mor']+=1
            except KeyError: #oov不纳入
                mor_oov_pair_num+=1
                continue
        sem_valid_num=sem_num-sem_oov_pair_num
        mor_valid_num=mor_num-mor_oov_pair_num
        all_valid_num=all_num-sem_oov_pair_num-mor_oov_pair_num
        sem_acc=accuracy_count['sem']/sem_valid_num
        mor_acc=accuracy_count['mor']/mor_valid_num
        all_acc=(accuracy_count['sem']+accuracy_count['mor'])/all_valid_num
        sem_cover=sem_valid_num/sem_num
        mor_cover=mor_valid_num/mor_num
        all_cover=all_valid_num/all_num
        sem_accuracy.append(sem_acc)
        mor_accuracy.append(mor_acc)
        all_accuracy.append(all_acc)
        sem_coverage.append(sem_cover)
        mor_coverage.append(mor_cover)
        all_coverage.append(all_cover)
        print(f"Analogy:模型{Round+1}/{len(model_paths)}测试完成")
        Round+=1
    model_info_dataframe['analogy_sem']=sem_accuracy
    model_info_dataframe['analogy_sem_coverage']=sem_coverage
    model_info_dataframe['analogy_mor']=mor_accuracy
    model_info_dataframe['analogy_mor_coverage']=mor_coverage
    model_info_dataframe['analogy_all']=all_accuracy
    model_info_dataframe['analogy_all_coverage']=all_coverage
    print('analogy测试完成')

In [None]:
#因similarity和analogy结果差异大
#在similarity测试规则下得到最佳模型（wordsim和simlex在（rho和r，rho优先）都要前15，筛选后按simlex rho，simlex r，wordsim rho，wordsim r排序（simlex优先）得到最佳模型）
def get_best_similarity(model_info_dataframe,top_num):
    result_sorted_wordsim=model_info_dataframe[(model_info_dataframe['wordsim_p_r']<0.05)&(model_info_dataframe['wordsim_prho']<0.05)].sort_values(by=['wordsim_rho','wordsim_r'],ascending=False).head(top_num)
    result_sorted_simlex=model_info_dataframe[(model_info_dataframe['simlex_p_r']<0.05)&(model_info_dataframe['simlex_prho']<0.05)].sort_values(by=['simlex_rho','simlex_r'],ascending=False).head(top_num)
    wordsim_indices=set(result_sorted_wordsim['index'])
    simlex_indices=set(result_sorted_simlex['index'])
    indices=list(simlex_indices&wordsim_indices)
    result_wordsim_simlex=model_info_dataframe.loc[indices]
    result_sorted_similarity=result_wordsim_simlex.sort_values(by=['simlex_rho','simlex_r','wordsim_rho','wordsim_r'],ascending=False).head(1)
    return result_sorted_similarity

In [None]:
#因similarity和analogy结果差异大
#在analogy测试规则下得到最佳模型（all_accuracy优先）
def get_best_analogy(model_info_dataframe):
    result_sorted_analogy=model_info_dataframe.sort_values(by='analogy_all',ascending=False).head(1)
    return result_sorted_analogy

In [None]:
#获取并保存最佳模型
def get_best_models(model_info_dataframe,top_num_for_similarity):
    best_similarity_model_info=get_best_similarity(model_info_dataframe,top_num_for_similarity)
    best_analogy_model_info=get_best_analogy(model_info_dataframe)
    best_models_df=pd.concat([best_similarity_model_info,best_analogy_model_info],axis=0)
    best_models_df.to_csv('bestmodels_info.csv',encoding='utf-8',index=False)
    print('最佳模型信息已保存至bestmodels_info.csv')
    return best_models_df

In [9]:
ModelInfo_df=read_model_infos('models')
with tqdm(total=3,desc='Testing') as pbar:
    wordsim_test(model_info_dataframe=ModelInfo_df,data_path='test_data/WordSim-353.csv')
    pbar.update(1)
    simlex_test(model_info_dataframe=ModelInfo_df,data_path='test_data/SimLex-999.txt')
    pbar.update(1)
    analogy_test(model_info_dataframe=ModelInfo_df,data_path='test_data/GoogleAnalogy.txt')
    pbar.update(1)
ModelInfo_df=ModelInfo_df.reset_index()
ModelInfo_df.to_csv('allmodel_info.csv',encoding='utf-8',index=False)

Testing:  33%|███▎      | 1/3 [00:09<00:19,  9.83s/it]

wordsim测试完成


Testing:  67%|██████▋   | 2/3 [00:19<00:09,  9.83s/it]

simlex测试完成
Analogy:模型1/36测试完成
Analogy:模型2/36测试完成
Analogy:模型3/36测试完成
Analogy:模型4/36测试完成
Analogy:模型5/36测试完成
Analogy:模型6/36测试完成
Analogy:模型7/36测试完成
Analogy:模型8/36测试完成
Analogy:模型9/36测试完成
Analogy:模型10/36测试完成
Analogy:模型11/36测试完成
Analogy:模型12/36测试完成
Analogy:模型13/36测试完成
Analogy:模型14/36测试完成
Analogy:模型15/36测试完成
Analogy:模型16/36测试完成
Analogy:模型17/36测试完成
Analogy:模型18/36测试完成
Analogy:模型19/36测试完成
Analogy:模型20/36测试完成
Analogy:模型21/36测试完成
Analogy:模型22/36测试完成
Analogy:模型23/36测试完成
Analogy:模型24/36测试完成
Analogy:模型25/36测试完成
Analogy:模型26/36测试完成
Analogy:模型27/36测试完成
Analogy:模型28/36测试完成
Analogy:模型29/36测试完成
Analogy:模型30/36测试完成
Analogy:模型31/36测试完成
Analogy:模型32/36测试完成
Analogy:模型33/36测试完成
Analogy:模型34/36测试完成
Analogy:模型35/36测试完成


Testing: 100%|██████████| 3/3 [12:13<00:00, 244.57s/it]

Analogy:模型36/36测试完成
analogy测试完成





In [None]:
best_models_df=get_best_models(model_info_dataframe=ModelInfo_df,top_num_for_similarity=15)

最佳模型信息已保存至bestmodels_info.csv
