# Word Embedding Evaluation

--- Last edited: 2024-09-26 ---

In [6]:
import collections
from datetime import date
import glob
import json
import os
from pathlib import Path
import pickle
import random
import re
import string
import sys
import time
import csv
import shutil
import requests
from tqdm.notebook import tqdm
import ast

import gensim
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

import nltk
from nltk.corpus import stopwords
from nltk.cluster import KMeansClusterer

import numpy as np
import pandas as pd
from scipy.spatial import distance
from scipy import stats

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import sklearn
from sklearn import cluster
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

import spacy
import spacy_transformers
from spacy.pipeline import EntityRuler
from spacy.training.example import Example
from spacy.scorer import Scorer
from spacy.tokens import DocBin
from spacy.training import offsets_to_biluo_tags

from thefuzz import fuzz
from thefuzz import process

import networkx as nx

import mantel

In [7]:
def load_data(file):
    with open(file, "r", encoding = "utf-8") as f:
        data = json.load(f)
    return(data)

def save_data(file, data):
    with open(file, "w", encoding = "utf-8") as f:
        json.dump(data, f, indent = 4)

In [3]:
base_dir = 'C:/Users/Brottrager/Documents/Diss/sec_lit/ENG/'
path_results = 'C:/Users/Brottrager/Documents/Diss/sec_lit/ENG/20240919_wordembeddings'

dict_text_clustered = load_data(base_dir + '20240919_ENG_dict_all_entities_WORK_OF_ART_final.json')
dict_persons_clustered = load_data(base_dir + '20240919_ENG_dict_all_entities_PERSON_final.json')

entities_per = []

for key, values in dict_persons_clustered.items():
    entity = str(key) + '_' + values[1]
    entities_per.append(entity)

entities_text = []

for key, values in dict_text_clustered.items():
    entity = str(key) + '_' + values[1]
    entities_text.append(entity)


entities = set(entities_per + entities_text)

model_dir = Path(path_results + '\\modelA_iteration=100').glob('*.kv')
files = list(model_dir)

models = []

for file in files:
    models.append(gensim.models.KeyedVectors.load(str(file)))

vectors_A = {}

for model in models:
    for key in entities:
        if key in model.key_to_index:
            if key not in vectors_A:
                vectors_A[key] = [model[key]]
            else:
                vectors_A[key].append(model[key])

vectors_mean_A = {}
idsinEmbedding_A = []

for key, values in vectors_A.items():
    array = np.array(vectors_A[key], dtype='float32')
    vectors_mean_A[key] = np.average(array, axis=0)
    idsinEmbedding_A.append(key)

vectors_ls_A = []

for key, values in vectors_mean_A.items():
    vectors_ls_A.append(vectors_mean_A[key])

similarities_A = cosine_similarity(vectors_ls_A)

data_A = pd.DataFrame(similarities_A)

for i in range(0, len(data_A)):
    for j in range(0,len(data_A)):
        if i == j:
            data_A.iloc[i,j] = 0

data_A.columns = idsinEmbedding_A
data_A.index = idsinEmbedding_A

model_dir = Path(path_results + '\\modelB_iteration=100').glob('*.kv')
files = list(model_dir)

models = []

for file in files:
    models.append(gensim.models.KeyedVectors.load(str(file)))

vectors_B = {}

for model in models:
    for key in entities:
        if key in model.key_to_index:
            if key not in vectors_B:
                vectors_B[key] = [model[key]]
            else:
                vectors_B[key].append(model[key])

vectors_mean_B = {}
idsinEmbedding_B = []

for key, values in vectors_B.items():
    array = np.array(vectors_B[key], dtype='float32')
    vectors_mean_B[key] = np.average(array, axis=0)
    idsinEmbedding_B.append(key)

vectors_ls_B = []

for key, values in vectors_mean_B.items():
    vectors_ls_B.append(vectors_mean_B[key])

similarities_B = cosine_similarity(vectors_ls_B)

data_B = pd.DataFrame(similarities_B)

for i in range(0, len(data_B)):
    for j in range(0,len(data_B)):
        if i == j:
            data_B.iloc[i,j] = 0

data_B.columns = idsinEmbedding_B
data_B.index = idsinEmbedding_B

sorted_matrix_A = data_A.sort_index(axis=0).sort_index(axis=1)
sorted_matrix_B = data_B.sort_index(axis=0).sort_index(axis=1)

dist1 = distance.squareform(sorted_matrix_A.values) 
dist2 = distance.squareform(sorted_matrix_B.values)

In [24]:
result = mantel.test(dist1, dist2, perms=10000, method='pearson', tail='upper')

In [56]:
result

MantelResult(0.9995510780547443, 0.0001, 99.11978477309117)

In [9]:
base_dir = 'C:/Users/Brottrager/Documents/Diss/sec_lit/GER/'
path_results = 'C:/Users/Brottrager/Documents/Diss/sec_lit/GER/20240919_wordembeddings'

dict_text_clustered = load_data(base_dir + '20240919_GER_dict_all_entities_WORK_OF_ART_final.json')
dict_persons_clustered = load_data(base_dir + '20240919_GER_dict_all_entities_PERSON_final.json')

entities_per = []

for key, values in dict_persons_clustered.items():
    entity = str(key) + '_' + values[1]
    entities_per.append(entity)

entities_text = []

for key, values in dict_text_clustered.items():
    entity = str(key) + '_' + values[1]
    entities_text.append(entity)


entities = set(entities_per + entities_text)

model_dir = Path(path_results + '\\modelA_iteration=100').glob('*.kv')
files = list(model_dir)

models = []

for file in files:
    models.append(gensim.models.KeyedVectors.load(str(file)))

vectors_A = {}

for model in models:
    for key in entities:
        if key in model.key_to_index:
            if key not in vectors_A:
                vectors_A[key] = [model[key]]
            else:
                vectors_A[key].append(model[key])

vectors_mean_A = {}
idsinEmbedding_A = []

for key, values in vectors_A.items():
    array = np.array(vectors_A[key], dtype='float32')
    vectors_mean_A[key] = np.average(array, axis=0)
    idsinEmbedding_A.append(key)

vectors_ls_A = []

for key, values in vectors_mean_A.items():
    vectors_ls_A.append(vectors_mean_A[key])

similarities_A = cosine_similarity(vectors_ls_A)

data_A = pd.DataFrame(similarities_A)

for i in range(0, len(data_A)):
    for j in range(0,len(data_A)):
        if i == j:
            data_A.iloc[i,j] = 0

data_A.columns = idsinEmbedding_A
data_A.index = idsinEmbedding_A

model_dir = Path(path_results + '\\modelB_iteration=100').glob('*.kv')
files = list(model_dir)

models = []

for file in files:
    models.append(gensim.models.KeyedVectors.load(str(file)))

vectors_B = {}

for model in models:
    for key in entities:
        if key in model.key_to_index:
            if key not in vectors_B:
                vectors_B[key] = [model[key]]
            else:
                vectors_B[key].append(model[key])

vectors_mean_B = {}
idsinEmbedding_B = []

for key, values in vectors_B.items():
    array = np.array(vectors_B[key], dtype='float32')
    vectors_mean_B[key] = np.average(array, axis=0)
    idsinEmbedding_B.append(key)

vectors_ls_B = []

for key, values in vectors_mean_B.items():
    vectors_ls_B.append(vectors_mean_B[key])

similarities_B = cosine_similarity(vectors_ls_B)

data_B = pd.DataFrame(similarities_B)

for i in range(0, len(data_B)):
    for j in range(0,len(data_B)):
        if i == j:
            data_B.iloc[i,j] = 0

data_B.columns = idsinEmbedding_B
data_B.index = idsinEmbedding_B

sorted_matrix_A = data_A.sort_index(axis=0).sort_index(axis=1)
sorted_matrix_B = data_B.sort_index(axis=0).sort_index(axis=1)

dist1 = distance.squareform(sorted_matrix_A.values) 
dist2 = distance.squareform(sorted_matrix_B.values)

In [10]:
result = mantel.test(dist1, dist2, perms=10000, method='pearson', tail='upper')

In [11]:
result

MantelResult(0.9991035290433951, 0.0001, 99.14614945587492)