# NER Workflow for English Literary Histories

--- Last edited: 2024-09-26 ---

This workflow is designed to extract named entities of persons and works of art by fine-tuning the existing spaCy NER module. After testing the fine-tuned NER with a manually compiled gold standard, the [spaCy wrapper](https://spacy.io/universe/project/spacyfishing) for  [entity-fishing](https://github.com/kermitt2/entity-fishing) is used to disambiguate the found entities.

### Required Packages

In [1]:
import collections
from datetime import date
import glob
import json
import os
from pathlib import Path
import pickle
import random
import re
import string
import sys
import time
import csv
import shutil
import requests
from tqdm.notebook import tqdm
import ast

import gensim
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

import nltk
from nltk.corpus import stopwords
from nltk.cluster import KMeansClusterer

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import sklearn
from sklearn import cluster
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

import spacy
import spacy_transformers
from spacy.pipeline import EntityRuler
from spacy.training.example import Example
from spacy.scorer import Scorer
from spacy.tokens import DocBin
from spacy.training import offsets_to_biluo_tags

from thefuzz import fuzz
from thefuzz import process

import networkx as nx

### Functions

In [2]:
def load_data(file):
    with open(file, "r", encoding = "utf-8") as f:
        data = json.load(f)
    return(data)

def save_data(file, data):
    with open(file, "w", encoding = "utf-8") as f:
        json.dump(data, f, indent = 4)

In [3]:
def test_model(model, text):
    doc = nlp(text)
    results = []
    for ent in doc.ents:
        results.append((ent.text, ent.label_))
    return(results)

In [4]:
def is_instance_of_human(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return False

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # Check for "instance of" property (P31)
    if 'P31' in claims:
        for claim in claims['P31']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            if value.get('id') == 'Q5':  # Q5 is the ID for 'human'
                return True
    
    return False

In [5]:
def is_irrelevant_occupation(wikidata_id):
    import requests

    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return False

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # List of relevant IDs for athletes, baseball players, American football players, racehorse trainers, basketball players, TV personalities, rappers, jockeys, footballers, golfers, cyclists, handball players, swimmers, and wrestlers
    occupation_ids = [
        'Q937857',    # 'athlete'
        'Q2066131',   # 'sports player'
        'Q10871364',  # 'baseball player'
        'Q19204627',  # 'American football player'
        'Q1622272',   # 'racehorse trainer'
        'Q3665646',   # 'basketball player'
        'Q28389',     # 'TV personality'
        'Q196452',    # 'rapper'
        'Q37226',     # 'jockey'
        'Q2736',      # 'footballer (soccer player)'
        'Q11513337',  # 'golfer'
        'Q14960',     # 'cyclist'
        'Q192061',    # 'handball player'
        'Q24488',     # 'swimmer'
        'Q183'        # 'wrestler'
    ]
    
    # Check for "instance of" property (P31) or "occupation" property (P106)
    for prop in ['P31', 'P106']:
        if prop in claims:
            for claim in claims[prop]:
                mainsnak = claim.get('mainsnak', {})
                datavalue = mainsnak.get('datavalue', {})
                value = datavalue.get('value', {})
                if value.get('id') in occupation_ids:
                    return True

    return False


In [6]:
def is_instance_of_literarywork(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return False

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # Check for "instance of" property (P31)
    if 'P31' in claims:
        for claim in claims['P31']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            if value.get('id') == 'Q7725634':  # Q7725634 is the ID for 'literary work'
                return True
            if value.get('id') == 'Q47461344':  # Q47461344 is the ID for 'written work'
                return True
            if value.get('id') == 'Q838948':  # Q838948 is the ID for 'work of art'
                return True
            if value.get('id') == 'Q116476516':  # Q116476516 is the ID for 'dramatic work'
                return True
    
    return False

### Finetuning spaCy's NER for a Corpus of Literary Historical Texts

In [7]:
#today = re.sub('-', '', str(date.today()))
today = "20240919"

In [8]:
base_dir = 'C:/Users/Brottrager/Documents/Diss/sec_lit/ENG/'
model_path = 'C:/Users/Brottrager/Documents/Diss/code/python/bespokeNER/output_ENG/'
path_results = 'C:/Users/Brottrager/Documents/Diss/sec_lit/ENG/' + today + '_wordembeddings'

In [9]:
if not os.path.exists(path_results):
    os.makedirs(path_results)

The manually annotated texts are imported, split into sentencens, and then turned into training data in json format.

In [10]:
corpus_path = base_dir + 'lit_histories_annotated'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

In [11]:
nlp = spacy.load("en_core_web_lg")

In [12]:
nlp.max_length = 4000000 

In [13]:
config = {"punct_chars": ['!', '.', '?']}
nlp.add_pipe("sentencizer", config=config, before="parser")

<spacy.pipeline.sentencizer.Sentencizer at 0x213261b2990>

In [None]:
sentences = []
entities_matches = []
for file in files:
    #print('File ' + str(counter) + ' is being processed!')
    all_paragraphs = ''
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        for paragraph in text:
            if paragraph != "":
                doc = nlp(paragraph)
                for sent in doc.sents:
                    newString = str(sent)
                    entities_sent_matches = []
                    num_entities = sum(1 for _ in re.finditer('<.+?>', str(newString)))
                    while num_entities > 0: 
                        entities = re.finditer('<.+?>', str(newString))
                        entities_accum = []
                        for e in entities:
                            #print(e)
                            entity_name = re.sub('<(.+?);.+?>', '\\1', e.group())
                            tag = re.sub('<.+(WORK_OF_ART|PERSON)>', '\\1', e.group())
                            newString = newString[0:e.start()] + entity_name + newString[e.end():len(newString)]
                            entity_name = re.sub(r'([\(\)\[\]])', r'\\\1', entity_name)
                            entities_accum.append([e.start(), e.start()+len(entity_name), tag])
                            num_entities = num_entities - 1 
                            break
                        entities_sent_matches.append(entities_accum)
                    entities_matches.append([x for xs in entities_sent_matches for x in xs])
                    sentences.append(newString)

In [None]:
TRAIN_DATA = []
counter = 0
for ent in entities_matches:
    if len(ent) != 0:
        TRAIN_DATA.append([sentences[counter], {"entities":  ent}])
    counter += 1

In [None]:
save_data(base_dir + today + '_final_manual_training_data_ENG.json', TRAIN_DATA)

In [None]:
GOLD_STANDARD = load_data(base_dir + '20231010_gold_standard_complete.json')

In [None]:
TRAIN_DATA = load_data(base_dir + today + '_final_manual_training_data_ENG.json')

#### Training

In [None]:
db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk(today + "_final_manual_train_ENG.spacy") # save the docbin object

The model is trained in the command line:

https://spacy.io/usage/training#config

CMD

python -m spacy init fill-config base_config_ENG.cfg config.cfg

python -m spacy train config.cfg --output ./output_ENG --paths.train ./20240821_final_manual_train_ENG.spacy --paths.dev ./20240821_final_manual_train_ENG.spacy 

In [None]:
trained = spacy.load(model_path + 'model-best') 
trained.max_length = 3000000

#### Testing

To evaluate the best fine-tuned model, the model's performance is compared to the basic nlp model and a combined nlp+fine-tuned model.

In [None]:
scorer = Scorer()

In [None]:
examples_nlp = []
for text, annotations in GOLD_STANDARD:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    example.predicted = nlp(str(example.predicted))
    examples_nlp.append(example)

In [None]:
examples_trained = []
for text, annotations in GOLD_STANDARD:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    example.predicted = trained(str(example.predicted))
    examples_trained.append(example)

In [None]:
dict_scores = [scorer.score(examples_nlp), 
               scorer.score(examples_trained)]

In [None]:
p_worksofart = []
r_worksofart = []
f_worksofart = []

p_person = []
r_person = []
f_person = []

dicts = ['Base NLP model', 'Fine-tuned model']

i = 0
for d in dict_scores:
    print('========== ' + dicts[i] + ' ==========\n')
    
    print('WORK_OF_ART')
    print('Precision: ' + str(d['ents_per_type']['WORK_OF_ART']['p']))
    print('Recall: ' + str(d['ents_per_type']['WORK_OF_ART']['r']))
    print('F1: ' + str(d['ents_per_type']['WORK_OF_ART']['f']))
    print('\n')
    
    print('PERSON')
    print('Precision: ' + str(d['ents_per_type']['PERSON']['p']))
    print('Recall: ' + str(d['ents_per_type']['PERSON']['r']))
    print('F1: ' + str(d['ents_per_type']['PERSON']['f']))
    print('\n')
    
    i += 1

### Entity Fishing

For the entity fishing, an instance of entity-fishing has to be run locally with Docker (see this [tutorial](https://nerd.readthedocs.io/en/latest/docker.html)). After pulling the image from Docker Hub and downloading the necessary data resources, the container can be run with this git command:

docker run --rm -p 8090:8090 -p 8091:8091 \\

  -v C:/entity-fishing/data/db-kb/db-kb:/opt/entity-fishing/data/db/db-kb \\
  
  -v C:/entity-fishing/data/db-en/db-en:/opt/entity-fishing/data/db/db-en \\
  
  -v C:/entity-fishing/data/db-de/db-de:/opt/entity-fishing/data/db/db-de \\
  
  grobid/entity-fishing:0.0.6

In [None]:
trained.add_pipe("entityfishing", config={"extra_info": True, "language": "en", "api_ef_base": "http://localhost:8090/service"})

In [None]:
corpus_path = base_dir + 'lit_histories_preprocessed'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

In [None]:
newpath = base_dir + today + '_txt_search&replace_model=trained'
if not os.path.exists(newpath):
    os.makedirs(newpath)

#### Search & Replace Logic

In this first iteration, the NER tagger is used on all files that have not yet been annotated. If an entity is detected, it is replaced by a sequence of the detected string, a normalised entity name, its wikiID (if applicable), and a tag for PERSON or WORK_OF_ART, placed in pointed brackets.

In [None]:
persons = []
works_of_art = []
for file in tqdm(files):
    all_paragraphs = ''
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        for paragraph in text:
            doc = trained(paragraph)
            newString = paragraph
            for e in reversed(doc.ents): #reversed to not modify the offsets of other entities when substituting
                if e.label_ == "PERSON":
                    start = e.start_char
                    end = start + len(e.text)
                    if str(e._.normal_term).upper() != 'NONE':
                        normalised = str(e._.normal_term).upper().replace(' ', '_')
                        normalised = normalised.replace(',', '')
                        newString = newString[:start] + '<' + e.text + '; ' + normalised + '; ' + str(e._.kb_qid) + '; ' + e.label_ + '>' +  newString[end:]
                        works_of_art.append([e.text + "," + str(e._.normal_term).upper().replace(' ', '_')  + "," + str(e._.kb_qid) + "," + e.label_])
                    else:
                        newString = newString[:start] + '<' + e.text + '; ' + e.text.upper().replace(' ', '_') + '; noWikiID; ' + e.label_ +'>' + newString[end:]
                        works_of_art.append([e.text + "," + e.text.upper().replace(' ', '_')  + "," + str(e._.kb_qid) + "," + e.label_])
                if e.label_ == "WORK_OF_ART":
                    start = e.start_char
                    end = start + len(e.text)
                    if str(e._.normal_term).upper() != 'NONE':
                        normalised = str(e._.normal_term).upper().replace(' ', '_')
                        normalised = normalised.replace(',', '')
                        newString = newString[:start] + '<' + e.text + '; ' + normalised + '; ' + str(e._.kb_qid) + '; ' + e.label_ + '>' +  newString[end:]
                        works_of_art.append([e.text + "," + str(e._.normal_term).upper().replace(' ', '_')  + "," + str(e._.kb_qid) + "," + e.label_])
                    else:
                        newString = newString[:start] + '<' + e.text + '; ' + e.text.upper().replace(' ', '_') + '; noWikiID; ' + e.label_ +'>' + newString[end:]
                        works_of_art.append([e.text + "," + e.text.upper().replace(' ', '_')  + "," + str(e._.kb_qid) + "," + e.label_])

            all_paragraphs = all_paragraphs + '\n' + newString
                
        file_name = re.sub('.+lit_histories_preprocessed.(.+.txt)', '\\1', str(file))
        file_name = newpath + '/' + file_name
        f = open(file_name, "w",  encoding = "utf-8")
        f.write(all_paragraphs)
        f.close()

In [None]:
newpath = base_dir + today + '_final_corpus_ENG'
if not os.path.exists(newpath):
    os.makedirs(newpath)

In [None]:
corpus_path = base_dir + today + '_txt_search&replace_model=trained'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

The newly annotated files are now copied into a new directory, together with the manually annotated files. Here, I also went through a first round of manual correction, replacing obvious mis-identifications.

In [None]:
for file in files:
    file_name = re.sub('.+txt_search&replace_model=trained.(.+.txt)', '\\1', str(file))
    new_file_name = newpath + '/' + file_name
    shutil.copyfile(file, new_file_name)

In [None]:
annotated = base_dir + 'lit_histories_annotated'
corpus_dir = Path(annotated).glob('*.txt')
files = list(corpus_dir)

In [None]:
for file in files:
    file_name = re.sub('.+lit_histories_annotated.(.+.txt)', '\\1', str(file))
    new_file_name = newpath + '/' + file_name
    shutil.copyfile(file, new_file_name)

In [None]:
corpus_path = base_dir + today + '_final_corpus_ENG'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

To prepare a consolidation of name variations, all detected entities that have a wikiID are extracted from the text files---both automatically and manually annotated---and are stored in two dictionaries. 

In [None]:
dict_per = {}
dict_text = {}

for file in tqdm(files):
    with open (file, encoding = "utf-8") as f:
        text = f.read()
        matches = re.findall( r'<.+?>', text)
        for m in matches:
            if len(re.findall(';', m)) > 1:
                key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', m)
                wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', m)
                wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', m)
                tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', m)
                if tag == 'PERSON':
                    if wikiID != 'noWikiID':
                        if wikiname in dict_per:
                            if key in dict_per[wikiname][0]:
                                dict_per[wikiname][3] += 1
                            else:
                                dict_per[wikiname][0].append(key)
                                dict_per[wikiname][3] += 1
                        else:
                            dict_per[wikiname] = [[key], wikiID, tag, 1]
                elif tag == 'WORK_OF_ART':
                    if wikiID != 'noWikiID':
                        if wikiname in dict_text:
                            if key in dict_text[wikiname][0]:
                                dict_text[wikiname][3] += 1
                            else:
                                dict_text[wikiname][0].append(key)
                                dict_text[wikiname][3] += 1
                        else:
                            dict_text[wikiname] = [[key], wikiID, tag, 1]

In [None]:
keys = list(dict_per.keys())
keys.sort()
dict_per = {i: dict_per[i] for i in keys}

In [None]:
keys = list(dict_text.keys())
keys.sort()
dict_text = {i: dict_text[i] for i in keys}

These dictionaries are now filtered: dict_per_filtered should only include entries that are linked to a person, dict_text_filtered only entries that are a written work, literary work, and so on. This helps to exclude non-sensical but common mis-identifications (city names, race horses, athletes, you name it). 

In [None]:
dict_per_filtered = {}

for key, values in tqdm(dict_per.items()):
    if is_instance_of_human(values[1]) == True:
        if is_irrelevant_occupation(values[1]) == False:
            dict_per_filtered[key] = values    

In [None]:
dict_text_filtered = {}

for key, values in tqdm(dict_text.items()):
    if is_instance_of_literarywork(values[1]) == True:
        dict_text_filtered[key] = values

In [None]:
save_data(base_dir + today + '_ENG_dict_entities_PER.json', dict_per_filtered)
save_data(base_dir + today + '_ENG_dict_entities_WORK_OF_ART.json', dict_text_filtered)

In [None]:
#dict_per_filtered = load_data(base_dir + today +'_ENG_dict_entities_PER.json')
#dict_text_filtered = load_data(base_dir + today + '_ENG_dict_entities_WORK_OF_ART.json')

The dictionaries are now inverted; with the different name variations as keys and the wikinames as values.

In [None]:
inverted_dict_per = {}

for key, values in dict_per_filtered.items():
    for value in values[0]:
        if value in inverted_dict_per:
            inverted_dict_per.pop(value)
        else:
            inverted_dict_per[value] = key

In [None]:
inverted_dict_text = {}

for key, values in dict_text_filtered.items():
    for value in values[0]:
        if value in inverted_dict_text:
            inverted_dict_text.pop(value)
        else:
            inverted_dict_text[value] = key

In [None]:
consolidated_dir = base_dir + today + '_final_corpus_ENG_consolidated'
if not os.path.exists(consolidated_dir):
    os.makedirs(consolidated_dir)

In the second iteration, all entities that have not been attributed a wikiID are compared to existing entries in both dictioniaries and are, if applicable, updated to match the dictionary information. If an entity has been deleted from a dictionary (because, for example, is_instance_of_human yielded False), the entity is also compared to existing disambiguated entities or updated to a 'noWikiID' entry.

In [None]:
for file in tqdm(files):
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        newText = ''
        for paragraph in text:
            newString = paragraph
            entities = re.finditer('<.+?>', newString)
            if entities is not None:
                for e in reversed(list(entities)):
                    key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', e.group())
                    wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', e.group())
                    wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', e.group())
                    tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', e.group())
                    if tag == 'PERSON':
                        if wikiID == 'noWikiID':
                            if wikiname in dict_per_filtered:
                                wikiID = dict_per_filtered[wikiname][2]
                            elif key in inverted_dict_per:
                                wikiname = inverted_dict_per[key]
                                wikiID = dict_per_filtered[inverted_dict_per[key]][1]
                            else:
                                fuzzy_matches = process.extract(key, inverted_dict_per.keys())
                                if fuzzy_matches[0][1] >= 95:
                                    wikiname = inverted_dict_per[fuzzy_matches[0][0]]
                                    wikiID = dict_per_filtered[inverted_dict_per[fuzzy_matches[0][0]]][1]
                                    
                            newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]

                        else:
                            if wikiname not in dict_per_filtered:
                                if key in inverted_dict_per:
                                    wikiname = inverted_dict_per[key]
                                    wikiID = dict_per_filtered[inverted_dict_per[key]][1]
                                else:
                                    wikiname = key.upper().replace(' ', '_')
                                    wikiname = wikiname.replace(',', '')
                                    wikiID = 'noWikiID'
                                    
                            newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                    if tag == 'WORK_OF_ART':
                        if wikiID == 'noWikiID':
                            if wikiname in dict_text_filtered:
                                wikiID = dict_text_filtered[wikiname][2]
                            elif key in inverted_dict_text:
                                wikiname = inverted_dict_text[key]
                                wikiID = dict_text_filtered[inverted_dict_text[key]][1]
                            else:
                                fuzzy_matches = process.extract(key, inverted_dict_text.keys())
                                if fuzzy_matches[0][1] >= 95:
                                    wikiname = inverted_dict_text[fuzzy_matches[0][0]]
                                    wikiID = dict_text_filtered[inverted_dict_text[fuzzy_matches[0][0]]][1]
                                    
                            newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                        
                        else:
                            if wikiname not in dict_text_filtered:
                                if key in inverted_dict_text:
                                    wikiname = inverted_dict_text[key]
                                    wikiID = dict_text_filtered[inverted_dict_text[key]][1]
                                else:
                                    wikiname = key.upper().replace(' ', '_')
                                    wikiname = wikiname.replace(',', '')
                                    wikiID = 'noWikiID'
                                    
                                newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
            
            newText = newText + '\n' + newString
            
    file_name = re.sub('.+ENG.(.+.txt)', '\\1', str(file))
    file_name = consolidated_dir + '/' + file_name
    f = open(file_name, "w",  encoding = "utf-8")
    f.write(newText)
    f.close()

In [None]:
corpus_path = base_dir + today + '_final_corpus_ENG_consolidated'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

After the consolidation, some last manual corrections have been made. Based on the consolidated text files two new dictionaries are compiled that now include all detected entities, with or without wikiIDs, for persons and literary texts, respectively. They are sorted alphabetically, and then exported as json and csv files. 

<Walter Scott; WALTER_SCOTT_(CANADIAN_POLITICIAN); Q1386411; PERSON> --> <Walter Scott; WALTER_SCOTT; Q79025; PERSON>

<Wordsworth; WORDSWORTH_(RAPPER); Q4020823; PERSON> --> <Wordsworth; WILLIAM_WORDSWORTH; Q45546; PERSON>

<Michael Smith; MICHAEL_SMITH_(DARTS_PLAYER); Q6834464; PERSON> --> <Michael Smith; MICHAEL_SMITH; noWikiID; PERSON>

<Charles Read; CHARLES_READ_(SQUASH_PLAYER); Q5081957; PERSON> --> <Charles Read; CHARLES_READ; noWikiID; PERSON>

<Declan Hughes; DECLAN_HUGHES_(SNOOKER_PLAYER); Q4502680; PERSON> --> <Declan Hughes; DECLAN_HUGHES; noWikiID; PERSON>

<Sydney Lee; SYDNEY_LEE_(SNOOKER_PLAYER); Q22097886; PERSON> --> <Sydney Lee; SYDNEY_LEE; noWikiID; PERSON>

<Alec Brown; ALEC_BROWN_(SNOOKER_PLAYER); Q4096095; PERSON> --> <Alec Brown; ALEC_BROWN; noWikiID; PERSON>

<George Walker; GEORGE_WALKER_(CHESS_PLAYER); Q1508508; PERSON> --> <George Walker; GEORGE_WALKER; noWikiID; PERSON>

<Robert Owen; ROBERT_OWEN_(DARTS_PLAYER); Q39074894; PERSON> --> <Robert Owen; ROBERT_OWEN; noWikiID; PERSON>

<John Bowles; JOHN_BOWLES_(DARTS_PLAYER); Q6222702; PERSON> --> <John Bowles; JOHN_BOWLES; noWikiID; PERSON>

<Trevelyan; JOHN_TREVELYAN_(CHESS_PLAYER); Q67641500; PERSON> --> <Trevelyan; JOHN_TREVELYAN; noWikiID; PERSON>

<Bob Quinn; BOB_QUINN_(BASEBALL_BORN_1870); Q4933713; PERSON> --> <Bob Quinn; BOB_QUINN_(BASEBALL_BORN_1870); Q4933713; PERSON>

<Hess; DAVID_HESS_(BASEBALL); Q50578172; PERSON> --> <Hess; MYRA_HESS; Q269848; PERSON>

<Alan Richardson; ALAN_RICHARDSON_(FOOTBALLER_BORN_1965); Q4707627; PERSON> --> <Alan Richardson; ALAN_RICHARDSON; noWikiID; PERSON>

<Liam O'Flaherty; LIAM_O'FLAHERTY_(FOOTBALLER); Q6539752; PERSON> --> <Liam O'Flaherty; LIAM_O'FLAHERTY; Q370973; PERSON>

<Joyce; JOYCE_(SINGER); Q3084419; PERSON> --> <Joyce; JAMES_JOYCE; Q6882; PERSON>

<John Tomlin; JOHN_TOMLIN_(AMERICAN_FOOTBALL); Q6260995; PERSON> --> <John Tomlin; JOHN_TOMLIN; noWikiID; PERSON>

<Peter Bell; PETER_BELL_(FOOTBALLER_BORN_1976); Q7172757; PERSON> --> <Peter Bell; PETER_BELL_(FOOTBALLER_BORN_1976); Q7172757; PERSON>

<Richard Head; RICHARD_HEAD_(FOOTBALLER); Q7326351; PERSON> --> <Richard Head; RICHARD_HEAD; noWikiID; PERSON>

<John Parry; JOHN_PARRY_(AMERICAN_FOOTBALL_OFFICIAL); Q16196206; PERSON> --> <John Parry; JOHN_PARRY; noWikiID; PERSON>

<Hugh O'Neill; HUGH_O'NEILL_(CANADIAN_FOOTBALL); Q5932150; PERSON> --> <Hugh O'Neill; HUGH_O'NEILL; noWikiID; PERSON>

<John O'Keeffe; JOHN_O'KEEFFE_(GAELIC_FOOTBALLER); Q6250925; PERSON> --> <John O'Keeffe; JOHN_O'KEEFFE; noWikiID; PERSON>

<Tom Hickey; TOM_HICKEY_(FOOTBALLER_BORN_1991); Q7816187; PERSON> --> <Tom Hickey; TOM_HICKEY; noWikiID; PERSON>

<McGuinness; TONY_MCGUINNESS_(FOOTBALLER); Q7822935; PERSON> --> <McGuinness; MCGUINNESS; noWikiID; PERSON>

<John Rosenberg; JOHN_ROSENBERG_(AMERICAN_FOOTBALL); Q6255640; PERSON> --> <John Rosenberg; JOHN_ROSENBERG; noWikiID; PERSON>

<John Bender; JOHN_BENDER_(GRIDIRON_FOOTBALL); Q6221502; PERSON> --> <John Bender; JOHN_BENDER; noWikiID; PERSON>

<James Parsons; JAMES_PARSONS_(FOOTBALLER); Q29568417; PERSON> --> <James Parsons; JAMES_PARSONS; noWikiID; PERSON>

<George Petrie; GEORGE_PETRIE_(AMERICAN_FOOTBALL); Q5543373; PERSON> --> <George Petrie; GEORGE_PETRIE; noWikiID; PERSON>

In [None]:
dict_per = {}
dict_text = {}

for file in tqdm(files):
    with open (file, encoding = "utf-8") as f:
        text = f.read()
        matches = re.findall( r'<.+?>', text)
        for m in matches:
            if len(re.findall(';', m)) > 1:
                key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', m)
                wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', m)
                wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', m)
                tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', m)
                if tag == 'PERSON':
                    if wikiname in dict_per:
                        if key in dict_per[wikiname][0]:
                            dict_per[wikiname][3] += 1
                        else:
                            dict_per[wikiname][0].append(key)
                            dict_per[wikiname][3] += 1
                    else:
                        dict_per[wikiname] = [[key], wikiID, tag, 1]
                elif tag == 'WORK_OF_ART':
                    if wikiname in dict_text:
                        if key in dict_text[wikiname][0]:
                            dict_text[wikiname][3] += 1
                        else:
                            dict_text[wikiname][0].append(key)
                            dict_text[wikiname][3] += 1
                    else:
                        dict_text[wikiname] = [[key], wikiID, tag, 1]

In [None]:
keys = list(dict_per.keys())
keys.sort()
dict_per = {i: dict_per[i] for i in keys}

In [None]:
keys = list(dict_text.keys())
keys.sort()
dict_text = {i: dict_text[i] for i in keys}

In [None]:
save_data(base_dir + today + '_ENG_dict_all_entities_PER.json', dict_per)
save_data(base_dir + today + '_ENG_dict_all_entities_WORK_OF_ART.json', dict_text)

In [None]:
#dict_per = load_data(base_dir + today + '_ENG_dict_all_entities_PER.json')
#dict_text = load_data(base_dir + today + '_ENG_dict_all_entities_WORK_OF_ART.json')

In [None]:
titles = list(dict_text.keys())
details = list(dict_text.values())

# Convert to DataFrame
df = pd.DataFrame(details, columns=['variations', 'wikiID', 'tag', 'count'])
df.insert(0, 'wikiname', titles)

In [None]:
df.to_csv(base_dir + today + '_texts_df_tobeclustered.csv', encoding='utf8')

In [None]:
persons = list(dict_per.keys())
details = list(dict_per.values())

# Convert to DataFrame
df = pd.DataFrame(details, columns=['variations', 'wikiID', 'tag', 'count'])
df.insert(0, 'wikiname', persons)

In [None]:
df.to_csv(base_dir + today + '_persons_df_tobeclustered.csv', encoding='utf8')

### OpenRefine

OpenRefine: export as xslx, save as csv

In [None]:
df_texts = pd.read_csv(base_dir + today + '-texts-df-clustered.csv', encoding='utf8', sep=';')
df_persons = pd.read_csv(base_dir + today + '-persons-df-clustered.csv', encoding='utf8', sep=';')

In [None]:
df_texts['variations'] = df_texts['variations'].apply(ast.literal_eval)
df_persons['variations'] = df_persons['variations'].apply(ast.literal_eval)

In [None]:
dict_text_clustered = {}

for index, row in df_texts.iterrows():
    if row.iloc[1] in dict_text_clustered: # is wikiname in dict?
        dict_text_clustered[row.iloc[1]][0] = dict_text_clustered[row.iloc[1]][0] + row.iloc[2] # if yes, then add variations and check if wikiID can be added
        if row.iloc[3] != "noWikiID": 
            dict_text_clustered[row.iloc[1]][1] = row.iloc[3]
        dict_text_clustered[row.iloc[1]][3] += row.iloc[5] # add up the count
    else:
        dict_text_clustered[row.iloc[1]] = [row.iloc[2], row.iloc[3], row.iloc[4], row.iloc[5]] # wikiname, variations, wikiID; tag, count

In [None]:
dict_persons_clustered = {}

for index, row in df_persons.iterrows():
    if row.iloc[1] in dict_persons_clustered: # is wikiname in dict?
        dict_persons_clustered[row.iloc[1]][0] = dict_persons_clustered[row.iloc[1]][0] + row.iloc[2] # if yes, then add variations and check if wikiID can be added
        if row.iloc[3] != "noWikiID": 
            dict_persons_clustered[row.iloc[1]][1] = row.iloc[3]
        dict_persons_clustered[row.iloc[1]][3] += row.iloc[5] # add up the count
    else:
        dict_persons_clustered[row.iloc[1]] = [row.iloc[2], row.iloc[3], row.iloc[4], row.iloc[5]] # wikiname, variations, wikiID; tag, count

In [None]:
#save_data(base_dir + today + '_ENG_dict_all_entities_WORK_OF_ART_final.json', dict_text_clustered)
#save_data(base_dir + today + '_ENG_dict_all_entities_PERSON_final.json', dict_persons_clustered)

In [11]:
dict_text_clustered = load_data(base_dir + today + '_ENG_dict_all_entities_WORK_OF_ART_final.json')
dict_persons_clustered = load_data(base_dir + today + '_ENG_dict_all_entities_PERSON_final.json')

In [12]:
len(dict_text_clustered)

15577

In [13]:
dict_text_clustered_inverted = {}

for key, values in dict_text_clustered.items():
    for value in values[0]:
        dict_text_clustered_inverted[value] = key

In [14]:
dict_persons_clustered_inverted = {}

for key, values in dict_persons_clustered.items():
    for value in values[0]:
        dict_persons_clustered_inverted[value] = key

In [None]:
clustered_dir = base_dir + today + '_final_corpus_ENG_clustered'
if not os.path.exists(clustered_dir):
    os.makedirs(clustered_dir)

In [None]:
co_citation_dict = {}

for file in tqdm(files):
    file_short = re.sub('(.+consolidated)\\\\(.+?).txt', '\\2', str(file))
    co_citation_dict[file_short] = {'texts': {}, 'persons': {}}
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        newText = ''
        for paragraph in text:
            newString = paragraph
            entities = re.finditer('<.+?>', newString)
            if entities is not None:
                for e in reversed(list(entities)):
                    key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', e.group())
                    wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', e.group())
                    wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', e.group())
                    tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', e.group())

                    if tag == 'PERSON':
                        if wikiname not in co_citation_dict[file_short]['persons']:
                            co_citation_dict[file_short]['persons'][wikiname] = 1
                        else:
                            co_citation_dict[file_short]['persons'][wikiname] += 1
                    if tag == 'WORK_OF_ART':
                        if wikiname not in co_citation_dict[file_short]['texts']:
                            co_citation_dict[file_short]['texts'][wikiname] = 1
                        else:
                            co_citation_dict[file_short]['texts'][wikiname] += 1
                    
                    if tag == 'PERSON':
                        if wikiname in dict_persons_clustered: # using the unfiltered version of the dict, but still fitlering out entities with a freq of 1 if they don't have a wikiID
                            newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                        else:
                            if key in dict_persons_clustered_inverted: # if an entity doesn't appear in the dictionary, the title variation is looked up in the inverted dictionary and the entity name is, if applicable, replaced
                                wikiname = dict_persons_clustered_inverted[key]
                                wikiID = dict_persons_clustered[dict_persons_clustered_inverted[key]][1]
                                newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                            else:
                                newString = newString[0:e.start()] + key + newString[e.end():len(newString)]
                                
                    if tag == 'WORK_OF_ART':
                        if wikiname in dict_text_clustered: # using the unfiltered version of the dict, but still fitlering out entities with a freq of 1 if they don't have a wikiID
                            newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                        else:
                            if key in dict_text_clustered_inverted: # if an entity doesn't appear in the dictionary, the title variation is looked up in the inverted dictionary and the entity name is, if applicable, replaced
                                wikiname = dict_text_clustered_inverted[key]
                                wikiID = dict_text_clustered[dict_text_clustered_inverted[key]][1]
                                newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                            else:
                                newString = newString[0:e.start()] + key + newString[e.end():len(newString)]

            newText = newText + '\n' + newString
            
    file_name = re.sub('.+consolidated.(.+.txt)', '\\1', str(file))
    file_name = clustered_dir + '/' + file_name
    f = open(file_name, "w",  encoding = "utf-8")
    f.write(newText)
    f.close()

In [None]:
save_data(base_dir + today + '_co_citation_dict.json', co_citation_dict)

In [None]:
corpus_path = base_dir + today + '_final_corpus_ENG_clustered'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

In [None]:
replaced_dir = base_dir + today + '_final_corpus_ENG_replaced'
if not os.path.exists(replaced_dir):
    os.makedirs(replaced_dir)

In [None]:
for file in tqdm(files):
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        newText = ''
        for paragraph in text:
            newString = paragraph
            entities = re.finditer('<.+?>', newString)
            if entities is not None:
                for e in reversed(list(entities)):
                    key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', e.group())
                    wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', e.group())
                    wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', e.group())
                    tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', e.group())
                    
                    newString = newString[0:e.start()] + wikiname + '_' + wikiID + newString[e.end():len(newString)]

            newText = newText + '\n' + newString

    file_name = re.sub('.+ENG_clustered.(.+.txt)', '\\1', str(file))
    file_name = replaced_dir + '/' + file_name
    f = open(file_name, "w",  encoding = "utf-8")
    f.write(newText)
    f.close()

### Word Embedding

In [15]:
entities_per = []

for key, values in dict_persons_clustered.items():
    entity = str(key) + '_' + values[1]
    entities_per.append(entity)

entities_text = []

for key, values in dict_text_clustered.items():
    entity = str(key) + '_' + values[1]
    entities_text.append(entity)

In [16]:
entities = set(entities_per + entities_text)

In [17]:
len(entities_text)

15577

In [None]:
fileList = glob.glob(os.path.join(os.getcwd(), replaced_dir, '*.txt'))
 
lithist = []
for file_path in fileList:
    with open(file_path, encoding = 'utf8') as file:
        lithist.append(file.read())

In [29]:
escape = "!#$%&'()*+–, -./:;<=>?@[\]^`{|}~’‘“”"

In [None]:
tokens = []

num_tokens = 0
for hist in tqdm(lithist):
    doc = nlp(hist)
    for sent in doc.sents:
        tokens_in_sent = []
        for token in sent:
            
            if token.text.startswith('\n'):
                continue
            if token.text[0].isdigit():
                continue
            if token.text in escape:
                continue
              
            if token.text not in entities:
                tok = re.sub('[\d!#$%&\(\)*\+, -\./:;<=>?@\[\]^`\{\|\}~’‘“”]+', '', token.text)
                tokens_in_sent.append(tok.lower())
            else:
                tokens_in_sent.append(token.text)

        tokens.append(tokens_in_sent)
        num_tokens += len(tokens_in_sent)

In [11]:
num_tokens = 0

for tok in tokens:
    num_tokens += len(tok)

In [12]:
num_tokens

4076326

In [None]:
#with open(path_results + '\\' + today + '_tokens_preprocessed.pickle', 'wb') as output_file:
#    pickle.dump(tokens, output_file)

In [18]:
with open(path_results + '\\' + today + '_tokens_preprocessed.pickle',  'rb') as input_file:
    tokens = pickle.load(input_file)

In [19]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [20]:
min_count = 5
vector_size = 100
window = 10
sg = 1
epochs = 5

In [21]:
iterations = 100

In [None]:
we_results = path_results + '\\modelA_iteration=' + str(iterations)
if not os.path.exists(we_results):
    os.makedirs(we_results)

In [None]:
models = []
for i in range(1, iterations+1):
    model = Word2Vec(tokens, min_count=min_count, vector_size=vector_size, window=window, sg=sg, epochs=epochs)
    model.wv.save(path_results + '\\modelA_iteration=' + str(iterations) + '\\' + today + '_' + str(i) + '_window=' + str(window) + '_skipgram_defaultparams_model_vectors.kv')
    models.append(model)

In [None]:
we_results = path_results + '\\modelB_iteration=' + str(iterations)
if not os.path.exists(we_results):
    os.makedirs(we_results)

In [None]:
for i in range(1, iterations+1):
    model = Word2Vec(tokens, min_count=min_count, vector_size=vector_size, window=window, sg=sg, epochs=epochs)
    model.wv.save(path_results + '\\modelB_iteration=' + str(iterations) + '\\' + today + '_' + str(i) + '_window=' + str(window) + '_skipgram_defaultparams_model_vectors.kv')

### Networks

#### Network A: All entities

In [22]:
model_dir = Path(path_results + '\\modelA_iteration=' + str(iterations)).glob('*.kv')
files = list(model_dir)

In [23]:
models = []

for file in files:
    models.append(gensim.models.KeyedVectors.load(str(file)))

2025-01-30 16:35:51,463 : INFO : loading KeyedVectors object from C:\Users\Brottrager\Documents\Diss\sec_lit\ENG\20240919_wordembeddings\modelA_iteration=100\20240919_100_window=10_skipgram_defaultparams_model_vectors.kv
2025-01-30 16:35:51,549 : INFO : KeyedVectors lifecycle event {'fname': 'C:\\Users\\Brottrager\\Documents\\Diss\\sec_lit\\ENG\\20240919_wordembeddings\\modelA_iteration=100\\20240919_100_window=10_skipgram_defaultparams_model_vectors.kv', 'datetime': '2025-01-30T16:35:51.549429', 'gensim': '4.3.2', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'loaded'}
2025-01-30 16:35:51,550 : INFO : loading KeyedVectors object from C:\Users\Brottrager\Documents\Diss\sec_lit\ENG\20240919_wordembeddings\modelA_iteration=100\20240919_10_window=10_skipgram_defaultparams_model_vectors.kv
2025-01-30 16:35:51,607 : INFO : KeyedVectors lifecycle event {'fname': 'C:\\Users\\Brottr

In [24]:
vectors = {}

for model in models:
    for key in entities:
        if key in model.key_to_index:
            if key not in vectors:
                vectors[key] = [model[key]]
            else:
                vectors[key].append(model[key])

In [25]:
vectors_mean = {}
idsinEmbedding = []
nodes = {}

for key, values in vectors.items():
    array = np.array(vectors[key], dtype='float32')
    vectors_mean[key] = np.average(array, axis=0)
    idsinEmbedding.append(key)
    if key in entities_per:
        nodes[key] = {'label': 'PERSON'}
    elif key in entities_text:
        nodes[key] = {'label': 'WORK_OF_ART'}   

In [27]:
save_data(base_dir + today + '_idsinEmbedding.json', idsinEmbedding)

In [26]:
idsinEmbedding = load_data(base_dir + today + '_idsinEmbedding.json')

In [27]:
len(idsinEmbedding)

3589

In [28]:
vectors_ls = []

for key, values in vectors_mean.items():
    vectors_ls.append(vectors_mean[key])

In [29]:
similarities = cosine_similarity(vectors_ls)

In [30]:
data = pd.DataFrame(similarities)

In [31]:
for i in range(0, len(data)):
    for j in range(0,len(data)):
        if i == j:
            data.iloc[i,j] = 0

In [32]:
len(idsinEmbedding)

3589

In [33]:
data.columns = idsinEmbedding
data.index = idsinEmbedding

In [37]:
data.to_csv(path_results + '//fulladjacencymatrix.csv', encoding='utf8')

Minimum Spanning Tree 

In [41]:
G = nx.from_pandas_adjacency(data)

In [42]:
with open(path_results + '//fullnetwork_linklist.csv', mode='w', newline='', encoding='utf8') as file:
    writer = csv.writer(file)
    # Write header (optional)
    writer.writerow(['Node1', 'Node2'])
    
    # Write each edge as a row
    for u, v in G.edges():
        writer.writerow([u, v])

In [40]:
nx.set_node_attributes(G, nodes)

In [41]:
algorithm = 'kruskal'

In [42]:
for u, v, data in G.edges(data=True):
    data['weight'] = 1 - data['weight'] # similarity vs minimum weight

mst = nx.minimum_spanning_tree(G, algorithm=algorithm)

for u, v, data in mst.edges(data=True):
    data['weight'] = 1 - data['weight']

In [43]:
#filename = path_results + '\\' + today + '_min_count' + str(min_count) + '_vector_size' + str(vector_size) + '_window' + str(window) + '_epochs' + str(epochs) + '_' + str(algorithm) + "_word_similarity_edges_highsimilarity.csv"
filename = path_results + '\\' + today + '_window=' + str(window) + "_defaultparams_skipgram_mean5_word_similarity_edges_highsimilarity.csv"

with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Source', 'Target', 'Weight'])
    
    for u, v, data in mst.edges(data=True):
        writer.writerow([u, v, data['weight']])

#### Network: PER

In [46]:
vectors_per_ls = []
idsinEmbedding_per = []

for key, values in vectors_mean.items():
    if key in entities_per:
        vectors_per_ls.append(vectors_mean[key])
        idsinEmbedding_per.append(key)

In [47]:
len(idsinEmbedding_per)

2628

In [48]:
save_data(base_dir + today + '_idsinEmbedding_PER.json', idsinEmbedding_per)

In [49]:
similarities_per = cosine_similarity(vectors_per_ls)

In [50]:
data_per = pd.DataFrame(similarities_per)

In [51]:
for i in range(0, len(data_per)):
    for j in range(0,len(data_per)):
        if i == j:
            data_per.iloc[i,j] = 0

In [52]:
data_per.columns = idsinEmbedding_per
data_per.index = idsinEmbedding_per

Minimum Spanning Tree

In [53]:
G_per = nx.from_pandas_adjacency(data_per)

In [54]:
for u, v, data in G_per.edges(data=True):
    data['weight'] = 1 - data['weight'] # similarity vs minimum weight

mst_per = nx.minimum_spanning_tree(G_per, algorithm=algorithm)

for u, v, data in mst_per.edges(data=True):
    data['weight'] = 1 - data['weight']

In [55]:
#filename = path_results + '\\' + today + '_min_count' + str(min_count) + '_vector_size' + str(vector_size) + '_window' + str(window) + '_epochs' + str(epochs) + '_' + str(algorithm) + "_word_similarity_edges_highsimilarity_PER.csv"
filename = path_results + '\\' + today + '_window=' + str(window) + "_defaultparams_skipgram_mean5_word_similarity_edges_highsimilarity_PER.csv"

with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Source', 'Target', 'Weight'])
    
    for u, v, data in mst_per.edges(data=True):
        writer.writerow([u, v, data['weight']])

#### Network: WORK_OF_ART

In [34]:
vectors_text_ls = []
idsinEmbedding_text = []

for key, values in vectors_mean.items():
    if key in entities_text:
        vectors_text_ls.append(vectors_mean[key])
        idsinEmbedding_text.append(key)

In [35]:
len(idsinEmbedding_text)

1056

In [34]:
save_data(base_dir + today + '_idsinEmbedding_TEXT.json', idsinEmbedding_text)

In [36]:
similarities_text = cosine_similarity(vectors_text_ls)

In [37]:
data_texts = pd.DataFrame(similarities_text)

In [38]:
for i in range(0, len(data_texts)):
    for j in range(0,len(data_texts)):
        if i == j:
            data_texts.iloc[i,j] = 0

In [39]:
data_texts.columns = idsinEmbedding_text
data_texts.index = idsinEmbedding_text

In [40]:
data_texts.to_csv(path_results + '\\' + today + '_window=' + str(window) + 'texts_wordembedding_adjacency.csv', encoding='utf8')

Minimum Spanning Tree

In [39]:
G_texts = nx.from_pandas_adjacency(data_texts)

In [42]:
for u, v, data in G_texts.edges(data=True):
    data['weight'] = 1 - data['weight'] # similarity vs minimum weight

mst_texts = nx.minimum_spanning_tree(G_texts, algorithm=algorithm)

for u, v, data in mst_texts.edges(data=True):
    data['weight'] = 1 - data['weight']

In [43]:
#filename = path_results + '\\' + today + '_min_count' + str(min_count) + '_vector_size' + str(vector_size) + '_window' + str(window) + '_epochs' + str(epochs) + '_' + str(algorithm) + "_word_similarity_edges_highsimilarity_WORK_OF_ART.csv"
filename = path_results + '\\' + today + '_window=' + str(window) + "_defaultparams_skipgram_mean5_word_similarity_edges_highsimilarity_WORK_OF_ART.csv"

with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Source', 'Target', 'Weight'])
    
    for u, v, data in mst_texts.edges(data=True):
        writer.writerow([u, v, data['weight']])