# NER Workflow for German Literary Histories

--- Last edited: 2024-10-01 ---

This workflow is designed to extract named entities of persons and works of art by fine-tuning the existing spaCy NER module. After testing the fine-tuned NER with a manually compiled gold standard, the [spaCy wrapper](https://spacy.io/universe/project/spacyfishing) for  [entity-fishing](https://github.com/kermitt2/entity-fishing) is used to disambiguate the found entities.

In [1]:
import collections
from datetime import date
import glob
import json
import os
from pathlib import Path
import pickle
import random
import re
import string
import sys
import time
import csv
import shutil
import requests
from tqdm.notebook import tqdm
import ast

import gensim
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim.models import KeyedVectors

import networkx as nx

import nltk
from nltk.corpus import stopwords
from nltk.cluster import KMeansClusterer

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.cm as cm

import sklearn
from sklearn import cluster
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

import spacy
import spacy_transformers
from spacy.pipeline import EntityRuler
from spacy.training.example import Example
from spacy.scorer import Scorer
from spacy.tokens import DocBin
from spacy.training import offsets_to_biluo_tags

from thefuzz import fuzz
from thefuzz import process

### Functions

In [2]:
def load_data(file):
    with open(file, "r", encoding = "utf-8") as f:
        data = json.load(f)
    return(data)

def save_data(file, data):
    with open(file, "w", encoding = "utf-8") as f:
        json.dump(data, f, indent = 4)

In [3]:
def test_model(model, text):
    doc = nlp(text)
    results = []
    for ent in doc.ents:
        results.append((ent.text, ent.label_))
    return(results)

In [4]:
def query_entity_fishing(text, language='de'):
    headers = {
        'Content-Type': 'application/json'
    }
    data = {
        "text": text,
        "language": language  # Specify language if needed
    }
    try:
        # Send a POST request to the Entity Fishing API
        response = requests.post(api_url, headers=headers, data=json.dumps(data))
        # Raise an error for bad HTTP status codes
        response.raise_for_status()
        # Parse the JSON response
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Handle HTTP errors
        return {}
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err}")  # Handle other request errors
        return {}
    except ValueError as json_err:
        print(f"JSON decoding error: {json_err}")  # Handle JSON decoding errors
        return {}

In [5]:
def get_wikidata_label(wikidataid, language='de'):
    # Define the Wikidata API URL
    url = "https://www.wikidata.org/w/api.php"
    
    # Set the parameters for the API request
    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": wikidataid,
        "props": "labels",
        "languages": language
    }
    
    try:
        # Send the request to the Wikidata API
        response = requests.get(url, params=params)
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()
        
        # Extract the label
        entity = data.get("entities", {}).get(wikidataid, {})
        labels = entity.get("labels", {})
        label = labels.get(language, {}).get("value", None)
        
        return label
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Handle HTTP errors
        return None
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err}")  # Handle other request errors
        return None
    except ValueError as json_err:
        print(f"JSON decoding error: {json_err}")  # Handle JSON decoding errors
        return None

In [6]:
def is_instance_of_human(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return False

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # Check for "instance of" property (P31)
    if 'P31' in claims:
        for claim in claims['P31']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            if value.get('id') == 'Q5':  # Q5 is the ID for 'human'
                return True
    
    return False

In [7]:
def is_irrelevant_occupation(wikidata_id):
    import requests

    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return False

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # List of relevant IDs for athletes, baseball players, American football players, racehorse trainers, basketball players, TV personalities, rappers, jockeys, footballers, golfers, cyclists, handball players, swimmers, and wrestlers
    occupation_ids = [
        'Q937857',    # 'athlete'
        'Q2066131',   # 'sports player'
        'Q10871364',  # 'baseball player'
        'Q19204627',  # 'American football player'
        'Q1622272',   # 'racehorse trainer'
        'Q3665646',   # 'basketball player'
        'Q28389',     # 'TV personality'
        'Q196452',    # 'rapper'
        'Q37226',     # 'jockey'
        'Q2736',      # 'footballer (soccer player)'
        'Q11513337',  # 'golfer'
        'Q14960',     # 'cyclist'
        'Q192061',    # 'handball player'
        'Q24488',     # 'swimmer'
        'Q183'        # 'wrestler'
    ]
    
    # Check for "instance of" property (P31) or "occupation" property (P106)
    for prop in ['P31', 'P106']:
        if prop in claims:
            for claim in claims[prop]:
                mainsnak = claim.get('mainsnak', {})
                datavalue = mainsnak.get('datavalue', {})
                value = datavalue.get('value', {})
                if value.get('id') in occupation_ids:
                    return True

    return False


In [8]:
def is_instance_of_literarywork(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return False

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # Check for "instance of" property (P31)
    if 'P31' in claims:
        for claim in claims['P31']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            if value.get('id') == 'Q7725634':  # Q7725634 is the ID for 'literary work'
                return True
            if value.get('id') == 'Q47461344':  # Q47461344 is the ID for 'written work'
                return True
            if value.get('id') == 'Q838948':  # Q838948 is the ID for 'work of art'
                return True
            if value.get('id') == 'Q116476516':  # Q116476516 is the ID for 'dramatic work'
                return True
    
    return False

### Finetuning spaCy's NER for a Corpus of Literary Historical Texts

In [9]:
#today = re.sub('-', '', str(date.today()))
today = "20240919"

In [10]:
base_dir = 'C:/Users/Brottrager/Documents/Diss/sec_lit/GER/'
model_path = 'C:/Users/Brottrager/Documents/Diss/code/python/bespokeNER/output_GER/'
path_results = 'C:/Users/Brottrager/Documents/Diss/sec_lit/GER/' + today + '_wordembeddings'

In [11]:
if not os.path.exists(path_results):
    os.makedirs(path_results)

The manually annotated texts are imported, split into sentencens, and then turned into training data in json format.

In [12]:
corpus_path = base_dir + 'lit_histories_annotated'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

In [13]:
nlp = spacy.load("de_core_news_lg")

In [14]:
nlp.max_length = 4000000

In [15]:
config = {"punct_chars": ['!', '.', '?']}
nlp.add_pipe("sentencizer", config=config, before="parser")

<spacy.pipeline.sentencizer.Sentencizer at 0x2305dcb7550>

In [74]:
sentences = []
entities_matches = []
for file in tqdm(files):
    all_paragraphs = ''
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        for paragraph in text:
            if paragraph != "":
                doc = nlp(paragraph)
                for sent in doc.sents:
                    newString = str(sent)
                    entities_sent_matches = []
                    num_entities = sum(1 for _ in re.finditer('<.+?>', str(newString)))
                    while num_entities > 0: 
                        entities = re.finditer('<.+?>', str(newString))
                        entities_accum = []
                        for e in entities:
                            #print(e)
                            entity_name = re.sub('<(.+?);.+?>', '\\1', e.group())
                            tag = re.sub('<.+(WORK_OF_ART|PER)>', '\\1', e.group())
                            newString = newString[0:e.start()] + entity_name + newString[e.end():len(newString)]
                            entity_name = re.sub(r'([\(\)\[\]])', r'\\\1', entity_name)
                            entities_accum.append([e.start(), e.start()+len(entity_name), tag])
                            num_entities = num_entities - 1 
                            break
                        entities_sent_matches.append(entities_accum)
                    entities_matches.append([x for xs in entities_sent_matches for x in xs])
                    sentences.append(newString)

  0%|          | 0/6 [00:00<?, ?it/s]

In [75]:
TRAIN_DATA = []
counter = 0
for ent in entities_matches:
    if len(ent) != 0:
        TRAIN_DATA.append([sentences[counter], {"entities":  ent}])
    counter += 1

In [76]:
save_data(base_dir + today + '_final_manual_training_data_GER.json', TRAIN_DATA)

In [20]:
GOLD_STANDARD = load_data(base_dir + '20240531_gold_standard_complete.json')

In [126]:
TRAIN_DATA = load_data(base_dir + today + '_final_manual_training_data_GER.json')

#### Training

In [127]:
db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk(today + "_final_manual_train_GER.spacy") # save the docbin object

  0%|          | 0/13672 [00:00<?, ?it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

https://spacy.io/usage/training#config

CMD

python -m spacy init fill-config base_config_GER.cfg config.cfg


python -m spacy train config.cfg --output ./output_GER --paths.train ./20240821_final_manual_train_GER.spacy --paths.dev ./20240821_final_manual_train_GER.spacy 

In [17]:
trained = spacy.load(r"C:/Users/Brottrager/Documents/Diss/code/python/bespokeNER/output_GER/model-best") 
trained.max_length = 3000000

#### Testing

In [21]:
scorer = Scorer()

In [22]:
examples_nlp = []
for text, annotations in GOLD_STANDARD:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    example.predicted = nlp(str(example.predicted))
    examples_nlp.append(example)


Heinrich Heine (1797..." with entities "[[0, 12, 'WORK_OF_ART'], [14, 28, 'PER'], [30, 44,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

Thomas Mann (18..." with entities "[[35, 46, 'PER'], [200, 209, 'WORK_OF_ART']]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


In [23]:
examples_trained = []
for text, annotations in GOLD_STANDARD:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    example.predicted = trained(str(example.predicted))
    examples_trained.append(example)

In [24]:
dict_scores = [scorer.score(examples_nlp), 
               scorer.score(examples_trained)]

In [25]:
p_worksofart = []
r_worksofart = []
f_worksofart = []

p_person = []
r_person = []
f_person = []

dicts = ['Base NLP model', 'Fine-tuned model']

i = 0
for d in dict_scores:
    print('========== ' + dicts[i] + ' ==========\n')
    
    print('WORK_OF_ART')
    print('Precision: ' + str(d['ents_per_type']['WORK_OF_ART']['p']))
    print('Recall: ' + str(d['ents_per_type']['WORK_OF_ART']['r']))
    print('F1: ' + str(d['ents_per_type']['WORK_OF_ART']['f']))
    print('\n')
    
    print('PERSON')
    print('Precision: ' + str(d['ents_per_type']['PER']['p']))
    print('Recall: ' + str(d['ents_per_type']['PER']['r']))
    print('F1: ' + str(d['ents_per_type']['PER']['f']))
    print('\n')
    
    i += 1


WORK_OF_ART
Precision: 0.0
Recall: 0.0
F1: 0.0


PERSON
Precision: 0.7861080485115767
Recall: 0.9235751295336787
F1: 0.8493150684931506



WORK_OF_ART
Precision: 0.7034883720930233
Recall: 0.7492260061919505
F1: 0.7256371814092952


PERSON
Precision: 0.8846625766871166
Recall: 0.9339378238341969
F1: 0.9086326402016384




### Entity fishing

docker run --rm -p 8090:8090 -p 8091:8091 \

-v C:/entity-fishing/data/db-kb/db-kb:/opt/entity-fishing/data/db/db-kb \

-v C:/entity-fishing/data/db-en/db-en:/opt/entity-fishing/data/db/db-en \

-v C:/entity-fishing/data/db-de/db-de:/opt/entity-fishing/data/db/db-de \

grobid/entity-fishing:0.0.6


In [18]:
trained.add_pipe("entityfishing", config={"extra_info": True, "filter_statements":['P569', 'P570'], "language": "de", "api_ef_base": "http://localhost:8090/service"})

<spacyfishing.entity_fishing_linker.EntityFishing at 0x20a26b39c10>

In [16]:
corpus_path = base_dir + 'lit_histories_preprocessed'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

In [17]:
newpath = base_dir + today + '_txt_search&replace_model=trained'
if not os.path.exists(newpath):
    os.makedirs(newpath)

#### Search & Replace Logic

In [None]:
persons = []
works_of_art = []
for file in tqdm(files):
    all_paragraphs = ''
    with open (file, encoding = 'utf-8') as f:
        text = f.read().split('\n')
        for paragraph in text:
            doc = trained(paragraph)
            newString = paragraph
            for e in reversed(doc.ents): #reversed to not modify the offsets of other entities when substituting
                if e.label_ == 'PER':
                    label = 'PERSON'
                    start = e.start_char
                    end = start + len(e.text)
                    if str(e._.normal_term).upper() != 'NONE':
                        normalised = str(e._.normal_term).upper().replace(' ', '_')
                        normalised = normalised.replace(',', '')
                        newString = newString[:start] + '<' + e.text + '; ' + normalised + '; ' + str(e._.kb_qid) + '; ' + label + '>' +  newString[end:]
                        works_of_art.append([e.text + "," + str(e._.normal_term).upper().replace(' ', '_')  + "," + str(e._.kb_qid) + "," + label])
                    else:
                        newString = newString[:start] + '<' + e.text + '; ' + e.text.upper().replace(' ', '_') + '; noWikiID; ' + label +'>' + newString[end:]
                        works_of_art.append([e.text + "," + e.text.upper().replace(' ', '_')  + "," + str(e._.kb_qid) + "," + label])
                if e.label_ == 'WORK_OF_ART':
                    start = e.start_char
                    end = start + len(e.text)
                    if str(e._.normal_term).upper() != 'NONE':
                        normalised = str(e._.normal_term).upper().replace(' ', '_')
                        normalised = normalised.replace(',', '')
                        newString = newString[:start] + '<' + e.text + '; ' + normalised + '; ' + str(e._.kb_qid) + '; ' + e.label_ + '>' +  newString[end:]
                        works_of_art.append([e.text + "," + str(e._.normal_term).upper().replace(' ', '_')  + "," + str(e._.kb_qid) + "," + e.label_])
                    else:
                        newString = newString[:start] + '<' + e.text + '; ' + e.text.upper().replace(' ', '_') + '; noWikiID; ' + e.label_ +'>' + newString[end:]
                        works_of_art.append([e.text + "," + e.text.upper().replace(' ', '_')  + "," + str(e._.kb_qid) + "," + e.label_])

            all_paragraphs = all_paragraphs + '\n' + newString
                
        file_name = re.sub('.+lit_histories_preprocessed.(.+.txt)', '\\1', str(file))
        file_name = newpath + '/' + file_name
        f = open(file_name, 'w',  encoding = 'utf-8')
        f.write(all_paragraphs)
        f.close()

In [18]:
newpath = base_dir + today + '_final_corpus_GER'
if not os.path.exists(newpath):
    os.makedirs(newpath)

In [19]:
corpus_path = base_dir + today + '_txt_search&replace_model=trained'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

The newly annotated files are now copied into a new directory, together with the manually annotated files. Here, I also went through a first round of manual correction, replacing obvious mis-identifications.

In [23]:
for file in files:
    file_name = re.sub('.+txt_search&replace_model=trained.(.+.txt)', '\\1', str(file))
    new_file_name = newpath + '/' + file_name
    shutil.copyfile(file, new_file_name)

In [24]:
annotated = base_dir + 'lit_histories_annotated'
corpus_dir = Path(annotated).glob('*.txt')
files = list(corpus_dir)

In [25]:
for file in files:
    file_name = re.sub('.+lit_histories_annotated.(.+.txt)', '\\1', str(file))
    new_file_name = newpath + '/' + file_name
    shutil.copyfile(file, new_file_name)

In [192]:
corpus_path = base_dir + today + '_final_corpus_GER'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

Replace all "PER" by "PERSON"

To prepare a consolidation of name variations, all detected entities that have a wikiID are extracted from the text files---both automatically and manually annotated---and are stored in two dictionaries. 

In [193]:
dict_per = {}
dict_text = {}

for file in tqdm(files):
    with open (file, encoding = "utf-8") as f:
        text = f.read()
        matches = re.findall( r'<.+?>', text)
        for m in matches:
            if len(re.findall(';', m)) > 1:
                key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', m)
                wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', m)
                wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', m)
                tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', m)
                if tag == 'PERSON':
                    if wikiID != 'noWikiID':
                        if wikiname in dict_per:
                            if key in dict_per[wikiname][0]:
                                dict_per[wikiname][3] += 1
                            else:
                                dict_per[wikiname][0].append(key)
                                dict_per[wikiname][3] += 1
                        else:
                            dict_per[wikiname] = [[key], wikiID, tag, 1]
                elif tag == 'WORK_OF_ART':
                    if wikiID != 'noWikiID':
                        if wikiname in dict_text:
                            if key in dict_text[wikiname][0]:
                                dict_text[wikiname][3] += 1
                            else:
                                dict_text[wikiname][0].append(key)
                                dict_text[wikiname][3] += 1
                        else:
                            dict_text[wikiname] = [[key], wikiID, tag, 1]

  0%|          | 0/29 [00:00<?, ?it/s]

In [194]:
keys = list(dict_per.keys())
keys.sort()
dict_per = {i: dict_per[i] for i in keys}

In [195]:
keys = list(dict_text.keys())
keys.sort()
dict_text = {i: dict_text[i] for i in keys}

These dictionaries are now filtered: dict_per_filtered should only include entries that are linked to a person, dict_text_filtered only entries that are a written work, literary work, and so on. This helps to exclude non-sensical but common mis-identifications (city names, race horses, athletes, you name it). 

In [196]:
dict_per_filtered = {}

for key, values in tqdm(dict_per.items()):
    if is_instance_of_human(values[1]) == True:
        if is_irrelevant_occupation(values[1]) == False:
            dict_per_filtered[key] = values    

  0%|          | 0/9039 [00:00<?, ?it/s]

In [72]:
dict_text_filtered = {}

for key, values in tqdm(dict_text.items()):
    if is_instance_of_literarywork(values[1]) == True:
        dict_text_filtered[key] = values

  0%|          | 0/6662 [00:00<?, ?it/s]

In [199]:
save_data(base_dir + today + '_GER_dict_entities_PER.json', dict_per_filtered)
save_data(base_dir + today + '_GER_dict_entities_WORK_OF_ART.json', dict_text_filtered)

In [25]:
#dict_per_filtered = load_data(base_dir + today + '_GER_dict_entities_PER.json')
#dict_text_filtered = load_data(base_dir + today + '_GER_dict_entities_WORK_OF_ART.json')

The dictionaries are now inverted; with the different name variations as keys and the wikinames as values.

In [200]:
inverted_dict_per = {}

for key, values in dict_per_filtered.items():
    for value in values[0]:
        if value in inverted_dict_per:
            inverted_dict_per.pop(value)
        else:
            inverted_dict_per[value] = key

In [201]:
inverted_dict_text = {}

for key, values in dict_text_filtered.items():
    for value in values[0]:
        if value in inverted_dict_text:
            inverted_dict_text.pop(value)
        else:
            inverted_dict_text[value] = key

In [202]:
consolidated_dir = base_dir + today + '_final_corpus_GER_consolidated'
if not os.path.exists(consolidated_dir):
    os.makedirs(consolidated_dir)

In the second iteration, all entities that have not been attributed a wikiID are compared to existing entries in both dictioniaries and are, if applicable, updated to match the dictionary information. If an entity has been deleted from a dictionary (because, for example, is_instance_of_human yielded False), the entity is also compared to existing disambiguated entities or updated to a 'noWikiID' entry.

In [203]:
for file in tqdm(files):
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        newText = ''
        for paragraph in text:
            newString = paragraph
            entities = re.finditer('<.+?>', newString)
            if entities is not None:
                for e in reversed(list(entities)):
                    key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', e.group())
                    wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', e.group())
                    wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', e.group())
                    tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', e.group())
                    if tag == 'PERSON':
                        if wikiID == 'noWikiID':
                            if wikiname in dict_per_filtered:
                                wikiID = dict_per_filtered[wikiname][2]
                            elif key in inverted_dict_per:
                                wikiname = inverted_dict_per[key]
                                wikiID = dict_per_filtered[inverted_dict_per[key]][1]
                            else:
                                fuzzy_matches = process.extract(key, inverted_dict_per.keys())
                                if fuzzy_matches[0][1] >= 95:
                                    wikiname = inverted_dict_per[fuzzy_matches[0][0]]
                                    wikiID = dict_per_filtered[inverted_dict_per[fuzzy_matches[0][0]]][1]
                                    
                            newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]

                        else:
                            if wikiname not in dict_per_filtered:
                                if key in inverted_dict_per:
                                    wikiname = inverted_dict_per[key]
                                    wikiID = dict_per_filtered[inverted_dict_per[key]][1]
                                else:
                                    wikiname = key.upper().replace(' ', '_')
                                    wikiname = wikiname.replace(',', '')
                                    wikiID = 'noWikiID'
                                    
                            newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                    if tag == 'WORK_OF_ART':
                        if wikiID == 'noWikiID':
                            if wikiname in dict_text_filtered:
                                wikiID = dict_text_filtered[wikiname][2]
                            elif key in inverted_dict_text:
                                wikiname = inverted_dict_text[key]
                                wikiID = dict_text_filtered[inverted_dict_text[key]][1]
                            else:
                                fuzzy_matches = process.extract(key, inverted_dict_text.keys())
                                if fuzzy_matches[0][1] >= 95:
                                    wikiname = inverted_dict_text[fuzzy_matches[0][0]]
                                    wikiID = dict_text_filtered[inverted_dict_text[fuzzy_matches[0][0]]][1]
                                    
                            newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                        
                        else:
                            if wikiname not in dict_text_filtered:
                                if key in inverted_dict_text:
                                    wikiname = inverted_dict_text[key]
                                    wikiID = dict_text_filtered[inverted_dict_text[key]][1]
                                else:
                                    wikiname = key.upper().replace(' ', '_')
                                    wikiname = wikiname.replace(',', '')
                                    wikiID = 'noWikiID'
                                    
                                newString = newString[0:e.start()] + '<' + key + '; ' + wikiname + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
            
            newText = newText + '\n' + newString
            
    file_name = re.sub('.+GER.(.+.txt)', '\\1', str(file))
    file_name = consolidated_dir + '/' + file_name
    f = open(file_name, "w",  encoding = "utf-8")
    f.write(newText)
    f.close()

  0%|          | 0/29 [00:00<?, ?it/s]

In [204]:
corpus_path = base_dir + today + '_final_corpus_GER_consolidated'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

Based on the consolidated text files two new dictionaries are compiled that now include all detected entities, with or without wikiIDs, for persons and literary texts, respectively. They are sorted alphabetically, and then exported as json and csv files. 

In [205]:
dict_per = {}
dict_text = {}

for file in tqdm(files):
    with open (file, encoding = "utf-8") as f:
        text = f.read()
        matches = re.findall( r'<.+?>', text)
        for m in matches:
            if len(re.findall(';', m)) > 1:
                key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', m)
                wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', m)
                wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', m)
                tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', m)
                if tag == 'PERSON':
                    if wikiname in dict_per:
                        if key in dict_per[wikiname][0]:
                            dict_per[wikiname][3] += 1
                        else:
                            dict_per[wikiname][0].append(key)
                            dict_per[wikiname][3] += 1
                    else:
                        dict_per[wikiname] = [[key], wikiID, tag, 1]
                elif tag == 'WORK_OF_ART':
                    if wikiname in dict_text:
                        if key in dict_text[wikiname][0]:
                            dict_text[wikiname][3] += 1
                        else:
                            dict_text[wikiname][0].append(key)
                            dict_text[wikiname][3] += 1
                    else:
                        dict_text[wikiname] = [[key], wikiID, tag, 1]

  0%|          | 0/29 [00:00<?, ?it/s]

In [206]:
keys = list(dict_per.keys())
keys.sort()
dict_per = {i: dict_per[i] for i in keys}

In [207]:
keys = list(dict_text.keys())
keys.sort()
dict_text = {i: dict_text[i] for i in keys}

In [208]:
save_data(base_dir + today + '_GER_dict_all_entities_PER.json', dict_per)
save_data(base_dir + today + '_GER_dict_all_entities_WORK_OF_ART.json', dict_text)

In [16]:
#dict_per = load_data(base_dir + today + '_GER_dict_all_entities_PER.json')
#dict_text = load_data(base_dir + today + '_GER_dict_all_entities_WORK_OF_ART.json')

In [210]:
titles = list(dict_text.keys())
details = list(dict_text.values())

# Convert to DataFrame
df = pd.DataFrame(details, columns=['variations', 'wikiID', 'tag', 'count'])
df.insert(0, 'wikiname', titles)

In [103]:
df.to_csv(base_dir + today + '_texts_df_tobeclustered.csv', encoding='utf8')

In [213]:
persons = list(dict_per.keys())
details = list(dict_per.values())

# Convert to DataFrame
df = pd.DataFrame(details, columns=['variations', 'wikiID', 'tag', 'count'])
df.insert(0, 'wikiname', persons)

In [214]:
df.to_csv(base_dir + today + '_persons_df_tobeclustered.csv', encoding='utf8')

### OpenRefine

OpenRefine: export as xslx, save as csv

In [216]:
df_texts = pd.read_csv(base_dir + today + '-texts-df-clustered.csv', encoding='utf8', sep=';')
df_persons = pd.read_csv(base_dir + today + '-persons-df-clustered.csv', encoding='utf8', sep=';')

In [217]:
df_texts['variations'] = df_texts['variations'].apply(ast.literal_eval)
df_persons['variations'] = df_persons['variations'].apply(ast.literal_eval)

In [218]:
dict_text_clustered = {}

for index, row in df_texts.iterrows():
    if row.iloc[1] in dict_text_clustered: # is wikiname in dict?
        dict_text_clustered[row.iloc[1]][0] = dict_text_clustered[row.iloc[1]][0] + row.iloc[2] # if yes, then add variations and check if wikiID can be added
        if row.iloc[3] != "noWikiID": 
            dict_text_clustered[row.iloc[1]][1] = row.iloc[3]
        dict_text_clustered[row.iloc[1]][3] += row.iloc[5] # add up the count
    else:
        dict_text_clustered[row.iloc[1]] = [row.iloc[2], row.iloc[3], row.iloc[4], row.iloc[5]] # wikiname, variations, wikiID; tag, count

In [219]:
dict_persons_clustered = {}

for index, row in df_persons.iterrows():
    if row.iloc[1] in dict_persons_clustered: # is wikiname in dict?
        dict_persons_clustered[row.iloc[1]][0] = dict_persons_clustered[row.iloc[1]][0] + row.iloc[2] # if yes, then add variations and check if wikiID can be added
        if row.iloc[3] != "noWikiID": 
            dict_persons_clustered[row.iloc[1]][1] = row.iloc[3]
        dict_persons_clustered[row.iloc[1]][3] += row.iloc[5] # add up the count
    else:
        dict_persons_clustered[row.iloc[1]] = [row.iloc[2], row.iloc[3], row.iloc[4], row.iloc[5]] # wikiname, variations, wikiID; tag, count

In [220]:
#save_data(base_dir + today + '_GER_dict_all_entities_WORK_OF_ART_final.json', dict_text_clustered)
#save_data(base_dir + today + '_GER_dict_all_entities_PERSON_final.json', dict_persons_clustered)

In [16]:
dict_text_clustered = load_data(base_dir + today + '_GER_dict_all_entities_WORK_OF_ART_final.json')
dict_persons_clustered = load_data(base_dir + today + '_GER_dict_all_entities_PERSON_final.json')

In [17]:
dict_text_clustered_inverted = {}

for key, values in dict_text_clustered.items():
    for value in values[0]:
        dict_text_clustered_inverted[value] = key

In [18]:
dict_persons_clustered_inverted = {}

for key, values in dict_persons_clustered.items():
    for value in values[0]:
        dict_persons_clustered_inverted[value] = key

In [19]:
clustered_dir = base_dir + today + '_final_corpus_GER_clustered'
if not os.path.exists(clustered_dir):
    os.makedirs(clustered_dir)

In [228]:
co_citation_dict = {}

for file in tqdm(files):
    file_short = re.sub('(.+consolidated)\\\\(.+?).txt', '\\2', str(file))
    co_citation_dict[file_short] = {'texts': {}, 'persons': {}}
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        newText = ''
        for paragraph in text:
            newString = paragraph
            entities = re.finditer('<.+?>', newString)
            if entities is not None:
                for e in reversed(list(entities)):
                    key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', e.group())
                    wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', e.group())
                    wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', e.group())
                    tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', e.group())

                    if tag == 'PERSON':
                        if wikiname not in co_citation_dict[file_short]['persons']:
                            co_citation_dict[file_short]['persons'][wikiname] = 1
                        else:
                            co_citation_dict[file_short]['persons'][wikiname] += 1
                    if tag == 'WORK_OF_ART':
                        if wikiname not in co_citation_dict[file_short]['texts']:
                            co_citation_dict[file_short]['texts'][wikiname] = 1
                        else:
                            co_citation_dict[file_short]['texts'][wikiname] += 1
                    
                    if tag == 'PERSON':
                        if wikiname in dict_persons_clustered:
                            newString = newString[0:e.start()] + '<' + key + '; ' + str(wikiname) + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                        else:
                            if key in dict_persons_clustered_inverted: # if an entity doesn't appear in the dictionary, the title variation is looked up in the inverted dictionary and the entity name is, if applicable, replaced
                                wikiname = dict_persons_clustered_inverted[key]
                                wikiID = dict_persons_clustered[dict_persons_clustered_inverted[key]][1]
                                newString = newString[0:e.start()] + '<' + key + '; ' + str(wikiname) + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                            else:
                                newString = newString[0:e.start()] + key + newString[e.end():len(newString)]
                                
                    if tag == 'WORK_OF_ART':
                        if wikiname in dict_text_clustered:
                            newString = newString[0:e.start()] + '<' + key + '; ' + str(wikiname) + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                        else:
                            if key in dict_text_clustered_inverted: # if an entity doesn't appear in the dictionary, the title variation is looked up in the inverted dictionary and the entity name is, if applicable, replaced
                                wikiname = dict_text_clustered_inverted[key]
                                wikiID = dict_text_clustered[dict_text_clustered_inverted[key]][1]
                                newString = newString[0:e.start()] + '<' + key + '; ' + str(wikiname) + '; ' + wikiID + '; ' + tag + '>' + newString[e.end():len(newString)]
                            else:
                                newString = newString[0:e.start()] + key + newString[e.end():len(newString)]

            newText = newText + '\n' + newString
            
    file_name = re.sub('.+consolidated.(.+.txt)', '\\1', str(file))
    file_name = clustered_dir + '/' + file_name
    f = open(file_name, "w",  encoding = "utf-8")
    f.write(newText)
    f.close()

  0%|          | 0/29 [00:00<?, ?it/s]

In [229]:
save_data(base_dir + today + '_co_citation_dict.json', co_citation_dict)

In [230]:
corpus_path = base_dir + today + '_final_corpus_GER_clustered'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

In [231]:
replaced_dir = base_dir + today + '_final_corpus_GER_replaced'
if not os.path.exists(replaced_dir):
    os.makedirs(replaced_dir)

In [232]:
for file in tqdm(files):
    with open (file, encoding = "utf-8") as f:
        text = f.read().split("\n")
        newText = ''
        for paragraph in text:
            newString = paragraph
            entities = re.finditer('<.+?>', newString)
            if entities is not None:
                for e in reversed(list(entities)):
                    key = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\1', e.group())
                    wikiname = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\2', e.group())
                    wikiID = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\3', e.group())
                    tag = re.sub('<(.+?); ([A-Z].+?); (Q.+?|noWikiID); (PERSON|WORK_OF_ART)>', '\\4', e.group())
                    
                    newString = newString[0:e.start()] + wikiname + '_' + wikiID + newString[e.end():len(newString)]

            newText = newText + '\n' + newString

    file_name = re.sub('.+GER_clustered.(.+.txt)', '\\1', str(file))
    file_name = replaced_dir + '/' + file_name
    f = open(file_name, "w",  encoding = "utf-8")
    f.write(newText)
    f.close()

  0%|          | 0/29 [00:00<?, ?it/s]

### Word Embedding

In [19]:
entities_per = []

for key, values in dict_persons_clustered.items():
    entity = str(key) + '_' + values[1]
    entities_per.append(entity)

entities_text = []

for key, values in dict_text_clustered.items():
    entity = str(key) + '_' + values[1]
    entities_text.append(entity)

In [20]:
len(entities_per)

15491

In [21]:
len(entities_text)

33779

In [22]:
entities = set(entities_per + entities_text)

In [237]:
fileList = glob.glob(os.path.join(os.getcwd(), replaced_dir, '*.txt'))
 
lithist = []
for file_path in fileList:
    with open(file_path, encoding = 'utf8') as file:
        lithist.append(file.read())

In [238]:
escape = "!#$%&'()*+–, -./:;<=>?@[\]^`{|}~’‘“”"

In [239]:
tokens = []

num_tokens = 0
for hist in tqdm(lithist):
    doc = nlp(hist)
    for sent in doc.sents:
        tokens_in_sent = []
        for token in sent:
            
            if token.text.startswith('\n'):
                continue
            if token.text[0].isdigit():
                continue
            if token.text in escape:
                continue
              
            if token.text not in entities:
                tok = re.sub('[\d!#$%&\(\)*\+, -\./:;<=>?@\[\]^`\{\|\}~’‘“”]+', '', token.text)
                tokens_in_sent.append(tok.lower())
            else:
                tokens_in_sent.append(token.text)

        tokens.append(tokens_in_sent)
        num_tokens += len(tokens_in_sent)

  0%|          | 0/29 [00:00<?, ?it/s]

In [240]:
num_tokens

3652548

In [241]:
#with open(path_results + '\\' + today + '_tokens_preprocessed.pickle', 'wb') as output_file:
#    pickle.dump(tokens, output_file)

In [23]:
with open(path_results + '\\' + today + '_tokens_preprocessed.pickle',  'rb') as input_file:
    tokens = pickle.load(input_file)

In [24]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [25]:
min_count = 5
vector_size = 100
window = 10
sg = 1
epochs = 5

In [26]:
iterations = 100

In [32]:
model = Word2Vec(tokens, min_count=min_count, vector_size=vector_size, window=window, sg=sg, epochs=epochs)

2024-10-18 13:28:21,167 : INFO : collecting all words and their counts
2024-10-18 13:28:21,168 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-18 13:28:21,247 : INFO : PROGRESS: at sentence #10000, processed 277781 words, keeping 38708 word types
2024-10-18 13:28:21,322 : INFO : PROGRESS: at sentence #20000, processed 573185 words, keeping 63626 word types
2024-10-18 13:28:21,396 : INFO : PROGRESS: at sentence #30000, processed 856107 words, keeping 86692 word types
2024-10-18 13:28:21,481 : INFO : PROGRESS: at sentence #40000, processed 1117523 words, keeping 104095 word types
2024-10-18 13:28:21,574 : INFO : PROGRESS: at sentence #50000, processed 1404857 words, keeping 119278 word types
2024-10-18 13:28:21,642 : INFO : PROGRESS: at sentence #60000, processed 1682965 words, keeping 133037 word types
2024-10-18 13:28:21,706 : INFO : PROGRESS: at sentence #70000, processed 1911646 words, keeping 148099 word types
2024-10-18 13:28:21,773 : INFO : PROGR

In [31]:
we_results = path_results + '\\modelA_iteration=' + str(iterations)
if not os.path.exists(we_results):
    os.makedirs(we_results)

In [247]:
models = []
for i in range(1, iterations+1):
    model = Word2Vec(tokens, min_count=min_count, vector_size=vector_size, window=window, sg=sg, epochs=epochs)
    model.wv.save(path_results + '\\modelA_iteration=' + str(iterations) + '\\' + today + '_' + str(i) + '_window=' + str(window) + '_skipgram_defaultparams_model_vectors.kv')
    models.append(model)

2024-10-02 17:40:14,024 : INFO : collecting all words and their counts
2024-10-02 17:40:14,026 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-02 17:40:14,144 : INFO : PROGRESS: at sentence #10000, processed 277781 words, keeping 38708 word types
2024-10-02 17:40:14,269 : INFO : PROGRESS: at sentence #20000, processed 573185 words, keeping 63626 word types
2024-10-02 17:40:14,393 : INFO : PROGRESS: at sentence #30000, processed 856107 words, keeping 86692 word types
2024-10-02 17:40:14,508 : INFO : PROGRESS: at sentence #40000, processed 1117523 words, keeping 104095 word types
2024-10-02 17:40:14,636 : INFO : PROGRESS: at sentence #50000, processed 1404857 words, keeping 119278 word types
2024-10-02 17:40:14,750 : INFO : PROGRESS: at sentence #60000, processed 1682965 words, keeping 133037 word types
2024-10-02 17:40:14,860 : INFO : PROGRESS: at sentence #70000, processed 1911646 words, keeping 148099 word types
2024-10-02 17:40:14,986 : INFO : PROGR

In [248]:
we_results = path_results + '\\modelB_iteration=' + str(iterations)
if not os.path.exists(we_results):
    os.makedirs(we_results)

In [249]:
models = []
for i in range(1, iterations+1):
    model = Word2Vec(tokens, min_count=min_count, vector_size=vector_size, window=window, sg=sg, epochs=epochs)
    model.wv.save(path_results + '\\modelB_iteration=' + str(iterations) + '\\' + today + '_' + str(i) + '_window=' + str(window) + '_skipgram_defaultparams_model_vectors.kv')
    models.append(model)

2024-10-02 20:12:15,168 : INFO : collecting all words and their counts
2024-10-02 20:12:15,168 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-02 20:12:15,275 : INFO : PROGRESS: at sentence #10000, processed 277781 words, keeping 38708 word types
2024-10-02 20:12:15,383 : INFO : PROGRESS: at sentence #20000, processed 573185 words, keeping 63626 word types
2024-10-02 20:12:15,508 : INFO : PROGRESS: at sentence #30000, processed 856107 words, keeping 86692 word types
2024-10-02 20:12:15,616 : INFO : PROGRESS: at sentence #40000, processed 1117523 words, keeping 104095 word types
2024-10-02 20:12:15,721 : INFO : PROGRESS: at sentence #50000, processed 1404857 words, keeping 119278 word types
2024-10-02 20:12:15,832 : INFO : PROGRESS: at sentence #60000, processed 1682965 words, keeping 133037 word types
2024-10-02 20:12:15,929 : INFO : PROGRESS: at sentence #70000, processed 1911646 words, keeping 148099 word types
2024-10-02 20:12:16,034 : INFO : PROGR

### Networks

#### Network A: All entities

In [27]:
model_dir = Path(path_results + '\\modelA_iteration=' + str(iterations)).glob('*.kv')
files = list(model_dir)

In [28]:
models = []

for file in files:
    models.append(gensim.models.KeyedVectors.load(str(file)))

2025-01-30 17:08:10,185 : INFO : loading KeyedVectors object from C:\Users\Brottrager\Documents\Diss\sec_lit\GER\20240919_wordembeddings\modelA_iteration=100\20240919_100_window=10_skipgram_defaultparams_model_vectors.kv
2025-01-30 17:08:10,302 : INFO : KeyedVectors lifecycle event {'fname': 'C:\\Users\\Brottrager\\Documents\\Diss\\sec_lit\\GER\\20240919_wordembeddings\\modelA_iteration=100\\20240919_100_window=10_skipgram_defaultparams_model_vectors.kv', 'datetime': '2025-01-30T17:08:10.302978', 'gensim': '4.3.2', 'python': '3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'loaded'}
2025-01-30 17:08:10,304 : INFO : loading KeyedVectors object from C:\Users\Brottrager\Documents\Diss\sec_lit\GER\20240919_wordembeddings\modelA_iteration=100\20240919_10_window=10_skipgram_defaultparams_model_vectors.kv
2025-01-30 17:08:10,391 : INFO : KeyedVectors lifecycle event {'fname': 'C:\\Users\\Brottr

In [29]:
vectors = {}

for model in models:
    for key in entities:
        if key in model.key_to_index:
            if key not in vectors:
                vectors[key] = [model[key]]
            else:
                vectors[key].append(model[key])

In [30]:
vectors_mean = {}
idsinEmbedding = []
nodes = {}

for key, values in vectors.items():
    array = np.array(vectors[key], dtype='float32')
    vectors_mean[key] = np.average(array, axis=0)
    idsinEmbedding.append(key)
    if key in entities_per:
        nodes[key] = {'label': 'PERSON'}
    elif key in entities_text:
        nodes[key] = {'label': 'WORK_OF_ART'}   

In [39]:
save_data(base_dir + today + '_idsinEmbedding.json', idsinEmbedding)

In [31]:
idsinEmbedding = load_data(base_dir + today + '_idsinEmbedding.json')

In [32]:
len(idsinEmbedding)

3054

In [33]:
vectors_ls = []

for key, values in vectors_mean.items():
    vectors_ls.append(vectors_mean[key])

In [34]:
similarities = cosine_similarity(vectors_ls)

In [35]:
data = pd.DataFrame(similarities)

In [36]:
for i in range(0, len(data)):
    for j in range(0,len(data)):
        if i == j:
            data.iloc[i,j] = 0

In [37]:
data.columns = idsinEmbedding
data.index = idsinEmbedding

Minimum Spanning Tree 

In [48]:
G = nx.from_pandas_adjacency(data)

In [49]:
nx.set_node_attributes(G, nodes)

In [50]:
algorithm = 'kruskal'

In [51]:
for u, v, data in G.edges(data=True):
    data['weight'] = 1 - data['weight'] # similarity vs minimum weight

mst = nx.minimum_spanning_tree(G, algorithm=algorithm)

for u, v, data in mst.edges(data=True):
    data['weight'] = 1 - data['weight']

In [52]:
#filename = path_results + '\\' + today + '_min_count' + str(min_count) + '_vector_size' + str(vector_size) + '_window' + str(window) + '_epochs' + str(epochs) + '_' + str(algorithm) + "_word_similarity_edges_highsimilarity.csv"
filename = path_results + '\\' + today + '_window=' + str(window) + "_defaultparams_skipgram_mean5_word_similarity_edges_highsimilarity.csv"

with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Source', 'Target', 'Weight'])
    
    for u, v, data in mst.edges(data=True):
        writer.writerow([u, v, data['weight']])

#### Network: PER

In [53]:
vectors_per_ls = []
idsinEmbedding_per = []

for key, values in vectors_mean.items():
    if key in entities_per:
        vectors_per_ls.append(vectors_mean[key])
        idsinEmbedding_per.append(key)

In [54]:
len(idsinEmbedding_per)

2110

In [55]:
similarities_per = cosine_similarity(vectors_per_ls)

In [56]:
save_data(base_dir + today + '_idsinEmbedding_PER.json', idsinEmbedding_per)

In [57]:
data_per = pd.DataFrame(similarities_per)

In [58]:
for i in range(0, len(data_per)):
    for j in range(0,len(data_per)):
        if i == j:
            data_per.iloc[i,j] = 0

In [59]:
data_per.columns = idsinEmbedding_per
data_per.index = idsinEmbedding_per

Minimum Spanning Tree

In [60]:
G_per = nx.from_pandas_adjacency(data_per)

In [61]:
for u, v, data in G_per.edges(data=True):
    data['weight'] = 1 - data['weight'] # similarity vs minimum weight

mst_per = nx.minimum_spanning_tree(G_per, algorithm=algorithm)

for u, v, data in mst_per.edges(data=True):
    data['weight'] = 1 - data['weight']

In [62]:
#filename = path_results + '\\' + today + '_min_count' + str(min_count) + '_vector_size' + str(vector_size) + '_window' + str(window) + '_epochs' + str(epochs) + '_' + str(algorithm) + "_word_similarity_edges_highsimilarity_PER.csv"
filename = path_results + '\\' + today + '_window=' + str(window) + "_defaultparams_skipgram_mean5_word_similarity_edges_highsimilarity_PER.csv"

with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Source', 'Target', 'Weight'])
    
    for u, v, data in mst_per.edges(data=True):
        writer.writerow([u, v, data['weight']])

#### Network: WORK_OF_ART

In [38]:
vectors_text_ls = []
idsinEmbedding_text = []

for key, values in vectors_mean.items():
    if key in entities_text:
        vectors_text_ls.append(vectors_mean[key])
        idsinEmbedding_text.append(key)

In [39]:
len(idsinEmbedding_text)

1264

In [65]:
save_data(base_dir + today + '_idsinEmbedding_TEXT.json', idsinEmbedding_text)

In [40]:
similarities_text = cosine_similarity(vectors_text_ls)

In [41]:
data_texts = pd.DataFrame(similarities_text)

In [42]:
for i in range(0, len(data_texts)):
    for j in range(0,len(data_texts)):
        if i == j:
            data_texts.iloc[i,j] = 0

In [43]:
data_texts.columns = idsinEmbedding_text
data_texts.index = idsinEmbedding_text

In [44]:
data_texts.to_csv(path_results + '\\' + today + '_window=' + str(window) + 'texts_wordembedding_adjacency.csv', encoding='utf8')

Minimum Spanning Tree

In [40]:
G_texts = nx.from_pandas_adjacency(data_texts)

In [71]:
for u, v, data in G_texts.edges(data=True):
    data['weight'] = 1 - data['weight'] # similarity vs minimum weight

mst_texts = nx.minimum_spanning_tree(G_texts, algorithm=algorithm)

for u, v, data in mst_texts.edges(data=True):
    data['weight'] = 1 - data['weight']

In [72]:
#filename = path_results + '\\' + today + '_min_count' + str(min_count) + '_vector_size' + str(vector_size) + '_window' + str(window) + '_epochs' + str(epochs) + '_' + str(algorithm) + "_word_similarity_edges_highsimilarity_WORK_OF_ART.csv"
filename = path_results + '\\' + today + '_window=' + str(window) + "_defaultparams_skipgram_mean5_word_similarity_edges_highsimilarity_WORK_OF_ART.csv"

with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Source', 'Target', 'Weight'])
    
    for u, v, data in mst_texts.edges(data=True):
        writer.writerow([u, v, data['weight']])

In [41]:
#filename = path_results + '\\' + today + '_min_count' + str(min_count) + '_vector_size' + str(vector_size) + '_window' + str(window) + '_epochs' + str(epochs) + '_' + str(algorithm) + "_word_similarity_edges_highsimilarity_WORK_OF_ART.csv"
filename = path_results + '\\' + today + '_window=' + str(window) + "_defaultparams_skipgram_mean5_all_WORK_OF_ART.csv"

with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Source', 'Target', 'Weight'])
    
    for u, v, data in G_texts.edges(data=True):
        writer.writerow([u, v, data['weight']])