# Disambiguating German Repositories

--- Last edited: 2024-09-27 ---

In [1]:
import collections
from datetime import date
import glob
import json
import os
from pathlib import Path
import pickle
import random
import requests
import re
import string
import sys
import time
from tqdm import tqdm
import csv
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import qwikidata
from qwikidata.sparql  import return_sparql_query_results

from urllib.parse import urlencode

from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from lxml import etree

from thefuzz import fuzz
from thefuzz import process

In [2]:
def load_data(file):
    with open(file, "r", encoding = "utf-8") as f:
        data = json.load(f)
    return(data)

def save_data(file, data):
    with open(file, "w", encoding = "utf-8") as f:
        json.dump(data, f, indent = 4)

In [3]:
def query_entity_fishing(text, language='de'):
    headers = {
        'Content-Type': 'application/json'
    }
    data = {
        "text": text,
        "language": language  # Specify language if needed
    }
    try:
        # Send a POST request to the Entity Fishing API
        response = requests.post(api_url, headers=headers, data=json.dumps(data))
        # Raise an error for bad HTTP status codes
        response.raise_for_status()
        # Parse the JSON response
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Handle HTTP errors
        return {}
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err}")  # Handle other request errors
        return {}
    except ValueError as json_err:
        print(f"JSON decoding error: {json_err}")  # Handle JSON decoding errors
        return {}

In [4]:
def get_wikidata_label(wikidataid, language='de'):
    # Define the Wikidata API URL
    url = "https://www.wikidata.org/w/api.php"
    
    # Set the parameters for the API request
    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": wikidataid,
        "props": "labels",
        "languages": language
    }
    
    try:
        # Send the request to the Wikidata API
        response = requests.get(url, params=params)
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()
        
        # Extract the label
        entity = data.get("entities", {}).get(wikidataid, {})
        labels = entity.get("labels", {})
        label = labels.get(language, {}).get("value", None)
        
        return label
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Handle HTTP errors
        return None
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err}")  # Handle other request errors
        return None
    except ValueError as json_err:
        print(f"JSON decoding error: {json_err}")  # Handle JSON decoding errors
        return None

In [5]:
def is_instance_of_literarywork(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return False

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # Check for "instance of" property (P31)
    if 'P31' in claims:
        for claim in claims['P31']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            if value.get('id') == 'Q7725634':  # Q7725634 is the ID for 'literary work'
                return True
            if value.get('id') == 'Q47461344':  # Q47461344 is the ID for 'written work'
                return True
            if value.get('id') == 'Q838948':  # Q838948 is the ID for 'work of art'
                return True
            if value.get('id') == 'Q116476516':  # Q116476516 is the ID for 'dramatic work'
                return True
    
    return False

In [6]:
def get_publication_date(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # Check for "publication date" property (P577)
    if 'P577' in claims:
        # Get the first publication date
        publication_date_claim = claims['P577'][0]
        mainsnak = publication_date_claim.get('mainsnak', {})
        datavalue = mainsnak.get('datavalue', {})
        value = datavalue.get('value', {})
        
        # Extract the time value
        publication_date = value.get('time')
        if publication_date:
            # Extract the year from the date string (format: +YYYY-MM-DDT00:00:00Z)
            year = publication_date.split('-')[0].lstrip('+')
            return year
    
    return None

In [7]:
def get_genres_and_forms_of_creative_work(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None, None

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    genres = []
    forms_of_creative_work = []
    
    # Check for "genre" property (P136)
    if 'P136' in claims:
        for claim in claims['P136']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            
            # Extract the genre ID
            genre_id = value.get('id')
            if genre_id:
                # Fetch the genre label
                genre_label = get_label(genre_id)
                if genre_label:
                    genres.append(genre_label)
    
    # Check for "form of creative work" property (P7937)
    if 'P7937' in claims:
        for claim in claims['P7937']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            
            # Extract the form of creative work ID
            form_of_creative_work_id = value.get('id')
            if form_of_creative_work_id:
                # Fetch the form of creative work label
                form_of_creative_work_label = get_label(form_of_creative_work_id)
                if form_of_creative_work_label:
                    forms_of_creative_work.append(form_of_creative_work_label)
    
    return genres, forms_of_creative_work

In [8]:
def get_label(entity_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json"
    response = requests.get(url)
    
    if response.status_code != 200:
        return None

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(entity_id, {})
    
    # Extract labels
    labels = entity_data.get('labels', {})
    
    # Get the English label if available, else return the first label found
    if 'en' in labels:
        return labels['en']['value']
    elif labels:
        return next(iter(labels.values()))['value']
    
    return None

### Disambiguation

In [9]:
api_url = "http://localhost:8090/service/disambiguate"

In [10]:
#today = re.sub('-', '', str(date.today()))
today = "20240926"

In [11]:
dict_text = load_data('C:/Users/Brottrager/Documents/Diss/sec_lit/GER/20240919_GER_dict_all_entities_WORK_OF_ART.json')

In [12]:
inverted_dict_text = {}

for key, values in dict_text.items():
    for value in values[0]:
        if value in inverted_dict_text:
            inverted_dict_text[value].append([key, values[1], values[2]]) 
        else:
            inverted_dict_text[value] = [key, values[1], values[2]]

#### Textgrid

In [13]:
textgrid_path = 'C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/textgrid/Digitale-Bibliothek-Literatur'
textgrid_dir = Path(textgrid_path).glob('*.xml')
files = list(textgrid_dir)

In [14]:
textgrid_texts_path = 'C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/textgrid/' + today + '_individual_texts'
if not os.path.exists(textgrid_texts_path):
    os.makedirs(textgrid_texts_path)

In [15]:
namespaces = {
    'tei': 'http://www.tei-c.org/ns/1.0'
}

In [17]:
textgrid_dict = {}

for file in tqdm(files): 
    tree = etree.parse(file)
    author = tree.xpath('//tei:author', namespaces=namespaces)
    author_content = re.sub(',', '_', author[0].text.strip())
    author_content = re.sub(' ', '-', author_content)
    author_content = re.sub('_-', '_', author_content)
    texts = tree.xpath('//tei:text', namespaces=namespaces)
    for i, text in enumerate(texts, start=1):
        title = text.xpath('.//tei:head[@type="h2"]', namespaces=namespaces)
        if not title:
            continue
        
        title_content = re.sub(r'[^\w\s-]', '', title[0].text.strip()).replace(' ', '-')
        filename = f'{textgrid_texts_path}/{author_content}_{title_content}.xml'
        
        new_root = etree.Element("text")
        new_head = etree.SubElement(new_root, "head")
        new_head.text = title[0].text.strip()
        
        paragraphs = text.xpath('.//*', namespaces=namespaces)
        for paragraph in paragraphs:
            new_paragraph = etree.SubElement(new_root, "p")
            new_paragraph.text = paragraph.text
        
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(etree.tostring(new_root, encoding='unicode', pretty_print=True))

        textgrid_dict[title[0].text.strip()] = [author[0].text.strip(), filename]

  0%|          | 0/691 [00:00<?, ?it/s]

In [None]:
wikified_textgrid = {}
for title in tqdm(textgrid_dict.keys()):
    response = query_entity_fishing(title)
    if 'entities' in response and response['entities']:
        # Get the first disambiguated entity
        first_entity = response['entities'][0]
        if 'wikidataId' in first_entity.keys():
            wikiID = first_entity['wikidataId']
            wikiname = get_wikidata_label(wikiID)
            if wikiname is None:
                wikiname = title
            wikiname = re.sub(' ', '_', wikiname.upper())
            if is_instance_of_literarywork(wikiID):
                wikified_textgrid[title] = [wikiname, wikiID, textgrid_dict[title][1]]

In [18]:
#save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/textgrid/' + today + '_wikified_textgrid.json', wikified_textgrid)
wikified_textgrid = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/textgrid/20240926_wikified_textgrid.json')

In [19]:
wikified_textgrid_inverted = {}

for key, values in wikified_textgrid.items():
    if values[1] in wikified_textgrid_inverted:
        wikified_textgrid_inverted[values[1]][0].append(key) 
        wikified_textgrid_inverted[values[1]][0].append(values[2]) 
    else:
        wikified_textgrid_inverted[values[1]] = [[key], values[0], [values[2]]]

In [20]:
fuzzy_matched_textgrid = {}

for key, values in tqdm(inverted_dict_text.items()):
    if key not in wikified_textgrid:
        result = process.extract(key, textgrid_dict.keys())
        if result[0][1] >= 95:
            fuzzy_matched_textgrid[key] = [result[0][0], result[0][1]]

  0%|          | 0/35973 [00:00<?, ?it/s]

In [21]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/textgrid/' + today + '_fuzzy_matched_textgrid.json', fuzzy_matched_textgrid)
#fuzzy_matched_textgrid = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/textgrid/20240821_fuzzy_matched_textgrid.json')

In [22]:
fuzzy_matched_textgrid_wikiname = {}

for key, values in tqdm(fuzzy_matched_textgrid.items()):
    wikiname = inverted_dict_text[key][0]
    if wikiname not in fuzzy_matched_textgrid_wikiname:
        fuzzy_matched_textgrid_wikiname[wikiname] = {'variations': dict_text[wikiname][0], 'wikiID': dict_text[wikiname][1], 'freq': dict_text[wikiname][3], 
                                                      'repo_title': values[0], 'repoID': textgrid_dict[values[0]][1], 'fuzzy_ratio': values[1]}

  0%|          | 0/1140 [00:00<?, ?it/s]

In [23]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/textgrid/' + today + '_fuzzy_matched_textgrid_wikiname_final.json', fuzzy_matched_textgrid_wikiname)
#fuzzy_matched_textgrid_wikiname = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/textgrid/' + today + '_fuzzy_matched_textgrid_wikiname_final.json')

#### Gutenberg-DE

In [24]:
all_titles_html = 'https://www.projekt-gutenberg.org/info/texte/allworka.html'

In [25]:
page = requests.get(all_titles_html)
soup = BeautifulSoup(page.content, 'html.parser')

In [26]:
gutenberg_dict = {}

dt_tag = soup.find('dt')
dd_tags = dt_tag.find_next_siblings('dd')
for dd in dd_tags:
    a_tag = dd.find('a')
    if a_tag:
        title = a_tag.text
        title = re.sub('[\t\r]+', '', title)
        link = a_tag['href']
        gutenberg_dict[title] = link

In [32]:
wikified_gutenberg = {}
for title in tqdm(gutenberg_dict.keys()):
    response = query_entity_fishing(title)
    if 'entities' in response and response['entities']:
        # Get the first disambiguated entity
        first_entity = response['entities'][0]
        if 'wikidataId' in first_entity.keys():
            wikiID = first_entity['wikidataId']
            wikiname = get_wikidata_label(wikiID)
            if wikiname is None:
                wikiname = title
            wikiname = re.sub(' ', '_', wikiname.upper())
            if is_instance_of_literarywork(wikiID):
                wikified_gutenberg[title] = [wikiname, wikiID, gutenberg_dict[title]]

  0%|          | 0/11642 [00:00<?, ?it/s]

HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localh

In [27]:
#save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/GutenbergDE/' + today + '_wikified_gutenberg.json', wikified_gutenberg)
wikified_gutenberg = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/GutenbergDE/20240821_wikified_gutenberg.json')

In [28]:
wikified_gutenberg_inverted = {}

for key, values in wikified_gutenberg.items():
    if values[1] in wikified_gutenberg_inverted:
        wikified_gutenberg_inverted[values[1]][0].append(key) 
        wikified_gutenberg_inverted[values[1]][0].append(values[2]) 
    else:
        wikified_gutenberg_inverted[values[1]] = [[key], values[0], [values[2]]]

In [29]:
fuzzy_matched_gutenberg = {}

for key, values in tqdm(inverted_dict_text.items()):
    if key not in wikified_gutenberg:
        result = process.extract(key, gutenberg_dict.keys())
        if result[0][1] >= 95:
            fuzzy_matched_gutenberg[key] = [result[0][0], result[0][1]]

  0%|          | 0/35973 [00:00<?, ?it/s]

In [30]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/GutenbergDE/' + today + '_fuzzy_matched_gutenberg.json', fuzzy_matched_gutenberg)
#fuzzy_matched_gutenberg = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/GutenbergDE/' + today + '_fuzzy_matched_gutenberg.json')

In [31]:
fuzzy_matched_gutenberg_wikiname = {}

for key, values in tqdm(fuzzy_matched_gutenberg.items()):
    wikiname = inverted_dict_text[key][0]
    if wikiname not in fuzzy_matched_gutenberg_wikiname:
        fuzzy_matched_gutenberg_wikiname[wikiname] = {'variations': dict_text[wikiname][0], 'wikiID': dict_text[wikiname][1], 'freq': dict_text[wikiname][3], 
                                                      'repo_title': values[0], 'repoID': gutenberg_dict[values[0]], 'fuzzy_ratio': values[1]}

  0%|          | 0/1679 [00:00<?, ?it/s]

In [32]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/GutenbergDE/' + today + '_fuzzy_matched_gutenberg_wikiname_final.json', fuzzy_matched_gutenberg_wikiname)
#fuzzy_matched_gutenberg_wikiname = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/GutenbergDE/' + today + '_fuzzy_matched_gutenberg_wikiname_final.json')

#### Zeno

In [33]:
zeno_base_html = 'http://www.zeno.org/Lesesaal/M/'

In [34]:
abc = string.ascii_uppercase

In [35]:
authors_works = {}

for n in abc:
    html = zeno_base_html + n
    page = requests.get(html)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    rows = soup.find_all('tr')
    current_author = None
    for row in rows:
        if 'lsAutor' in row.get('class', []):
            # Extract the author name
            author_tag = row.find('span', class_='lsAutor')
            if author_tag:
                current_author = author_tag.text.strip()
                if current_author not in authors_works:
                    authors_works[current_author] = []
        elif 'lsProdukt' in row.get('class', []) and current_author:
                # Extract the book title and link from the <td> with class 'lsTitel'
                title_td = row.find('td', class_='lsTitel')
                if title_td:
                    title_tag = title_td.find('a')
                    if title_tag:
                        title = title_tag.text.strip()
                        link = title_tag['href']
                        # Append the title and link to the current author
                        authors_works[current_author].append([title, link])

In [36]:
zeno_dict = {}

for key, values in authors_works.items():
    for title in values:
        zeno_dict[title[0]] = [key, title[1]]

In [49]:
wikified_zeno = {}
for title in tqdm(zeno_dict.keys()):
    response = query_entity_fishing(title)
    if 'entities' in response and response['entities']:
        # Get the first disambiguated entity
        first_entity = response['entities'][0]
        if 'wikidataId' in first_entity.keys():
            wikiID = first_entity['wikidataId']
            wikiname = get_wikidata_label(wikiID)
            if wikiname is None:
                wikiname = title
            wikiname = re.sub(' ', '_', wikiname.upper())
            if is_instance_of_literarywork(wikiID):
                wikified_zeno[title] = [wikiname, wikiID, zeno_dict[title][1]]

  0%|          | 0/2294 [00:00<?, ?it/s]

HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localhost:8090/service/disambiguate
HTTP error occurred: 400 Client Error: Bad Request for url: http://localh

In [37]:
#save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/zeno/' + today + '_wikified_zeno.json', wikified_zeno)
wikified_zeno = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/zeno/20240821_wikified_zeno.json')

In [38]:
wikified_zeno_inverted = {}

for key, values in wikified_zeno.items():
    if values[1] in wikified_zeno_inverted:
        wikified_zeno_inverted[values[1]][0].append(key) 
        wikified_zeno_inverted[values[1]][0].append(values[2]) 
    else:
        wikified_zeno_inverted[values[1]] = [[key], values[0], [values[2]]]

In [39]:
fuzzy_matched_zeno = {}

for key, values in tqdm(inverted_dict_text.items()):
    if key not in wikified_zeno:
        result = process.extract(key, zeno_dict.keys())
        if result[0][1] >= 95:
            fuzzy_matched_zeno[key] = [result[0][0], result[0][1]]

  0%|          | 0/35973 [00:00<?, ?it/s]

In [40]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/zeno/' + today + '_fuzzy_matched_zeno.json', fuzzy_matched_zeno)
#fuzzy_matched_zeno = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/zeno/' + today + '_fuzzy_matched_zeno.json')

In [41]:
fuzzy_matched_zeno_wikiname = {}

for key, values in tqdm(fuzzy_matched_zeno.items()):
    wikiname = inverted_dict_text[key][0]
    if wikiname not in fuzzy_matched_zeno_wikiname:
        fuzzy_matched_zeno_wikiname[wikiname] = {'variations': dict_text[wikiname][0], 'wikiID': dict_text[wikiname][1], 'freq': dict_text[wikiname][3], 
                                                      'repo_title': values[0], 'repoID': zeno_dict[values[0]][1], 'fuzzy_ratio': values[1]}

  0%|          | 0/602 [00:00<?, ?it/s]

In [42]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/zeno/' + today + '_fuzzy_matched_zeno_wikiname_final.json', fuzzy_matched_zeno_wikiname)
#fuzzy_matched_zeno_wikiname = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/zeno/' + today + '_fuzzy_matched_zeno_wikiname_final.json')

#### Consolidate with Existing Metadata Table

In [43]:
metadata_old = pd.read_csv('C:/Users/Brottrager/Documents/Diss/metadata/ger/database/GER_texts_meta.csv', sep=';')

In [44]:
metadata = metadata_old.to_dict('records')

In [45]:
metadata_dict = {}
for record in metadata:
    key = record['title']
    metadata_dict[key] = {k: v for k, v in record.items() if k != 'title'}

In [46]:
fuzzy_matched_metadata = {}

for title in tqdm(metadata_old['title']):
    result = process.extract(title, inverted_dict_text.keys())
    if result[0][1] >= 95:
        fuzzy_matched_metadata[result[0][0]] = [title, result[0][1]]

  0%|          | 0/547 [00:00<?, ?it/s]

In [47]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/' + today + '_fuzzy_matched_metadata.json', fuzzy_matched_metadata)
#fuzzy_matched_metadata = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_fuzzy_matched_metadata.json')

### Data Consolidation and Matching

In [48]:
dict_all_info = {}
dict_all_info_wikiID = {}
empty_wiki_info = {'repo_title': np.nan, 'repoID': np.nan, 'fuzzy_ratio': np.nan}
empty_additional_info = {'author_viaf': np.nan, 'id': np.nan, 'file_name': np.nan, 'pub_year': np.nan, 'repo': np.nan}

for key, values in tqdm(dict_text.items()):

    # TEXTGRID
    if key in fuzzy_matched_textgrid_wikiname:

        wikiID = values[1]   
        duplicate = False
        empty = False

        if wikiID != 'noWikiID':
            if wikiID not in dict_all_info_wikiID:
                dict_all_info_wikiID[fuzzy_matched_textgrid_wikiname[key]['wikiID']] = [key]
            else: 
                dict_all_info_wikiID[fuzzy_matched_textgrid_wikiname[key]['wikiID']].append(key) 
                duplicate = True

        if duplicate:
            existing_key = dict_all_info_wikiID[fuzzy_matched_textgrid_wikiname[key]['wikiID']][0]
            dict_all_info[existing_key]['freq'] += dict_text[key][3]
            continue
            
        for v in fuzzy_matched_textgrid_wikiname[key]['variations']:
            empty = True
            if v in fuzzy_matched_metadata:
                additional_info = metadata_dict[fuzzy_matched_metadata[v][0]]
                dict_all_info[key] = {**fuzzy_matched_textgrid_wikiname[key], **additional_info}
                dict_all_info[key]['repo'] = 'Textgrid'
                empty = False
                break
                
        if empty == True:
            dict_all_info[key] = {**fuzzy_matched_textgrid_wikiname[key], **empty_additional_info}

    # GUTENBERG-DE    
    elif key in fuzzy_matched_gutenberg_wikiname:

        wikiID = values[1]   
        duplicate = False
        empty = False

        if wikiID != 'noWikiID':
            if wikiID not in dict_all_info_wikiID:
                dict_all_info_wikiID[fuzzy_matched_gutenberg_wikiname[key]['wikiID']] = [key]
            else: 
                dict_all_info_wikiID[fuzzy_matched_gutenberg_wikiname[key]['wikiID']].append(key) 
                duplicate = True

        if duplicate:
            existing_key = dict_all_info_wikiID[fuzzy_matched_gutenberg_wikiname[key]['wikiID']][0]
            dict_all_info[existing_key]['freq'] += dict_text[key][3]
            continue
            
        for v in fuzzy_matched_gutenberg_wikiname[key]['variations']:
            empty = True
            if v in fuzzy_matched_metadata:
                additional_info = metadata_dict[fuzzy_matched_metadata[v][0]]
                dict_all_info[key] = {**fuzzy_matched_gutenberg_wikiname[key], **additional_info}
                dict_all_info[key]['repo'] = 'Gutenberg-DE'
                empty = False
                break
            
        if empty == True:
            dict_all_info[key] = {**fuzzy_matched_gutenberg_wikiname[key], **empty_additional_info}
    

    # ZENO
    elif key in fuzzy_matched_zeno_wikiname:

        wikiID = values[1]   
        duplicate = False
        empty = False

        if wikiID != 'noWikiID':
            if wikiID not in dict_all_info_wikiID:
                dict_all_info_wikiID[fuzzy_matched_zeno_wikiname[key]['wikiID']] = [key]
            else: 
                dict_all_info_wikiID[fuzzy_matched_zeno_wikiname[key]['wikiID']].append(key) 
                duplicate = True

        if duplicate:
            existing_key = dict_all_info_wikiID[fuzzy_matched_zeno_wikiname[key]['wikiID']][0]
            dict_all_info[existing_key]['freq'] += dict_text[key][3]
            continue
            
        for v in fuzzy_matched_zeno_wikiname[key]['variations']:
            empty = True
            if v in fuzzy_matched_metadata:
                additional_info = metadata_dict[fuzzy_matched_metadata[v][0]]
                dict_all_info[key] = {**fuzzy_matched_zeno_wikiname[key], **additional_info}
                dict_all_info[key]['repo'] = 'Zeno'
                empty = False
                break
            
        if empty == True:
            dict_all_info[key] = {**fuzzy_matched_zeno_wikiname[key], **empty_additional_info}

    # NO MATCHES
    else:
        empty = True
        for v in values[0]:
            if v in fuzzy_matched_metadata:
                additional_info = metadata_dict[fuzzy_matched_metadata[v][0]]
                info = {'variations': dict_text[key][0], 'wikiID': dict_text[key][1], 'freq': dict_text[key][3]}
                dict_all_info[key] = {**info, **empty_wiki_info, **additional_info}
                dict_all_info[key]['repo'] = np.nan
                empty = False
                break

        if empty == True:
            dict_all_info[key] = {'variations': dict_text[key][0], 'wikiID': dict_text[key][1], 'freq': dict_text[key][3], 'repo_title': np.nan,
                                  'repoID': np.nan, 'fuzzy_ratio': np.nan, 'author_viaf': np.nan, 'id': np.nan, 'file_name': np.nan, 'pub_year': np.nan,
                                 'repo': np.nan}
        
    dict_all_info[key]['genre'] = np.nan        

  0%|          | 0/35203 [00:00<?, ?it/s]

If an entry has a wikiID, wikidata is queried for genre attributions and publication years. 

In [49]:
for key, values in tqdm(dict_all_info.items()):
    if values['wikiID'] != 'noWikiID':
        values['genre'] = get_genres_and_forms_of_creative_work(values['wikiID'])
        if pd.isna(values['pub_year']):
            values['pub_year'] = get_publication_date(values['wikiID'])

  0%|          | 0/35199 [00:00<?, ?it/s]

In [50]:
for key, values in tqdm(dict_all_info.items()):
    dict_all_info[key]['prose'] = 0
    if isinstance(values['genre'], tuple):
        if values['genre'] != (None, None):
            strings = [element for sublist in values['genre'] for element in sublist if isinstance(element, str)]
            result = ' '.join(strings)
            if 'novel' in result or 'story' in result or 'stories' in result or 'novella' in result:
                dict_all_info[key]['prose'] = 1

  0%|          | 0/35199 [00:00<?, ?it/s]

In [51]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/' + today + '_dict_all_info.json', dict_all_info)
#dict_all_info = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/' + today + '_dict_all_info.json')

In [52]:
df = pd.DataFrame.from_dict(dict_all_info, orient='index')
df = df.reset_index().rename(columns={'index': 'wikiname'})

In [53]:
df.to_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/' + today + '_df_all_info.csv', encoding='utf8', sep=';')

### Manual Correction

In [79]:
df_manual = pd.read_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/20240821_matched_entities_texts_final.csv', encoding='utf8', sep=';')
df_manual.set_index('wikiname', inplace=True)
dict_manual = df_manual.to_dict(orient='index')

  df_manual = pd.read_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/20240821_matched_entities_texts_final.csv', encoding='utf8', sep=';')


In [80]:
dict_all_info_manuallycorrected = {}

for key, values in tqdm(dict_all_info.items()):
    dict_all_info_manuallycorrected[key] = dict_all_info[key]
    dict_all_info_manuallycorrected[key]['prose_fiction'] = 0
    dict_all_info_manuallycorrected[key]['done'] = 0
    if key in dict_manual:
        dict_all_info_manuallycorrected[key]['pub_year'] = dict_manual[key]['pub_year']
        dict_all_info_manuallycorrected[key]['genre'] = dict_manual[key]['genre']
        dict_all_info_manuallycorrected[key]['repoID'] = dict_manual[key]['repoID']
        dict_all_info_manuallycorrected[key]['prose_fiction'] = dict_manual[key]['prose_fiction']
        dict_all_info_manuallycorrected[key]['comment_exclusion'] = dict_manual[key]['comment_exclusion']
        dict_all_info_manuallycorrected[key]['done'] = dict_manual[key]['done']

  0%|          | 0/35199 [00:00<?, ?it/s]

In [81]:
df = pd.DataFrame.from_dict(dict_all_info_manuallycorrected, orient='index')
df = df.reset_index().rename(columns={'index': 'wikiname'})

In [82]:
df_manual = pd.read_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/20240920_matched_entities_texts_final.csv', encoding='utf8', sep=';')
df_manual.set_index('wikiname', inplace=True)
dict_manual = df_manual.to_dict(orient='index')

In [83]:
for key, values in tqdm(dict_manual.items()):
    if key in dict_all_info_manuallycorrected:
        dict_all_info_manuallycorrected[key]['pub_year'] = dict_manual[key]['pub_year']
        dict_all_info_manuallycorrected[key]['genre'] = dict_manual[key]['genre']
        dict_all_info_manuallycorrected[key]['repoID'] = dict_manual[key]['repoID']
        dict_all_info_manuallycorrected[key]['prose_fiction'] = dict_manual[key]['prose_fiction']
        dict_all_info_manuallycorrected[key]['comment_exclusion'] = dict_manual[key]['comment_exclusion']
        dict_all_info_manuallycorrected[key]['done'] = 'NA'

  0%|          | 0/18838 [00:00<?, ?it/s]

In [84]:
df = pd.DataFrame.from_dict(dict_all_info_manuallycorrected, orient='index')
df = df.reset_index().rename(columns={'index': 'wikiname'})

In [85]:
df.to_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/' + today + '_matched_entities_texts_final.csv', encoding='utf8', sep=';')

Manual check

In [12]:
df_manuallycorrected = pd.read_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/' + today + '_matched_entities_texts_final_ALL.csv', encoding='utf8', sep=';')

  df_manuallycorrected = pd.read_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/' + today + '_matched_entities_texts_final_ALL.csv', encoding='utf8', sep=';')


In [13]:
manuallycorrected = df_manuallycorrected.to_dict('records')

In [14]:
dict_manuallycorrected = {}
for record in manuallycorrected:
    key = record['wikiname']
    if record['comment_exclusion'] != 'duplicate':
        dict_manuallycorrected[key] = {k: v for k, v in record.items() if k != 'title'}

In [15]:
counter_wiki_entities = 0

for key, values in dict_manuallycorrected.items():
    if values['wikiID'] != 'noWikiID':
        counter_wiki_entities += 1

In [16]:
len(dict_manuallycorrected)

35176

In [17]:
n = len(dict_manuallycorrected)
p = counter_wiki_entities/n

print(p)

0.039430293381851264


Only 4% of all detected text titles have a corresponding wikidata entry. 

In [18]:
df_filtered = pd.read_csv('C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/corpora/GER_corpus.csv', encoding='utf8', sep=';')

In [19]:
filtered = df_filtered.set_index("wikiname").to_dict(orient="index")

In [20]:
counter_wiki_entities = 0

for key, values in filtered.items():
    if values['wikiID'] != 'noWikiID':
        counter_wiki_entities += 1

In [21]:
len(filtered)

571

In [22]:
n = len(filtered)
p = counter_wiki_entities/n

print(p)

0.4413309982486865


44% of all detected text titles in the corpus list have a corresponding wikidata entry. 

In [50]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/GER/0_scraping/' + today + '_dict_corpus.json', filtered)