# Disambiguating English Repositories

--- Last edited: 2024-09-26 ---

In [2]:
import collections
from datetime import date
import glob
import json
import os
from pathlib import Path
import pickle
import random
import requests
import re
import string
import sys
import time
from tqdm import tqdm
import csv
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import shutil

import qwikidata
from qwikidata.sparql  import return_sparql_query_results

from urllib.parse import urlencode

from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from lxml import etree

from thefuzz import fuzz
from thefuzz import process

### Functions

In [3]:
def load_data(file):
    with open(file, "r", encoding = "utf-8") as f:
        data = json.load(f)
    return(data)

def save_data(file, data):
    with open(file, "w", encoding = "utf-8") as f:
        json.dump(data, f, indent = 4)

In [4]:
def query_entity_fishing(text, language='en'):
    headers = {
        'Content-Type': 'application/json'
    }
    data = {
        "text": text,
        "language": language  # Specify language if needed
    }
    try:
        # Send a POST request to the Entity Fishing API
        response = requests.post(api_url, headers=headers, data=json.dumps(data))
        # Raise an error for bad HTTP status codes
        response.raise_for_status()
        # Parse the JSON response
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Handle HTTP errors
        return {}
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err}")  # Handle other request errors
        return {}
    except ValueError as json_err:
        print(f"JSON decoding error: {json_err}")  # Handle JSON decoding errors
        return {}

In [5]:
def get_wikidata_label(wikidataid, language='en'):
    # Define the Wikidata API URL
    url = "https://www.wikidata.org/w/api.php"
    
    # Set the parameters for the API request
    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": wikidataid,
        "props": "labels",
        "languages": language
    }
    
    try:
        # Send the request to the Wikidata API
        response = requests.get(url, params=params)
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()
        
        # Extract the label
        entity = data.get("entities", {}).get(wikidataid, {})
        labels = entity.get("labels", {})
        label = labels.get(language, {}).get("value", None)
        
        return label
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Handle HTTP errors
        return None
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err}")  # Handle other request errors
        return None
    except ValueError as json_err:
        print(f"JSON decoding error: {json_err}")  # Handle JSON decoding errors
        return None

In [6]:
def is_instance_of_literarywork(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return False

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # Check for "instance of" property (P31)
    if 'P31' in claims:
        for claim in claims['P31']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            if value.get('id') == 'Q7725634':  # Q7725634 is the ID for 'literary work'
                return True
            if value.get('id') == 'Q47461344':  # Q47461344 is the ID for 'written work'
                return True
            if value.get('id') == 'Q838948':  # Q838948 is the ID for 'work of art'
                return True
            if value.get('id') == 'Q116476516':  # Q116476516 is the ID for 'dramatic work'
                return True
    
    return False

In [7]:
def get_publication_date(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    # Check for "publication date" property (P577)
    if 'P577' in claims:
        # Get the first publication date
        publication_date_claim = claims['P577'][0]
        mainsnak = publication_date_claim.get('mainsnak', {})
        datavalue = mainsnak.get('datavalue', {})
        value = datavalue.get('value', {})
        
        # Extract the time value
        publication_date = value.get('time')
        if publication_date:
            # Extract the year from the date string (format: +YYYY-MM-DDT00:00:00Z)
            year = publication_date.split('-')[0].lstrip('+')
            return year
    
    return None

In [8]:
def get_genres_and_forms_of_creative_work(wikidata_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return None, None

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(wikidata_id, {})
    
    # Extract claims (properties)
    claims = entity_data.get('claims', {})
    
    genres = []
    forms_of_creative_work = []
    
    # Check for "genre" property (P136)
    if 'P136' in claims:
        for claim in claims['P136']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            
            # Extract the genre ID
            genre_id = value.get('id')
            if genre_id:
                # Fetch the genre label
                genre_label = get_label(genre_id)
                if genre_label:
                    genres.append(genre_label)
    
    # Check for "form of creative work" property (P7937)
    if 'P7937' in claims:
        for claim in claims['P7937']:
            mainsnak = claim.get('mainsnak', {})
            datavalue = mainsnak.get('datavalue', {})
            value = datavalue.get('value', {})
            
            # Extract the form of creative work ID
            form_of_creative_work_id = value.get('id')
            if form_of_creative_work_id:
                # Fetch the form of creative work label
                form_of_creative_work_label = get_label(form_of_creative_work_id)
                if form_of_creative_work_label:
                    forms_of_creative_work.append(form_of_creative_work_label)
    
    return genres, forms_of_creative_work

In [9]:
def get_label(entity_id):
    # URL to fetch the entity data
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json"
    response = requests.get(url)
    
    if response.status_code != 200:
        return None

    data = response.json()
    
    # Extract the entity data
    entity_data = data['entities'].get(entity_id, {})
    
    # Extract labels
    labels = entity_data.get('labels', {})
    
    # Get the English label if available, else return the first label found
    if 'en' in labels:
        return labels['en']['value']
    elif labels:
        return next(iter(labels.values()))['value']
    
    return None

### Disambiguation

In [10]:
api_url = "http://localhost:8090/service/disambiguate"

In [11]:
#today = re.sub('-', '', str(date.today()))
today = "20240926"

First, the Gutenberg US index is wikified to link entries, if possible, with wikidataIDs. The Gutenberg US index can be accessed at https://www.gutenberg.org/dirs/GUTINDEX.ALL; after saving it as plaintext and deleting header, footer, and '<0xa0>', it can be read in as a text file and then processed into a dictionary structure. 

In [11]:
with open ('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/GUTINDEX.txt', encoding = "utf-8") as f:
    gutenberg_titles = f.read().split("\n")

In [24]:
gutenberg_dict = {}

next_is_entry = False
for entry in gutenberg_titles:
    if entry == '':
        next_is_entry = True
        continue
    if next_is_entry == True:
        gutenberg_id =  re.sub('(.+?) +(\d+)$', '\\2', entry)
        gutenberg_title = re.sub('(.+?) +(\d+)$', '\\1', entry)
        title = re.sub('(.+?), by .+', '\\1', gutenberg_title)
        gutenberg_author = re.sub('(.+?), by (.+?)', '\\2', gutenberg_title)
        gutenberg_dict[title] = [gutenberg_id, gutenberg_author]
        next_is_entry = False

In [None]:
wikified_gutenberg = {}
for title in tqdm(gutenberg_dict.keys()):
    if title not in already_wikified_gutenberg:
        response = query_entity_fishing(title)
        if 'entities' in response and response['entities']:
            # Get the first disambiguated entity
            first_entity = response['entities'][0]
            if 'wikidataId' in first_entity.keys():
                wikiID = first_entity['wikidataId']
                wikiname = get_wikidata_label(wikiID)
                if wikiname is None:
                    wikiname = title
                wikiname = re.sub(' ', '_', wikiname.upper())
                if is_instance_of_literarywork(wikiID):
                    wikified_gutenberg[title] = [wikiname, wikiID, gutenberg_dict[title][0]]
    else:
        wikified_gutenberg[title] = already_wikified_gutenberg[title]

In [33]:
#save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/' + today + '_wikified_gutenberg.json', wikified_gutenberg)
wikified_gutenberg = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/20240718_wikified_gutenberg.json')

In [34]:
wikified_gutenberg_inverted = {}

for key, values in wikified_gutenberg.items():
    if values[1] in wikified_gutenberg_inverted:
        wikified_gutenberg_inverted[values[1]][0].append(key) 
        wikified_gutenberg_inverted[values[1]][0].append(values[2]) 
    else:
        wikified_gutenberg_inverted[values[1]] = [[key], values[0], [values[2]]]

Now, the wikified Gutenberg entries are compared to the entities detected in the literary histories.

In [241]:
dict_text = load_data('C:/Users/Brottrager/Documents/Diss/sec_lit/ENG/20240919_ENG_dict_all_entities_WORK_OF_ART_final.json')

The detected text titles are now fuzzy matched with the entries extracted from the Gutenberg US index to link the detected text titles with full texts. 

In [36]:
inverted_dict_text = {}

for key, values in dict_text.items():
    for value in values[0]:
        if value in inverted_dict_text:
            inverted_dict_text[value].append([key, values[1], values[2]]) 
        else:
            inverted_dict_text[value] = [key, values[1], values[2]]

In [38]:
fuzzy_matched_gutenberg = {}

for key, values in tqdm(inverted_dict_text.items()):
    result = process.extract(key, gutenberg_dict.keys())
    if result[0][1] >= 95: # only if the matching ratio is above 95%, the match is retained
        fuzzy_matched_gutenberg[key] = [result[0][0], result[0][1]]

  0%|          | 0/16862 [00:00<?, ?it/s]

In [39]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/' + today + '_fuzzy_matched_gutenberg_final.json', fuzzy_matched_gutenberg)
#fuzzy_matched_gutenberg = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/20240821_fuzzy_matched_gutenberg_final.json')

In [40]:
fuzzy_matched_gutenberg_wikiname = {}

for key, values in tqdm(fuzzy_matched_gutenberg.items()):
    wikiname = inverted_dict_text[key][0]
    if wikiname not in fuzzy_matched_gutenberg_wikiname:
        fuzzy_matched_gutenberg_wikiname[wikiname] = {'variations': dict_text[wikiname][0], 'wikiID': dict_text[wikiname][1], 'freq': dict_text[wikiname][3], 
                                                      'gutenberg_title': values[0], 'gutenbergID': gutenberg_dict[values[0]][0], 'fuzzy_ratio': values[1]}

  0%|          | 0/2182 [00:00<?, ?it/s]

In [41]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/' + today + '_fuzzy_matched_gutenberg_wikiname_final.json', fuzzy_matched_gutenberg_wikiname)

In a previous iteration of canon compilation, I have already manually compiled a metadata table for possibly relevant text. This metadata is now also transferred into a dictionary structure and wikified. 

In [42]:
metadata_old = pd.read_csv('C:/Users/Brottrager/Documents/Diss/metadata/eng/database/ENG_texts_meta.csv', sep=';')

In [43]:
metadata = metadata_old.to_dict('records')

In [44]:
metadata_dict = {}
for record in metadata:
    key = record['title']
    metadata_dict[key] = {k: v for k, v in record.items() if k != 'title'}

In [45]:
fuzzy_matched_metadata = {}

for title in tqdm(metadata_old['title']):
    result = process.extract(title, inverted_dict_text.keys())
    if result[0][1] >= 95:
        fuzzy_matched_metadata[result[0][0]] = [title, result[0][1]]

  0%|          | 0/605 [00:00<?, ?it/s]

In [46]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_fuzzy_matched_metadata.json', fuzzy_matched_metadata)
#fuzzy_matched_metadata = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_fuzzy_matched_metadata.json')

In this iteration, all detected text titles are processed, compared to the compiled dictionary, and checked for duplicates. The resulting dictionary summarises all the available information. 

### Data Consolidation and Matching

In [47]:
dict_all_info = {}
dict_all_info_wikiID = {}
empty_wiki_info = {'gutenberg_title': np.nan, 'gutenbergID': np.nan, 'fuzzy_ratio': np.nan}
empty_additional_info = {'author_viaf': np.nan, 'id': np.nan, 'file_name': np.nan, 'pub_year': np.nan}

for key, values in tqdm(dict_text.items()):
    if key in fuzzy_matched_gutenberg_wikiname:

        wikiID = values[1]   
        duplicate = False

        if wikiID != 'noWikiID':
            if wikiID not in dict_all_info_wikiID:
                dict_all_info_wikiID[fuzzy_matched_gutenberg_wikiname[key]['wikiID']] = [key]
            else: 
                dict_all_info_wikiID[fuzzy_matched_gutenberg_wikiname[key]['wikiID']].append(key) 
                duplicate = True

        if duplicate:
            existing_key = dict_all_info_wikiID[fuzzy_matched_gutenberg_wikiname[key]['wikiID']][0]
            dict_all_info[existing_key]['freq'] += dict_text[key][3]
            continue
            
        for v in fuzzy_matched_gutenberg_wikiname[key]['variations']:
            empty = True
            if v in fuzzy_matched_metadata:
                additional_info = metadata_dict[fuzzy_matched_metadata[v][0]]
                dict_all_info[key] = {**fuzzy_matched_gutenberg_wikiname[key], **additional_info}
                empty = False
                break
            
        if empty == True:
            dict_all_info[key] = {**fuzzy_matched_gutenberg_wikiname[key], **empty_additional_info}
            
    else:
        empty = True
        for v in values[0]:
            if v in fuzzy_matched_metadata:
                additional_info = metadata_dict[fuzzy_matched_metadata[v][0]]
                info = {'variations': dict_text[key][0], 'wikiID': dict_text[key][1], 'freq': dict_text[key][3]}
                dict_all_info[key] = {**info, **empty_wiki_info, **additional_info}
                empty = False
                break

        if empty == True:
            dict_all_info[key] = {'variations': dict_text[key][0], 'wikiID': dict_text[key][1], 'freq': dict_text[key][3], 'gutenberg_title': np.nan,
                                  'gutenbergID': np.nan, 'fuzzy_ratio': np.nan, 'author_viaf': np.nan, 'id': np.nan, 'file_name': np.nan, 'pub_year': np.nan}
        
    dict_all_info[key]['genre'] = np.nan
        

  0%|          | 0/15577 [00:00<?, ?it/s]

If an entry has a wikiID, wikidata is queried for genre attributions and publication years. 

In [49]:
for key, values in tqdm(dict_all_info.items()):
    if values['wikiID'] != 'noWikiID':
        values['genre'] = get_genres_and_forms_of_creative_work(values['wikiID'])
        if pd.isna(values['pub_year']):
            values['pub_year'] = get_publication_date(values['wikiID'])

  0%|          | 0/15569 [00:00<?, ?it/s]

In [50]:
for key, values in tqdm(dict_all_info.items()):
    dict_all_info[key]['prose'] = 0
    if isinstance(values['genre'], tuple):
        strings = [element for sublist in values['genre'] for element in sublist if isinstance(element, str)]
        result = ' '.join(strings)
        if 'novel' in result or 'story' in result or 'stories' in result or 'novella' in result:
            dict_all_info[key]['prose'] = 1

  0%|          | 0/15569 [00:00<?, ?it/s]

In [11]:
#save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_dict_all_info.json', dict_all_info)
dict_all_info = load_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_dict_all_info.json')

In [52]:
df = pd.DataFrame.from_dict(dict_all_info, orient='index')
df = df.reset_index().rename(columns={'index': 'wikiname'})

In [53]:
df.to_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_df_all_info.csv', encoding='utf8', sep=';')

### Manual Correction

previous iteration

In [12]:
df_manual = pd.read_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/20240821_matched_entities_texts_final.csv', encoding='utf8', sep=';')
df_manual.set_index('wikiname', inplace=True)
dict_manual = df_manual.to_dict(orient='index')

In [13]:
dict_all_info_manuallycorrected = {}

for key, values in tqdm(dict_all_info.items()):
    dict_all_info_manuallycorrected[key] = dict_all_info[key]
    dict_all_info_manuallycorrected[key]['prose_fiction'] = 0
    if key in dict_manual:
        dict_all_info_manuallycorrected[key]['pub_year'] = dict_manual[key]['pub_year']
        dict_all_info_manuallycorrected[key]['genre'] = dict_manual[key]['genre']
        dict_all_info_manuallycorrected[key]['gutenbergID'] = dict_manual[key]['gutenbergID']
        dict_all_info_manuallycorrected[key]['prose_fiction'] = dict_manual[key]['prose_fiction']
        dict_all_info_manuallycorrected[key]['comment_exclusion'] = dict_manual[key]['comment_exclusion']

  0%|          | 0/15569 [00:00<?, ?it/s]

In [14]:
df = pd.DataFrame.from_dict(dict_all_info_manuallycorrected, orient='index')
df = df.reset_index().rename(columns={'index': 'wikiname'})

In [15]:
df.to_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_matched_entities_texts_final.csv', encoding='utf8', sep=';')

Manual check

In [12]:
df_manuallycorrected = pd.read_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_matched_entities_texts_final_ALL.csv', encoding='utf8', sep=';')

In [13]:
manuallycorrected = df_manuallycorrected.to_dict('records')

In [14]:
dict_manuallycorrected = {}
for record in manuallycorrected:
    key = record['wikiname']
    if record['comment_exclusion'] != 'duplicate':
        dict_manuallycorrected[key] = {k: v for k, v in record.items() if k not in ['wikiname', 'Unnamed: 0']}

In [15]:
counter_wiki_entities = 0

for key, values in dict_manuallycorrected.items():
    if values['wikiID'] != 'noWikiID':
        counter_wiki_entities += 1

In [16]:
len(dict_manuallycorrected)

15552

In [17]:
n = len(dict_manuallycorrected)
p = counter_wiki_entities/n

print(p)

0.14737654320987653


Only 15% of all detected text titles have a corresponding wikidata entry. 

In [13]:
df_filtered = pd.read_csv('C:/Users/Brottrager/Documents/Diss/RelatingTheUnread/corpora/ENG_corpus.csv', encoding='utf8', sep=';')

In [14]:
filtered = df_filtered.set_index("wikiname").to_dict(orient="index")

In [15]:
counter_wiki_entities = 0

for key, values in filtered.items():
    if values['wikiID'] != 'noWikiID':
        counter_wiki_entities += 1

In [16]:
len(filtered)

679

In [17]:
n = len(filtered)
p = counter_wiki_entities/n

print(p)

0.6406480117820325


64% of all detected text titles in the corpus list have a corresponding wikidata entry. 

In [18]:
save_data('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_dict_corpus.json', filtered)

In [18]:
txt_link = 'https://www.gutenberg.org/files/' #'1342/1342-0.txt'

In [19]:
filtered_gutenberg_html = []

for key, values in dict_filtered.items():
    gutenbergID = values['gutenbergID']
    if gutenbergID != 'nan':
        filtered_gutenberg_html.append(str(txt_link) + str(gutenbergID) + '/' + str(gutenbergID) + '-0.txt' + '\n')

In [42]:
with open('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/filtered_gutenberg_html.txt', 'w') as file:
    file.writelines(filtered_gutenberg_html)

CMD:

wget -w 2 -m -H --no-if-modified-since --input-file C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/filtered_gutenberg_html.txt

Powershell:

Get-ChildItem "C:\www.gutenberg.org\files" -Recurse -Filter "*.txt" | Copy-Item -Destination "C:\Users\Brottrager\Documents\Diss\corpora\20240714\ENG\0_scraping\GutenbergUS\txt"

In [20]:
corpus_path = 'C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/txt'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

In [21]:
gutenberg_files_dict = {}

for file in files:
    id = re.sub('(.+\\\\)(\\d+)-0.txt', '\\2', str(file))
    gutenberg_files_dict[id] = file

In [46]:
to_do_dict = {}
matched_dict = {}

for key, values in dict_filtered.items():
    gutenbergID = values['gutenbergID']
    wikiname = key
    wikiname = wikiname.replace('"', '').replace("'", '')
    wikiID = values['wikiID']
    if gutenbergID != 'nan':
        if gutenbergID in gutenberg_files_dict:
            matched_dict[key] = values
        else:
            to_do_dict[key] = values
    else:
        to_do_dict[key] = values

In [48]:
df = pd.DataFrame.from_dict(to_do_dict, orient='index')
df = df.reset_index().rename(columns={'index': 'wikiname'})

In [49]:
df.to_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_to_do.csv', encoding='utf8', sep=';')

In [35]:
df_done = pd.read_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_done.csv', encoding='utf8', sep=';')

In [36]:
done = df_done.to_dict('records')

In [37]:
dict_done = {}
for record in done:
    key = record['wikiname']
    if pd.isna(record['comment_exclusion']):
        dict_done[key] = {k: v for k, v in record.items() if k != 'wikiname'}

In [56]:
filtered_gutenberg_html = []

for key, values in dict_done.items():
    gutenbergID = values['gutenbergID']
    if gutenbergID in gutenberg_html_dict:
        filtered_gutenberg_html.append(gutenberg_html_dict[gutenbergID])
    
with open('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/filtered_gutenberg_secondround_html.txt', 'w') as file:
    file.writelines(filtered_gutenberg_html)

CMD:

wget -w 2 -m -H --no-if-modified-since --input-file C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/filtered_gutenberg_secondround_html.txt

Powershell:

Get-ChildItem "C:\www.gutenberg.org\cache" -Recurse -Filter "*.txt" | Copy-Item -Destination "C:\Users\Brottrager\Documents\Diss\corpora\20240714\ENG\0_scraping\GutenbergUS\txt_added_unnamed"

In [44]:
corpus_dict = {}

for key, values in dict_filtered.items():
    if key in dict_done:
        corpus_dict[key] = dict_done[key]
    else:
        corpus_dict[key] = values

In [15]:
txt_dir =  'C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/txt_added_named'
if not os.path.exists(txt_dir):
    os.makedirs(txt_dir)

In [16]:
corpus_path = 'C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/GutenbergUS/txt_added_unnamed'
corpus_dir = Path(corpus_path).glob('*.txt')
files = list(corpus_dir)

In [None]:
gutenberg_txt_dict = {}

for file in files:
    id = re.sub('(.+unnamed\\\\)[a-z]*(\\d+)(-0\.txt|\.txt)', '\\2', str(file))
    gutenberg_txt_dict[id] = file

In [47]:
matched_dict = {}

for key, values in corpus_dict.items():
    gutenbergID = values['gutenbergID']
    wikiname = key
    wikiname = wikiname.replace('"', '').replace("'", '')
    wikiID = values['wikiID']
    if gutenbergID != 'nan':
        if gutenbergID in gutenberg_txt_dict:
            shutil.copy(gutenberg_txt_dict[gutenbergID], txt_dir + '/' + wikiname + '_' + wikiID + '.txt')
            matched_dict[key] = values
            matched_dict[key]['text_source'] = True
        else:
            matched_dict[key] = values
            matched_dict[key]['text_source'] = False
    else:
        matched_dict[key] = values
        matched_dict[key]['text_source'] = False


In [48]:
df = pd.DataFrame.from_dict(matched_dict, orient='index')
df = df.reset_index().rename(columns={'index': 'wikiname'})

In [49]:
df.to_csv('C:/Users/Brottrager/Documents/Diss/corpora/20240714/ENG/0_scraping/' + today + '_corpus_list_matchedTXT.csv', encoding='utf8', sep=';')