In [80]:
import pandas as pd
import json
import re
import os
import subprocess
import bibtexparser
from tqdm import tqdm

In [None]:
#TODO: write a function that normalizes the names of the authors from our annotations
# to fit the format generated by anystyle.io check below for examples and look for the pattern

In [75]:
folder_path = "../all_data_articles"

for filename in os.listdir(folder_path):
    if filename.endswith(".json") and filename.startswith("Royal_Companies"):
        file_path = os.path.join(folder_path, filename)
        article = json.load(open(file_path, "r"))
        
        extraction = {}

        for number, footnote in article["footnotes"].items():
            references = re.split(';',footnote)
            
            author_title_list = []

            for reference in references:
                command = ['ruby', 'anystyle.rb', str(reference)]
                bibtex = subprocess.run(command, stdout=subprocess.PIPE, text=True).stdout
                parsed_bibtex = bibtexparser.loads(bibtex).entries[0]

                #print(parsed_bibtex)

                # Extract title, prioritizing 'booktitle' if both 'title' and 'booktitle' are present
                title = parsed_bibtex.get('booktitle', parsed_bibtex.get('title', None))

                # Extract author information
                author = parsed_bibtex.get('author', None)

                # Append author and title pair to the list
                author_title_list.append([author, title])

            # Store the list in the extraction dictionary with the footnote number as the key
            extraction[number] = author_title_list

            if number == "8":
                break
                

In [76]:
extraction 

{'1': [['Paris',
   'Z/1d/85, fo. 1v, summary register documenting the policy on the Amitié'],
  [None,
   'fos 4v–6, record of the arbitration dispute between the two companies']],
 '2': [[None,
   'The key register from the Royal Insurance Company that is used is its arbitration register: AN, Z/1d/84. The key bundle of documents for the Parisian admiralty court is AN, Z/1d/109']],
 '3': [[None, 'Marine Insurance: Origins and Institutions, 1300–1850'],
  ['Kingston, C.',
   'Governance and Institutional Change in Marine Insurance, 1350–1850’']],
 '4': [[None,
   'The only semi-extensive treatment of the Royal Insurance Company in the past century has been Louis-Augustin Boiteux’s brief and imbalanced study in L.-A. Boiteux, L’assurance maritime à Paris sous le règne de Louis XIV']],
 '5': [['Wubs-Mrozewicz, J.',
   'Conflict Management and Interdisciplinary History: Presentation of a New Project and an Analytical Model’'],
  [None,
   'Conflict Management in the Mediterranean and the 

loading the annotated data and converting it to the set of triplets

In [56]:
def file_finder(file_str: str) -> str:
    """
    This function takes a file name and returns the path to the file in the all_data_articles.
    """
    title_doi = "../data/titles_doi.csv"
    folder_path = "../all_data_articles"
    
    #extract the doi from the file name
    doi = file_str.split("_")[-1].split(".")[0]

    # find the row in the csv file where the doi column ends with the doi
    df = pd.read_csv(title_doi)
    doi_row = df[df["DOI"].str.endswith(doi)]

    # extract the title from the row
    title_json = doi_row["Title"].values[0].replace(" ", "_") + ".json"

    for filename in os.listdir(folder_path):
        if filename.endswith(".json") and filename.startswith(title_json[:int(len(title_json)/3)]):
            return filename


In [109]:
def load_annotations(file_str: str) -> pd.DataFrame:
    """
    This function takes a file name and returns the annotations from the file.
    And also replaces missing values with None.
    """
    folder_path = "../data/annotated"

    file_path = os.path.join(folder_path, file_str)
    df = pd.read_excel(file_path)

    # replace missing values with None
    df = df.where(pd.notnull(df), None)

    # replace values marked with nan with None
    df = df.replace("nan", None)
    
    return df

In [None]:
def convert_author_format(author: str) -> str:
    """
    #TODO
    This function takes an author name and converts it to the format used by anystyle.io.
    """
    return author 

In [107]:
def df_to_triplets(df: pd.DataFrame) -> set:
    """
    This function takes a dataframe and returns a set of triplets.
    """
    triplets = set()
    for i in range(len(df)):
        triplet = (df.iloc[i]["Footnote"], df.iloc[i]["Authors"], df.iloc[i]["Title"])
        triplets.add(triplet)
    return triplets

In [97]:
def dict_to_triplets(extraction: dict) -> set:
    """
    Converts a dictionary of footnotes to a set of triplets
    """
    triplets = set()

    for number, references in extraction.items():
        for reference in references:
            author = reference[0]
            title = reference[1]
            
            triplets.add((int(number), author, title))

    return triplets

In [92]:
def information_extraction(file_path: str) -> set:
    """
    This function takes a file path and returns a set of triplets.
    """
    article = json.load(open(file_path, "r"))
    extraction = {}

    prev_footnote = None

    for number, footnote in tqdm(article["footnotes"].items()):

        # If the footnote is ibid, use the previous footnote
        if footnote.startswith("Ibid"):
            footnote = prev_footnote

        # Store the footnote for the next iteration
        prev_footnote = footnote
        
        references = re.split(';',footnote)
        
        author_title_list = []

        for reference in references:

            command = ['ruby', 'anystyle.rb', str(reference)]
            bibtex = subprocess.run(command, stdout=subprocess.PIPE, text=True).stdout
            parsed_bibtex = bibtexparser.loads(bibtex).entries[0]

            #print(parsed_bibtex)

            # Extract title, prioritizing 'booktitle' if both 'title' and 'booktitle' are present
            title = parsed_bibtex.get('booktitle', parsed_bibtex.get('title', None))

            # Extract author information
            author = parsed_bibtex.get('author', None)

            # Append author and title pair to the list
            author_title_list.append([author, title])

        # Store the list in the extraction dictionary with the footnote number as the key
        extraction[number] = author_title_list

    return dict_to_triplets(extraction)

In [104]:
path_annotations = "../data/annotated"

for filename in os.listdir(path_annotations):
    print(filename)
    title_json = file_finder(filename)
    #file_path = os.path.join(folder_path, title_json)
    #article = json.load(open(file_path, "r"))
    df = load_annotations(filename)
    triplets = df_to_triplets(df)
    extraction = information_extraction(file_path)
    break

Labels - https___doi.org_10.1093_ehr_cew052.xlsx


100%|██████████| 148/148 [01:05<00:00,  2.25it/s]


In [105]:
extraction

{('1', '', 'What is Living and What is Dead in Socialism’'),
 ('10',
  'Jackson, B.',
  'Equality and the British Left: A Study in Progressive Political Thought'),
 ('100', 'Naughton, B.', 'The Spiv’'),
 ('101', 'Hinton, Mass', None),
 ('102', 'Rudd, D.', 'Naughton, William John Francis (1910–1992)’'),
 ('103', 'Madge, C.', 'Commentary’'),
 ('104', 'Naughton', 'The Spiv’'),
 ('105',
  None,
  'Hilton played up the dirtiness of his characters as a direct attack on George Orwell’s cultural politics of smell and class sensibility'),
 ('106', 'Naughton', 'The Spiv’'),
 ('107', 'Naughton', 'The Spiv’'),
 ('108', 'Madge', 'Drinking in Bolton’'),
 ('109',
  'Black, L.',
  'The Political Culture of the Left in Affluent Britain, 1951–64'),
 ('109',
  'Middleton, S.',
  '“Affluence” and the Left in Britain, c .1958–1974’'),
 ('11', 'Brooke, Labour’s War', None),
 ('110',
  'Hornsey, R.',
  'The Spiv and the Architect: Unruly Life in Postwar London'),
 ('111',
  'Jackson, Equality and Left, Briti

In [108]:
df_to_triplets(df)

{(1, 'Young and E. Shils', 'What is Living and What is Dead in Socialism’'),
 (10,
  'B. Jackson',
  'Equality and the British Left: A Study in Progressive Political Thought'),
 (100, 'B. Naughton', 'The Spiv’, Pilot Papers'),
 (101, 'Hinton', 'Mass - Observers'),
 (102, 'D. Rudd', 'Naughton, William John Francis (1910–1992)'),
 (103, 'C. Madge', 'Commentary’, Pilot Papers'),
 (104, 'Naughton', 'The Spiv'),
 (105, 'Hilton', 'English Ways'),
 (106, 'Naughton', 'The Spiv'),
 (107, 'Naughton', 'The Spiv'),
 (108, 'Madge', 'Drinking in Bolton'),
 (109,
  'L. Black',
  'The Political Culture of the Left in Affluent Britain, 1951–64'),
 (109, 'S. Middleton', '“Affluence” and the Left in Britain, c .1958–1974'),
 (11, 'Brooke', 'Labour’s War'),
 (110,
  'R. Hornsey',
  'The Spiv and the Architect: Unruly Life in Postwar London'),
 (111, 'Jackson', 'Equality and the British Left'),
 (111,
  'L. Black',
  'Social Democracy as a Way of Life: Fellowship and the Socialist Union, 1951–9'),
 (112, '

In [None]:
#(113, 'M. Young and P. Willmott', 'Family and Kinship in East London'),
#('113', 'Young, M. and Willmott, P.', 'Family and Kinship in East London'),