In [1]:
import pandas as pd
import json
import re
import os
import subprocess
import bibtexparser
from tqdm import tqdm
import re
from flair.data import Sentence
from flair.nn import Classifier
from difflib import SequenceMatcher


In [2]:
#TODO: write a function that normalizes the names of the authors from our annotations
# to fit the format generated by anystyle.io check below for examples and look for the pattern

In [2]:
def file_finder(file_str: str) -> str:
    """
    This function takes a file name and returns the path to the file in the all_data_articles.
    """
    title_doi = "../data/titles_doi.csv"
    folder_path = "../all_data_articles"
    
    #extract the doi from the file name
    doi = file_str.split("_")[-1].split(".")[0]

    # find the row in the csv file where the doi column ends with the doi
    df = pd.read_csv(title_doi)
    doi_row = df[df["DOI"].str.endswith(doi)]

    # extract the title from the row
    title_json = doi_row["Title"].values[0].replace(" ", "_") + ".json"

    for filename in os.listdir(folder_path):
        if filename.endswith(".json") and filename.startswith(title_json[:int(len(title_json)/3)]):
            return filename


In [3]:
def load_annotations(file_str: str) -> pd.DataFrame:
    """
    This function takes a file name and returns the annotations from the file.
    And also replaces missing values with None.
    """
    folder_path = "../data/annotated"

    file_path = os.path.join(folder_path, file_str)
    df = pd.read_excel(file_path)

    # replace missing values with None
    df = df.where(pd.notnull(df), None)

    # replace values marked with nan with None
    df = df.replace("nan", None)
    
    return df

In [4]:
def format_author_name(name):
    # TODO: probably need to handle more cases
    if name is None:
        return None
    if ' and ' in name:
        # Handle multiple authors
        authors = name.split(' and ')
        formatted_authors = [format_author_name(author) for author in authors]
        return ' and '.join(formatted_authors)
    else:
        parts = name.split()
        # Handle case where there is a middle initial
        if len(parts) == 3:
            return f"{parts[1]}, {parts[0]} {parts[2]}"
        # Handle case where there is no middle initial
        elif len(parts) == 2:
            return f"{parts[1]}, {parts[0]}"
        else:
            return name

In [5]:
def df_to_triplets(df: pd.DataFrame, format_author = True) -> set:
    """
    This function takes a dataframe and returns a set of triplets.
    """
    triplets = set()
    for i in range(len(df)):
        if format_author:
            triplet = (df.iloc[i]["Footnote"], format_author_name(df.iloc[i]["Authors"]), df.iloc[i]["Title"])
        else:
            triplet = (df.iloc[i]["Footnote"], df.iloc[i]["Authors"], df.iloc[i]["Title"])
        triplets.add(triplet)
    return triplets

In [6]:
def dict_to_triplets(extraction: dict) -> set:
    """
    Converts a dictionary of footnotes to a set of triplets
    """
    triplets = set()

    for number, references in extraction.items():
        for reference in references:
            author = reference[0]
            title = reference[1]

            if author == "":
                author = None

            if title == "":
                title = None
            
            triplets.add((int(number), author, title))

    return triplets

In [20]:
def information_extraction(file_path: str) -> set:
    """
    This function takes a file path and returns a set of triplets.
    """
    path = "../all_data_articles"
    file_path = os.path.join(path, file_path)
    article = json.load(open(file_path, "r"))
    extraction = {}

    prev_footnote = None

    for number, footnote in tqdm(article["footnotes"].items()):

        # If the footnote is ibid, use the previous footnote
        if footnote.startswith("Ibid"):
            footnote = prev_footnote

        # Store the footnote for the next iteration
        prev_footnote = footnote
        
        references = re.split('; see |; |. See |, see | .See also',footnote)
        
        author_title_list = []

        for reference in references:

            command = ['ruby', 'anystyle.rb', str(reference)]
            bibtex = subprocess.run(command, stdout=subprocess.PIPE, text=True).stdout
            parsed_bibtex = bibtexparser.loads(bibtex).entries
            if parsed_bibtex:
                parsed_bibtex = parsed_bibtex[0]
            else:
                #print(f"No valid BibTeX entry found in: {bibtex}, set to empty dict")
                parsed_bibtex = {}

            #print(parsed_bibtex)

            # Extract title, prioritizing 'booktitle' if both 'title' and 'booktitle' are present
            title = parsed_bibtex.get('booktitle', parsed_bibtex.get('title', None))

            # Extract author information
            author = parsed_bibtex.get('author', None)

            if author is not None or title is not None:
                # Append author and title pair to the list
                author_title_list.append([author, title])

        # Store the list in the extraction dictionary with the footnote number as the key
        extraction[number] = author_title_list

    return dict_to_triplets(extraction)

In [8]:
def calculate_scores(triplets, extractions):
    TP = len(triplets & extractions)  # Intersection of triplets and extractions
    FP = len(extractions - triplets)  # Elements in extractions but not in triplets
    FN = len(triplets - extractions)  # Elements in triplets but not in extractions

    recall = TP / (TP + FN) if TP + FN != 0 else 0
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0

    return recall, precision, f_score

In [9]:
# TODO: add some how to check for similarity between the authors and titles from the two sets. if for footnote number x 
# the authors and titles are very similar then we can assume that the extraction is correct.

def calculate_similarity(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

def evaluate_extraction(set1, set2, threshold=0.95):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for triplet1 in set1:
        footnote1, author1, title1 = triplet1
        author1 = author1 if author1 is not None else ""
        title1 = title1 if title1 is not None else ""
        concat_str1 = str(author1) + " " + str(title1)
        found_match = False

        for triplet2 in set2:
            footnote2, author2, title2 = triplet2
            author2 = author2 if author2 is not None else ""
            title2 = title2 if title2 is not None else ""
            concat_str2 = str(author2) + " " + str(title2)

            # Check for footnote number and similarity
            if footnote1 == footnote2 and calculate_similarity(concat_str1, concat_str2) >= threshold:
                found_match = True
                break

        if found_match:
            true_positives += 1
        else:
            false_negatives += 1

    false_positives = len(set2) - true_positives

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f_score

In [21]:
path_annotations = "../data/annotated"

for filename in os.listdir(path_annotations):
    print(filename)
    title_json = file_finder(filename)
    df = load_annotations(filename)
    triplets = df_to_triplets(df, format_author=True)
    extraction = information_extraction(title_json)
    recall, precision, f_score = evaluate_extraction(triplets, extraction, threshold=0.9)
    print("\n")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F-Score: {f_score}")
    print("-"* 50)
    print("\n")
    #break

Labels - https___doi.org_10.1093_ehr_ceab280.xlsx


 21%|██        | 21/100 [00:19<01:09,  1.14it/s]Entry type patent not standard. Not considered.
100%|██████████| 100/100 [01:24<00:00,  1.18it/s]




Recall: 0.39568345323741005
Precision: 0.23404255319148937
F-Score: 0.29411764705882354
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead103.xlsx


  5%|▌         | 6/115 [00:07<03:28,  1.91s/it]Entry type webpage not standard. Not considered.
 21%|██        | 24/115 [00:34<01:57,  1.30s/it]Entry type thesis not standard. Not considered.
Entry type thesis not standard. Not considered.
 26%|██▌       | 30/115 [00:47<01:59,  1.41s/it]Entry type thesis not standard. Not considered.
100%|██████████| 115/115 [02:19<00:00,  1.21s/it]




Recall: 0.5024630541871922
Precision: 0.5074626865671642
F-Score: 0.504950495049505
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_ceac260.xlsx


  7%|▋         | 12/175 [00:18<04:01,  1.48s/it]Entry type thesis not standard. Not considered.
 99%|█████████▉| 174/175 [03:24<00:00,  1.18it/s]Entry type webpage not standard. Not considered.
100%|██████████| 175/175 [03:25<00:00,  1.17s/it]




Recall: 0.391304347826087
Precision: 0.4439461883408072
F-Score: 0.41596638655462187
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead065.xlsx


100%|██████████| 142/142 [03:08<00:00,  1.33s/it]




Recall: 0.29411764705882354
Precision: 0.31390134529147984
F-Score: 0.3036876355748373
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead080.xlsx


100%|██████████| 133/133 [02:34<00:00,  1.16s/it]




Recall: 0.6979166666666666
Precision: 0.7362637362637363
F-Score: 0.716577540106952
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cew052.xlsx


100%|██████████| 148/148 [02:35<00:00,  1.05s/it]




Recall: 0.5526315789473685
Precision: 0.5412371134020618
F-Score: 0.5468750000000001
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead107.xlsx


 97%|█████████▋| 120/124 [02:51<00:04,  1.23s/it]Entry type thesis not standard. Not considered.
100%|██████████| 124/124 [02:58<00:00,  1.44s/it]




Recall: 0.4337899543378995
Precision: 0.4846938775510204
F-Score: 0.45783132530120485
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead004.xlsx


100%|██████████| 171/171 [03:03<00:00,  1.07s/it]



Recall: 0.5155555555555555
Precision: 0.5345622119815668
F-Score: 0.5248868778280543
--------------------------------------------------





In [15]:
# TODO: implement a simple approach with a just regrex to extract the author and title, and a simple split with ";"

def extract_citations(file_path: str) -> set:
    path = "../all_data_articles"
    file_path = os.path.join(path, file_path)
    article = json.load(open(file_path, "r"))
    
    citations = set()
    prev_footnote = None

    for footnote_number, footnote_text in tqdm(article["footnotes"].items()):
        # If the footnote is ibid, use the previous footnote
        if footnote_text.startswith("Ibid"):
            footnote_text = prev_footnote
        
        prev_footnote = footnote_text

        # Split the footnote into individual citations
        individual_citations = re.split('; see |; |. See |, see | .See also', footnote_text)
        
        for citation_text in individual_citations:
            # Regular expression to extract authors and titles
            # TODO: try a better pattern
            pattern = re.compile(r'^(.+?),\s+(.+?)[,|(]')

            match = pattern.match(citation_text)
            
            if match:
                author = match.group(1)
                title = match.group(2)
                citations.add((int(footnote_number), author, title))


    return citations

 

In [16]:
path_annotations = "../data/annotated"

for filename in os.listdir(path_annotations):
    print(filename)
    title_json = file_finder(filename)
    df = load_annotations(filename)
    triplets = df_to_triplets(df, format_author=False)
    extraction = extract_citations(title_json)
    recall, precision, f_score = evaluate_extraction(triplets, extraction, threshold=0.9)
    print("\n")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F-Score: {f_score}")
    print("-"* 50)
    print("\n")
    #break

Labels - https___doi.org_10.1093_ehr_ceab280.xlsx


100%|██████████| 100/100 [00:00<00:00, 102826.77it/s]




Recall: 0.49193548387096775
Precision: 0.25957446808510637
F-Score: 0.3398328690807799
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead103.xlsx


100%|██████████| 115/115 [00:00<00:00, 95400.51it/s]



Recall: 0.41899441340782123
Precision: 0.373134328358209
F-Score: 0.3947368421052632
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_ceac260.xlsx



100%|██████████| 175/175 [00:00<00:00, 107499.00it/s]




Recall: 0.5677966101694916
Precision: 0.600896860986547
F-Score: 0.5838779956427016
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead065.xlsx


100%|██████████| 142/142 [00:00<00:00, 73584.28it/s]




Recall: 0.32558139534883723
Precision: 0.31390134529147984
F-Score: 0.31963470319634707
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead080.xlsx


100%|██████████| 133/133 [00:00<00:00, 80080.74it/s]




Recall: 0.4887640449438202
Precision: 0.47802197802197804
F-Score: 0.48333333333333334
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cew052.xlsx


100%|██████████| 148/148 [00:00<00:00, 145444.47it/s]




Recall: 0.5113636363636364
Precision: 0.4639175257731959
F-Score: 0.4864864864864865
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead107.xlsx


100%|██████████| 124/124 [00:00<00:00, 94185.75it/s]




Recall: 0.4911242603550296
Precision: 0.42346938775510207
F-Score: 0.4547945205479452
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead004.xlsx


100%|██████████| 171/171 [00:00<00:00, 151825.99it/s]




Recall: 0.5968586387434555
Precision: 0.5253456221198156
F-Score: 0.5588235294117647
--------------------------------------------------


In [12]:
path_annotations = "../data/annotated"

for filename in os.listdir(path_annotations):
    print(filename)
    title_json = file_finder(filename)
    df = load_annotations(filename)
    triplets = df_to_triplets(df, format_author=False)
    extraction = extract_citations(title_json)
    recall, precision, f_score = evaluate_extraction(triplets, extraction, threshold=0.9)
    print("\n")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F-Score: {f_score}")
    print("-"* 50)
    print("\n")
    #break

Labels - https___doi.org_10.1093_ehr_ceab280.xlsx


100%|██████████| 100/100 [00:00<00:00, 103155.53it/s]



Recall: 0.49193548387096775
Precision: 0.25957446808510637
F-Score: 0.3398328690807799
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead103.xlsx



100%|██████████| 115/115 [00:00<00:00, 72762.85it/s]




Recall: 0.41899441340782123
Precision: 0.373134328358209
F-Score: 0.3947368421052632
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_ceac260.xlsx


100%|██████████| 175/175 [00:00<00:00, 75615.86it/s]




Recall: 0.559322033898305
Precision: 0.5919282511210763
F-Score: 0.5751633986928104
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead065.xlsx


100%|██████████| 142/142 [00:00<00:00, 73421.00it/s]




Recall: 0.32558139534883723
Precision: 0.31390134529147984
F-Score: 0.31963470319634707
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead080.xlsx


100%|██████████| 133/133 [00:00<00:00, 123008.25it/s]




Recall: 0.4887640449438202
Precision: 0.47802197802197804
F-Score: 0.48333333333333334
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cew052.xlsx


100%|██████████| 148/148 [00:00<00:00, 149760.43it/s]




Recall: 0.5113636363636364
Precision: 0.4639175257731959
F-Score: 0.4864864864864865
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead107.xlsx


100%|██████████| 124/124 [00:00<00:00, 76159.57it/s]




Recall: 0.4911242603550296
Precision: 0.42346938775510207
F-Score: 0.4547945205479452
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead004.xlsx


100%|██████████| 171/171 [00:00<00:00, 145069.98it/s]




Recall: 0.5968586387434555
Precision: 0.5253456221198156
F-Score: 0.5588235294117647
--------------------------------------------------


In [19]:
# TODO: write a function that extracts the author and title from the footnote using a tagger (i.e. flair)

def tagger_information_extraction(file_path: str, tagger) -> set:
    path = "../all_data_articles"
    file_path = os.path.join(path, file_path)
    article = json.load(open(file_path, "r"))
    
    citations = set()
    prev_footnote = None

    for footnote_number, footnote_text in tqdm(article["footnotes"].items()):
        # If the footnote is ibid, use the previous footnote
        if footnote_text.startswith("Ibid"):
            footnote_text = prev_footnote
        
        prev_footnote = footnote_text

        # Split the footnote into individual citations
        individual_citations = re.split('; see |; |. See |, see | .See also', footnote_text)

        for citation_text in individual_citations:
            
            author = None

            sentence = Sentence(citation_text)
            tagger.predict(sentence)
            for span in sentence.get_spans('ner'):
                if span.tag == "PERSON" or span.tag == "ORG":
                    if author is None:
                        author = span.text
                    else:
                        author += ("and " + span.text)
                if span.tag == "WORK_OF_ART":
                    citations.add((int(footnote_number), author, span.text))
                    author = None

                
    return citations


In [18]:
path_annotations = "../data/annotated"
# load the NER tagger
tagger = Classifier.load('ner-ontonotes-large')

for filename in os.listdir(path_annotations):
    print(filename)
    title_json = file_finder(filename)
    df = load_annotations(filename)
    triplets = df_to_triplets(df, format_author=False)
    extraction = tagger_information_extraction(title_json, tagger=tagger)
    recall, precision, f_score = evaluate_extraction(triplets, extraction, threshold=0.90)
    print("\n")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F-Score: {f_score}")
    print("-"* 50)
    print("\n")

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2023-12-18 21:01:15,904 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY
Labels - https___doi.org_10.1093_ehr_ceab280.xlsx


100%|██████████| 100/100 [00:04<00:00, 20.34it/s]




Recall: 0.48951048951048953
Precision: 0.2978723404255319
F-Score: 0.37037037037037035
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead103.xlsx


100%|██████████| 115/115 [00:06<00:00, 18.59it/s]




Recall: 0.3941176470588235
Precision: 0.3333333333333333
F-Score: 0.36118598382749323
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_ceac260.xlsx


100%|██████████| 175/175 [00:07<00:00, 22.06it/s]




Recall: 0.5180722891566265
Precision: 0.38565022421524664
F-Score: 0.44215938303341906
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead065.xlsx


100%|██████████| 142/142 [00:06<00:00, 20.66it/s]




Recall: 0.4114583333333333
Precision: 0.3542600896860987
F-Score: 0.380722891566265
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead080.xlsx


100%|██████████| 133/133 [00:05<00:00, 23.73it/s]




Recall: 0.6763005780346821
Precision: 0.6428571428571429
F-Score: 0.6591549295774648
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cew052.xlsx


100%|██████████| 148/148 [00:05<00:00, 27.03it/s]




Recall: 0.5829383886255924
Precision: 0.634020618556701
F-Score: 0.6074074074074074
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead107.xlsx


100%|██████████| 124/124 [00:07<00:00, 16.79it/s]




Recall: 0.5125628140703518
Precision: 0.5204081632653061
F-Score: 0.5164556962025317
--------------------------------------------------


Labels - https___doi.org_10.1093_ehr_cead004.xlsx


100%|██████████| 171/171 [00:06<00:00, 25.45it/s]



Recall: 0.5513513513513514
Precision: 0.4700460829493088
F-Score: 0.5074626865671642
--------------------------------------------------






In [16]:
def extract_triplets(triplets):
    return list(filter(lambda triplet: 10 < triplet[0] < 20, triplets))

In [17]:
extract_triplets(extraction)

[(12, None, '–1461'),
 (15, None, 'Bulletin of the Institute of Historical Research'),
 (15,
  'William Clerionetand Richard of Gloucesterand Katherineand A.J. Pollard',
  'Lord Fitzhugh’s Rising in 1470’'),
 (12,
  'R.A. Griffiths',
  'The Reign of King Henry VI: The Exercise of Royal Authority'),
 (11, 'S. Federico', 'The Imaginary Society: Women in 1381’'),
 (13, None, '1300–1348'),
 (13, 'B. Hanawalt', 'Crime and Conflict in English Communities'),
 (17, 'Warwickand Pollard', '‘Lord Fitzhugh’s Rising’'),
 (19,
  'Richard Salkeld',
  'Calendar of the Patent Rolls Preserved in the Public Record Office')]

In [18]:
extract_triplets(triplets)

[(11.0, 'S. Federico', 'The Imaginary Society: Women in 1381'),
 (16.0, None, None),
 (17.0, 'Pollard', 'Lord Fitzhugh’s Rising’'),
 (14.0, None, None),
 (13.0, 'B. Hanawalt', 'rime and Conflict in English Communities, 1300–1348'),
 (15.0, 'A.J. Pollard', 'Lord Fitzhugh’s Rising in 1470'),
 (18.0, 'Pollard', 'Lord Fitzhugh’s Rising’'),
 (12.0,
  'R.A. Griffiths',
  'The Reign of King Henry VI: The Exercise of Royal Authority, 1422–1461'),
 (19.0, None, None)]

In [19]:
#(113, 'M. Young and P. Willmott', 'Family and Kinship in East London'),
#('113', 'Young, M. and Willmott, P.', 'Family and Kinship in East London'),