In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
from thefuzz import process
from pathlib import Path
import xml.etree.ElementTree as ET

In [2]:
df_ceur = pd.read_parquet("../data/metadata_CEUR.parquet")

In [5]:
# Read iso file to resolve country name abbrevations

with open("../data/iso3166-1.json", 'r', encoding="utf-8") as file:
    Country_codes = json.load(file)


country_codes_reformatted = {}
for i in Country_codes["3166-1"]:
    country_codes_reformatted[i["alpha_2"]] = i["name"]

In [6]:
# Extract metadata informations from OpenAlex regarding author names, institution names and location of the institution

def calculate_most_active_countries_institutions(country_codes_dict,df,path):
    with open(path, 'r', encoding="utf-8") as file:
        OpenAlexJson = json.load(file)

    countries_list = []
    for i in OpenAlexJson:
        country_codes = []
        for j in OpenAlexJson[i]["authorships"]:
            for k in j["institutions"]:
                if k["country_code"] is not None and k["display_name"] is not None:
                    if [country_codes_dict[k["country_code"]], k["display_name"], j["author"]["display_name"]] not in country_codes:
                        country_codes.append([country_codes_dict[k["country_code"]], k["display_name"], j["author"]["display_name"]])
              
        countries_list.append([i, country_codes])

    df_countries = pd.DataFrame(countries_list, columns=["ID", "Countries"])
    df = pd.merge(df, df_countries, how ="inner", right_on="ID",left_on="ID")
    return df


In [7]:
df_ceur = calculate_most_active_countries_institutions(country_codes_reformatted, df_ceur, path="../data/OpenAlex_CEUR.json" )

In [8]:
df_ceur

Unnamed: 0,PubYear,CEUR Title,Volume,filename,Title,Section,Authors,url,ID,Countries
0,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090001.pdf,Task 1 of the CLEF eHealth Evaluation Lab 2016...,CLEFeHealth,"[Hanna Suominen, Liyuan Zhou, Lorraine Goeurio...",https://ceur-ws.org/Vol-1609/16090001.pdf,ceur_1019,"[[Australia, Data61, Hanna Suominen], [France,..."
1,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090028.pdf,Clinical Information Extraction at the CLEF eH...,CLEFeHealth,"[Aurélie Névéol, Kevin Bretonnel Cohen, Cyril ...",https://ceur-ws.org/Vol-1609/16090028.pdf,ceur_1021,"[[France, Laboratoire d'Informatique pour la M..."
2,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090043.pdf,Comparison of Several Word embedding Sources f...,CLEFeHealth,"[Julie Budaher, Mohannad Almasri, Lorraine Goe...",https://ceur-ws.org/Vol-1609/16090043.pdf,ceur_1022,[]
3,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090047.pdf,SIBM at CLEF eHealth Evaluation Lab 2016: Extr...,CLEFeHealth,"[Chloé Cabot, Lina F. Soualmia, Badisse Dahamn...",https://ceur-ws.org/Vol-1609/16090047.pdf,ceur_1023,"[[France, Laboratoire d'Informatique, du Trait..."
4,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090061.pdf,ECSTRA-INSERM @ CLEF eHealth2016-task 2: ICD10...,CLEFeHealth,"[Mohamed Dermouche, Vincent Looten, Rémi Flico...",https://ceur-ws.org/Vol-1609/16090061.pdf,ceur_1024,"[[France, Sorbonne Université, Vincent Looten]..."
...,...,...,...,...,...,...,...,...,...,...
2853,2022,Proceedings of the Working Notes of CLEF 2022 ...,Vol-3180,paper-183.pdf,Solutions for Fine-grained and Long-tailed Sna...,LifeCLEF: Biodiversity identification and pred...,"[Cheng Zou, Furong Xu, Meng Wang, Wen Li, Yuan...",https://ceur-ws.org/Vol-3180/paper-183.pdf,ceur_3250,[]
2854,2022,Proceedings of the Working Notes of CLEF 2022 ...,Vol-3180,paper-235.pdf,Overview of the CLEF 2022 SimpleText Task 1: P...,SimpleText: Automatic Simplification of Scient...,"[Eric Sanjuan, Stéphane Huet, Jaap Kamps, Lian...",https://ceur-ws.org/Vol-3180/paper-235.pdf,ceur_3302,"[[France, Laboratoire Informatique d'Avignon, ..."
2855,2022,Proceedings of the Working Notes of CLEF 2022 ...,Vol-3180,paper-236.pdf,Overview of the CLEF 2022 SimpleText Task 2: C...,SimpleText: Automatic Simplification of Scient...,"[Liana Ermakova, Irina Ovchinnikov, Jaap Kamps...",https://ceur-ws.org/Vol-3180/paper-236.pdf,ceur_3303,"[[Singapore, Ministry of Manpower, И. Г. Овчин..."
2856,2022,Proceedings of the Working Notes of CLEF 2022 ...,Vol-3180,paper-237.pdf,Overview of the CLEF 2022 SimpleText Task 3: Q...,SimpleText: Automatic Simplification of Scient...,"[Liana Ermakova, Irina Ovchinnikov, Jaap Kamps...",https://ceur-ws.org/Vol-3180/paper-237.pdf,ceur_3304,"[[Russian Federation, Sechenov University, Iri..."


In [9]:
import os

def find_all_subdirectories(basisverzeichnis):
    subdirectories = []
    for root, dirs, files in os.walk(basedirectory):
        for dir in dirs:
            subdirectories.append(os.path.join(root, dir))
    return subdirectories

basedirectory = "../data/resources/XML_CEUR/"

# Get a list of all subdirectories
subdirectories = find_all_subdirectories(basedirectory)

subdirectories = subdirectories[24:]


In [10]:
list_of_subdirectory_names = [os.path.basename(pfad) for pfad in subdirectories]

In [11]:
match_list = []

for name_1 in df_ceur["Section"]:
    # Fuzzy matching for each name in between both lists
    best_match = process.extractOne(name_1, list_of_subdirectory_names)
    
    # Extract the best match and the corresponding similarity value
    if best_match:
        name_2, score = best_match
        if {name_1: name_2} not in match_list:
            match_list.append({name_1: name_2})



In [12]:
def replace_section(section, match_list):
    for match_dict in match_list:
        if section in match_dict:
            return match_dict[section]
    return section  


# Reassigning the headings without the replacement of whitespaces and punctuation marks
df_ceur['Section_not_whitespace'] = df_ceur['Section'].apply(lambda x: replace_section(x, match_list))


In [13]:
# Create paths to the location of the XML-transformed CEUR PDFs from GROBID

paths = []
for i, j in df_ceur.iterrows():
    path = "../data/resources/XML_CEUR/" + j["Volume"] + "/" + j["Section_not_whitespace"] + "/" + j["filename"].replace(".pdf", ".tei.xml")
    paths.append(path)

In [14]:
df_ceur["filepath"] = paths

In [15]:

# Function to extract the required informations like author names, institutions and corresponding countries to the institutions from the XML-files transformed with GROBID

def extract_authors_info(path):
    authors_info = []
    file_path = Path(path)
    tree = ET.parse(file_path)
    root = tree.getroot()
    ns = {'ns0': 'http://www.tei-c.org/ns/1.0'}

    source_desc_elem = root.find('.//ns0:sourceDesc', namespaces=ns)
    if source_desc_elem is None:
        return authors_info
    
    for author_elem in source_desc_elem.findall('.//ns0:author', namespaces=ns):
        
        forename_elem = author_elem.find('./ns0:persName/ns0:forename', namespaces=ns)
        if forename_elem is not None:
            forename = forename_elem.text
        else:
            forename = ""
        
        surname_elem = author_elem.find('./ns0:persName/ns0:surname', namespaces=ns)
        if surname_elem is not None:
            surname = surname_elem.text
        else:
            surname = ""
        
        author_name = f"{forename} {surname}"
        
        for aff_elem in author_elem.findall('./ns0:affiliation', namespaces=ns):
            org_name_elem = aff_elem.find('./ns0:orgName', namespaces=ns)
            if org_name_elem is not None:
                org_name = org_name_elem.text
            else:
                org_name = ""
            address_elem = aff_elem.find('./ns0:address', namespaces=ns)
            if address_elem is not None:
                country_elem = address_elem.find('./ns0:country', namespaces=ns)
                if country_elem is not None and country_elem.text is not None and org_name != "":
                    authors_info.append([country_elem.text, org_name, author_name])
    return authors_info

In [16]:

list_metadata_grobid_extract = []

# Extracting the metadata based on the paths 

for i,j in df_ceur.iterrows():
    try:
        author_infos = extract_authors_info(j["filepath"])
        list_metadata_grobid_extract.append(author_infos)
    except:
        list_metadata_grobid_extract.append([])

In [17]:
df_ceur["grobid"] = list_metadata_grobid_extract

In [18]:
df_ceur

Unnamed: 0,PubYear,CEUR Title,Volume,filename,Title,Section,Authors,url,ID,Countries,Section_not_whitespace,filepath,grobid
0,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090001.pdf,Task 1 of the CLEF eHealth Evaluation Lab 2016...,CLEFeHealth,"[Hanna Suominen, Liyuan Zhou, Lorraine Goeurio...",https://ceur-ws.org/Vol-1609/16090001.pdf,ceur_1019,"[[Australia, Data61, Hanna Suominen], [France,...",CLEFeHealth,../data/resources/XML_CEUR/Vol-1609/CLEFeHealt...,"[[Australia, University of Turku, Hanna Suomin..."
1,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090028.pdf,Clinical Information Extraction at the CLEF eH...,CLEFeHealth,"[Aurélie Névéol, Kevin Bretonnel Cohen, Cyril ...",https://ceur-ws.org/Vol-1609/16090028.pdf,ceur_1021,"[[France, Laboratoire d'Informatique pour la M...",CLEFeHealth,../data/resources/XML_CEUR/Vol-1609/CLEFeHealt...,"[[France, LIMSI, Aurélie Névéol], [France, LIM..."
2,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090043.pdf,Comparison of Several Word embedding Sources f...,CLEFeHealth,"[Julie Budaher, Mohannad Almasri, Lorraine Goe...",https://ceur-ws.org/Vol-1609/16090043.pdf,ceur_1022,[],CLEFeHealth,../data/resources/XML_CEUR/Vol-1609/CLEFeHealt...,[]
3,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090047.pdf,SIBM at CLEF eHealth Evaluation Lab 2016: Extr...,CLEFeHealth,"[Chloé Cabot, Lina F. Soualmia, Badisse Dahamn...",https://ceur-ws.org/Vol-1609/16090047.pdf,ceur_1023,"[[France, Laboratoire d'Informatique, du Trait...",CLEFeHealth,../data/resources/XML_CEUR/Vol-1609/CLEFeHealt...,"[[France, SIBM, Chloé Cabot], [France, SIBM, L..."
4,2016,Working Notes of CLEF 2016 - Conference and La...,Vol-1609,16090061.pdf,ECSTRA-INSERM @ CLEF eHealth2016-task 2: ICD10...,CLEFeHealth,"[Mohamed Dermouche, Vincent Looten, Rémi Flico...",https://ceur-ws.org/Vol-1609/16090061.pdf,ceur_1024,"[[France, Sorbonne Université, Vincent Looten]...",CLEFeHealth,../data/resources/XML_CEUR/Vol-1609/CLEFeHealt...,"[[France, U1153 Epidemiology and Biostatistics..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2853,2022,Proceedings of the Working Notes of CLEF 2022 ...,Vol-3180,paper-183.pdf,Solutions for Fine-grained and Long-tailed Sna...,LifeCLEF: Biodiversity identification and pred...,"[Cheng Zou, Furong Xu, Meng Wang, Wen Li, Yuan...",https://ceur-ws.org/Vol-3180/paper-183.pdf,ceur_3250,[],LifeCLEF__Biodiversity_identification_and_pred...,../data/resources/XML_CEUR/Vol-3180/LifeCLEF__...,"[[China, Ant Group, Cheng Zou], [China, Ant Gr..."
2854,2022,Proceedings of the Working Notes of CLEF 2022 ...,Vol-3180,paper-235.pdf,Overview of the CLEF 2022 SimpleText Task 1: P...,SimpleText: Automatic Simplification of Scient...,"[Eric Sanjuan, Stéphane Huet, Jaap Kamps, Lian...",https://ceur-ws.org/Vol-3180/paper-235.pdf,ceur_3302,"[[France, Laboratoire Informatique d'Avignon, ...",SimpleText__Automatic_Simplification_of_Scient...,../data/resources/XML_CEUR/Vol-3180/SimpleText...,"[[France, Avignon Université, Eric Sanjuan], [..."
2855,2022,Proceedings of the Working Notes of CLEF 2022 ...,Vol-3180,paper-236.pdf,Overview of the CLEF 2022 SimpleText Task 2: C...,SimpleText: Automatic Simplification of Scient...,"[Liana Ermakova, Irina Ovchinnikov, Jaap Kamps...",https://ceur-ws.org/Vol-3180/paper-236.pdf,ceur_3303,"[[Singapore, Ministry of Manpower, И. Г. Овчин...",SimpleText__Automatic_Simplification_of_Scient...,../data/resources/XML_CEUR/Vol-3180/SimpleText...,"[[France, Université de Bretagne Occidentale, ..."
2856,2022,Proceedings of the Working Notes of CLEF 2022 ...,Vol-3180,paper-237.pdf,Overview of the CLEF 2022 SimpleText Task 3: Q...,SimpleText: Automatic Simplification of Scient...,"[Liana Ermakova, Irina Ovchinnikov, Jaap Kamps...",https://ceur-ws.org/Vol-3180/paper-237.pdf,ceur_3304,"[[Russian Federation, Sechenov University, Iri...",SimpleText__Automatic_Simplification_of_Scient...,../data/resources/XML_CEUR/Vol-3180/SimpleText...,"[[France, Université de Bretagne Occidentale, ..."


In [19]:
df_ceur.to_parquet("../data/Network_preparations_CEUR.parquet")