In [17]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import json
from pathlib import Path
import xml.etree.ElementTree as ET


In [18]:
df_lncs = pd.read_parquet("../data/metadata_LNCS.parquet")
df_lncs.loc[df_lncs['ID'] == "lncs_649", 'Section'] = "CLEF at SemEval 2007"

In [19]:
# Read iso file to resolve country name abbrevations

with open("../data/iso3166-1.json", 'r', encoding="utf-8") as file:
    Country_codes = json.load(file)


country_codes_reformatted = {}
for i in Country_codes["3166-1"]:
    country_codes_reformatted[i["alpha_2"]] = i["name"]

In [20]:
# Extract metadata informations from OpenAlex regarding author names, institution names and location of the institution

def calculate_most_active_countries_institutions(country_codes_dict,df,path):
    with open(path, 'r', encoding="utf-8") as file:
        OpenAlexJson = json.load(file)

    countries_list = []
    for i in OpenAlexJson:
        country_codes = []
        for j in OpenAlexJson[i]["authorships"]:
            for k in j["institutions"]:
                if k["country_code"] is not None and k["display_name"] is not None:
                    if [country_codes_dict[k["country_code"]], k["display_name"], j["author"]["display_name"]] not in country_codes:
                        country_codes.append([country_codes_dict[k["country_code"]], k["display_name"], j["author"]["display_name"]])
        countries_list.append([i, country_codes])

    df_countries = pd.DataFrame(countries_list, columns=["ID", "Countries"])
    df = pd.merge(df, df_countries, how ="inner", right_on="ID",left_on="ID")
    return df


In [21]:
df_lncs = calculate_most_active_countries_institutions(country_codes_reformatted, df_lncs, path="../data/OpenAlex_LNCS.json" )

In [22]:
# Create paths to the location of the XML-transformed LNCS PDFs from GROBID

paths = []
for i, j in df_lncs.iterrows():
    path = "../data/resources/XML_LNCS/" + j["Book Subtitle"] + "/" + j["Section"]
    if j["Subsection"] == "Uncategorized":
        path = path + "/" + j["Filename"].replace(".pdf", ".tei.xml")
    else:
        path = path + "/" + j["Section"] + "/" + j["Subsection"] + "/" + j["Filename"].replace(".pdf", ".tei.xml")

    paths.append(path)

In [23]:
df_lncs["filepath"] = paths

In [24]:
df_lncs

Unnamed: 0,PubYear,Book Subtitle,Book Title,Filename,Title,Section,Subsection,DOI,Citation count,Authors & Affiliations,ID,Countries,filepath
0,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_2.pdf,CLEF 2005: Ad Hoc Track Overview,Part I. Multilingual Textual Document Retrival...,Uncategorized,https://doi.org/10.1007/11878773_2,17,"[['Department of Information Engineering, Univ...",lncs_2,"[[Italy, University of Padua, Giorgio Maria Di...",../data/resources/XML_LNCS/6th Workshop of the...
1,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_3.pdf,Ad-Hoc Mono- and Bilingual Retrieval Experimen...,Cross-Language and More,Uncategorized,https://doi.org/10.1007/11878773_3,Not found,"[['Information Science, University of Hildeshe...",lncs_3,"[[Germany, University of Hildesheim, René Hack...",../data/resources/XML_LNCS/6th Workshop of the...
2,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_4.pdf,MIRACLE at Ad-Hoc CLEF 2005: Merging and Combi...,Cross-Language and More,Uncategorized,https://doi.org/10.1007/11878773_4,4,"[['Universidad Politécnica de Madrid, \xa0', [...",lncs_4,"[[Spain, Universidad Politécnica de Madrid, Jo...",../data/resources/XML_LNCS/6th Workshop of the...
3,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_5.pdf,The XLDB Group at the CLEF 2005 Ad-Hoc Task,Cross-Language and More,Uncategorized,https://doi.org/10.1007/11878773_5,3,"[['Departamento de Informática, Grupo XLDB, Fa...",lncs_5,"[[Portugal, University of Lisbon, Nuno Cardoso...",../data/resources/XML_LNCS/6th Workshop of the...
4,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_6.pdf,Thomson Legal and Regulatory Experiments at CL...,Cross-Language and More,Uncategorized,https://doi.org/10.1007/11878773_6,1,"[['Thomson Legal and Regulatory, 610 Opperman ...",lncs_6,"[[United States, Thomson Reuters (United State...",../data/resources/XML_LNCS/6th Workshop of the...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,2013,4th International Conference of the CLEF Initi...,Information Access Evaluation. Multilinguality...,978-3-642-40802-1_28.pdf,Recent Trends in Digital Text Forensics and It...,Lab Overviews,Uncategorized,https://doi.org/10.1007/978-3-642-40802-1_28,15,"[['Web Technology and Information Systems, Bau...",lncs_1339,"[[Germany, Bauhaus-Universität Weimar, Tim Gol...",../data/resources/XML_LNCS/4th International C...
1221,2013,4th International Conference of the CLEF Initi...,Information Access Evaluation. Multilinguality...,978-3-642-40802-1_29.pdf,QA4MRE 2011-2013: Overview of Question Answeri...,Lab Overviews,Uncategorized,https://doi.org/10.1007/978-3-642-40802-1_29,13,"[['NLP&IR Group, UNED, Spain', ['Anselmo Peñas...",lncs_1340,"[[Spain, National University of Distance Educa...",../data/resources/XML_LNCS/4th International C...
1222,2013,4th International Conference of the CLEF Initi...,Information Access Evaluation. Multilinguality...,978-3-642-40802-1_30.pdf,Multilingual Question Answering over Linked Da...,Lab Overviews,Uncategorized,https://doi.org/10.1007/978-3-642-40802-1_30,36,"[['CITEC, Universität Bielefeld, Germany', ['P...",lncs_1341,"[[Germany, Bielefeld University, Philipp Cimia...",../data/resources/XML_LNCS/4th International C...
1223,2013,4th International Conference of the CLEF Initi...,Information Access Evaluation. Multilinguality...,978-3-642-40802-1_31.pdf,Overview of RepLab 2013: Evaluating Online Rep...,Lab Overviews,Uncategorized,https://doi.org/10.1007/978-3-642-40802-1_31,49,"[['UNED NLP and IR Group, Juan del Rosal, 16, ...",lncs_1342,"[[Spain, National University of Distance Educa...",../data/resources/XML_LNCS/4th International C...


In [None]:
# Function to extract the required informations like author names, institutions and corresponding countries to the institutions from the XML-files transformed with GROBID

def extract_authors_info(path):
    authors_info = []
    file_path = Path(path)
    tree = ET.parse(file_path)
    root = tree.getroot()
    ns = {'ns0': 'http://www.tei-c.org/ns/1.0'}

    source_desc_elem = root.find('.//ns0:sourceDesc', namespaces=ns)
    if source_desc_elem is None:
        return authors_info
    
    for author_elem in source_desc_elem.findall('.//ns0:author', namespaces=ns):
        
        # Extract first name and surname of the author
        forename_elem = author_elem.find('./ns0:persName/ns0:forename', namespaces=ns)
        if forename_elem is not None:
            forename = forename_elem.text
        else:
            forename = ""
        
        surname_elem = author_elem.find('./ns0:persName/ns0:surname', namespaces=ns)
        if surname_elem is not None:
            surname = surname_elem.text
        else:
            surname = ""
        
        author_name = f"{forename} {surname}"
        
        for aff_elem in author_elem.findall('./ns0:affiliation', namespaces=ns):
            org_name_elem = aff_elem.find('./ns0:orgName', namespaces=ns)
            if org_name_elem is not None:
                org_name = org_name_elem.text
            else:
                org_name = ""
            address_elem = aff_elem.find('./ns0:address', namespaces=ns)
            if address_elem is not None:
                country_elem = address_elem.find('./ns0:country', namespaces=ns)
                if country_elem is not None and country_elem.text is not None and org_name != "":
                    authors_info.append([country_elem.text, org_name, author_name])
    return authors_info

In [39]:

list_metadata_grobid_extract = []

# Extracting the metadata based on the paths 

for i,j in df_lncs.iterrows():
    try:
        author_infos = extract_authors_info(j["filepath"])
        list_metadata_grobid_extract.append(author_infos)
    except:
        list_metadata_grobid_extract.append([])

Russia
Russia
USA
Russia
Russia
Italy
Italy
Italy
Germany
Germany
Germany
Germany
Germany
Germany
Germany
France
France
Spain
Spain
Spain
Ethiopia
France
Ethiopia
Germany
The Netherlands
The Netherlands
Portugal
Spain
Portugal
Portugal
Germany
The Netherlands
Italy
Italy
Denmark
Denmark
Denmark
Denmark
Belarus
Canada
Canada
Greece
Sweden
Greece
Sweden
Greece
USA
USA
USA
USA
Greece
Greece
Greece
Greece
Greece
Spain
Spain
Spain
Greece
Italy
Qatar
Qatar
Qatar
Qatar
Qatar
Bulgaria
Jordan
Bulgaria
Qatar
Australia
Australia
Vietnam
The Netherlands
Germany
Germany
The Netherlands
The Netherlands
Australia
Australia
Australia
Japan
Australia
Australia
Australia
Australia
Australia
France
Australia
Australia
Finland
Ireland
Spain
Spain
Australia
Italy
France
Italy
Australia
Australia
Spain
Switzerland
Spain
Switzerland
Switzerland
Switzerland
Romania
Switzerland
France
USA
USA
USA
USA
Belarus
Belarus
Belarus
Germany
Germany
Ireland
Ireland
Ireland
Italy
Norway
Norway
Vietnam
Austria
Ireland
Nor

In [32]:
list_metadata_grobid_extract

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [['Russia', 'St. Petersburg State University', 'Pavel Efimov'],
  ['Russia', 'Ural Federal University', 'Pavel Braslavski'],
  ['Russia', 'JetBrains Research', 'Pavel Braslavski']],
 [['Italy',
   'Department of Mathematics "Tullio Levi-Civita"',
   'Marco Ferrante'],
  ['Italy', 'Department of Information Engineering', 'Nicola Ferro'],
  ['Italy', 'Department of Information Engineering', 'Luca Piazzon']],
 [['Germany',
   'Berlin School of

In [33]:
df_lncs["grobid"] = list_metadata_grobid_extract

In [34]:
df_lncs

Unnamed: 0,PubYear,Book Subtitle,Book Title,Filename,Title,Section,Subsection,DOI,Citation count,Authors & Affiliations,ID,Countries,filepath,grobid
0,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_2.pdf,CLEF 2005: Ad Hoc Track Overview,Part I. Multilingual Textual Document Retrival...,Uncategorized,https://doi.org/10.1007/11878773_2,17,"[['Department of Information Engineering, Univ...",lncs_2,"[[Italy, University of Padua, Giorgio Maria Di...",../data/resources/XML_LNCS/6th Workshop of the...,[]
1,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_3.pdf,Ad-Hoc Mono- and Bilingual Retrieval Experimen...,Cross-Language and More,Uncategorized,https://doi.org/10.1007/11878773_3,Not found,"[['Information Science, University of Hildeshe...",lncs_3,"[[Germany, University of Hildesheim, René Hack...",../data/resources/XML_LNCS/6th Workshop of the...,[]
2,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_4.pdf,MIRACLE at Ad-Hoc CLEF 2005: Merging and Combi...,Cross-Language and More,Uncategorized,https://doi.org/10.1007/11878773_4,4,"[['Universidad Politécnica de Madrid, \xa0', [...",lncs_4,"[[Spain, Universidad Politécnica de Madrid, Jo...",../data/resources/XML_LNCS/6th Workshop of the...,[]
3,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_5.pdf,The XLDB Group at the CLEF 2005 Ad-Hoc Task,Cross-Language and More,Uncategorized,https://doi.org/10.1007/11878773_5,3,"[['Departamento de Informática, Grupo XLDB, Fa...",lncs_5,"[[Portugal, University of Lisbon, Nuno Cardoso...",../data/resources/XML_LNCS/6th Workshop of the...,[]
4,2005,6th Workshop of the Cross-Language Evaluation ...,Accessing Multilingual Information Repositories,11878773_6.pdf,Thomson Legal and Regulatory Experiments at CL...,Cross-Language and More,Uncategorized,https://doi.org/10.1007/11878773_6,1,"[['Thomson Legal and Regulatory, 610 Opperman ...",lncs_6,"[[United States, Thomson Reuters (United State...",../data/resources/XML_LNCS/6th Workshop of the...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,2013,4th International Conference of the CLEF Initi...,Information Access Evaluation. Multilinguality...,978-3-642-40802-1_28.pdf,Recent Trends in Digital Text Forensics and It...,Lab Overviews,Uncategorized,https://doi.org/10.1007/978-3-642-40802-1_28,15,"[['Web Technology and Information Systems, Bau...",lncs_1339,"[[Germany, Bauhaus-Universität Weimar, Tim Gol...",../data/resources/XML_LNCS/4th International C...,"[[Germany, Web Technology and Information Syst..."
1221,2013,4th International Conference of the CLEF Initi...,Information Access Evaluation. Multilinguality...,978-3-642-40802-1_29.pdf,QA4MRE 2011-2013: Overview of Question Answeri...,Lab Overviews,Uncategorized,https://doi.org/10.1007/978-3-642-40802-1_29,13,"[['NLP&IR Group, UNED, Spain', ['Anselmo Peñas...",lncs_1340,"[[Spain, National University of Distance Educa...",../data/resources/XML_LNCS/4th International C...,"[[Spain, NLP&IR Group, Anselmo Pen ˜as], [USA,..."
1222,2013,4th International Conference of the CLEF Initi...,Information Access Evaluation. Multilinguality...,978-3-642-40802-1_30.pdf,Multilingual Question Answering over Linked Da...,Lab Overviews,Uncategorized,https://doi.org/10.1007/978-3-642-40802-1_30,36,"[['CITEC, Universität Bielefeld, Germany', ['P...",lncs_1341,"[[Germany, Bielefeld University, Philipp Cimia...",../data/resources/XML_LNCS/4th International C...,"[[Germany, CITEC, Philipp Cimiano], [Ireland, ..."
1223,2013,4th International Conference of the CLEF Initi...,Information Access Evaluation. Multilinguality...,978-3-642-40802-1_31.pdf,Overview of RepLab 2013: Evaluating Online Rep...,Lab Overviews,Uncategorized,https://doi.org/10.1007/978-3-642-40802-1_31,49,"[['UNED NLP and IR Group, Juan del Rosal, 16, ...",lncs_1342,"[[Spain, National University of Distance Educa...",../data/resources/XML_LNCS/4th International C...,"[[Spain, UNED NLP and IR Group Juan del Rosal,..."


In [35]:
df_lncs.to_parquet("../data/Network_preparations_LNCS.parquet")