In [30]:
import pandas as pd
import json
import xml.etree.ElementTree as ET
from pathlib import Path

In [31]:
df_trec = pd.read_parquet("../data/metadata_TREC.parquet")

In [32]:
# Read iso file to resolve country name abbrevations

with open("../data/iso3166-1.json", 'r', encoding="utf-8") as file:
    Country_codes = json.load(file)


country_codes_reformatted = {}
for i in Country_codes["3166-1"]:
    country_codes_reformatted[i["alpha_2"]] = i["name"]

In [33]:
# Extract metadata informations from OpenAlex regarding author names, institution names and location of the institution

def calculate_most_active_countries_institutions(country_codes_dict,df,path):
    with open(path, 'r', encoding="utf-8") as file:
        OpenAlexJson = json.load(file)

    countries_list = []
    for i in OpenAlexJson:
        country_codes = []
        for j in OpenAlexJson[i]["authorships"]:
            for k in j["institutions"]:
                if k["country_code"] is not None and k["display_name"] is not None:
                    if [country_codes_dict[k["country_code"]], k["display_name"], j["author"]["display_name"]] not in country_codes:
                        country_codes.append([country_codes_dict[k["country_code"]], k["display_name"], j["author"]["display_name"]])
        countries_list.append([i, country_codes])

    df_countries = pd.DataFrame(countries_list, columns=["ID", "Countries"])
    df = pd.merge(df, df_countries, how ="inner", right_on="ID",left_on="ID")
    return df


In [34]:
df_trec = calculate_most_active_countries_institutions(country_codes_reformatted, df_trec, path="../data/OpenAlex_TREC.json" )

In [35]:
df_trec.iloc[0]["filepath"]  

'../data/resources/XML_TREC/trec25/overview_9.tei.xml'

In [36]:
df_trec.iloc[1]["filepath"]  

'../data/resources/XML_TREC/trec11/trec9-clir-overview.tei.xml'

In [37]:
df_trec

Unnamed: 0,PubYear,url,Authors,Title,Section,filename,filepath,ID,Classification,Countries
0,2000,http://trec.nist.gov/pubs/trec9/papers/overvie...,"[Ellen M. Voorhees, Donna Harman]",Overview of the Ninth Text REtrieval Conferenc...,Uncategorized,overview_9.pdf,../data/resources/XML_TREC/trec25/overview_9.t...,trec_1,Uncategorized,[]
1,2000,http://trec.nist.gov/pubs/trec9/papers/trec9-c...,"[Fredric C. Gey, Aitao Chen]",TREC-9 Cross-Language Information Retrieval (E...,Uncategorized,trec9-clir-overview.pdf,../data/resources/XML_TREC/trec11/trec9-clir-o...,trec_2,Uncategorized,[]
2,2000,http://trec.nist.gov/pubs/trec9/papers/filteri...,"[Stephen E. Robertson, David A. Hull]",The TREC-9 Filtering Track Final Report.,Uncategorized,filtering_new.pdf,../data/resources/XML_TREC/trec28/filtering_ne...,trec_3,Uncategorized,"[[United States, Microsoft (United States), St..."
3,2000,http://trec.nist.gov/pubs/trec9/papers/t9irep.pdf,"[William R. Hersh, Paul Over]",The TREC-9 Interactive Track Report.,Uncategorized,t9irep.pdf,../data/resources/XML_TREC/trec21/t9irep.tei.xml,trec_4,Uncategorized,[]
4,2000,http://trec.nist.gov/pubs/trec9/papers/liggett...,"[Walter Liggett, Chris Buckley]",Query Expansion Seen Through Return Order of R...,Uncategorized,liggett.pdf,../data/resources/XML_TREC/trec10/liggett.tei.xml,trec_5,Uncategorized,"[[Egypt, National Institute of Standards, Walt..."
...,...,...,...,...,...,...,...,...,...,...
1705,2016,https://trec.nist.gov/pubs/trec25/papers/IRIT-...,"[Gia-Hung Nguyen, Laure Soulier, Lynda Tamine,...",IRIT @ TREC 2016 Clinical Decision Support Track,Participant,IRIT-CL.pdf,../data/resources/XML_TREC/trec25/Participant/...,trec_1966,Participant,"[[France, Université Toulouse - Jean Jaurès, G..."
1706,2012,https://trec.nist.gov/pubs/trec21/papers/UTAus...,"[Hyun Joon Jung, Matthew Lease]",UT Austin in the TREC 2012 Crowdsourcing Track...,Participant,UTAustin.crowd.final.pdf,../data/resources/XML_TREC/trec21/Participant/...,trec_1967,Participant,"[[United States, The University of Texas at Au..."
1707,2007,https://trec.nist.gov/pubs/trec16/papers/umelb...,"[William Webber, Vo Ngoc Anh, Alistair Moffat]",The University of Melbourne in the Million Que...,Participant,umelbourne.ngoc-ahn.MQ.final.pdf,../data/resources/XML_TREC/trec16/Participant/...,trec_1969,Participant,[]
1708,2020,https://trec.nist.gov/pubs/trec29/papers/OVERV...,"[Asia J. Biega, Fernando Diaz, Michael D. Ekst...",Overview of the TREC 2020 Fair Ranking Track∗,Overview,OVERVIEW.FR.pdf,../data/resources/XML_TREC/trec29/Overview/OVE...,trec_1970,Overview,[]


In [39]:
# Function to extract the required informations like author names, institutions and corresponding countries to the institutions from the XML-files transformed with GROBID

def extract_authors_info(path):
    authors_info = []
    file_path = Path(path)
    tree = ET.parse(file_path)
    root = tree.getroot()
    ns = {'ns0': 'http://www.tei-c.org/ns/1.0'}
    source_desc_elem = root.find('.//ns0:sourceDesc', namespaces=ns)
    if source_desc_elem is None:
        return authors_info
    for author_elem in source_desc_elem.findall('.//ns0:author', namespaces=ns):
        
        forename_elem = author_elem.find('./ns0:persName/ns0:forename', namespaces=ns)
        if forename_elem is not None:
            forename = forename_elem.text
        else:
            forename = ""
        
        surname_elem = author_elem.find('./ns0:persName/ns0:surname', namespaces=ns)
        if surname_elem is not None:
            surname = surname_elem.text
        else:
            surname = ""
        
        author_name = f"{forename} {surname}"
        # Extract affiliations of the author
        for aff_elem in author_elem.findall('./ns0:affiliation', namespaces=ns):
            org_name_elem = aff_elem.find('./ns0:orgName', namespaces=ns)
            if org_name_elem is not None:
                org_name = org_name_elem.text
            else:
                org_name = ""
            address_elem = aff_elem.find('./ns0:address', namespaces=ns)
            if address_elem is not None:
                country_elem = address_elem.find('./ns0:country', namespaces=ns)
                if country_elem is not None and country_elem.text is not None and org_name != "":
                    authors_info.append([country_elem.text, org_name, author_name])

    return authors_info

In [40]:
list_metadata_grobid_extract = []

# Extracting the metadata based on the paths 

for i,j in df_trec.iterrows():
    try:
        author_infos = extract_authors_info(j["filepath"])
        list_metadata_grobid_extract.append(author_infos)
    except:
        list_metadata_grobid_extract.append([])

In [42]:
df_trec["grobid"] = list_metadata_grobid_extract

In [47]:
df_trec

Unnamed: 0,PubYear,url,Authors,Title,Section,filename,filepath,ID,Classification,Countries,grobid
0,2000,http://trec.nist.gov/pubs/trec9/papers/overvie...,"[Ellen M. Voorhees, Donna Harman]",Overview of the Ninth Text REtrieval Conferenc...,Uncategorized,overview_9.pdf,../data/resources/XML_TREC/trec25/overview_9.t...,trec_1,Uncategorized,[],[]
1,2000,http://trec.nist.gov/pubs/trec9/papers/trec9-c...,"[Fredric C. Gey, Aitao Chen]",TREC-9 Cross-Language Information Retrieval (E...,Uncategorized,trec9-clir-overview.pdf,../data/resources/XML_TREC/trec11/trec9-clir-o...,trec_2,Uncategorized,[],[]
2,2000,http://trec.nist.gov/pubs/trec9/papers/filteri...,"[Stephen E. Robertson, David A. Hull]",The TREC-9 Filtering Track Final Report.,Uncategorized,filtering_new.pdf,../data/resources/XML_TREC/trec28/filtering_ne...,trec_3,Uncategorized,"[[United States, Microsoft (United States), St...",[]
3,2000,http://trec.nist.gov/pubs/trec9/papers/t9irep.pdf,"[William R. Hersh, Paul Over]",The TREC-9 Interactive Track Report.,Uncategorized,t9irep.pdf,../data/resources/XML_TREC/trec21/t9irep.tei.xml,trec_4,Uncategorized,[],[]
4,2000,http://trec.nist.gov/pubs/trec9/papers/liggett...,"[Walter Liggett, Chris Buckley]",Query Expansion Seen Through Return Order of R...,Uncategorized,liggett.pdf,../data/resources/XML_TREC/trec10/liggett.tei.xml,trec_5,Uncategorized,"[[Egypt, National Institute of Standards, Walt...",[]
...,...,...,...,...,...,...,...,...,...,...,...
1705,2016,https://trec.nist.gov/pubs/trec25/papers/IRIT-...,"[Gia-Hung Nguyen, Laure Soulier, Lynda Tamine,...",IRIT @ TREC 2016 Clinical Decision Support Track,Participant,IRIT-CL.pdf,../data/resources/XML_TREC/trec25/Participant/...,trec_1966,Participant,"[[France, Université Toulouse - Jean Jaurès, G...","[[France, France, IRIT, Gia-Hung Nguyen], [Fra..."
1706,2012,https://trec.nist.gov/pubs/trec21/papers/UTAus...,"[Hyun Joon Jung, Matthew Lease]",UT Austin in the TREC 2012 Crowdsourcing Track...,Participant,UTAustin.crowd.final.pdf,../data/resources/XML_TREC/trec21/Participant/...,trec_1967,Participant,"[[United States, The University of Texas at Au...",[]
1707,2007,https://trec.nist.gov/pubs/trec16/papers/umelb...,"[William Webber, Vo Ngoc Anh, Alistair Moffat]",The University of Melbourne in the Million Que...,Participant,umelbourne.ngoc-ahn.MQ.final.pdf,../data/resources/XML_TREC/trec16/Participant/...,trec_1969,Participant,[],"[[Australia, Department of Computer Science an..."
1708,2020,https://trec.nist.gov/pubs/trec29/papers/OVERV...,"[Asia J. Biega, Fernando Diaz, Michael D. Ekst...",Overview of the TREC 2020 Fair Ranking Track∗,Overview,OVERVIEW.FR.pdf,../data/resources/XML_TREC/trec29/Overview/OVE...,trec_1970,Overview,[],[]


In [48]:
df_trec.to_parquet("../data/Network_preparations_TREC.parquet")