In [1]:
import pandas as pd 
import numpy as np 
import pathlib as pl 
import requests
import country_converter as coco 
from module import *

In [2]:
# define path 
DIR = pl.Path("../..")

data = DIR / "data"
internal = DIR / "data" / "ersilia"
external = DIR / "data" / "external"
PROC_DIR = DIR / "src" / "processed"

In [3]:
# import internal data
pub = pd.read_csv(internal / "publications.csv")

In [4]:
# Get the SDGs and Open Access status for Ersilia's publication from OpenAlex API
pub_info = {}
for slug, id in zip(pub.Slug,pub.OpenAlexID):
    response = get_work(id)
    sdg = response["sustainable_development_goals"]
    oa_status = response['open_access']['is_oa']
    if len(sdg) == 0:
        pub_info[slug] = (None, oa_status)
    else:
        pub_info[slug] = (sdg[0]['display_name'], oa_status)

pub_info_df = pd.DataFrame(pub_info).T.reset_index().rename(columns={'index': 'slug', 0: 'sdg', 1: 'open'})
pub_info_df.head()

Unnamed: 0,slug,sdg,open
0,structural-systems-pharmacology,Good health and well-being,False
1,chemocentric-view,Good health and well-being,True
2,similar-binding-pockets,Good health and well-being,True
3,oncogenomic-landscape,,True
4,rationalizing-drug-response,Good health and well-being,False


In [5]:
# Map open accessibility 
open_map = {
    True: "Open",
    False: 'Closed'
}
pub_info_df['open'] = pub_info_df['open'].map(open_map)
pub_info_df.head()

Unnamed: 0,slug,sdg,open
0,structural-systems-pharmacology,Good health and well-being,Closed
1,chemocentric-view,Good health and well-being,Open
2,similar-binding-pockets,Good health and well-being,Open
3,oncogenomic-landscape,,Open
4,rationalizing-drug-response,Good health and well-being,Closed


In [6]:
pub_info_merge_ersilia = pub.merge(pub_info_df, how='left', left_on='Slug', right_on = 'slug').drop(columns=['slug','Abstract']).fillna("None") #dropping Abstract since it's causing issues uploading to Looker
#pub_info_merge_ersilia.to_csv(PROC_DIR/"publication_sdg_open.csv", index=False)
pub_info_merge_ersilia.to_csv(PROC_DIR/"publication_merged.csv", index=False)

In [7]:
# Get required information on the citations
# information contains: citation name (cited_in_paper), SDG, SDG score, Publication Date, Primary Topic

# Initialising
citation_info = {}
author_info = {}
for slug, id in zip(pub.Slug,pub.OpenAlexID):
    # Request and extract info from response from OpenAlex API
    work_cited_url = get_cited_url(id)
    response = get_cited(work_cited_url)
    info = get_citation_info(response)
    authors = get_author_info(response)
    # Transform response into DataFrame
    df1 = pd.DataFrame(info).T.reset_index().rename(columns={'index': 'cited_in_paper', 0: 'sdg_goal', 1:'score', 2:'publication_date', 3:'primary_topic'})
    df1['Slug'] = [slug] * len(df1)

    df2 = pd.DataFrame(authors).T.reset_index().rename(columns={'index': 'cited_in_paper', 0:'Institution', 1:'Country', 2: 'Author Name', 3: 'Author ID'})
    try:
        df2['cited_in_paper'] = df2['cited_in_paper'].str.split("_").str[0]
    except: 
        pass
    df2['Slug'] = [slug] * len(df2)

    # Store in dictionary
    citation_info[slug] = df1
    author_info[slug] = df2

# Concat all dataframes in the dictionary for citation info
citation_info_df = pd.concat(citation_info.values(), axis=0).reset_index(drop=True).rename(columns={'Slug':'slug'})
citation_info_df['publication_date'] = pd.to_datetime(citation_info_df['publication_date'])
citation_info_df['sdg_goal'] = citation_info_df['sdg_goal'].astype(str)

# Concat all dataframes in the dictionary for authors info
author_df = pd.concat(author_info.values(), axis=0).reset_index(drop=True).rename(columns={'Slug':'slug'})


In [8]:
citation_info_df.head()

Unnamed: 0,cited_in_paper,sdg_goal,score,publication_date,primary_topic,slug
0,In silico methods to address polypharmacology:...,Good health and well-being,0.45,2016-02-01,Computational Methods in Drug Discovery,structural-systems-pharmacology
1,Before and after AlphaFold2: An overview of pr...,,,2023-02-28,Protein Structure Prediction and Analysis,structural-systems-pharmacology
2,Towards a detailed atlas of protein–protein in...,,,2013-12-01,Analysis of Gene Interaction Networks,structural-systems-pharmacology
3,Alternative modulation of protein–protein inte...,,,2015-12-01,Targeted Protein Degradation in Biomedical Res...,structural-systems-pharmacology
4,Machine Learning Toxicity Prediction: Latest A...,Good health and well-being,0.67,2022-12-13,Computational Methods in Drug Discovery,structural-systems-pharmacology


In [9]:
author_df.head()

Unnamed: 0,cited_in_paper,Institution,Country,Author Name,Author ID,slug
0,In silico methods to address polypharmacology:...,,[IT],Antonio Lavecchia,https://openalex.org/A5083327774,structural-systems-pharmacology
1,In silico methods to address polypharmacology:...,,[IT],Carmen Cerchia,https://openalex.org/A5031786233,structural-systems-pharmacology
2,Before and after AlphaFold2: An overview of pr...,Universidade de São Paulo,BR,Letícia M. F. Bertoline,https://openalex.org/A5060313675,structural-systems-pharmacology
3,Before and after AlphaFold2: An overview of pr...,Universidade de São Paulo,BR,Angélica N. Lima,https://openalex.org/A5048517462,structural-systems-pharmacology
4,Before and after AlphaFold2: An overview of pr...,Universidade de São Paulo,BR,José Eduardo Krieger,https://openalex.org/A5069859243,structural-systems-pharmacology


In [10]:
# Save as csv file
citation_info_df.drop(columns=['score']).to_csv(PROC_DIR/"citation_information.csv", index=False)

In [11]:
((pd.merge(left = citation_info_df.groupby(citation_info_df.publication_date.dt.year).agg('count')['slug'].reset_index(), 
          right = pub_info_merge_ersilia.groupby(pub_info_merge_ersilia.Year).agg('count')['Slug'].reset_index(), 
          how='outer', 
          left_on='publication_date', 
          right_on='Year')
          .drop(columns=['Year'])
          .fillna(0)
          .rename(columns={'publication_date': 'Publication Year',
                           'slug': 'Citation',
                           'Slug': 'Publication'}))
                           .melt(id_vars='Publication Year')
                           .rename(columns={
                               'variable': 'Type',
                               'value': 'Count'
                           }).to_csv(PROC_DIR/"summary_of_publication_and_citation_by_year.csv", index=False))

In [12]:
# Mapping the country code to country short name
author_flatten_df = author_df.copy().explode(['Country']).fillna(value = {'Country': "None"}).reset_index(drop=True)
author_flatten_df['short_name'] = author_flatten_df.Country.apply(lambda x: coco.convert(names = x, to='name_short', not_found=None))

None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found in regex
None not found i

In [13]:
country_to_region_mapping = pd.read_csv(external/"geo"/"country_mapping.csv")
country_to_region_mapping.head()

region_mapping = {
    "GS" : "Global South",
    "GN" : "Global North"
}

In [14]:
author_flatten_with_region_df = author_flatten_df.merge(country_to_region_mapping, how='left', left_on="short_name", right_on="Countries").drop(columns='Countries')
author_flatten_with_region_df['Global North/ Global South classification'] = author_flatten_with_region_df['Global North/ Global South classification'].map(region_mapping)
author_flatten_with_region_df.rename(columns={"Global North/ Global South classification":"Regions"}, inplace=True)
author_flatten_with_region_df

Unnamed: 0,cited_in_paper,Institution,Country,Author Name,Author ID,slug,short_name,Regions
0,In silico methods to address polypharmacology:...,,IT,Antonio Lavecchia,https://openalex.org/A5083327774,structural-systems-pharmacology,Italy,Global North
1,In silico methods to address polypharmacology:...,,IT,Carmen Cerchia,https://openalex.org/A5031786233,structural-systems-pharmacology,Italy,Global North
2,Before and after AlphaFold2: An overview of pr...,Universidade de São Paulo,BR,Letícia M. F. Bertoline,https://openalex.org/A5060313675,structural-systems-pharmacology,Brazil,Global South
3,Before and after AlphaFold2: An overview of pr...,Universidade de São Paulo,BR,Angélica N. Lima,https://openalex.org/A5048517462,structural-systems-pharmacology,Brazil,Global South
4,Before and after AlphaFold2: An overview of pr...,Universidade de São Paulo,BR,José Eduardo Krieger,https://openalex.org/A5069859243,structural-systems-pharmacology,Brazil,Global South
...,...,...,...,...,...,...,...,...
5872,Worldwide study of the taste of bitter medicin...,Monell Chemical Senses Center,US,Riley D Herriman,https://openalex.org/A5087531977,ai-tailor-drugs-africa-comment,United States,Global North
5873,Worldwide study of the taste of bitter medicin...,National Institute on Alcohol Abuse and Alcoho...,US,Paule V. Joseph,https://openalex.org/A5058203848,ai-tailor-drugs-africa-comment,United States,Global North
5874,Worldwide study of the taste of bitter medicin...,Countess of Chester Hospital,GB,Oghogho Braimah,https://openalex.org/A5095921631,ai-tailor-drugs-africa-comment,United Kingdom,Global North
5875,Worldwide study of the taste of bitter medicin...,Monell Chemical Senses Center,US,Danielle R. Reed,https://openalex.org/A5074936451,ai-tailor-drugs-africa-comment,United States,Global North


In [15]:
# Merge with citation information to get publication and topic
author_flatten_with_region_df.merge(citation_info_df[['cited_in_paper', 'publication_date', 'primary_topic']].drop_duplicates(subset=['cited_in_paper'], keep="first"), how="left", left_on="cited_in_paper", right_on="cited_in_paper").sort_values(by='publication_date', ascending=True).drop_duplicates(subset='Author ID', keep='first').to_csv(PROC_DIR/"author_information.csv", index=False)