In [0]:
#Initially, I will discuss how to get the affiliates data. I observed 5 major data sources: biorxiv, medrxiv, elsevier, pmc and cnz. Pubmed API and Selenium scraping customized 
#for major journals like medr,bior,sciencedirect gives a good fill rate

In [0]:
!apt update
!apt install chromium-chromedriver
!pip install selenium

In [0]:
!pip install ratelimit

In [0]:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',options=options)
find_el = wd.find_elements_by_xpath
wd.implicitly_wait(30)
wd.set_page_load_timeout(30)

In [0]:
#The various scraping functions
def node_text(node):
    if node.text:
        result = node.text
    else:
        result = ''
    for child in node:
        if child.tail is not None:
            result += child.tail
    return result

def scrape_else(data):
  try:
    wd.get("https://doi.org/"+data)
    more_button = find_el("//button[@class='show-hide-details u-font-sans']")[0]
    more_button.click()
    l=[]
    for i in find_el("//dl[@class='affiliation']/dd"):
      l.append(i.text)
    return l
  except:
    l=[]
    return l

@sleep_and_retry
@limits(calls=1, period=1)
def czi_affiliations(data):
  try:
    PMC_ID = data.lstrip('PMC')
    response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id='+PMC_ID)
    root = ElementTree.fromstring(response.content)
    l=[]
    for i in root.iter('aff'):
      l.append(node_text(i))
    return l
  except:
    l=[]
    return l

def scrape_bior(data):
  try:
    wd.get("https://doi.org/"+data)
    wd.get(wd.current_url+".article-info")
    l=[]
    for i in find_el("//ol[@class='affiliation-list']/li/address/span"):
      l.append(i.text)
    return l
  except:
    l=[]
    return l

#Medr aldo follows bior structure so same function can be used

In [0]:
#Not putting the scraping results here as they were mult-threaded and run on a different env 

In [0]:
#For websites that redirect to PDFs
import os
d=dict()
c=0
root_dir = '/content/gdrive/My Drive'
for root,dirs, files in os.walk(root_dir):
  for file in files:
    c+=1
    print(c)
    if 'pdf' in file:
      name = file.lstrip('PII').rstrip('.pdf')
      l=[]
      try:
        wd.get("https://www.sciencedirect.com/science/article/pii/"+name)
        t1 = find_el("//a[@class='doi']")[0]
        more_button = find_el("//button[@class='show-hide-details u-font-sans']")[0]
        more_button.click()
        for i in find_el("//dl[@class='affiliation']/dd"):
          l.append(i.text)
        d[t1.text.lstrip('https://doi.org/')]=l
      except:
        pass


In [0]:
#Here on, we eill discuss the geolocation part
!sudo apt-get install curl autoconf automake libtool python-dev pkg-config
!git clone https://github.com/openvenues/libpostal
cd libpostal
!./bootstrap.sh
!sudo mkdir /opt/libpostal 
!./configure --datadir=/opt/libpostal
!make 
!sudo make install

In [0]:
pip install postal

Collecting postal
  Using cached https://files.pythonhosted.org/packages/56/f7/69ca5d374077e23aa9a51ecd4031222ca9dfb7d19c95d7691f024e2e27ef/postal-1.1.8.tar.gz
Building wheels for collected packages: postal
  Building wheel for postal (setup.py) ... [?25l[?25hdone
  Created wheel for postal: filename=postal-1.1.8-cp36-cp36m-linux_x86_64.whl size=162432 sha256=a175cfb9fb8d0c610e63d36be7bf63d5528d690587d04a20d98011e0794f0129
  Stored in directory: /root/.cache/pip/wheels/07/a1/d6/5d48641ee6f7c494c35534661d367b966b2b398e3789f48ffc
Successfully built postal
Installing collected packages: postal
Successfully installed postal-1.1.8


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import pandas as pd
import numpy as np

In [0]:
#Reading in the COVID affiliates data previously created, https://github.com/isdapro/CovidClinicalTrials/blob/master/Datasets/covid_affiliates.csv
df = pd.read_csv("covid_affiliates.csv")

In [0]:
from ast import literal_eval
df['affiliate'] = df['affiliate'].apply(literal_eval)

In [0]:
from geopy.geocoders import Nominatim, GeoNames, DataBC, ArcGIS, TomTom, Bing, GoogleV3
geolocator = Bing(api_key="BING_API_KEY")

In [0]:
def extract_country(data):
  s = set()
  for el in data:
    d = dict(parse_address(el))
    inv_map = {v: k for k, v in d.items()}
    if 'country' in inv_map.keys():
      s.add(inv_map['country'])
    elif 'city' in inv_map.keys():
      try:
        s.add(geolocator.geocode(inv_map['city']).raw['address']['countryRegion'])
      except:
        pass
    elif 'house' in inv_map.keys():
      try:
        s.add(geolocator.geocode(inv_map['house']).raw['address']['countryRegion'])
      except:
        pass
  return list(s)

In [0]:
!pip install pandarallel

In [0]:
from pandarallel import pandarallel

In [0]:
pandarallel.initialize()

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [0]:
from tqdm import tqdm
tqdm.pandas()

In [0]:
def set_cleaner(data):
  s = set()
  for i in data:
    s.add(i.lower())
  return list(s)

In [0]:
df['geolocated_country'] = df['affiliate'].parallel_apply(extract_country)
df['geolocated_country'] = df['geolocated_country'].parallel_apply(set_cleaner)

In [0]:
df.head(50)

Unnamed: 0.1,Unnamed: 0,Id,doi,title,affiliate,geolocated_country
0,0,044d1e54d0a62dcd6234694e2a7ce1aa48fa7601,10.1016/j.tmaid.2020.101571,Outbreak of novel Corona virus (2019-nCoV); im...,[],[]
1,1,a386909bd060eac84e6dad7e1dc90986baf1be27,10.1016/j.tmaid.2020.101573,Fatal human coronavirus 229E (HCoV-229E) and R...,[Research Group Infectious Diseases and Infect...,[colombia]
2,2,d9e4c9b6b809ddc1f9dc8787f77368334e1e538b,10.1016/j.tmaid.2020.101575,"Coronavirus infections reported by ProMED, Feb...","[Incubator in Zoonosis (SIZOO), Biodiversity a...","[saudi arabia, colombia, nepal]"
3,3,3cca30d1cc54ef4a38a3dda706059bea64906e6b,10.1016/j.tmaid.2020.101577,Coronavirus 2019-nCoV: Is the genie already ou...,"[Instituto de Microbiologia, Faculdade de Medi...","[the netherlands, portugal]"
4,4,f8da9e99d451df2a2b0177cc553bb82c61f712f0,10.1016/j.jhin.2020.02.002,"Novel coronavirus, poor quarantine, and the ri...",[],[]
5,5,af4b55f98a0bd3ff40552a86fbf52ad1ff9d68d5,10.1016/s2215-0366(20)30077-8,Online mental health services in China during ...,[],[]
6,6,6f07f5dc2599b774baffd5a80e0b0a59662b7f35,10.1016/s1473-3099(20)30110-9,Initiation of a new infection control system f...,[],[]
7,7,e740f0be0666572862dc8c736b998f527d326224,10.1016/j.ijid.2020.02.025,The basic reproduction number of novel coronav...,"[JC School of Public Health and Primary Care, ...","[china, usa]"
8,8,32d7635948b54f9ce6f019847fd57fe95306f721,10.1016/j.ijid.2020.02.024,"Comments on ""Preliminary estimation of the bas...","[School of Mathematical Science, University of...",[australia]
9,9,c843c9f3d123553ed90bed163426cd6ef071587f,10.1016/j.cmi.2020.02.011,First atypical case of 2019 novel coronavirus ...,[],[]


In [0]:
df.to_csv("geolocated_affiliations.csv")

In [0]:
linked_trials = pd.read_csv("ClinicalTrialsLinked.csv")

In [0]:
#Running the geolocation logic on the VT SHA papers
sha_vt = pd.read_csv("sha_vt.csv")
sha_vt.drop_duplicates(subset ="sha", inplace = True) 

In [0]:
len(set(sha_vt.sha))

253

In [0]:
metadata = pd.read_csv("metadata.csv")

In [0]:
#We need the DOI also for getting affiliate data so some merges are necessary
metadata = pd.read_csv("metadata.csv")
metadata = metadata.dropna(subset=['sha'])
s = metadata['sha'].str.split(';').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'sha'
del metadata['sha']
cleaned_meta = metadata.join(s)
merged = sha_vt.merge(cleaned_meta, left_on='sha', right_on='sha')

In [0]:
#Just avoiding work for papers that we have already extracted the data for
merged2 = merged.merge(df,how='left',left_on='sha',right_on='Id')

In [0]:
merged_compact = merged2[['sha','doi_x','pmcid','affiliate','geolocated_country']]

In [0]:
!pip install ratelimit

In [0]:
import requests
from tqdm import tqdm
tqdm.pandas()
import xml.etree.ElementTree as ElementTree
from ratelimit import limits, sleep_and_retry
def node_text(node):
    if node.text:
        result = node.text
    else:
        result = ''
    for child in node:
        if child.tail is not None:
            result += child.tail
    return result

@sleep_and_retry
@limits(calls=1, period=1)
def affiliations(data):
  try:
    PMC_ID = data.lstrip('PMC')
    response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id='+PMC_ID)
    root = ElementTree.fromstring(response.content)
    l=[]
    for i in root.iter('aff'):
      l.append(node_text(i))
    return l
  except:
    l=[]
    return l

In [0]:
def fun(data):
  if not isinstance(data.affiliate,list):
    tmp = affiliations(data.pmcid)
    data.affiliate = tmp
  print(data)
  return data
 

In [0]:
merged_compact2 = merged_compact.parallel_apply(fun,axis=1)

In [0]:
#80% fill rate
merged_compact2['geolocated_country'].apply(lambda x: len(x)).value_counts()

In [0]:
df_final = merged_compact2[['sha','geolocated_country']]

In [0]:
df_final

Unnamed: 0,sha,geolocated_country
0,00c71f72eb837e2342dbda8761664e221e2a03df,[]
1,05192151667b1bb4e3405de11f6e4ae2f844e7c5,[china]
2,056610b4981cee0efc485ed3f978f541fbe55f54,[uk]
3,0792384d074cef963c808eacf3c63e3654776a2a,[]
4,0c36cc3dc9a1632f0fbf14372f376f89521718c5,[germany]
5,0dca038ac8dfd45a921b7cd9f1c90a8554799e23,[]
6,0dd1ab1f5811f3a1fe36f45577846243b3cf14f8,[china]
7,0eadf5a901c0d89fad2c202990056556be103e12,[china.]
8,0f6a5070fd5f9bd5d46c3bbd4545c55982e726c3,[france]
9,1007a3041cb5ae9e3e6f09712d81277c02ed3dbf,[]


In [0]:
df_final.to_csv("drug_geolocated.csv")