# Prep Website Data

Prep datafiles for use in specific sections of the website


In [2]:
import numpy as np
import pandas as pd
import pydeck as pdk
import os
from os.path import join
import matplotlib.pyplot as plt
import seaborn as sns

import random
import json

In [29]:
dataDir = '../data'
outputDir = '../websiteData'

In [3]:
arts = []
with open(join(dataDir, 'articleMetadata.json')) as f:
    for i,line in enumerate(f):
        thisArt = json.loads(line)
        if 'isValid' in thisArt.keys():
            arts.append(thisArt)

## Hero

Create a sample of article titles used to drive the hero animation. This sample should be manually spot checked afterward to prune out titles that are less obviously related to the current pandemic. 

In [25]:
# get a random sample of journal articles
random.seed(101101)
sampleArts = random.sample(arts, 1000)

# extract the relevant metadata (title, journal name, etc) for each
# and save to csv
heroData = []
for a in sampleArts:
    title = a['title']
    if title:
        title = title.replace('\n', '').strip()
        
    journal = a['journalTitle']
    if journal:
        journal = journal.replace('\n', '').strip()
    
    heroData.append({
        'title': title,
        'journal': journal,
        'pubDate': a['pubDate']
    })

heroData_df = pd.DataFrame(heroData)
heroData_df.to_csv(join(outputDir, 'heroData.csv'), index=False)

In [22]:
heroData_df.shape

(1000, 3)

## Section I

### Papers per year

For the visualization showing the number of coronavirus related papers from the late 60's on to 2020. 

No processing steps required here, just a note about how that data was acquired: 

The `papersPerYear.csv` was obtained by going to pubmed central and using the covid-19 search string, without the publication date range:

```
"COVID-19"[All Fields] OR "COVID-19"[MeSH Terms] OR "COVID-19 Vaccines"[All Fields] OR "COVID-19 Vaccines"[MeSH Terms] OR "COVID-19 serotherapy"[All Fields] OR "COVID-19 Nucleic Acid Testing"[All Fields] OR "covid-19 nucleic acid testing"[MeSH Terms] OR "COVID-19 Serological Testing"[All Fields] OR "covid-19 serological testing"[MeSH Terms] OR "COVID-19 Testing"[All Fields] OR "covid-19 testing"[MeSH Terms] OR "SARS-CoV-2"[All Fields] OR "sars-cov-2"[MeSH Terms] OR "Severe Acute Respiratory Syndrome Coronavirus 2"[All Fields] OR "NCOV"[All Fields] OR "2019 NCOV"[All Fields] OR (("coronavirus"[MeSH Terms] OR "coronavirus"[All Fields] OR "COV"[All Fields]) ) 
```

Next, using the returned results, I applied a custom filter (using the left hand panel) to define a date range for each year from 1965 onwards. For instance, 1965 was setting the custom date range to `1965/01/01` to `1965/12/31`. 

I manually recorded the number of articles returned by this filter. Repeated for every year thereafter. 



## Section II

### Collaboration map

In order to store this data as efficiently as possible, create separate datasets for:

* a main geoIDs dataset. One row per unique geoID, stores all of the relevant info (addr, state, country, lat, lng) for each geoID. This dataset should be easily indexable

* the collaborations. For each collaboration, we need to know (at minimum) the date, src geoID idx, dst geoID idx

* total article states by date. For each date throughout 2020, store the number of (authors? papers? collaborations?) accumulated by that date



In [16]:
### Create the main geoIDs dataset
cityGeocodes_df = pd.read_csv(join(dataDir, 'cityGeocodes.csv'))

# add idx column
cityGeocodes_df['idx'] = cityGeocodes_df.index

# save a version for site, removing all unecessary columns
siteGeoIDs = cityGeocodes_df[['idx', 'lat', 'lng']]

# write to web folder
siteGeoIDs.to_csv(join(outputDir, 'geoIDs.csv'), float_format='%.2f', index=False)

In [21]:
def getGeoIdx(geoID):
    """ return the index of the specified geoID from the cityGeoCodes_df dataframe """
    indices = cityGeocodes_df.index[cityGeocodes_df['geoID'] == geoID].tolist()
    return indices[0]  # should only ever be 1 value

This step takes ~10min to run

In [28]:
def getGeoIdx(geoID):
    """ return the index of the specified geoID from the cityGeoCodes_df dataframe """
    indices = cityGeocodes_df.index[cityGeocodes_df['geoID'] == geoID].tolist()
    return indices[0]  # should only ever be 1 value


### create the collaborations dataset
collabs_df = pd.read_csv(join(dataDir, 'processed/collaborations.csv'))

# add the srcIdx and dstIdx cols to each row
collabs_df['srcIdx'] = collabs_df['geoID_A'].apply(getGeoIdx)
collabs_df['dstIdx'] = collabs_df['geoID_B'].apply(getGeoIdx)

# save a version for site, removing unecessary columns
siteCollabs = collabs_df[['pubDate', 'srcIdx', 'dstIdx']].sort_values('pubDate')

# write to web folder
siteCollabs.to_csv(join(outputDir, 'collabsByDate.csv'), index=False)

In [25]:
collabs_df.head()

Unnamed: 0,PMCID,pubDate,geoID_A,geoID_B,lat_A,lng_A,fmtAddr_A,lat_B,lng_B,fmtAddr_B,srcIdx
0,PMC7581440,2020-10-23,ChIJ7faBuL7or0cRYDuslG2sJQQ,ChIJAVkDPzdOqEcRcDteW0YgIQQ,52.480909,10.550783,"38518 Gifhorn, Germany",52.520007,13.404954,"Berlin, Germany",7036
1,PMC7581502,2020-10-23,ChIJc8r44c9unUcRDZsdKH0cIJ0,ChIJuRMYfoNhsUcRoDrWe_I9JgQ,47.269212,11.404102,"Innsbruck, Austria",53.551085,9.993682,"Hamburg, Germany",177
2,PMC7581692,2020-10-23,ChIJmb1k2ko-eUgRqdwTAv26rVE,ChIJra6o8IHuBUgRMO0NHlI3DQQ,53.800755,-1.549077,"Leeds, UK",47.218371,-1.553621,"Nantes, France",324
3,PMC7581692,2020-10-23,ChIJmb1k2ko-eUgRqdwTAv26rVE,ChIJsZ3dJQevthIRAuiUKHRWh60,53.800755,-1.549077,"Leeds, UK",43.610769,3.876716,"Montpellier, France",324
4,PMC7581692,2020-10-23,ChIJmb1k2ko-eUgRqdwTAv26rVE,ChIJt2BwZIrfekgRAW4XP28E3EI,53.800755,-1.549077,"Leeds, UK",53.408371,-2.991573,"Liverpool, UK",324


In [10]:
cityGeocodes_df.index[cityGeocodes_df['geoID'] == 'ChIJU3OqqNAywkcRUG1NL6uZAAQ'].tolist()

[8039]