In [1]:
import json
import os
import pandas as pd

In [2]:
metadata_cleaned = pd.read_csv("../Data/Cord 19/metadata_cleaned.csv")

In [3]:
map_ = pd.read_csv("../Data/PageRankCord19/references_map.csv")

In [4]:
map_.head()

Unnamed: 0,from_paper,to_paper
0,sequence requirements for rna strand transfer ...,recombination in rna viruses and in virus resi...
1,sequence requirements for rna strand transfer ...,minimal templates directing accurate initiatio...
2,sequence requirements for rna strand transfer ...,engineering the largest rna virus genome as an...
3,sequence requirements for rna strand transfer ...,an in vitro system for the leaderprimed transc...
4,sequence requirements for rna strand transfer ...,subgenomic negative strand rna function during...


In [5]:
# Top ranked papers by citations
map_.groupby('to_paper').count().sort_values(ascending=False, by='from_paper').head(10)

Unnamed: 0_level_0,from_paper
to_paper,Unnamed: 1_level_1
isolation of a novel coronavirus from a man with pneumonia in saudi arabia,771
a novel coronavirus from patients with pneumonia in china,729
identification of a novel coronavirus in patients with severe acute respiratory syndrome,698
a novel coronavirus associated with severe acute respiratory syndrome,683
early transmission dynamics in wuhan china of novel coronavirus infected pneumonia,680
epidemiological and clinical characteristics of cases of novel coronavirus pneumonia in wuhan china a descriptive study,667
clinical characteristics of coronavirus disease in china,652
clinical features of patients infected with novel coronavirus in wuhan,647
coronavirus as a possible cause of severe acute respiratory syndrome,538
a pneumonia outbreak associated with a new coronavirus of probable bat origin,507


In [6]:
# Least cited papers
map_.groupby('to_paper').count().sort_values(ascending=True, by='from_paper').head(10)

Unnamed: 0_level_0,from_paper
to_paper,Unnamed: 1_level_1
the effects of prenatal exposure to predictable or unpredictable stress on early development in the rat,0
hiv dementia and the basal ganglia,0
registrar general annual report of the registrar general for england and wales,0
association between early onset parkinson s disease and mutations in the parkin gene,0
synaptic density in human frontal cortexdevelopmental changes and effect of aging,0
echoes of as new yorkers try to keep calm but we can t quite carry on,0
covid got ya in a childcare jam clarkmalissa and i want to hear your experiences for a research study click for min survey looking for married couples with a kid under specifically,0
adrenomedullary catecholamine release in the fetus and newborn secretory mechanisms and their role in stress and survival,0
human brain dopamine receptors in children and aging adults,0
with americans hunkering down some wonder if coronavirus could lead to a baby boom the boston globe,0


In [7]:
encodings_folder = '../Encodings/Research Paper Encodings/'

In [107]:
titles = set()
for idx, row in map_.iterrows():
    to_title = row['to_paper']
    from_title = row['from_paper']
    titles.add(to_title)
    titles.add(from_title)

In [108]:
titles_id = {title: id for id, title in enumerate(titles)}

In [109]:
with open(os.path.join(encodings_folder, 'titles_id.json'), 'w') as f:
    json.dump(titles_id, f)

In [110]:
id_based_map = map_.copy()
id_based_map['to_paper'] = id_based_map['to_paper'].map(titles_id)
id_based_map['from_paper'] = id_based_map['from_paper'].map(titles_id)

In [111]:
id_based_map.head()

Unnamed: 0,from_paper,to_paper
0,867447,634718
1,867447,42072
2,867447,67122
3,867447,355235
4,867447,12036


In [112]:
id_based_map[['to_paper', 'from_paper']].to_csv(os.path.join(encodings_folder, 'id_based_references_map.csv'), index=False)

In [113]:
len(titles)

960890

### Reading and Transforming Results

In [8]:
results_path = "..\\Page Rank Results\\citation_rank_output.csv"
citation_ranks = pd.read_csv(results_path, header=None, names=['paper_id', 'citation_rank'])

In [9]:
citation_ranks.head()

Unnamed: 0,paper_id,citation_rank
0,820761,0.005979
1,902017,0.003455
2,544952,0.001806
3,881244,0.001091
4,26432,0.001031


In [10]:
titles_id = {}
with open(os.path.join(encodings_folder, 'titles_id.json'), 'r') as f:
    titles_id = json.load(f)

id_titles = {id: title for title, id in titles_id.items()}

In [11]:
citation_ranks["paper_title"] = citation_ranks['paper_id'].map(id_titles)

In [18]:
citation_ranks.sort_values(ascending=False, by='citation_rank')

Unnamed: 0,paper_id,citation_rank,paper_title
0,820761,5.979350e-03,viral glycoproteins biological role and applic...
1,902017,3.454510e-03,bibliography of the current world literature
2,544952,1.806490e-03,author index volumes
3,881244,1.091290e-03,spezielle arzneimitteltherapie in der schwange...
4,26432,1.031200e-03,cumulative contents for
...,...,...,...
341999,909756,4.701500e-07,transmigration of human cd monocytes across th...
342000,909757,4.701500e-07,phase i clinical trials with three formulation...
342001,909758,4.701500e-07,chlamydial persistence beyond the biphasic par...
342002,909759,4.701500e-07,authors address e caui public health laboratory


In [19]:
num_papers = len(citation_ranks) - 1
citation_ranks["Score"] = 0
for i in range(10):
    start_idx = int(i * 0.1 * num_papers)
    end_idx = int((i + 1) * 0.1 * num_papers)
    citation_ranks.loc[start_idx:end_idx, "Score"] = 10 - i


In [20]:
citation_ranks.head()

Unnamed: 0,paper_id,citation_rank,paper_title,Score
0,820761,0.005979,viral glycoproteins biological role and applic...,10
1,902017,0.003455,bibliography of the current world literature,10
2,544952,0.001806,author index volumes,10
3,881244,0.001091,spezielle arzneimitteltherapie in der schwange...,10
4,26432,0.001031,cumulative contents for,10


In [25]:
len(citation_ranks[citation_ranks["Score"] == 1])

96090

In [26]:
len(citation_ranks[citation_ranks["Score"] == 1])

96090

In [27]:
citation_ranks[["paper_title", "citation_rank", "Score"]].to_csv("../Page Rank Results/citation_ranks_with_scores.csv", index=False)