In [1]:
import pandas as pd
import json
import glob
from tqdm import tqdm
import sys, os
from fuzzywuzzy import fuzz
import requests
import aiohttp
import asyncio
import nest_asyncio
import numpy as np

In [3]:
nest_asyncio.apply()
import logging

# Set up the logger
logger = logging.getLogger("WOS-MATCH")
sh = logging.StreamHandler()
logger.addHandler(sh)
logger.setLevel(20)

In [9]:
async def fetch(paper, es_end_point):

    #
    # Sub routines
    #
    def make_query(paper):
        """
        Make a query for the ElasticSearch
        
        Parameters
        ----------
        paper: dict 
            with keys, title, authors, year
        
        Returns
        -------
        query: str
            query for the ElasticSearch
        paper_queried: dict
            paper metadata to make the query
        """

        paper_queried = {"title": "", "author_list": [], "year": np.nan}

        original_title = str(paper["title"]).lower()
        original_author_list = paper.get("authors", [])
        original_authors = " ".join(
            original_author_list[: np.minimum(3, len(original_author_list))]
        )
        
        # Initialize clause place holders
        should_clause = []
        must_clause = []

        # Title
        title_query = '{"match": {"titles.title._VALUE":"%s"}}' % original_title
        must_clause += [title_query]
        
        # Authors
        for name in original_author_list:
            should_clause += ['{"match": {"name.display_name":"%s"}}' % name]
        
        # Year of publication
        year = np.nan
        if paper["year"] is not None:
            if np.isnan(paper["year"]) == False:
                year = int(paper["year"])
                must_clause += ['{"match": {"pub_info._pubyear":"%d"}}' % year]

        if (len(must_clause) + len(should_clause)) == 1:
            # query = '{"size":%d,"query": %s }' %  (num, title_query)
            query = '{"size":5,"query": { "bool":{ "must":[%s] } }}' % (
                ",".join(must_clause)
            )
        elif len(should_clause) == 0:
            # query = '{"size":%d,"query": %s }' %  (num, title_query)
            query = '{"size":5,"query": { "bool":{ "must":[%s] } }}' % (
                ",".join(must_clause)
            )
        else:
            query = (
                '{"size":5,"query": { "bool":{ "must":[%s], "should":[%s], "minimum_should_match": 1 } }}'
                % (",".join(must_clause), ",".join(should_clause))
            )
        paper_queried["title"] = original_title
        paper_queried["author_list"] = original_author_list
        paper_queried["year"] = year

        return query, paper_queried
    
    def make_query_pmid(paper):
        """
        Make a query for the ElasticSearch
        
        Parameters
        ----------
        paper: dict 
            with keys, title, authors, year
        
        Returns
        -------
        query: str
            query for the ElasticSearch
        paper_queried: dict
            paper metadata to make the query
        """
        paper_queried = {"title": "", "author_list": [], "pmid": np.nan, "year": np.nan}

        original_title = str(paper["title"]).lower()
        original_author_list = paper.get("authors", [])
        original_authors = " ".join(
            original_author_list[: np.minimum(3, len(original_author_list))]
        )
        
        # Initialize clause place holders
        should_clause = []
        must_clause = []

        # Title
        title_query = '{"match": {"titles.title._VALUE":"%s"}}' % original_title
        must_clause += [title_query]
        
        # Authors
        for name in original_author_list:
            should_clause += ['{"match": {"name.display_name":"%s"}}' % name]
        
        # Matching by pmid
        pmid = np.nan
        if paper["pmid_CORD"] is not None:
            if np.isnan(paper["pmid_CORD"]) == False:
                pmid = int(paper["pmid_CORD"])
                must_clause += ['{"match": {"identifier._value":"%d"}}' % pmid]
        
        year = np.nan
        if paper["year"] is not None:
            if np.isnan(paper["year"]) == False:
                year = int(paper["year"])
                must_clause += ['{"match": {"pub_info._pubyear":"%d"}}' % year]

        if (len(must_clause) + len(should_clause)) == 1:
            # query = '{"size":%d,"query": %s }' %  (num, title_query)
            query = '{"size":5,"query": { "bool":{ "must":[%s] } }}' % (
                ",".join(must_clause)
            )
        elif len(should_clause) == 0:
            # query = '{"size":%d,"query": %s }' %  (num, title_query)
            query = '{"size":5,"query": { "bool":{ "must":[%s] } }}' % (
                ",".join(must_clause)
            )
        else:
            query = (
                '{"size":5,"query": { "bool":{ "must":[%s], "should":[%s], "minimum_should_match": 1 } }}'
                % (",".join(must_clause), ",".join(should_clause))
            )
        paper_queried["title"] = original_title
        paper_queried["author_list"] = original_author_list
        paper_queried["pmid"] = pmid
        paper_queried["year"] = year

        return query, paper_queried
    
    def parse_response(_response):
        """
        Parse the response from the ElasticSearch
        
        Parameters
        ----------
        _response: response
            response from the ElasticSearch
        
        Returns
        -------
        results: list
            List of matches. Each match is a dict object containing authors, title, journal, identifier, year and score.
        """

        _response = json.loads(_response)
        try:
            hits = _response.get("hits", None)
        except:
            print(_response)
        #print(_response)
        if hits is None:
            return []

        hits = hits["hits"]
        results = []
        for hit in hits:
            score = hit["_score"]
            doc = hit["_source"]
            journal = ""
            title = ""
            #print(doc.get("titles", None))
            if doc.get("titles", None) is not None:
                for d in doc["titles"]["title"]:
                    if d["_type"] == "source":
                        journal = d["_VALUE"]
                    if d["_type"] == "item":
                        title = d["_VALUE"]

            identifier = doc.get("identifier", [[]])
            UID = doc.get("UID", "")

            authors = []
            if doc.get("name", None) is not None:
                for d in doc["name"]:
                    authors += [d["display_name"]]

            year = doc.get("pub_info", [{"_pubyear": np.nan}])["_pubyear"]
            #print(doc.get("pub_info", [{"_pubyear": np.nan}]))
            results += [
                {
                    "authors": authors,
                    "title": str(title),
                    "journal": journal,
                    "identifier": identifier,
                    "year": year,
                    "score": score,
                    "UID": UID,
                }
            ]
        return results

    def calculate_similarity(results, paper_queried):
        """
        Calculate the similarity between the hit and query
        
        Parameters
        ----------
        results: list
            result given by parse_response 
        paper_queried:
            dict object that contains the metadata to make the query
            
        Returns
        -------
        matches: list
            List of dict. Each dict consists of "match" and "rank", where match contains the information on the paper found by ElasticSarch
            The "rank" indicates the rank of the match starting from 0 (best match)
        """
        if len(results) == 0:
            return []

        # Compute the similarity for the secondary check
        for rid, res in enumerate(results):
            year_similarity = - np.abs(
                paper_queried["year"]
                - (
                    paper_queried["year"]
                    if np.isnan(res.get("year"))
                    else res.get("year")
                )
            )
            
            pmid_similarity = - np.abs(
                paper_queried["pmid"]
                - (
                    paper_queried["pmid"]
                    if not res.get("identifier") in ["pmid"]
                    else res.get("identifier")["_value"]
                )
            )
            title_similarity = fuzz.ratio(
                paper_queried["title"], res.get("title", " ").lower()
            )
            authorlist = " ".join(paper_queried["author_list"])
            #print(rid, authorlist)
            if not authorlist:  #catch empty authorlist
                author_similarity = -1
            else:
                author_similarity = fuzz.token_sort_ratio(
                    authorlist,
                    " ".join(res.get("authors", [])).lower(),
                )
            results[rid]["score"] = {
                "title": title_similarity,
                "author": author_similarity,
                "year": pmid_similarity,
            }

        results = sorted(
            results,
            key=lambda x: -(
                x["score"]["title"]
                + x["score"]["author"]
                + x["score"]["year"]
            ),
        )
        
        #if results[0]["score"]["title"] < 80:
        best_hit = results[0]
        if len(results) > 1:
            second_hit = results[1]
        else:
            second_hit = {"score": {"title": -1, "author": -1, "year": -1}}

        matches = [{"match": best_hit, "rank": 0}, {"match": second_hit, "rank": 1}]
        return matches

    #
    # Main routine
    #
    query, paper_queried = make_query_pmid(paper)
    #print(query)
    try:
        headers = {
            "Content-Type": "application/json",
        }
        semaphore = asyncio.Semaphore(1)
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    es_end_point, headers=headers, data=query.encode("utf-8")
                ) as response:
                    results0 = parse_response(await response.text())
                    results = calculate_similarity(results0, paper_queried)
                    return results

    except Exception as e:
        logger.error("Unable to get due to {}.".format(e.__class__))
        return -1
    
async def paper2doi(paper_list, es_end_point):
    semaphore = asyncio.Semaphore(10)
    async def sem_task(task):
        async with semaphore:
            await task
    ret = await asyncio.gather(*[fetch(paper, es_end_point) for paper in paper_list])
    return ret

In [20]:
meta = pd.read_csv("/N/project/rcsc/raw_data/mag-2020-09-01/CORD19-09-01/CORD-19-MAGinsts-LitCovid.csv", escapechar='\\')
meta.drop(["year"], inplace=True, axis=1) 
meta.count()

cord_uid                 242164
Unnamed: 0               242164
sha                       94862
source_x                 242164
title                    242108
doi                      158784
pmcid                    101673
pubmed_id                139396
license                  242164
abstract                 171771
publish_time             242107
authors                  234852
journal                  227648
mag_id                        0
who_covidence_id          68828
arxiv_id                   3197
pdf_json_files            94862
pmc_json_files            74137
url                      173596
s2_id                    210817
pmcAuthors                74137
pmcAffiliation            74137
pdfAuthors                89969
pdfAffiliation            89969
pmid                      65978
annotations               65978
MAGids                   167331
authorids                167331
authorOrders             167331
affiliationids           167331
affiliationNormalized    167331
affiliat

In [50]:
try:
    authors = []
    inputList = eval(meta.loc[22,"pmcAuthors"])
    for author in inputList:
        authors.append(author["last"]+", "+author["first"][0])              
except:
    try: 
        authors = []
        inputList = eval(meta.loc[22,"pdfAuthors"])
        for author in inputList:
            authors.append(author["last"]+", "+author["first"][0])      
    except:
        authors = []
authors

['De Groot, A']

In [10]:
from datetime import datetime

es_end_point = "http://{user}:{password}@{endpoint}".format(
        user="yan30", password="", endpoint="iuni2.carbonate.uits.iu.edu:9200/wos_covid/_search/"
    )

# Initialize the counter
first_write = True
paper_count = 0
identified_count = 0
    
for papers in pd.read_csv("/N/project/rcsc/raw_data/mag-2020-09-01/CORD19-09-01/CORD-19-MAGinsts-LitCovid.csv", escapechar='\\', chunksize=500):
        # Convert pandas data frame into list
        paper_list = papers.to_dict("records")
        for i in range(len(paper_list)):
            pid = papers.index[i]
            try:
                authors = []
                inputList = eval(papers.loc[pid,"pmcAuthors"])
                for author in inputList:
                    authors.append(author["last"]+", "+author["first"][0])              
            except:
                try: 
                    authors = []
                    inputList = eval(papers.loc[pid,"pdfAuthors"])
                    for author in inputList:
                        authors.append(author["last"]+", "+author["first"][0])      
                except:
                    authors = []
            try:    
                date = datetime.strptime(papers.loc[pid,"publish_time"], "%Y-%m-%d")
            except:
                date = datetime.strptime("2005-01-01", "%Y-%m-%d")
            
            paper_list[i]["authors"] = authors
            paper_list[i]["pid"] = pid
            paper_list[i]["year"] = date.year
            try: 
                paper_list[i]["pmid_CORD"] = int(float(papers.loc[pid,"pubmed_id"]))
            except:
                paper_list[i]["pmid_CORD"] = np.nan
            
            for new_column in ["doi", "pmid", "xref_doi", "art_no"]:
                paper_list[i][new_column] = ""

        # Search
        loop = asyncio.get_event_loop()
        result = asyncio.run(paper2doi(paper_list, es_end_point))        
        # Set identifier if found
        for i in range(len(result)):
            for match in result[i]:  # result contans the best match and secondary math
                if match["rank"] == 0:  # if the match is the best match
                    for identifier in match["match"]["identifier"]:
                        if identifier: #catch empty identifier list
                            if identifier["_type"] in ["doi", "pmid", "xref_doi", "art_no"]:
                                paper_list[i][identifier["_type"]] = identifier["_value"]
                    paper_list[i]["UID"] = match["match"]["UID"]
                    paper_list[i]["WoStitle"] = match["match"]["title"]
                    paper_list[i]["WoSauthors"] = match["match"]["authors"]
                    paper_list[i]["WoSjournal"] = match["match"]["journal"]
                    identified_count += 1
                for sim_type in ["title", "author", "year"]:
                    paper_list[i]["score_%s_%d" % (sim_type, match["rank"])] = match[
                        "match"
                    ]["score"][sim_type]
                    
        papers = pd.DataFrame(paper_list)
        # Save this chunk
        if first_write:
            first_write = False
            papers.to_csv("testOutput.csv", index=False, mode="w")
        else:
            papers.to_csv("testOutput.csv", index=False, header=False, mode="a")

        # Logging
        paper_count += papers.shape[0]
        info = "{identified}/{total} identified".format(
            identified=identified_count, total=paper_count
        )
        logger.info(info)
#papers

238/500 identified
481/1000 identified
725/1500 identified
979/2000 identified
1194/2500 identified
1430/3000 identified
1659/3500 identified
1879/4000 identified
2095/4500 identified
2335/5000 identified
2563/5500 identified
2792/6000 identified
3028/6500 identified
3254/7000 identified
3492/7500 identified
3716/8000 identified
3941/8500 identified
4167/9000 identified
4383/9500 identified
4627/10000 identified
4850/10500 identified
5085/11000 identified
5301/11500 identified
5531/12000 identified
5766/12500 identified
5979/13000 identified
6209/13500 identified
6441/14000 identified
6671/14500 identified
6926/15000 identified
7162/15500 identified
7401/16000 identified
7652/16500 identified
7896/17000 identified
8147/17500 identified
8374/18000 identified
8590/18500 identified
8825/19000 identified
9046/19500 identified
9279/20000 identified
9513/20500 identified
9745/21000 identified
9978/21500 identified
10202/22000 identified
10428/22500 identified
10643/23000 identified
10888/235

In [4]:
# for aiohttp.client_exceptions.ClientConnectorError'> debugging
#first_write = True
#paper_count = 0
#identified_count = 0
for papers in pd.read_csv("/N/project/rcsc/raw_data/mag-2020-09-01/CORD19-09-01/CORD-19-MAGinsts-LitCovid.csv", escapechar='\\', chunksize=1000, skiprows=range(1, 100000), nrows = 200000):
        # Convert pandas data frame into list
        paper_list = papers.to_dict("records")
        for i in range(len(paper_list)):
            pid = papers.index[i]
            try:
                authors = []
                inputList = eval(papers.loc[pid,"pdfAuthors"])
                for author in inputList:
                    authors.append(author["last"]+", "+author["first"][0])              
            except:
                authors = []
            try:    
                date = datetime.strptime(papers.loc[pid,"publish_time"], "%Y-%m-%d")
            except:
                date = datetime.strptime("9999-01-01", "%Y-%m-%d")
            
            paper_list[i]["authors"] = authors
            paper_list[i]["pid"] = pid
            paper_list[i]["year"] = date.year
            
            for new_column in ["doi", "pmid", "xref_doi", "art_no"]:
                paper_list[i][new_column] = ""

        # Search
        loop = asyncio.get_event_loop()
        result = asyncio.run(paper2doi(paper_list, es_end_point))        
        # Set identifier if found
        for i in range(len(result)):
            for match in result[i]:  # result contans the best match and secondary math
                if match["rank"] == 0:  # if the match is the best match
                    for identifier in match["match"]["identifier"]:
                        if identifier: #catch empty identifier list
                            if identifier["_type"] in ["doi", "pmid", "xref_doi", "art_no"]:
                                paper_list[i][identifier["_type"]] = identifier["_value"]
                    paper_list[i]["UID"] = match["match"]["UID"]
                    identified_count += 1

                for sim_type in ["title", "author", "year"]:
                    paper_list[i]["score_%s_%d" % (sim_type, match["rank"])] = match[
                        "match"
                    ]["score"][sim_type]
                    
        papers2 = pd.DataFrame(paper_list)
        # Save this chunk
        if first_write:
            first_write = False
            papers2.to_csv("testOutput2.csv", index=False, mode="w")
        else:
            papers2.to_csv("testOutput2.csv", index=False, header=False, mode="a")

        # Logging
        paper_count += papers2.shape[0]
        info = "{identified}/{total} identified".format(
            identified=identified_count, total=paper_count
        )
        logger.info(info)
        
#matched = pd.read_csv("testOutput.csv")
#matched.drop(matched.tail(1).index,inplace=True) 
#matched2 = pd.read_csv("testOutput2.csv")
#WoSmatched = pd.concat([matched, matched2], axis=0)

NameError: name 'datetime' is not defined

In [92]:
meta0 = pd.read_csv('testOutput.csv')
meta0.drop(["year", "pid"], inplace=True, axis=1) 
#papers.dtypes
meta0.groupby('score_year_0').count()

Unnamed: 0_level_0,cord_uid,Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,...,art_no,UID,WoStitle,WoSauthors,WoSjournal,score_title_0,score_author_0,score_title_1,score_author_1,score_year_1
score_year_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,12128,12128,9584,12128,12128,10743,10383,12128,12128,9214,...,2935,12128,12128,12128,12128,12128,12128,12128,12128,12128


In [97]:
meta1.loc[(meta1["score_year_0"]!=0)]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,GRIDids,pmid_CORD,xref_doi,art_no,UID,WoSauthors,WoSjournal,score_title_0,score_author_0,score_year_0
0,00a19z5i,8e3673efd4b53eabbbe603faddc34a111de80484,Elsevier; Medline; PMC,Testing the modularity of the N-terminal amphi...,,PMC7111807,1.62268e+07,els-covid,Abstract The N-terminal region of the picornav...,2006-01-20,...,grid.10419.3d;grid.94365.3d;grid.6612.3;grid.6...,1.62268e+07,,,,,,,,
1,02rcmt0g,,Medline,Activation of Egr-1 expression in astrocytes b...,,,2.04147e+07,unk,Human immunodeficiency virus type 1 (HIV-1) Ta...,2011,...,grid.257410.5;grid.257410.5;grid.257410.5;grid...,2.04147e+07,,,,,,,,
2,034w5afv,5b6bbda2fc6183d26b212e74e697d6e389278c3a,Medline; PMC,Dissolution Advantage of Nitazoxanide Cocrysta...,,PMC7022799,3.18817e+07,cc-by,The effect of hydroxypropyl methylcellulose (H...,2019-12-25,...,grid.412873.b;grid.412873.b;grid.412873.b;grid...,3.18817e+07,,,,,,,,
3,03er0xjy,,Medline,PANI/BaFe12O19@Halloysite ternary composites a...,,,3.28187e+07,unk,A three-phase PANI/BaFe12O19@Hal heterostructu...,2020-08-06,...,grid.216417.7;grid.216417.7;grid.216417.7;grid...,3.28187e+07,,,,,,,,
4,04uf62g7,,Medline,Thin-section computed tomography findings in 1...,,,2.82737e+07,unk,"Background To date, there has been no computed...",2017,...,grid.460942.d,2.82737e+07,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242159,zz01t9fq,,PMC,Editor's Note,10.1007/s10489-020-01721-4,PMC7273714,,cc-by-nc-nd,,2020-06-05,...,,,,,WOS:000530233800002,['[Anonymous]'],APPLIED INTELLIGENCE,37.0,-1.0,
242160,zz3walri,109fba60f390ba2be39c77da6e04266294a8540f,Medline; PMC,Pandemic response protocol of a non-frontline ...,,PMC7255825,3.24682e+07,no-cc,"BACKGROUND: 3,181,642 cases and 224,301 deaths...",2020-05-29,...,,3.24682e+07,,,,,,,,
242161,zz74v7n3,,WHO,Cardiovascular manifestations in severe and cr...,,,,unk,,2020,...,grid.8547.e;grid.8547.e;grid.8547.e;grid.8547....,,,,WOS:000235640000003,"['Sheng, WH', 'Chiang, BL', 'Chang, SC', 'Ho, ...",JOURNAL OF THE FORMOSAN MEDICAL ASSOCIATION,51.0,-1.0,
242162,zz74v7n3,,WHO,Cardiovascular manifestations in severe and cr...,,,,unk,BACKGROUND: Severe acute respiratory syndrome ...,2020,...,grid.8547.e;grid.8547.e;grid.8547.e;grid.8547....,,,,WOS:000235640000003,"['Sheng, WH', 'Chiang, BL', 'Chang, SC', 'Ho, ...",JOURNAL OF THE FORMOSAN MEDICAL ASSOCIATION,51.0,-1.0,


In [109]:
meta1 = meta0.drop(["Unnamed: 0", "score_title_1", "score_author_1", "score_year_1"], axis=1)
#meta1.loc[meta1["pmid_CORD"].isnull(),["UID","WoSauthors","WoSjournal"]] = [np.NaN,np.NaN,np.NaN]
meta1.loc[(meta1["score_year_0"]==0),"pmid_CORD"] = "pmidMatch"
meta1.loc[meta1["pmid_CORD"].isnull(),"pmid_CORD"] = np.NaN
meta1.count()

cord_uid                 242164
sha                       94862
source_x                 242164
title                    242108
doi                       74870
pmcid                    101673
pubmed_id                139396
license                  242164
abstract                 171771
publish_time             242107
authors                  242164
journal                  227648
mag_id                        0
who_covidence_id          68828
arxiv_id                   3197
pdf_json_files            94862
pmc_json_files            74137
url                      173596
s2_id                    210817
pmcAuthors                74137
pmcAffiliation            74137
pdfAuthors                89969
pdfAffiliation            89969
pmid                      92400
annotations               65978
MAGids                   167331
authorids                167331
authorOrders             167331
affiliationids           167331
affiliationNormalized    167331
affiliationNames         167331
GRIDids 

In [110]:
meta1.loc[(meta1["score_year_0"]!=0)&((meta1["score_title_0"]<70)|(meta1["score_author_0"]<70)),["pmid_CORD","UID","WoSauthors","WoSjournal"]] = [np.NaN,np.NaN,np.NaN,np.NaN]
meta1.count()

cord_uid                 242164
sha                       94862
source_x                 242164
title                    242108
doi                       74870
pmcid                    101673
pubmed_id                139396
license                  242164
abstract                 171771
publish_time             242107
authors                  242164
journal                  227648
mag_id                        0
who_covidence_id          68828
arxiv_id                   3197
pdf_json_files            94862
pmc_json_files            74137
url                      173596
s2_id                    210817
pmcAuthors                74137
pmcAffiliation            74137
pdfAuthors                89969
pdfAffiliation            89969
pmid                      92400
annotations               65978
MAGids                   167331
authorids                167331
authorOrders             167331
affiliationids           167331
affiliationNormalized    167331
affiliationNames         167331
GRIDids 

In [99]:
meta1.astype(str).groupby(['UID']).count().sort_values('cord_uid',ascending=False)

Unnamed: 0_level_0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,affiliationNames,GRIDids,pmid_CORD,xref_doi,art_no,WoSauthors,WoSjournal,score_title_0,score_author_0,score_year_0
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,229774,229774,229774,229774,229774,229774,229774,229774,229774,229774,...,229774,229774,229774,229774,229774,229774,229774,229774,229774,229774
WOS:000528949200002,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
WOS:000529753800001,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
WOS:000525326200063,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
WOS:000230291500035,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOS:000347590900009,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
WOS:000347596500010,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
WOS:000347601900017,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
WOS:000347601900019,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [111]:
meta1[(meta1['UID'].notnull())][["title", "WoStitle", "score_title_0", "authors", "WoSauthors", "score_author_0", "journal", "WoSjournal", "doi", "pmid_CORD"]].to_csv("testOutput4.csv", index=False, mode="w")

In [112]:
meta1.to_csv("CORD-19-MAGinsts-LitCovid-WoS.csv", index=False, mode="w")

In [None]:
def issue_paper_ids():
    """
    Paper ID generator
    """
    paper_id = 0
    while True:
        yield paper_id
        paper_id = paper_id + 1


def extract_from_json(
    filename,
    paper_id_generator,
    focal_field_list=[
        "issn",
        "authors",
        "cord_paper_id",
        "paper_id",
    ],
):
    """
    Extract the bibliography from the json file
    """
    with open(filename, "r") as f:
        data = json.load(f)

    paper = {f: None for f in focal_field_list}
    paper["cord_paper_id"] = data["paper_id"]
    paper["paper_id"] = next(paper_id_generator) 
    paper["authors"] = data["metadata"]["authors"] 

    raw_references = [
        v
        for k, v in data["bib_entries"].items()
        if ("BIBREF" in k) and v["year"] is not None
    ]  # Extract all references
    references = []
    for i, raw_ref in enumerate(raw_references):
        ref = {fi: raw_ref.get(fi, None) for fi in focal_field_list}
        doi = raw_ref["other_ids"].get("DOI", [None])
        ref["DOI"] = doi[0] if len(doi) > 0 else None
        references += [ref]

    for i, r in enumerate(references):
        references[i]["paper_id"] = next(paper_id_generator)

    return paper, references

In [None]:
os.chdir("/N/project/rcsc/raw_data/2020_09_02/")
os.getcwd()
meta = pd.read_csv('metadata.csv')
meta
#paper, references = extract_from_json(meta['pmc_json_files'].iloc[0], paper_id_generator)

In [None]:
metaCount = meta.groupby(['source_x']).count()
metaCount.sort_values('cord_uid',ascending=False)

In [None]:
meta0Count = meta0.groupby(['source_x']).count()
meta0Count.sort_values('cord_uid',ascending=False)

In [None]:
pd.set_option('display.max_colwidth', 2)
meta['pdf_json_files']

In [None]:
def extract_meta_json(filename):
    """
    Extract the metadata from the json file
    """
    try:
        with open(filename, "r") as f:
            data = json.load(f)
        return data["metadata"]["authors"]
    except OSError as e:
        return None

meta["pmcAuthors"] = meta['pmc_json_files'].dropna().apply(extract_meta_json)

def get_Affiliations(authorList):
    """
    Extract the metadata from the json file
    """
    AffiliationsList = [d.get('affiliation') for d in authorList]
    return AffiliationsList
#get_Affiliations(meta['pmcAuthors'].iloc[0])  
meta['pmcAffiliation'] = meta['pmcAuthors'].dropna().apply(get_Affiliations)

In [None]:
pd.reset_option('display.max_colwidth')
meta

In [None]:
meta

In [None]:
#get_Affiliations(meta['pmcAuthors'].iloc[0])  
meta['pdfAuthors'] = meta['pdf_json_files'].dropna().apply(extract_meta_json)
#meta['pdfAffiliation'] = meta['pdfAuthors'].dropna().apply(get_Affiliations)
meta

In [None]:
#meta['pdfAffiliation'] = meta['pdfAuthors'].dropna().apply(get_Affiliations)
meta.to_csv("/N/project/rcsc/shared_space/RCSCdata/metaInstitutes-09-01.csv")
#paper_refs.to_csv("papersPDFrefs.csv")

In [None]:
meta = pd.read_csv("/N/project/rcsc/raw_data/mag-2020-09-01/CORD19-09-01/CORD-19-MAGinsts-LitCovid.csv", escapechar='\\', encoding='utf-8')
meta

In [11]:
LitCovid = pd.read_csv("/N/project/rcsc/shared_space/RCSCdata/09132020.litcovid.export.tsv", sep = "\t")
LitCovid

Unnamed: 0,pmid,title,journal
0,32916755,Clinical endodontic management during the COVI...,Int Endod J
1,32916744,[COVID-19 associated pneumonia despite repeate...,Pneumologie
2,32916743,[Renin-Angiotensin-System (RAS) and COVID-19 -...,Pneumologie
3,32916697,"RE: ""COVID-19 IN HEALTH-CARE WORKERS: A LIVING...",Am J Epidemiol
4,32916692,"Covid19: Unless one gets everyone to act, poli...",PLoS One
...,...,...,...
14459,32148172,Timely development of vaccines against SARS-Co...,Emerg Microbes Infect
14460,32147944,Clinical strategies for treating pediatric can...,Pediatr Blood Cancer
14461,32147890,Emergency Management of the Prevention and Con...,Acad Emerg Med
14462,32147731,Diagnosis and Management of First Case of COVI...,Clin Infect Dis


In [None]:
LitMatched = pd.merge(LitCovid, meta, left_on=['pmid'], right_on=['pubmed_id'], how='left')
LitMatched.count()

In [None]:
LitMatchedT = pd.merge(LitCovid, meta, left_on=['title'], right_on=['title'], how='left')
LitMatchedT.count()

In [None]:
import json

with open('/N/project/rcsc/raw_data/litcovid2pubtator-2020-09-13.json') as f:
    LitCovidPub = json.load(f)
# Output: {'name': 'Bob', 'languages': ['English', 'Fench']}
LitCovidPub[1][10]

In [None]:
LitCovidPub[1][100]

In [None]:
count = 0
for key in LitCovidPub[1]:
    count += 1
    #print(key)
count

In [None]:
rows = [] 
# appending rows
i = 0
for key in LitCovidPub[1]:
    row = {}
    row['pmid'] = key['pmid']
    row['year'] = key['year']
    row['annotations'] = key['accessions']
    rows.append(row) 
df = pd.DataFrame(rows) 
df

In [None]:
metaLit = pd.merge(meta, df, left_on=['pubmed_id'], right_on=['pmid'], how='left').applymap(str)
temp = metaLit[['cord_uid','pmid','year','annotations']].applymap(str)
temp['pmid'] = metaLit.groupby(['cord_uid'])['pmid'].transform(lambda x: ';'.join(x)).replace('nan', np.nan)
#temp['year'] = metaLit.groupby(['cord_uid'])['year'].transform(lambda x: ';'.join(x)).replace('nan', np.nan)
temp['annotations'] = metaLit.groupby(['cord_uid'])['annotations'].transform(lambda x: ';'.join(x)).replace('nan', np.nan)
temp2 = temp.drop_duplicates()
temp2.count()

In [None]:
metaLit2 = pd.merge(meta, temp2, left_on=['cord_uid'], right_on=['cord_uid'], how='left')
metaLit2.count()

In [None]:
#metaLit2.to_csv("/N/project/rcsc/shared_space/RCSCdata/metaLitCovid-09-13.csv")

In [None]:
LitPubMatched = pd.merge(df, meta, left_on=['pmid'], right_on=['pubmed_id'], how='left')
len(LitPubMatched.pmid.value_counts())

In [None]:
Matched = pd.merge(LitCovid, df, left_on=['pmid'], right_on=['pmid'], how='left')
Matched.count()

In [None]:
paper_id_generator = issue_paper_ids()
paper_refs = []
papers = []
for filename in tqdm(FILE_LIST):

    # Extract bibliography from json
    paper, references = extract_from_json(filename, paper_id_generator)

    # Record the citations between the paper and its references
    paper_refs += [(paper["paper_id"], ref["paper_id"]) for ref in references]

    # Record extracted papers
    papers += [pd.DataFrame([paper] + references)]

# Pack into the pandas DataFrame
papers = pd.concat(papers, ignore_index=True)
paper_refs = pd.DataFrame(paper_refs, columns=["source", "target"])

# Save
papers.to_csv("papersPMC.csv")
paper_refs.to_csv("papersPMCrefs.csv")

In [None]:
f = open('document_parses/pdf_json/6b0567729c2143a66d737eb0a2f63f2dce2e5a7d.json', "r")
json.load(f)

In [None]:
import requests

resp = requests.get('https://api.semanticscholar.org/v1/paper/d1aafb70c066a2068b02786f8929fd9c900897fb')
if resp.status_code != 200:
    # This means something went wrong.
    raise ApiError('GET /tasks/ {}'.format(resp.status_code))
#resp.json()['references']
resp.json()

def get_SS_json(url):
    """
    Extract the metadata from SemanticScholar REST API
    """
    resp = requests.get('https://api.semanticscholar.org/v1/paper/'+url)
    if resp.status_code != 200:
        # This means something went wrong.
        #return None
        return ('status Codes: '+str(resp.status_code))
    else :    
        return resp.json()['references']

meta["SSRefs"] = meta['sha'].dropna().apply(get_SS_json)

In [None]:
import requests

es_end_point = "http://{user}:{password}@{endpoint}".format(
        user="yan30", password="", endpoint="iuni2.carbonate.uits.iu.edu:9200/wos_covid/_search/"
    )

resp = requests.get(es_end_point)
if resp.status_code != 200:
    # This means something went wrong.
    raise ApiError('GET /tasks/ {}'.format(resp.status_code))
#resp.json()['references']
resp.json()