In [1]:
import pandas as pd
import json
import glob
from tqdm import tqdm
import sys, os
from fuzzywuzzy import fuzz
import requests
import aiohttp
import asyncio
import nest_asyncio
import numpy as np

In [3]:
nest_asyncio.apply()
import logging

# Set up the logger
logger = logging.getLogger("WOS-MATCH")
sh = logging.StreamHandler()
logger.addHandler(sh)
logger.setLevel(20)

In [9]:
async def fetch(paper, es_end_point):

    #
    # Sub routines
    #
    def make_query(paper):
        """
        Make a query for the ElasticSearch
        
        Parameters
        ----------
        paper: dict 
            with keys, title, authors, year
        
        Returns
        -------
        query: str
            query for the ElasticSearch
        paper_queried: dict
            paper metadata to make the query
        """

        paper_queried = {"title": "", "author_list": [], "year": np.nan}

        original_title = str(paper["title"]).lower()
        original_author_list = paper.get("authors", [])
        original_authors = " ".join(
            original_author_list[: np.minimum(3, len(original_author_list))]
        )
        
        # Initialize clause place holders
        should_clause = []
        must_clause = []

        # Title
        title_query = '{"match": {"titles.title._VALUE":"%s"}}' % original_title
        must_clause += [title_query]
        
        # Authors
        for name in original_author_list:
            should_clause += ['{"match": {"name.display_name":"%s"}}' % name]
        
        # Year of publication
        year = np.nan
        if paper["year"] is not None:
            if np.isnan(paper["year"]) == False:
                year = int(paper["year"])
                must_clause += ['{"match": {"pub_info._pubyear":"%d"}}' % year]

        if (len(must_clause) + len(should_clause)) == 1:
            # query = '{"size":%d,"query": %s }' %  (num, title_query)
            query = '{"size":5,"query": { "bool":{ "must":[%s] } }}' % (
                ",".join(must_clause)
            )
        elif len(should_clause) == 0:
            # query = '{"size":%d,"query": %s }' %  (num, title_query)
            query = '{"size":5,"query": { "bool":{ "must":[%s] } }}' % (
                ",".join(must_clause)
            )
        else:
            query = (
                '{"size":5,"query": { "bool":{ "must":[%s], "should":[%s], "minimum_should_match": 1 } }}'
                % (",".join(must_clause), ",".join(should_clause))
            )
        paper_queried["title"] = original_title
        paper_queried["author_list"] = original_author_list
        paper_queried["year"] = year

        return query, paper_queried
    
    def make_query_pmid(paper):
        """
        Make a query for the ElasticSearch
        
        Parameters
        ----------
        paper: dict 
            with keys, title, authors, year
        
        Returns
        -------
        query: str
            query for the ElasticSearch
        paper_queried: dict
            paper metadata to make the query
        """
        paper_queried = {"title": "", "author_list": [], "pmid": np.nan, "year": np.nan}

        original_title = str(paper["title"]).lower()
        original_author_list = paper.get("authors", [])
        original_authors = " ".join(
            original_author_list[: np.minimum(3, len(original_author_list))]
        )
        
        # Initialize clause place holders
        should_clause = []
        must_clause = []

        # Title
        title_query = '{"match": {"titles.title._VALUE":"%s"}}' % original_title
        must_clause += [title_query]
        
        # Authors
        for name in original_author_list:
            should_clause += ['{"match": {"name.display_name":"%s"}}' % name]
        
        # Matching by pmid
        pmid = np.nan
        if paper["pmid_CORD"] is not None:
            if np.isnan(paper["pmid_CORD"]) == False:
                pmid = int(paper["pmid_CORD"])
                must_clause += ['{"match": {"identifier._value":"%d"}}' % pmid]
        
        year = np.nan
        if paper["year"] is not None:
            if np.isnan(paper["year"]) == False:
                year = int(paper["year"])
                must_clause += ['{"match": {"pub_info._pubyear":"%d"}}' % year]

        if (len(must_clause) + len(should_clause)) == 1:
            # query = '{"size":%d,"query": %s }' %  (num, title_query)
            query = '{"size":5,"query": { "bool":{ "must":[%s] } }}' % (
                ",".join(must_clause)
            )
        elif len(should_clause) == 0:
            # query = '{"size":%d,"query": %s }' %  (num, title_query)
            query = '{"size":5,"query": { "bool":{ "must":[%s] } }}' % (
                ",".join(must_clause)
            )
        else:
            query = (
                '{"size":5,"query": { "bool":{ "must":[%s], "should":[%s], "minimum_should_match": 1 } }}'
                % (",".join(must_clause), ",".join(should_clause))
            )
        paper_queried["title"] = original_title
        paper_queried["author_list"] = original_author_list
        paper_queried["pmid"] = pmid
        paper_queried["year"] = year

        return query, paper_queried
    
    def parse_response(_response):
        """
        Parse the response from the ElasticSearch
        
        Parameters
        ----------
        _response: response
            response from the ElasticSearch
        
        Returns
        -------
        results: list
            List of matches. Each match is a dict object containing authors, title, journal, identifier, year and score.
        """

        _response = json.loads(_response)
        try:
            hits = _response.get("hits", None)
        except:
            print(_response)
        #print(_response)
        if hits is None:
            return []

        hits = hits["hits"]
        results = []
        for hit in hits:
            score = hit["_score"]
            doc = hit["_source"]
            journal = ""
            title = ""
            #print(doc.get("titles", None))
            if doc.get("titles", None) is not None:
                for d in doc["titles"]["title"]:
                    if d["_type"] == "source":
                        journal = d["_VALUE"]
                    if d["_type"] == "item":
                        title = d["_VALUE"]

            identifier = doc.get("identifier", [[]])
            UID = doc.get("UID", "")

            authors = []
            if doc.get("name", None) is not None:
                for d in doc["name"]:
                    authors += [d["display_name"]]

            year = doc.get("pub_info", [{"_pubyear": np.nan}])["_pubyear"]
            #print(doc.get("pub_info", [{"_pubyear": np.nan}]))
            results += [
                {
                    "authors": authors,
                    "title": str(title),
                    "journal": journal,
                    "identifier": identifier,
                    "year": year,
                    "score": score,
                    "UID": UID,
                }
            ]
        return results

    def calculate_similarity(results, paper_queried):
        """
        Calculate the similarity between the hit and query
        
        Parameters
        ----------
        results: list
            result given by parse_response 
        paper_queried:
            dict object that contains the metadata to make the query
            
        Returns
        -------
        matches: list
            List of dict. Each dict consists of "match" and "rank", where match contains the information on the paper found by ElasticSarch
            The "rank" indicates the rank of the match starting from 0 (best match)
        """
        if len(results) == 0:
            return []

        # Compute the similarity for the secondary check
        for rid, res in enumerate(results):
            year_similarity = - np.abs(
                paper_queried["year"]
                - (
                    paper_queried["year"]
                    if np.isnan(res.get("year"))
                    else res.get("year")
                )
            )
            
            pmid_similarity = - np.abs(
                paper_queried["pmid"]
                - (
                    paper_queried["pmid"]
                    if not res.get("identifier") in ["pmid"]
                    else res.get("identifier")["_value"]
                )
            )
            title_similarity = fuzz.ratio(
                paper_queried["title"], res.get("title", " ").lower()
            )
            authorlist = " ".join(paper_queried["author_list"])
            #print(rid, authorlist)
            if not authorlist:  #catch empty authorlist
                author_similarity = -1
            else:
                author_similarity = fuzz.token_sort_ratio(
                    authorlist,
                    " ".join(res.get("authors", [])).lower(),
                )
            results[rid]["score"] = {
                "title": title_similarity,
                "author": author_similarity,
                "year": pmid_similarity,
            }

        results = sorted(
            results,
            key=lambda x: -(
                x["score"]["title"]
                + x["score"]["author"]
                + x["score"]["year"]
            ),
        )
        
        #if results[0]["score"]["title"] < 80:
        best_hit = results[0]
        if len(results) > 1:
            second_hit = results[1]
        else:
            second_hit = {"score": {"title": -1, "author": -1, "year": -1}}

        matches = [{"match": best_hit, "rank": 0}, {"match": second_hit, "rank": 1}]
        return matches

    #
    # Main routine
    #
    query, paper_queried = make_query_pmid(paper)
    #print(query)
    try:
        headers = {
            "Content-Type": "application/json",
        }
        semaphore = asyncio.Semaphore(1)
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    es_end_point, headers=headers, data=query.encode("utf-8")
                ) as response:
                    results0 = parse_response(await response.text())
                    results = calculate_similarity(results0, paper_queried)
                    return results

    except Exception as e:
        logger.error("Unable to get due to {}.".format(e.__class__))
        return -1
    
async def paper2doi(paper_list, es_end_point):
    semaphore = asyncio.Semaphore(10)
    async def sem_task(task):
        async with semaphore:
            await task
    ret = await asyncio.gather(*[fetch(paper, es_end_point) for paper in paper_list])
    return ret

In [10]:
from datetime import datetime

es_end_point = "http://{user}:{password}@{endpoint}".format(
        user="yan30", password="", endpoint="iuni2.carbonate.uits.iu.edu:9200/wos_covid/_search/"
    )

# Initialize the counter
first_write = True
paper_count = 0
identified_count = 0
    
for papers in pd.read_csv("/N/project/rcsc/raw_data/mag-2020-09-01/CORD19-09-01/CORD-19-MAGinsts-LitCovid.csv", escapechar='\\', chunksize=500):
        # Convert pandas data frame into list
        paper_list = papers.to_dict("records")
        for i in range(len(paper_list)):
            pid = papers.index[i]
            try:
                authors = []
                inputList = eval(papers.loc[pid,"pmcAuthors"])
                for author in inputList:
                    authors.append(author["last"]+", "+author["first"][0])              
            except:
                try: 
                    authors = []
                    inputList = eval(papers.loc[pid,"pdfAuthors"])
                    for author in inputList:
                        authors.append(author["last"]+", "+author["first"][0])      
                except:
                    authors = []
            try:    
                date = datetime.strptime(papers.loc[pid,"publish_time"], "%Y-%m-%d")
            except:
                date = datetime.strptime("2005-01-01", "%Y-%m-%d")
            
            paper_list[i]["authors"] = authors
            paper_list[i]["pid"] = pid
            paper_list[i]["year"] = date.year
            try: 
                paper_list[i]["pmid_CORD"] = int(float(papers.loc[pid,"pubmed_id"]))
            except:
                paper_list[i]["pmid_CORD"] = np.nan
            
            for new_column in ["doi", "pmid", "xref_doi", "art_no"]:
                paper_list[i][new_column] = ""

        # Search
        loop = asyncio.get_event_loop()
        result = asyncio.run(paper2doi(paper_list, es_end_point))        
        # Set identifier if found
        for i in range(len(result)):
            for match in result[i]:  # result contans the best match and secondary math
                if match["rank"] == 0:  # if the match is the best match
                    for identifier in match["match"]["identifier"]:
                        if identifier: #catch empty identifier list
                            if identifier["_type"] in ["doi", "pmid", "xref_doi", "art_no"]:
                                paper_list[i][identifier["_type"]] = identifier["_value"]
                    paper_list[i]["UID"] = match["match"]["UID"]
                    paper_list[i]["WoStitle"] = match["match"]["title"]
                    paper_list[i]["WoSauthors"] = match["match"]["authors"]
                    paper_list[i]["WoSjournal"] = match["match"]["journal"]
                    identified_count += 1
                for sim_type in ["title", "author", "year"]:
                    paper_list[i]["score_%s_%d" % (sim_type, match["rank"])] = match[
                        "match"
                    ]["score"][sim_type]
                    
        papers = pd.DataFrame(paper_list)
        # Save this chunk
        if first_write:
            first_write = False
            papers.to_csv("testOutput.csv", index=False, mode="w")
        else:
            papers.to_csv("testOutput.csv", index=False, header=False, mode="a")

        # Logging
        paper_count += papers.shape[0]
        info = "{identified}/{total} identified".format(
            identified=identified_count, total=paper_count
        )
        logger.info(info)
#papers

238/500 identified
481/1000 identified
725/1500 identified
979/2000 identified
1194/2500 identified
1430/3000 identified
1659/3500 identified
1879/4000 identified
2095/4500 identified
2335/5000 identified
2563/5500 identified
2792/6000 identified
3028/6500 identified
3254/7000 identified
3492/7500 identified
3716/8000 identified
3941/8500 identified
4167/9000 identified
4383/9500 identified
4627/10000 identified
4850/10500 identified
5085/11000 identified
5301/11500 identified
5531/12000 identified
5766/12500 identified
5979/13000 identified
6209/13500 identified
6441/14000 identified
6671/14500 identified
6926/15000 identified
7162/15500 identified
7401/16000 identified
7652/16500 identified
7896/17000 identified
8147/17500 identified
8374/18000 identified
8590/18500 identified
8825/19000 identified
9046/19500 identified
9279/20000 identified
9513/20500 identified
9745/21000 identified
9978/21500 identified
10202/22000 identified
10428/22500 identified
10643/23000 identified
10888/235

In [None]:
def issue_paper_ids():
    """
    Paper ID generator
    """
    paper_id = 0
    while True:
        yield paper_id
        paper_id = paper_id + 1


def extract_from_json(
    filename,
    paper_id_generator,
    focal_field_list=[
        "issn",
        "authors",
        "cord_paper_id",
        "paper_id",
    ],
):
    """
    Extract the bibliography from the json file
    """
    with open(filename, "r") as f:
        data = json.load(f)

    paper = {f: None for f in focal_field_list}
    paper["cord_paper_id"] = data["paper_id"]
    paper["paper_id"] = next(paper_id_generator) 
    paper["authors"] = data["metadata"]["authors"] 

    raw_references = [
        v
        for k, v in data["bib_entries"].items()
        if ("BIBREF" in k) and v["year"] is not None
    ]  # Extract all references
    references = []
    for i, raw_ref in enumerate(raw_references):
        ref = {fi: raw_ref.get(fi, None) for fi in focal_field_list}
        doi = raw_ref["other_ids"].get("DOI", [None])
        ref["DOI"] = doi[0] if len(doi) > 0 else None
        references += [ref]

    for i, r in enumerate(references):
        references[i]["paper_id"] = next(paper_id_generator)

    return paper, references

In [None]:
def extract_meta_json(filename):
    """
    Extract the metadata from the json file
    """
    try:
        with open(filename, "r") as f:
            data = json.load(f)
        return data["metadata"]["authors"]
    except OSError as e:
        return None

meta["pmcAuthors"] = meta['pmc_json_files'].dropna().apply(extract_meta_json)

def get_Affiliations(authorList):
    """
    Extract the metadata from the json file
    """
    AffiliationsList = [d.get('affiliation') for d in authorList]
    return AffiliationsList
#get_Affiliations(meta['pmcAuthors'].iloc[0])  
meta['pmcAffiliation'] = meta['pmcAuthors'].dropna().apply(get_Affiliations)

In [2]:
meta0 = pd.read_csv('/N/project/rcsc/raw_data/mag-2020-09-01/CORD19-09-01/CORD-19-MAGinsts-LitCovid-WoS.csv')
meta0.count()

  interactivity=interactivity, compiler=compiler, result=result)


cord_uid                 242164
sha                       94862
source_x                 242164
title                    242108
doi                       74870
pmcid                    101673
pubmed_id                139396
license                  242164
abstract                 171771
publish_time             242107
authors                  242164
journal                  227648
mag_id                        0
who_covidence_id          68828
arxiv_id                   3197
pdf_json_files            94862
pmc_json_files            74137
url                      173596
s2_id                    210817
pmcAuthors                74137
pmcAffiliation            74137
pdfAuthors                89969
pdfAffiliation            89969
pmid                      92400
annotations               65978
MAGids                   167331
authorids                167331
authorOrders             167331
affiliationids           167331
affiliationNormalized    167331
affiliationNames         167331
GRIDids 

In [11]:
gTruth = pd.read_csv('VincentDisambiguation.txt', escapechar='\\', sep="\t")
gTruth["WoSid"] = "WOS:"+gTruth["UT_Code"]
gTruth.count()

ID_researcher    961033
UT_Code          961033
author_order     961033
Author_name      961033
WoSid            961033
dtype: int64

In [15]:
paperCount = gTruth.groupby(['WoSid']).count()
paperCount.sort_values('ID_researcher',ascending=False)

Unnamed: 0_level_0,ID_researcher,UT_Code,author_order,Author_name
WoSid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WOS:000313336500004,53,53,53,53
WOS:000314407400017,51,51,51,51
WOS:000322785400002,50,50,50,50
WOS:000320609000001,50,50,50,50
WOS:000321208400075,50,50,50,50
...,...,...,...,...
WOS:000262646600009,1,1,1,1
WOS:000262646600007,1,1,1,1
WOS:000262646600005,1,1,1,1
WOS:000262646500063,1,1,1,1


In [16]:
paperMatched = pd.merge(meta0, paperCount, left_on=['UID'], right_on=['WoSid'], how='left')
paperMatched.count()

cord_uid                 242164
sha                       94862
source_x                 242164
title                    242108
doi                       74870
pmcid                    101673
pubmed_id                139396
license                  242164
abstract                 171771
publish_time             242107
authors                  242164
journal                  227648
mag_id                        0
who_covidence_id          68828
arxiv_id                   3197
pdf_json_files            94862
pmc_json_files            74137
url                      173596
s2_id                    210817
pmcAuthors                74137
pmcAffiliation            74137
pdfAuthors                89969
pdfAffiliation            89969
pmid                      92400
annotations               65978
MAGids                   167331
authorids                167331
authorOrders             167331
affiliationids           167331
affiliationNormalized    167331
affiliationNames         167331
GRIDids 

In [18]:
paperMatched.dropna(subset=['UT_Code'],inplace=True)
paperMatched.count()

cord_uid                 5863
sha                       415
source_x                 5863
title                    5863
doi                      3172
pmcid                     559
pubmed_id                 246
license                  5863
abstract                 3431
publish_time             5859
authors                  5863
journal                  5659
mag_id                      0
who_covidence_id         5272
arxiv_id                    3
pdf_json_files            415
pmc_json_files            299
url                       597
s2_id                    5556
pmcAuthors                299
pmcAffiliation            299
pdfAuthors                401
pdfAffiliation            401
pmid                     5528
annotations              1641
MAGids                   3220
authorids                3220
authorOrders             3220
affiliationids           3220
affiliationNormalized    3220
affiliationNames         3220
GRIDids                  3112
xref_doi                 1889
art_no    

In [None]:
import requests

resp = requests.get('https://api.semanticscholar.org/v1/paper/d1aafb70c066a2068b02786f8929fd9c900897fb')
if resp.status_code != 200:
    # This means something went wrong.
    raise ApiError('GET /tasks/ {}'.format(resp.status_code))
#resp.json()['references']
resp.json()

def get_SS_json(url):
    """
    Extract the metadata from SemanticScholar REST API
    """
    resp = requests.get('https://api.semanticscholar.org/v1/paper/'+url)
    if resp.status_code != 200:
        # This means something went wrong.
        #return None
        return ('status Codes: '+str(resp.status_code))
    else :    
        return resp.json()['references']

meta["SSRefs"] = meta['sha'].dropna().apply(get_SS_json)

In [None]:
import requests

es_end_point = "http://{user}:{password}@{endpoint}".format(
        user="yan30", password="", endpoint="iuni2.carbonate.uits.iu.edu:9200/wos_covid/_search/"
    )

resp = requests.get(es_end_point)
if resp.status_code != 200:
    # This means something went wrong.
    raise ApiError('GET /tasks/ {}'.format(resp.status_code))
#resp.json()['references']
resp.json()