## Guideliness

Follow these guideliness to get the list of entities which have cited retracted articles.
* Run all the (Pre) blocks 
* Run block (1) to read the list of retracted articles from a CSV dataset
* Run block (2) to get the citing entities from MAG
* Run block (3) to dump the results of the previous step


In [2]:
import requests
import json
import pandas as pd
import os
import sys
import urllib.parse
import nltk
import string

## MAG configurations

In [3]:
## MICROSOFT ACADEMIC GRAPH APROACH
#https://www.microsoft.com/en-us/research/project/academic-knowledge/

#AUTH
USERNAME = "acdh2020"
PKEY = "6d1214a4c1f140eeab9606ab63671e83"
SKEY = "a518ae1f71bd4811a7e9d25e621736e2"


#CALLS
MAG_API_QUERY = "https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr="
HEADERS = {
    "Host": "api.labs.cognitive.microsoft.com",
    "Ocp-Apim-Subscription-Key": PKEY
}

#NOTE Write the &attributes in sorted order
EXPR = {
    "magid":"Id=<VAR>",
    "title":"Ti='<VAR>'",
    "doi":"DOI='<VAR>'",
    "rid":"RId=<VAR>",
    "author": "Composite(AA.AuN='<VAR>')",
    "composite": "And(Ti='<VAR_0>',Composite(AA.AuN='<VAR_1>'),Y=<VAR_2>)",
}
RESULTS ={
    "title": ("&attributes=Ti",[["entities"]]),
    "magid": ("&attributes=Id",[["entities","<ALL>","Id"]]),
    "meta": ("&attributes=DOI,D,DN,AA.DAuN,PB,VFN",[["entities","<ALL>","DOI"],["entities","<ALL>","D"],["entities","<ALL>","DN"],["entities",0,"AA","<ALL>","DAuN"],["entities","<ALL>","PB"],["entities","<ALL>","VFN"]]),
#    "author": "&attributes=AA.AuN,AA.AuId,AA.DAfN",
#    "affiliation": "&attributes=AA.DAfN,AW,DOI,F.FN",
    "citations_num": ("&attributes=CC", [["entities",0,"CC"]])
}    

Pt_mapping = {0:"Unknown", 1:"Journal article", 2:"Patent", 3:"Conference paper", 4:"Book chapter", 5:"Book", 6:"Book reference entry", 7:"Dataset", 8:"Repository"}

# TESTING MAG: COMMENT
#r = requests.get(MAG_API_QUERY + EXPR["doi"].replace("<VAR>","10.1007/s11192-019-03217-6")+ RESULTS["citation_context"], headers=HEADERS) 
#json.loads(r.text)
#query = f"expr=And(Ti='{title}')&attributes=Id,DOI,AA.AuN,AA.AuId,Y,RId"
#url_mag = f"https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?{query}"

## COCI and MAG methods

In [23]:
##https://opencitations.net/index/coci/api/v1/citations/10.1002/adfm.201505328
LOG_CALLS = []
def get_results(req_json, attr):
    if "REQUEST_ERROR" in req_json:
        return ["REQUEST_ERROR"]
    if len(attr) == 1:
        try:
            if attr[0] == "<ALL>":
                return req_json
            else:
                return req_json[attr[0]]
        except:
            return None
    else:
        if attr[0] == "<ALL>":
            l_results = []
            for a in req_json:
                l_results.append(get_results(a,attr[1:]))
            return l_results
        else:
            return get_results(req_json[attr[0]],attr[1:])

def call_mag(q_val, expr = "doi", results_type = "citations"):
    
    # returns a list of results in json format. E.G. [{},{},{}...]
    # the list might contain also the value REQUEST_ERROR
    def recursive_call(req_call, offset_num=0, res = []):
        try:
            get_call = req_call+ "&count=10&offset="+str(offset_num*10)
            r = requests.get(get_call, headers=HEADERS) 
            r_json = json.loads(r.text)
            
            if len(r_json.keys()) == 0:
                return res
            
            if "entities" in r_json: 
                if len(r_json["entities"]) == 0:
                    return res
            
            res += [r_json]
            return recursive_call(req_call,offset_num+1, res)
        
        except:
            res += ["REQUEST_ERROR"]
            return res
        
    results = None
    if expr != "":
        q_results_att = ""
        if results_type != None:
            q_results_att = RESULTS[results_type][0]
        
        req_call = MAG_API_QUERY
        if isinstance(q_val, list):
            expressions = EXPR[expr]
            for v_index,a_q_val in enumerate(q_val):
                expressions = expressions.replace("<VAR_"+str(v_index)+">",str(a_q_val))
            req_call += expressions
        else:
            req_call += EXPR[expr].replace("<VAR>",str(q_val))
        
        req_call += q_results_att
        LOG_CALLS.append(req_call)
        r_json_list = recursive_call(req_call)
        
        # Define the keys in the results object
        keys_att = RESULTS[results_type][0].replace("&attributes=","").split(",")
        results = {}
        for k in keys_att:
            results[k] = []
        
        # Populate the results object 
        if results_type != None:
            for r_json in r_json_list:
                for index,att_result in enumerate(RESULTS[results_type][1]): 
                    results[keys_att[index]] += get_results(r_json, att_result)
        else:
            return {"value":r_json_list}
        
    return results

def call_coci_citaions(doi):
    res = "unknown"
    if doi != "":
        req_call = "https://opencitations.net/index/coci/api/v1/citations/"+doi
        try:
            r = requests.get(req_call) 
            r_json = json.loads(r.text)
            res = get_results(r_json, ["<ALL>","citing"])
        except:
            pass
    return res

def call_coci_metadata(doi):
    res = "unknown"
    if doi != "":
        req_call = "https://opencitations.net/index/coci/api/v1/metadata/"+doi
        try:
            r = requests.get(req_call) 
            r_json = json.loads(r.text)
            res = get_results(r_json, [0])
        except:
            pass
    return res

## 1) Read all the items from the csv

In [None]:
import csv
items_l = []
with open('retracted_items.csv', mode='r') as file:     
    csvFile = csv.reader(file)   
    header = next(csvFile)
    for row in csvFile:
        a_dict = dict()
        for index,h in enumerate(header):
            a_dict[h] = row[index]
        items_l.append(a_dict)

## 2) Get citations from MAG

In [None]:
STARTING_INDEX = 0
for index,row in enumerate(items_l[STARTING_INDEX:]):
    print("Progress: "+str(STARTING_INDEX + index), end="\r", flush=True)
    ids = None
    
    # Check if there is a DOI value
    if row["doi"] != "":
        res = call_mag(row["doi"].upper(), expr = "doi", results_type="magid")
        ids = res["Id"]
    # In case no DOI found we query MAG using Title + Authors + Year of publication
    else:
        try: 
            title = urllib.parse.quote(row["title"].lower().translate(str.maketrans('', '', string.punctuation)))
            author = urllib.parse.quote(row["authors"].lower().replace(";;",", ").translate(str.maketrans('', '', string.punctuation)))
            year = row["date"][6:] 
            res = call_mag([title,author,year],expr = "composite",results_type="magid")
            ids = res["Id"]
        except:
            # if any error occur when searching for the retracted entity -> we label the values an "unavailable"
            row["mag_id"] = "unavailable"      
            row["mag_citing_entities"] = "unavailable"    
            row["mag_num_cits"] = "unavailable"
            continue
            
    # For each MAGID we get its citing entities (i.e. MAGIDs) 
    if ids != None:
        if isinstance(ids,list):
            ids = [str(t) for t in ids]
            row["mag_id"] = ";;".join(ids)
        else:
            row["mag_id"] = str(ids)
            ids = [ids]
            
        set_results = set()
        for an_id in ids: 
            res = call_mag(an_id, expr = "rid", results_type="magid")
            if res["Id"] != None:
                set_results = set_results.union(set(res["Id"]))
            
        row["mag_num_cits"] = len(set_results)
        if len(set_results) > 0:
            row["mag_citing_entities"] = ";;".join([str(a) for a in set_results])
        else:
            row["mag_citing_entities"] = ""
    continue
        
    # In this case no MAGID found for the retracted article
    row["mag_id"] = "unavailable"      
    row["mag_citing_entities"] = "unavailable"    
    row["mag_num_cits"] = "unavailable"

## 3) Dump the results of (2)

In [None]:
df_items = pd.DataFrame(items_l)
df_items.to_csv('mag_cits.csv',index=False) 

## 4) Collect the citing entities metadata 

In [None]:
l_mag_citing = []
df_mag = pd.read_csv('mag_cits.csv')
df_mag = df_mag[["mag_id","mag_citing_entities","date","retraction_date"]]
df_mag = df_mag[df_mag.mag_id.notnull()]
df_mag = df_mag[df_mag.mag_citing_entities.notnull()]
df_mag["mag_citing_entities"] = df_mag["mag_citing_entities"].apply(lambda x : str(x).split(";;"))
df_mag["date"] = df_mag["date"].apply(lambda x : str(x)[6:])
df_mag["retraction_date"] = df_mag["retraction_date"].apply(lambda x : ";; ".join([a[6:] for a in str(x).split(";; ")]))

In [None]:
EXCLUDE = "2151119690"
STARTING_INDEX = 0
for index,row in enumerate(df_mag.values[STARTING_INDEX:]):
    print("Progress: "+str(STARTING_INDEX + index + 1)+ " out of "+str(len(df_mag.values)), end="\r", flush=True)
    if row[0] == EXCLUDE:
        continue
    for citing_magid in row[1]:
        res = call_mag(citing_magid,expr = "magid",results_type="meta")
        res["DOI"] = res["DOI"][0] if len(res["DOI"]) > 0 else "" 
        res["citing_doi"] = res.pop('DOI')
        res["D"] = res["D"][0] if len(res["D"]) > 0 else "" 
        res["citing_publication_date"] = res.pop('D')
        res["DN"] = res["DN"][0] if len(res["DN"]) > 0 else ""
        res["citing_title"] = res.pop('DN')
        res["AA.DAuN"] = " ;; ".join(res["AA.DAuN"]) if len(res["AA.DAuN"]) > 0 else ""
        res["citing_author"] = res.pop('AA.DAuN')
        #res["Pt"] = Pt_mapping[int(res["Pt"][0])] if len(res["Pt"]) > 0 else "" 
        #res["citing_publication_type"] = res.pop('Pt')
        res["VFN"] = res["VFN"][0] if len(res["VFN"]) > 0 else "" 
        res["citing_venue"] = res.pop('VFN')
        res["PB"] = res["PB"][0] if len(res["PB"]) > 0 else "" 
        res["citing_publisher"] = res.pop('PB')
        res["cited_magid"] = row[0]
        res["cited_publication_year"] = row[2]
        res["cited_retraction_year"] = row[3]
        l_mag_citing.append(res)
    

## 5) Dump the results of (3)

In [None]:
df_items = pd.DataFrame(l_mag_citing)
df_items = df_items.to_csv('mag_cits_meta.csv',index=False) 

## 6) Get citations from COCI

In [11]:
df_items = pd.read_csv('mag_cits.csv')
df_items = df_items[["title","doi","date","authors","mag_id","date","retraction_date"]]
df_items = df_items[df_items.doi.notnull()]

In [12]:
l_coci_citing = []
for index,row in enumerate(df_items.values):
    print("Progress: "+str(index + 1)+ " out of "+str(len(df_items.values)), end="\r", flush=True)
    res = call_coci_citaions(row[1])
    l_coci_citing.append(
        {
            "cited_title": row[0],
            "cited_doi": row[1],
            "cited_publication_date": row[2],
            "cited_author": row[3],
            "cited_mag_id": row[4],
            "cited_publication_year": row[5],
            "cited_retraction_year": row[6],
            "coci_citing_entities": ";; ".join(res),
            "coci_num_citations": len(res) if isinstance(res,list) else None 
        }
    )

Progress: 264 out of 264

### -> Dump the results

In [13]:
df_coci = pd.DataFrame(l_coci_citing)
df_coci.to_csv('coci_cits.csv',index=False) 

## 8) Compare COCI with MAG and collect citing entities metadata

In [14]:
df_mag_cits = pd.read_csv('mag_cits_meta.csv')
df_mag_cits = df_mag_cits[df_mag_cits.citing_doi.notnull()]["citing_doi"]
mag_citing_doi = set([a_doi.lower() for a_doi in list(df_mag_cits)])
df_coci_cits = pd.read_csv('coci_cits.csv')
df_coci_cits = df_coci_cits[df_coci_cits.coci_citing_entities.notnull()]

In [None]:
total = 0
not_in_mag = []
coci_dois = set()
errors = []
for index,row in enumerate(df_coci_cits.values):
    citing_dois = row[7].split(";; ")
    total += len(citing_dois)
    print("Progress: "+str(index + 1)+ " out of "+str(len(df_coci_cits.values))+" (total citations elaborated = "+str(total)+")", end="\r", flush=True)
    for a_doi in citing_dois:
        if a_doi.lower() not in mag_citing_doi:
            try:
                print("get metadata from coci of: "+str(a_doi))
                res = call_coci_metadata(a_doi)
                not_in_mag.append(
                    {
                        "citing_doi": a_doi,
                        "citing_title": res["title"] if res["title"] else "",
                        "citing_author": res["author"],
                        "citing_publication_year": res["year"],
                        "cited_doi": row[1],
                        "cited_publication_year": row[5],
                        "cited_retraction_year": row[6]
                    }
                )
            except:
                print("Error with DOI: "+str(a_doi))
                errors.append(a_doi)

### -> Dump the results

In [27]:
df_coci_not_in_mag = pd.DataFrame(not_in_mag)
df_coci_not_in_mag.to_csv('coci_cits_meta(not_in_mag)_errors.csv',index=False) 

## --> Make a comparison

In [29]:
#citing_doi	citing_publication_date	citing_title	citing_author	citing_venue	citing_publisher	cited_magid	cited_publication_year	cited_retraction_year
df_mag_cits_meta = pd.read_csv('mag_cits_meta.csv')
#citing_doi	citing_title	citing_author	citing_publication_year	cited_doi	cited_publication_year	cited_retraction_year
df_coci_cits_meta = pd.read_csv('coci_cits_meta(not_in_mag).csv')

In [52]:
print("MAG cits: ", len(df_mag_cits_meta),"\nCOCI cits: ", len(df_coci_cits_meta)) 
s_compare = set()
mag_title = list(df_mag_cits_meta["citing_title"])
mag_title = [set(t.split(" ")) for t in mag_title]
print("list of: ",len(mag_title))

PERCENTAGE = 0.8
for title in mag_title:
    for row in df_coci_cits_meta.values:
        if not pd.isna(row[1]):
            len_coci_title = len(row[1])
            len_mag_title = len(title)
            int_set = set(row[1].split(" ")).intersection(title)
            if len(int_set)/min(len_coci_title,len_mag_title) >= PERCENTAGE:
                print(row[1],title,int_set)
                break

MAG cits:  816 
COCI cits:  417
list of:  816
0.8333333333333334 0.8
Redistribution Preferences And Low Socioeconomic Status {'Low', 'and', 'Preferences', 'Status', 'Socioeconomic', 'Redistribution'} {'Low', 'Preferences', 'Status', 'Socioeconomic', 'Redistribution'}
0.875 0.8
A Smart Audio On Demand Application On Android Systems {'Smart', 'Demand', 'A', 'on', 'Systems', 'Android', 'Audio', 'Application'} {'Smart', 'Demand', 'A', 'Systems', 'Android', 'Audio', 'Application'}


## DRAFT

In [None]:
title = urllib.parse.quote("'Killing the Dead' in Surmanci: About the Local Sources of “the War” in Bosnia".lower())
author = urllib.parse.quote("Max Bax".lower())
call_mag([title,author,1996],expr = "composite",results_type="magid")

In [None]:
# COCI: 10.1007/S11192-019-03217-6
# Fabio: 10.1016/J.WEBSEM.2012.08.001 
# 10.1207/s15326934crj1801_10

call_mag("10.1007/s11059-012-0120-6".upper(),expr = "doi",results_type="magid")
#call_mag(2343320592,expr = "magid",results_type="title")

In [None]:
#2130836170
call_mag("2130836170",expr = "magid",results_type="meta")

In [None]:
title = urllib.parse.quote("Must the Relation of Substantial Composition Be a Mode William of Ockham’s Answers".lower().translate(str.maketrans('', '', string.punctuation)))
author = urllib.parse.quote("Magali Roques".lower().replace(";;",", ").translate(str.maketrans('', '', string.punctuation)))
year = "2018"
call_mag([title,author,year],expr = "composite",results_type="magid")