In [4]:
import csv
import re
import requests
import os
from datetime import datetime
import pandas as pd 
import numpy as np
from collections import defaultdict
import pprint

# A methodology for the generation and annotation of an in-text citations collection based on entities citing a retracted document: the Wakefield et.al. retraction case

The construction of the in-text reference dataset is based on four phases, each of the following phases will enrich the final dataset with new attributes. The following table shows each phase, input, and generated attributes to embed in the dataset.  

![](img/tab.png)

In [5]:
## Usefull functions
def write_list(l,file_path, header= True, initial_pos= 0):
    f = open(file_path,"w+")
    
    if len(l) > 0:
        #header
        if header:
            str_header = ''
            for k_header in l[0].keys():
                str_header = str_header + str(k_header) + ","
            f.write(str_header[:-1]+"\n")

        #content
        for l_index in range(initial_pos,len(l)):
            str_row = ''
            for k_att in l[l_index]:
                str_row = str_row + '"'+str(l[l_index][k_att]) +'"'+','
            f.write(str_row[:-1]+"\n")
    else:
        f.write("")
        
def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

def df_to_dict_list(a_df, extra_keys = {}, fields = []):
    l = []
    for index, row in a_df.iterrows():
        
        dict_elem = {}
        for k_field in fields:
            if k_field in row:
                dict_elem[k_field] = row[k_field]
                
        l.append(merge_two_dicts(dict_elem, extra_keys))
    
    #if len(l) == 0:
    #    header = {}
    #    for k_field in fields:
    #        header[k_field] = ""
    #    l.append(merge_two_dicts(header, extra_keys))
        
    return l

In [7]:
# CONSTANTS
CITS = "data/cits.csv"
STEP_A = "data/step_a.csv"
STEP_B = "data/step_b.csv"
STEP_B_1 = "data/step_b_1.csv"
STEP_B_2 = "data/step_b_2.csv"
STEP_C = "data/step_c.csv"
STEP_D = "data/step_d.csv"
STEP_E = "data/step_e.csv"

## A) Identifying and retrieving the resources 
#### Input: Wakefield et.al. retracted article
* Use the DOI value of the retracted article: 10.1016/S0140-6736(97)11096-0 

#### Output: Creates a dataset with the following attributes (A.1) DOI; (A.2) year of publication; (A.3) title; (A.4) ISSN/ISBN; (A.5) whether is retracted
* Query the COCI dataset ([https://opencitations.net/index/coci](https://opencitations.net/index/coci)) to get (A.1), (A.2), (A.3), (A.4). At the time of this elaboration we have used the last version of COCI available: November 2018 Dump (46,534,705 bibliographic resources, 445,826,118 citation links).  
**Note:** all the APIs of COCI used refered to the same version.
* Query RetractionWatch database ([http://retractiondatabase.org/](http://retractiondatabase.org/))

In [6]:
RET_ART_DOI = "10.1016/S0140-6736(97)11096-0"
COCI_API = "https://opencitations.net/index/coci/api/v1/"

def call_api_coci(operation, vals, fields, params=""):
    
    if len(vals) == 0:
        return {}

    val_key = vals.pop(0)
    item = {}
    item[val_key] = {}  
    r = requests.get(COCI_API + str(operation) + "/" + str(val_key) + str(params))
    if len(r.json()) > 0:
        if fields == "*":
            item[val_key] = r.json()[0]
        else:
            for f in fields:
                item[val_key][f] = None
                if f in r.json()[0]:
                    item[val_key][f] = r.json()[0][f]
    
    return merge_two_dicts(item, call_api_coci(operation, vals, fields, params))

# All the citations in COCI
ret_meta = call_api_coci("metadata", [RET_ART_DOI],["citation"],'?json=array("; ",citation,doi)')
coci_cits = ret_meta[RET_ART_DOI]["citation"]

# ---- <TEST> ----- COMMENT  
coci_cits = coci_cits[0:10]
# ---- </TEST> ----- COMMENT  

# Get the metadata of citing document
coci_cits_meta = call_api_coci("metadata", coci_cits, "*")

#write the partial results of this step
step_a_data = []
for c in coci_cits_meta:
    step_a_data.append({
        "doi": coci_cits_meta[c]["doi"],
        "title": coci_cits_meta[c]["title"],
        "year": coci_cits_meta[c]["year"],
        "source_id": coci_cits_meta[c]["source_id"],
        "source_title": coci_cits_meta[c]["source_title"]
    })

write_list(step_a_data, STEP_A, header= True)
# Verify and add "retracted" field to each citing document using/querying RetractionWatch database (http://retractiondatabase.org/) as source

#### Prepare data for next steps

In [161]:
# Prepare pandas dataframe with the fields needed
coci_cits_df = pd.DataFrame.from_dict(coci_cits_meta).transpose()
coci_cits_df = coci_cits_df[["doi","title","year","source_id","source_title"]]

## B) Classifying into subjects and areas of study
#### Input: All documents ISSN/ISBN values on the dataset
#### Output: Adds these attributes: (B.1) source title, (B.2) subject; (B.3) area

In [162]:
# ISSNs: citations having an issn value in the source id
coci_cits_df_issn = coci_cits_df[coci_cits_df["source_id"].str.contains('^issn')]
coci_cits_df_issn = coci_cits_df_issn[["source_id","source_title"]].drop_duplicates(subset ="source_id", keep = 'first')
step_b_1_data = df_to_dict_list(coci_cits_df_issn,{"scimago_subject":"TODO","scimago_area":"TODO"},["source_id","source_title"])
write_list(step_b_1_data, STEP_B_1, header= True)
# Verify and add the "scimago_subject", and the "scimago_area" fields for each ISSN value using/querying the SCIMAGO service (https://www.scimagojr.com/)

# ISBNs: citations having an isbn value in the source id
coci_cits_df_isbn = coci_cits_df[coci_cits_df["source_id"].str.contains('^isbn')]
coci_cits_df_isbn = coci_cits_df_isbn[["source_id","source_title"]].drop_duplicates(subset ="source_id", keep = 'first')
step_b_2_data = df_to_dict_list(coci_cits_df_isbn,{"scimago_subject":"TODO","scimago_area":"TODO"},["source_id","source_title"])
write_list(step_b_2_data, STEP_B_2, header= True)
# Verify and add the LCC code of each ISBN value using/querying the ISBNDB service (https://isbndb.com/)
# Consider only the first alphabetic characters of the LCC code and map them to a SCIMAGO subject and area 
# If some LCC Alphabetic characters don't have a corresponding SCIMAGO match:
#   -> Take in consideration the entire LCC code and find a suitable SCIMAGO subject and area

In [163]:
# ***ONLY ONES THE PREVIOUS STEP IS DONE

# Combine all the results in one csv
if len(step_b_2_data) == 0:
    coci_cits_df_sources = pd.read_csv(STEP_B_1)
else:
    coci_cits_df_sources = pd.concat([pd.read_csv(STEP_B_1), pd.read_csv(STEP_B_2)],sort=False)

step_b_data = df_to_dict_list(coci_cits_df_sources,{},["source_id","source_title","scimago_subject","scimago_area"])
write_list(step_b_data, STEP_B, header= True)

## C) Extracting textual values

#### Input: All documents DOI values on the dataset
#### Output: Adds these attributes: (C.1) abstract; (C.2) in-text reference section; (C.3) in-text reference context

In [164]:
step_c_data = df_to_dict_list(coci_cits_df,{"abstract":"TODO", "section":"TODO", "context":"TODO"},["doi"])
write_list(step_c_data, STEP_C, header= True)
# Verify and add the "abstract", "in-text reference section", and "in-text reference context"

## D) Annotating citing entity characteristics

#### Input: All the in-text context values
#### Output: Adds these attributes: (D.1) whether it mentions the retraction

In [165]:
step_d_data = df_to_dict_list(pd.read_csv(STEP_C),{"mention_retraction":"TODO"},["doi","context"])
write_list(step_d_data, STEP_D, header= True)

## E) Annotating in-text references characteristics

#### Input: All the in-text context values
#### Output: Adds these attributes: (E.1) citation intent, (E.2) sentiment

In [166]:
step_e_data = df_to_dict_list(pd.read_csv(STEP_C),{"intent":"TODO","sentiment":"TODO"},["doi","context"])
write_list(step_e_data, STEP_E, header= True)

## MERGE ALL RESULTS

#### Input: All the results obtained 
#### Output: One dataset which includes all the results

In [None]:
merge_data = pd.merge(
    pd.merge(
        pd.merge(
            pd.merge(
                pd.read_csv(STEP_C), 
                pd.DataFrame(pd.read_csv(STEP_D)[["doi","mention_retraction"]]), 
                on='doi'),
            pd.read_csv(STEP_E)[["doi","intent","sentiment"]],
            on='doi'),
            pd.read_csv(STEP_A),
        on='doi'),
    pd.read_csv(STEP_B)[["source_id","scimago_subject","scimago_area"]],
    on='source_id')

In [26]:
write_list(
    df_to_dict_list(merge_data,{},["doi","year","title","abstract","context","section","mention_retraction","intent","sentiment","source_id","source_title","scimago_subject","scimago_area"]),
    CITS, 
    header= True)