In [69]:
import csv
import re
import requests
import os
from datetime import datetime
import pandas as pd 
import numpy as np
from collections import defaultdict
import pprint

In [70]:
## Usefull functions

def write_list(l,file_path, header= True):
    f = open(file_path,"w+")
    initial_pos = 0
    
    #header
    if header:
        initial_pos = 1
        str_header = ''
        for k_header in l[0].keys():
            str_header = str_header + str(k_header) + ","
        f.write(str_header[:-1]+"\n")
        
    #content
    for l_index in range(initial_pos,len(l)):
        str_row = ''
        for k_att in l[l_index]:
            str_row = str_row + '"'+str(l[l_index][k_att]) +'"'+','
        f.write(str_row[:-1]+"\n")
        

def coci_call(operation, list_dois, fields):
    items_dict = {}
    for doi in list_dois:
        r = requests.get('https://opencitations.net/index/coci/api/v1/'+str(operation)+"/"+str(doi))
        if len(r.json()) > 0: 
            if fields == "*":
                items_dict[doi] = r.json()[0]
            else:
                items_dict[doi] = {}
                for f in fields:
                    items_dict[doi][f] = None
                    if f in r.json()[0]:
                        items_dict[doi][f] = r.json()[0][f]
    return items_dict

In [71]:
# Normalization

def norm_isbn_code(x):
    regex = r"^([A-Z]{1,})\d"
    matches = re.finditer(regex, x, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        if match:
            return match.groups()[0]
        
def norm_pdftext(t):
    t = re.sub(r"(\w{1})\-\s(\w{1})", r"\1\2", t)
    return t

def norm_data(x):
    x = x.rstrip().lstrip()
    regex = r"(\d{4})"
    matches = re.finditer(regex, x, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        if match:
            return match.group()
    return "none"

def norm_source(x):
    x = x.rstrip().lstrip().lower()
    if x == "doi.org":
        return "doi"
    if x == "other":
        return "other"
    return "none"
    
def norm_title(x):
    x = x.rstrip().lstrip().lower()
    x = norm_pdftext(x)
    return x

def norm_abstract(x):
    x = x.rstrip().lstrip().lower()
    x = norm_pdftext(x)
    return x

def norm_section(x, intext_cits = None):
    x = x.rstrip().lstrip()
    sections = list(filter(None,[item for item in x.split(";;")])) 
    sections = [item.split(";") for item in sections]
    for i,item_val in enumerate(sections): 
        for p,part_val in enumerate(item_val): 
            sections[i][p] = part_val.rstrip().lstrip().lower()
            if sections[i][p] == "none":
                return [["none"] for j in range(0,intext_cits)]
    return sections

def norm_cits_text(x):
    x = x.rstrip().lstrip()
    cits_text = [norm_pdftext(item.rstrip().lstrip().lower()) for item in x.split(";;")]
    cits_text = list(filter(None, cits_text))
    return cits_text

def norm_cit_intent(x):
    x = x.rstrip().lstrip()
    cit_intent = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    cit_intent = list(filter(None, cit_intent))
    return cit_intent

def norm_sentiment(x):
    x = x.rstrip().lstrip()
    sentiment = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    sentiment = list(filter(None, sentiment))
    return sentiment

def norm_retraction_men(x):
    x = x.rstrip().lstrip().lower()
    x = x.replace(";;","")
    return x

def norm_note(x):
    x = x.rstrip().lstrip()
    note = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    note = list(filter(None, note))
    return note

## Normalize sources

def norm_subject(x):
    x = x.rstrip().lstrip()
    norm_val = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    norm_val = list(filter(None, norm_val))
    return norm_val

def norm_area(x, intext_cits = None):
    x = x.rstrip().lstrip()
    norm_val = list(filter(None,[item for item in x.split(";;")])) 
    norm_val = [item.split(";") for item in norm_val]   
    for i,item_val in enumerate(norm_val): 
        for p,part_val in enumerate(item_val): 
            norm_val[i][p] = part_val.rstrip().lstrip().lower()
    return norm_val

def norm_source_id(x):
    def filter_null(x):
        return x[0] != ""
    
    x = x.rstrip().lstrip()
    norm_val = [item.rstrip().lstrip().lower() for item in x.split(";")]
    norm_val = [tuple(item.split(":")) for item in norm_val]
    norm_val = list(filter(filter_null, norm_val))
    return norm_val

def norm_dois(x):
    x = x.rstrip().lstrip()
    norm_val = [item.rstrip().lstrip().lower() for item in x.split("[[;;]]")]
    norm_val = list(filter(None, norm_val))
    return norm_val


# 1) Convert the Wakefield retraction in-text references dataset ("coci_intext_ref.csv") into the DASPLAB algorith input dataset format

| DASPLAB algorithm input dataset format| Meaning | Corresponding field from the in-text ref dataset |
|--------------|-------------------------------------------------------------|---|
| citfunc      | citation function                                           | cito_fun  |
| sectitle     | the ecxact title of the section                             | section (part of it)  |
| refentry     | the reference entry used in the citing article              | NONE  |
| art          | the article identifier (e.g. DOI)                           | doi  |
| sectype      | the section type (e.g. results, related work, introduction) | section (part of it)  |
| sectype2     | _                                                           | NONE  |
| refid        | reference URL                                               | NONE  |
| sec          | section URL                                                 | NONE  |
| ctx          | in-text ref URL                                             | NONE  |
| pointerlist  | _                                                           | NONE  |
| nbcontexts   | _                                                           | NONE  |
| nbsections   | _                                                           | NONE  |
| itrp         | URL of SOMETHING                                            | NONE  |
| anchorsent   | the in-text reference anchor sentence                       | intext_ref  |
| partext      | _                                                           | NONE  |
| potential_cf | a possible Object-property of Cito                          | cito_fun  |
| annotator    | _                                                           | NONE  |
| dataset      | dataset name                                                | NONE  |

In [72]:
# Data structures
#---
valid_docs = dict()

with open("../wakefield_retraction/data/coci_intext_ref.csv") as a_file:
    csv_reader = csv.reader(a_file, delimiter=',')
    # skip the headers 
    # 0.Date,
    # 1.DOI
    # 2.Source
    # 3.Title
    # 4.Abstract
    # 5.Section
    # 6.Citations to retracted article
    # 7.Citing reasons,
    # 8.Sentiment (negative/neutral/positive)
    # 9.Mentions the article retraction,
    # 10.Is a retracted article
    # 11.Notes
    
    #skip the header
    next(csv_reader, None)
    
    #iterate all the csv rows
    for row in csv_reader:
        
        #Classify documents into Vlaid and Error 
        #---
        cits_text = norm_cits_text(row[6])
        if len(cits_text) == 0:
            err_docs[row[1]] = row[10]
        else:
            doi = row[1]
            valid_docs[doi] = dict()
            valid_docs[doi]["year"] = norm_data(row[0])
            valid_docs[doi]["source"] = norm_source(row[2])
            valid_docs[doi]["title"] = norm_title(row[3])
            valid_docs[doi]["abstract"] = norm_abstract(row[4])
            valid_docs[doi]["cits_text"] = cits_text
            valid_docs[doi]["section"] = norm_section(row[5], len(cits_text))
            valid_docs[doi]["cit_intent"] = norm_cit_intent(row[7])
            valid_docs[doi]["sentiment"] = norm_sentiment(row[8])
            valid_docs[doi]["retraction_mention"] = norm_retraction_men(row[9])
            valid_docs[doi]["note"] = norm_note(row[10])
            valid_docs[doi]["retracted"] = norm_retraction_men(row[10])
            valid_docs[doi]["note"] = norm_note(row[11])

In [73]:
df = pd.DataFrame.from_dict(valid_docs).transpose()
df["intext_cit"] = list(zip(df["cits_text"],df["cit_intent"],df["sentiment"],df["section"]))

In [84]:
dasplab_alg_input = []
for doi, item in df.iterrows():
    
    for i in range(0,len(item["intext_cit"][0])):
                   
        #define section parts 
        sec_parts = {"sectitle":"","sectype":""}
        for s_ann in item["intext_cit"][3]: 
            for s_p in s_ann:
                if "introduction" in s_p:
                    sec_parts["sectype"] = "introduction"
                elif "background" in s_p:
                    sec_parts["sectype"] = "related work"
                elif "method" in s_p:
                    sec_parts["sectype"] = "methods"
                elif "result" in s_p:
                    sec_parts["sectype"] = "results"
                elif "conclusion" in s_p:
                    sec_parts["sectype"] = "conclusions"
                if s_p != "" and s_p[0] == '“':
                    sec_parts["sectitle"] = s_p[1:-1]
        
        rep_item = {
            "citfunc" : item["intext_cit"][1][i] ,
            "sectitle" :  sec_parts["sectitle"],
            "refentry" :  "",
            "art" :  doi,
            "sectype" :  sec_parts["sectype"],
            "sectype2" :"",
            "refid" :  "",
            "sec" :  "",
            "ctx" :  "",
            "pointerlist" :  "",
            "nbcontexts" :  "",
            "nbsections" :  "",
            "itrp" :  "",
            "anchorsent" : item["intext_cit"][0][i],
            "partext" :  "",
            "potential_cf" : "",
            "annotator" : item["intext_cit"][1][i] ,
            "dataset" : "coci_wakefield_ret_intext_ref.csv"
        }
        dasplab_alg_input.append(rep_item)

In [85]:
df_new_format = pd.DataFrame.from_dict(dasplab_alg_input)
write_list(dasplab_alg_input, "wakefield_intext_ref.csv", header= True)