In [210]:
import csv
import re
import requests
import os
from datetime import datetime
import pandas as pd 
import numpy as np
from collections import defaultdict
import pprint

In [211]:
## Usefull functions

def write_list(l,file_path, header= True):
    f = open(file_path,"w+")
    initial_pos = 0
    
    #header
    if header:
        initial_pos = 1
        str_header = ''
        for k_header in l[0].keys():
            str_header = str_header + str(k_header) + ","
        f.write(str_header[:-1]+"\n")
        
    #content
    for l_index in range(initial_pos,len(l)):
        str_row = ''
        for k_att in l[l_index]:
            str_row = str_row + '"'+str(l[l_index][k_att]) +'"'+','
        f.write(str_row[:-1]+"\n")
        

def coci_call(operation, list_dois, fields):
    items_dict = {}
    for doi in list_dois:
        r = requests.get('https://opencitations.net/index/coci/api/v1/'+str(operation)+"/"+str(doi))
        if len(r.json()) > 0: 
            if fields == "*":
                items_dict[doi] = r.json()[0]
            else:
                items_dict[doi] = {}
                for f in fields:
                    items_dict[doi][f] = None
                    if f in r.json()[0]:
                        items_dict[doi][f] = r.json()[0][f]
    return items_dict
        

## Paths

In [212]:
csv_1998_2004 = "csv/1998_2004.csv"
csv_2005_2010 = "csv/2005_2010.csv"
csv_2011_2017 = "csv/2011_2017.csv"
coci_cits = "csv/coci_cits.csv"

## Normalizations

In [213]:
def norm_pdftext(t):
    t = re.sub(r"(\w{1})\-\s(\w{1})", r"\1\2", t)
    return t

def norm_data(x):
    x = x.rstrip().lstrip()
    regex = r"(\d{4})"
    matches = re.finditer(regex, x, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        if match:
            return match.group()
    return "none"

def norm_source(x):
    x = x.rstrip().lstrip().lower()
    if x == "doi.org":
        return "doi"
    if x == "other":
        return "other"
    return "none"
    
def norm_title(x):
    x = x.rstrip().lstrip().lower()
    x = norm_pdftext(x)
    return x

def norm_abstract(x):
    x = x.rstrip().lstrip().lower()
    x = norm_pdftext(x)
    return x

def norm_section(x, intext_cits = None):
    x = x.rstrip().lstrip()
    sections = list(filter(None,[item for item in x.split(";;")])) 
    sections = [item.split(";") for item in sections]
    for i,item_val in enumerate(sections): 
        for p,part_val in enumerate(item_val): 
            sections[i][p] = part_val.rstrip().lstrip().lower()
            if sections[i][p] == "none":
                return ["none" for j in range(0,intext_cits)]
    return sections

def norm_cits_text(x):
    x = x.rstrip().lstrip()
    cits_text = [norm_pdftext(item.rstrip().lstrip().lower()) for item in x.split(";;")]
    cits_text = list(filter(None, cits_text))
    return cits_text

def norm_cit_intent(x):
    x = x.rstrip().lstrip()
    cit_intent = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    cit_intent = list(filter(None, cit_intent))
    return cit_intent

def norm_sentiment(x):
    x = x.rstrip().lstrip()
    sentiment = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    sentiment = list(filter(None, sentiment))
    return sentiment

def norm_retraction_men(x):
    x = x.rstrip().lstrip().lower()
    x = x.replace(";;","")
    return x

def norm_note(x):
    x = x.rstrip().lstrip()
    note = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    note = list(filter(None, note))
    return note
        

## General stats

### 1) In-text citations

In [214]:
# Data structures
#---
valid_docs = dict()
err_docs = dict()
#--- Testing
test_dates = defaultdict(list)
test_titles = defaultdict(list)
test_sources = defaultdict(list)
test_ret_men = defaultdict(list)

with open(csv_2011_2017) as a_file:
    csv_reader = csv.reader(a_file, delimiter=',')
    # skip the headers 
    # 0.Date,
    # 1.DOI
    # 2.Source
    # 3.Title
    # 4.Abstract
    # 5.Section
    # 6.Citations to retracted article
    # 7.Citing reasons,
    # 8.Sentiment (negative/neutral/positive)
    # 9.Mentions the article retraction,
    # 10.Notes
    
    #skip the header
    next(csv_reader, None)
    
    #iterate all the csv rows
    for row in csv_reader:
        
        #Classify documents into Vlaid and Error 
        #---
        cits_text = norm_cits_text(row[6])
        if len(cits_text) == 0:
            err_docs[row[1]] = row[10]
        else:
            doi = row[1]
            valid_docs[doi] = dict()
            valid_docs[doi]["year"] = norm_data(row[0])
            valid_docs[doi]["source"] = norm_source(row[2])
            valid_docs[doi]["title"] = norm_title(row[3])
            valid_docs[doi]["abstract"] = norm_abstract(row[4])
            valid_docs[doi]["cits_text"] = cits_text
            valid_docs[doi]["section"] = norm_section(row[5], len(cits_text))
            valid_docs[doi]["cit_intent"] = norm_cit_intent(row[7])
            valid_docs[doi]["sentiment"] = norm_sentiment(row[8])
            valid_docs[doi]["retraction_mention"] = norm_retraction_men(row[9])
            valid_docs[doi]["note"] = norm_note(row[10])
            
            #Testing the csv values
            #---
            test_dates[norm_data(row[0])].append(doi)
            test_sources[norm_source(row[2])].append(doi)
            test_titles[norm_title(row[3])].append(doi)
            test_ret_men[norm_retraction_men(row[9])].append(doi)
            is_valid = (len(valid_docs[doi]["cits_text"]) == len(valid_docs[doi]["section"]) == len(valid_docs[doi]["cit_intent"]) == len(valid_docs[doi]["sentiment"]))  
            if not is_valid:
                print(doi)

## Analysis with pandas 

In [215]:
#Columns are the fields
#Rows are the DOIs 
df = pd.DataFrame.from_dict(valid_docs).transpose()
df["intext_cit"] = list(zip(df["cits_text"],df["cit_intent"],df["sentiment"],df["section"]))
sub_df = df[['year', 'source', 'title', 'abstract', 'retraction_mention', 'note', 'intext_cit']]

In [269]:
# Analyse the DataFrame
df_cts_x_doc = df["section"].apply(lambda x : len(x))

MEAN_CITxDOC = df_cts_x_doc.mean()
TOT_INTEXT_CIT = df_cts_x_doc.sum()
TOT_DOC = df_cts_x_doc.count()
DOI_DOCs = df[df["source"] == "doi"]["source"].count()
OTHER_DOCs = df[df["source"] == "other"]["source"].count()

#COCI CITS
COCI_CITS_DICT = {}
with open(coci_cits, mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        COCI_CITS_DICT[row["citing"]] = row
COCI_CITS_DF = pd.DataFrame.from_dict(COCI_CITS_DICT).transpose()
COCI_CITS_DF['creation'] = COCI_CITS_DF['creation'].apply(lambda x: x[0:4])
CITS_SOURCES =  COCI_CITS_DF[['creation','source_title','source_id']]
SOURCES_BY_YEAR = defaultdict(set)
for row in CITS_SOURCES.itertuples():
    SOURCES_BY_YEAR[row[1]].add((row[2],row[3]))

RET_MEN = defaultdict(int)       
for item in list(df["retraction_mention"]):
    RET_MEN[item] += 1

SENTIMENT_COUNT = defaultdict(int)
CIT_INTENT_COUNT = defaultdict(int)
PATTERN = defaultdict(int)
for doi, item in df.iterrows(): 
    cits_num = len(item["intext_cit"][0])
    for cit_index in range(0,cits_num):
        cit_intent_val = item["intext_cit"][1][cit_index]
        cit_sentiment_val = item["intext_cit"][2][cit_index]
        
        CIT_INTENT_COUNT[cit_intent_val] += 1
        SENTIMENT_COUNT[cit_sentiment_val] += 1  
        PATTERN[(cit_intent_val,cit_sentiment_val,item["retraction_mention"])] += 1

In [273]:
#Write sources on csv
all_sources = []
for k,v in SOURCES_BY_YEAR.items():
    for item in v: 
        all_sources.append({"year": k, "source_title": item[0], "source_id": item[1]})
write_list(all_sources, "coci_sources.csv", header= False)

In [110]:
print("+ Total number of documents: ",TOT_DOC)
print("\n+ Source: ")
print("      from the editor page: "+str(DOI_DOCs))
print("      from other sources: "+str(OTHER_DOCs))
print("\n+ Total number of in-text reference pointers: ",TOT_INTEXT_CIT)
print("\n+ Average number of in-text reference pointers per document: ",MEAN_CITxDOC)
print("\n+ Documents which mention the retraction: ", RET_MEN['yes'])
print("\n+ In-text reference pointers: ")
print("\n      sentiment count:")

SENTIMENT_COUNT = {k: v for k, v in sorted(dict(SENTIMENT_COUNT).items(), key=lambda item: item[1],reverse=True)}
for k in SENTIMENT_COUNT:
    print("      ",k," : ",SENTIMENT_COUNT[k] )
print("      -----")
print("\n      intent count:")
CIT_INTENT_COUNT = {k: v for k, v in sorted(dict(CIT_INTENT_COUNT).items(), key=lambda item: item[1],reverse=True)}
for k in CIT_INTENT_COUNT:
    print("      ",k," : ",CIT_INTENT_COUNT[k] )

print("      -----")
print("\n      common patters (intent, sentiment, mentions retraction):")
PATTERN = {k: v for k, v in sorted(dict(PATTERN).items(), key=lambda item: item[1],reverse=True)}
for k in PATTERN:
    print("      ",k," : ",PATTERN[k] )



+ Total number of documents:  337

+ Source: 
      from the editor page: 225
      from other sources: 112

+ Total number of in-text reference pointers:  442

+ Average number of in-text reference pointers per document:  1.311572700296736

+ Documents which mention the retraction:  128

+ In-text reference pointers: 

      sentiment count:
       neutral  :  231
       negative  :  209
       positive  :  2
      -----

      intent count:
       discusses  :  122
       cites for information  :  45
       cites as evidence  :  45
       qualifies  :  44
       credits  :  43
       disputes  :  41
       critiques  :  39
       obtains background from  :  34
       describes  :  23
       includes excerpt from  :  4
       uses data from  :  1
       refutes  :  1
      -----

      common patters (intent, sentiment, mentions retraction):
       ('discusses', 'negative', 'yes')  :  38
       ('discusses', 'neutral', 'yes')  :  36
       ('discusses', 'negative', 'no')  :  26
      