In [76]:
import csv
import re
import requests
import os
from datetime import datetime
import pandas as pd 
import numpy as np
from collections import defaultdict
import pprint

In [77]:
## Usefull functions

def write_list(l,file_path, header= True):
    f = open(file_path,"w+")
    initial_pos = 0
    
    #header
    if header:
        initial_pos = 1
        str_header = ''
        for k_header in l[0].keys():
            str_header = str_header + str(k_header) + ","
        f.write(str_header[:-1]+"\n")
        
    #content
    for l_index in range(initial_pos,len(l)):
        str_row = ''
        for k_att in l[l_index]:
            str_row = str_row + '"'+str(l[l_index][k_att]) +'"'+','
        f.write(str_row[:-1]+"\n")
        

def coci_call(operation, list_dois, fields):
    items_dict = {}
    for doi in list_dois:
        r = requests.get('https://opencitations.net/index/coci/api/v1/'+str(operation)+"/"+str(doi))
        if len(r.json()) > 0: 
            if fields == "*":
                items_dict[doi] = r.json()[0]
            else:
                items_dict[doi] = {}
                for f in fields:
                    items_dict[doi][f] = None
                    if f in r.json()[0]:
                        items_dict[doi][f] = r.json()[0][f]
    return items_dict
        

## Paths

In [78]:
csv_1998_2004 = "csv/1998_2004.csv"
csv_2005_2010 = "csv/2005_2010.csv"
csv_2011_2017 = "csv/2011_2017.csv"
coci_cits = "csv/coci_cits.csv"
coci_sources = "csv/coci_sources.csv"
coci_sources_isbn = "csv/isbn/isbn_cat_lcc.csv"
lcc_scimago_map = "csv/lcc_scimago_subject.csv"

## Normalizations

In [79]:
def norm_isbn_code(x):
    regex = r"^([A-Z]{1,})\d"
    matches = re.finditer(regex, x, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        if match:
            return match.groups()[0]
        
def norm_pdftext(t):
    t = re.sub(r"(\w{1})\-\s(\w{1})", r"\1\2", t)
    return t

def norm_data(x):
    x = x.rstrip().lstrip()
    regex = r"(\d{4})"
    matches = re.finditer(regex, x, re.MULTILINE)
    for matchNum, match in enumerate(matches, start=1):
        if match:
            return match.group()
    return "none"

def norm_source(x):
    x = x.rstrip().lstrip().lower()
    if x == "doi.org":
        return "doi"
    if x == "other":
        return "other"
    return "none"
    
def norm_title(x):
    x = x.rstrip().lstrip().lower()
    x = norm_pdftext(x)
    return x

def norm_abstract(x):
    x = x.rstrip().lstrip().lower()
    x = norm_pdftext(x)
    return x

def norm_section(x, intext_cits = None):
    x = x.rstrip().lstrip()
    sections = list(filter(None,[item for item in x.split(";;")])) 
    sections = [item.split(";") for item in sections]
    for i,item_val in enumerate(sections): 
        for p,part_val in enumerate(item_val): 
            sections[i][p] = part_val.rstrip().lstrip().lower()
            if sections[i][p] == "none":
                return ["none" for j in range(0,intext_cits)]
    return sections

def norm_cits_text(x):
    x = x.rstrip().lstrip()
    cits_text = [norm_pdftext(item.rstrip().lstrip().lower()) for item in x.split(";;")]
    cits_text = list(filter(None, cits_text))
    return cits_text

def norm_cit_intent(x):
    x = x.rstrip().lstrip()
    cit_intent = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    cit_intent = list(filter(None, cit_intent))
    return cit_intent

def norm_sentiment(x):
    x = x.rstrip().lstrip()
    sentiment = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    sentiment = list(filter(None, sentiment))
    return sentiment

def norm_retraction_men(x):
    x = x.rstrip().lstrip().lower()
    x = x.replace(";;","")
    return x

def norm_note(x):
    x = x.rstrip().lstrip()
    note = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    note = list(filter(None, note))
    return note

## Normalize sources

def norm_subject(x):
    x = x.rstrip().lstrip()
    norm_val = [item.rstrip().lstrip().lower() for item in x.split(";;")]
    norm_val = list(filter(None, norm_val))
    return norm_val

def norm_area(x, intext_cits = None):
    x = x.rstrip().lstrip()
    norm_val = list(filter(None,[item for item in x.split(";;")])) 
    norm_val = [item.split(";") for item in norm_val]   
    for i,item_val in enumerate(norm_val): 
        for p,part_val in enumerate(item_val): 
            norm_val[i][p] = part_val.rstrip().lstrip().lower()
    return norm_val

def norm_source_id(x):
    def filter_null(x):
        return x[0] != ""
    
    x = x.rstrip().lstrip()
    norm_val = [item.rstrip().lstrip().lower() for item in x.split(";")]
    norm_val = [tuple(item.split(":")) for item in norm_val]
    norm_val = list(filter(filter_null, norm_val))
    return norm_val

def norm_dois(x):
    x = x.rstrip().lstrip()
    norm_val = [item.rstrip().lstrip().lower() for item in x.split("[[;;]]")]
    norm_val = list(filter(None, norm_val))
    return norm_val


## General stats

### 1) In-text citations

In [80]:
# Data structures
#---
valid_docs = dict()
err_docs = dict()
#--- Testing
test_dates = defaultdict(list)
test_titles = defaultdict(list)
test_sources = defaultdict(list)
test_ret_men = defaultdict(list)

with open(csv_2011_2017) as a_file:
    csv_reader = csv.reader(a_file, delimiter=',')
    # skip the headers 
    # 0.Date,
    # 1.DOI
    # 2.Source
    # 3.Title
    # 4.Abstract
    # 5.Section
    # 6.Citations to retracted article
    # 7.Citing reasons,
    # 8.Sentiment (negative/neutral/positive)
    # 9.Mentions the article retraction,
    # 10.Notes
    
    #skip the header
    next(csv_reader, None)
    
    #iterate all the csv rows
    for row in csv_reader:
        
        #Classify documents into Vlaid and Error 
        #---
        cits_text = norm_cits_text(row[6])
        if len(cits_text) == 0:
            err_docs[row[1]] = row[10]
        else:
            doi = row[1]
            valid_docs[doi] = dict()
            valid_docs[doi]["year"] = norm_data(row[0])
            valid_docs[doi]["source"] = norm_source(row[2])
            valid_docs[doi]["title"] = norm_title(row[3])
            valid_docs[doi]["abstract"] = norm_abstract(row[4])
            valid_docs[doi]["cits_text"] = cits_text
            valid_docs[doi]["section"] = norm_section(row[5], len(cits_text))
            valid_docs[doi]["cit_intent"] = norm_cit_intent(row[7])
            valid_docs[doi]["sentiment"] = norm_sentiment(row[8])
            valid_docs[doi]["retraction_mention"] = norm_retraction_men(row[9])
            valid_docs[doi]["note"] = norm_note(row[10])
            
            #Testing the csv values
            #---
            test_dates[norm_data(row[0])].append(doi)
            test_sources[norm_source(row[2])].append(doi)
            test_titles[norm_title(row[3])].append(doi)
            test_ret_men[norm_retraction_men(row[9])].append(doi)
            is_valid = (len(valid_docs[doi]["cits_text"]) == len(valid_docs[doi]["section"]) == len(valid_docs[doi]["cit_intent"]) == len(valid_docs[doi]["sentiment"]))  
            if not is_valid:
                print(doi)

### 2) All Citing sources

### 2.1) ISBNs Citations (update the sources dataset)

In [81]:
SUBJECTS_ISBN = defaultdict(str)
LCC_SCIMAGO_MAP = defaultdict(str)
with open(coci_sources_isbn) as a_file:
    #0. ISBN
    #1. CODE
    csv_reader = csv.reader(a_file, delimiter=',') 
    for row in csv_reader:
        SUBJECTS_ISBN[row[0]] = row[1]
        
with open(lcc_scimago_map) as a_file:
    #0. LCC Code
    #1. SCIMAGO Subject
    csv_reader = csv.reader(a_file, delimiter=',') 
    for row in csv_reader:
        LCC_SCIMAGO_MAP[row[0]] = row[1]

# The Sources ISBN
MY_ISBN_LIST = [["9780230282889","9780230369078"],["9780333922637","9780230213999"],["9780470710470","9780470745328"],["9780470918449","9780470381120"],["9780470939345","9780471716969"],["9780470939390","9780471237372"],["9780470939406","9780471237389"],["9780470976739","9780470694671"],["9780824705107","9781420002515"],["9780824707156","9780824755164"],["9780824750619","9780203026229"],["9781118404898","9780470029718"],["9781118517000","9780470745915"],["9781118543504","9780470654675"],["9781118683484","9781119940418"],["9781118688489","9780813806143"],["9781118753378","9781118845479"],["9781118858080","9781118128336"],["9781118898345","9781118898390"],["9781119164746","9781119164777"],["9781119426981","9781118586624"],["9781119943280","9780470667347"],["9781119959946","9780470654750"],["9781349318230","9781137023001"],["9781420060737","9781420060744"],["9781420068818","9781420068870"],["9781439804797","9781439804827"],["9781439813430","9781439813447"],["9781439838839","9781439838846"],["9781444325461","9781405186544"],["9781444355666","9781444337082"],["9781466567207","9781466567238"],["9781841845203","9780203007648"],["9781848550803","9781848550810"],["9783319159485","9783319159492"],["9783319180953","9783319180960"],["9783319283241","9783319283265"],["9783319309231","9783319309255"],["9783319311432","9783319311432"],["9783319418988","9783319418995"],["9783319432663","9783319432687"],["9783319599502","9783319599526"],["9783319625416","9783319625430"],["9783319638225","9783319638232"],["9783319652641","9783319652665"],["9783319669380","9783319669397"],["9783319693491","9783319693507"],["9783662495032","9783662495049"],["9783662547984","9783662547991"]]
MY_ISBN_LCC = defaultdict(str)
for a_book in MY_ISBN_LIST:
    sub_val = "none"
    for i_isbn in range(0,2):
        if a_book[i_isbn] in SUBJECTS_ISBN: 
                sub_val = SUBJECTS_ISBN[a_book[i_isbn]]
                break
    MY_ISBN_LCC[tuple(a_book)] = sub_val
# Others manually retrieved from https://catalog.loc.gov
MY_ISBN_LCC[("9781420068818","9781420068870")] = "RC553.A88"

for isbn,code in MY_ISBN_LCC.items():
    if code != "none":
        if code in LCC_SCIMAGO_MAP:
            sub_val = LCC_SCIMAGO_MAP[code]
        else:
            sub_val = LCC_SCIMAGO_MAP[norm_isbn_code(code)]
            father_code = norm_isbn_code(code)[0:-1]
            while sub_val == "" and len(father_code)>0:
                sub_val = LCC_SCIMAGO_MAP[father_code]
                father_code = father_code[0:-1]
        
        MY_ISBN_SUBJECT_DICT[isbn] = sub_val
        CSV_READY_ISBN_SUBJECTS.append({"isbn":";".join(isbn),"lcc":code,"subject":sub_val})

In [55]:
UPDATED_COCI_SOURCES = []

with open(coci_sources) as a_file:
    csv_reader = csv.reader(a_file, delimiter=',') 
    #0. date
    #1. subject
    #2. area
    #3. source_title
    #4. source_id
    #5. notes
    #6. doi

    #skip the header
    next(csv_reader, None)
    #iterate all the csv rows
    for row in csv_reader:
        updated_row = row
        k = tuple(row[4].replace("isbn:","").split("; "))
        if k in MY_ISBN_SUBJECT_DICT:
            updated_row[1] = MY_ISBN_SUBJECT_DICT[k]
        dict_updated_row = dict()
        for i in range(0,len(row)):
            dict_updated_row[i] = row[i]
        UPDATED_COCI_SOURCES.append(dict_updated_row)
        
write_list(UPDATED_COCI_SOURCES, "coci_sources_updated.csv", header= False)

### 2.2) All Citing Sources

In [82]:
valid_sources = []
#--- Testing
test_dates = defaultdict(list)
test_titles = defaultdict(list)
test_sources = defaultdict(list)
test_ret_men = defaultdict(list)
with open(coci_sources) as a_file:
    csv_reader = csv.reader(a_file, delimiter=',') 
    #0. date
    #1. subject
    #2. area
    #3. source_title
    #4. source_id
    #5. notes
    #6. doi

    #iterate all the csv rows 
    for row in csv_reader:
        is_valid = len(norm_subject(row[1])) > 0
        if is_valid:
            elem = {
                "year" : int(norm_data(row[0])),
                "subject" : norm_subject(row[1]),
                "area" : norm_area(row[2]),
                "source_title" : norm_title(row[3]),
                "source_id" : norm_source_id(row[4]),
                "doi" : norm_dois(row[6])
            }
            valid_sources.append(elem)     

## Analysis with pandas 

In [89]:
#Columns are the fields
#Rows are the DOIs 
df = pd.DataFrame.from_dict(valid_docs).transpose()
df["intext_cit"] = list(zip(df["cits_text"],df["cit_intent"],df["sentiment"],df["section"]))
sub_df = df[['year', 'source', 'title', 'abstract', 'retraction_mention', 'note', 'intext_cit']]

#Sources
df_sources = pd.DataFrame.from_dict(valid_sources)
df_sources["doi"] = df_sources["doi"].apply(lambda x: len(x))
df_sources = df_sources[(df_sources["year"] >= 2011) & (df_sources["year"] <= 2017)]

In [90]:
# Analyse the DataFrame
df_cts_x_doc = df["section"].apply(lambda x : len(x))

MEAN_CITxDOC = df_cts_x_doc.mean()
TOT_INTEXT_CIT = df_cts_x_doc.sum()
TOT_DOC = df_cts_x_doc.count()
DOI_DOCs = df[df["source"] == "doi"]["source"].count()
OTHER_DOCs = df[df["source"] == "other"]["source"].count()

#COCI CITS
COCI_CITS_DICT = {}
with open(coci_cits, mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        COCI_CITS_DICT[row["citing"]] = row
COCI_CITS_DF = pd.DataFrame.from_dict(COCI_CITS_DICT).transpose()
COCI_CITS_DF['creation'] = COCI_CITS_DF['creation'].apply(lambda x: x[0:4])
CITS_SOURCES =  COCI_CITS_DF[['creation','source_title','source_id']]
SOURCES_BY_YEAR = defaultdict(set)
for row in CITS_SOURCES.itertuples():
    SOURCES_BY_YEAR[row[1]].add((row[2],row[3]))

RET_MEN = defaultdict(int)       
for item in list(df["retraction_mention"]):
    RET_MEN[item] += 1

SENTIMENT_COUNT = defaultdict(int)
CIT_INTENT_COUNT = defaultdict(int)
PATTERN = defaultdict(int)
for doi, item in df.iterrows(): 
    cits_num = len(item["intext_cit"][0])
    for cit_index in range(0,cits_num):
        cit_intent_val = item["intext_cit"][1][cit_index]
        cit_sentiment_val = item["intext_cit"][2][cit_index]
        
        CIT_INTENT_COUNT[cit_intent_val] += 1
        SENTIMENT_COUNT[cit_sentiment_val] += 1  
        PATTERN[(cit_intent_val,cit_sentiment_val,item["retraction_mention"])] += 1
        
        
# The Sources ISSN
SUBJECTS = defaultdict(dict)
sub_area = list(zip(df_sources["subject"], list(df_sources["area"]), list(df_sources["doi"])))
for tupla in sub_area:
    #in case of an ISBN
    if len(tupla[1]) == 0:
            if "ISBN" not in SUBJECTS[tupla[0][0]]:
                SUBJECTS[tupla[0][0]]["ISBN"] = 0
            SUBJECTS[tupla[0][0]]["ISBN"] += tupla[2]
    else:
        for sub_index in range(0,len(tupla[0])):
            for area in tupla[1][sub_index]:
                if area not in SUBJECTS[tupla[0][sub_index]]:
                    SUBJECTS[tupla[0][sub_index]][area] = 0
                SUBJECTS[tupla[0][sub_index]][area] += tupla[2]

READY_TO_CSV = []
for k,v in SUBJECTS.items():
    for k_area in v:
        READY_TO_CSV.append({"subject":k,"area":k_area,"count":v[k_area]})
write_list(READY_TO_CSV, "sources_test.csv", header= False)

In [None]:
print("+ Total number of documents: ",TOT_DOC)
print("\n+ Source: ")
print("      from the editor page: "+str(DOI_DOCs))
print("      from other sources: "+str(OTHER_DOCs))
print("\n+ Total number of in-text reference pointers: ",TOT_INTEXT_CIT)
print("\n+ Average number of in-text reference pointers per document: ",MEAN_CITxDOC)
print("\n+ Documents which mention the retraction: ", RET_MEN['yes'])
print("\n+ In-text reference pointers: ")
print("\n      sentiment count:")

SENTIMENT_COUNT = {k: v for k, v in sorted(dict(SENTIMENT_COUNT).items(), key=lambda item: item[1],reverse=True)}
for k in SENTIMENT_COUNT:
    print("      ",k," : ",SENTIMENT_COUNT[k] )
print("      -----")
print("\n      intent count:")
CIT_INTENT_COUNT = {k: v for k, v in sorted(dict(CIT_INTENT_COUNT).items(), key=lambda item: item[1],reverse=True)}
for k in CIT_INTENT_COUNT:
    print("      ",k," : ",CIT_INTENT_COUNT[k] )

print("      -----")
print("\n      common patters (intent, sentiment, mentions retraction):")
PATTERN = {k: v for k, v in sorted(dict(PATTERN).items(), key=lambda item: item[1],reverse=True)}
for k in PATTERN:
    print("      ",k," : ",PATTERN[k] )

