In [1]:
import csv
import re
import requests
import os
from datetime import datetime
import pandas as pd 
import numpy as np
from collections import defaultdict
import pprint
import norm 

## Read the in-text citations dataset

In [4]:
# Data structures
#---
valid_docs = dict()
err_docs = dict()
#--- Testing
test_dates = defaultdict(list)
test_titles = defaultdict(list)
test_sources = defaultdict(list)
test_ret_men = defaultdict(list)

with open("../data/coci_intext_ref.csv") as a_file:
    csv_reader = csv.reader(a_file, delimiter=',')
    
    # skip the headers 
    #0)date
    #1)doi
    #2)full_text_source
    #3)title
    #4)abstract
    #5)section
    #6)intext_ref
    #7)cito_fun
    #8)sentiment
    #9)ret_mention
    #10)retracted
    #11)note
    #12)source_title
    #13)source_id
    #14)area
    #15)category
    
    #skip the header
    next(csv_reader, None)
    
    #iterate all the csv rows
    for row in csv_reader:
        
        #Classify documents into Vlaid and Error 
        #---
        cits_text = norm.norm_cits_text(row[6])
        if len(cits_text) == 0:
            err_docs[row[1]] = row[10]
        else:
            doi = row[1]
            valid_docs[doi] = dict()
            valid_docs[doi]["year"] = norm.norm_data(row[0])
            valid_docs[doi]["source"] = norm.norm_source(row[2])
            valid_docs[doi]["title"] = norm.norm_title(row[3])
            valid_docs[doi]["abstract"] = norm.norm_abstract(row[4])
            valid_docs[doi]["cits_text"] = cits_text
            valid_docs[doi]["section"] = norm.norm_section(row[5], len(cits_text))
            valid_docs[doi]["cit_intent"] = norm.norm_cit_intent(row[7])
            valid_docs[doi]["sentiment"] = norm.norm_sentiment(row[8])
            valid_docs[doi]["retraction_mention"] = norm.norm_retraction_men(row[9])
            valid_docs[doi]["retracted"] = norm.norm_retraction_men(row[10])
            valid_docs[doi]["note"] = norm.norm_note(row[11])
            valid_docs[doi]["source_title"] = norm.norm_title(row[12])
            valid_docs[doi]["source_id"] = norm.norm_source_id(row[13])
            valid_docs[doi]["area"] = norm.norm_area(row[14])
            valid_docs[doi]["category"] = norm.norm_category(row[15])
            
            #Testing the csv values
            #---
            test_dates[norm.norm_data(row[0])].append(doi)
            test_sources[norm.norm_source(row[2])].append(doi)
            test_titles[norm.norm_title(row[3])].append(doi)
            test_ret_men[norm.norm_retraction_men(row[9])].append(doi)
            is_valid = (len(valid_docs[doi]["cits_text"]) == len(valid_docs[doi]["section"]) == len(valid_docs[doi]["cit_intent"]) == len(valid_docs[doi]["sentiment"]))  
            if not is_valid:
                print(doi)

## Results analysis

In [218]:
#Columns are the fields
#Rows are the DOIs 
base_df = pd.DataFrame.from_dict(valid_docs).transpose()
base_df["intext_cit"] = list(zip(base_df["cits_text"],base_df["cit_intent"],base_df["sentiment"],base_df["section"]))

periods = [
    {
        "label": "1998-2004",
        "years": (1998,2004),
        "results": {}
    },
    {
        "label": "2005-2010",
        "years": (2005,2010),
        "results": {}
    },
    {
        "label": "2011-2017",
        "years": (2011,2017),
        "results": {}
    }
]

In [219]:
## Separate results and calculate basic stats
## ---------------

p_years = None
def filter_fn(row):
    if int(row["year"]) in range(p_years[0],p_years[1]+1):
        return True
    else:
        return False

def intext_cits_map(df):
    count_intext_cits = 0
    groups = {"sentiment":defaultdict(int),"cit_intent":defaultdict(int),"intext_cit":defaultdict(int)}
    for index, row in df.iterrows():
        ## count total
        count_intext_cits += len(row["section"])
        ## groups
        for k in groups:
            # if k = "intext_cit" then create a pattern including: sentiment, intent, mentions_retraction
            if k == "intext_cit":
                for index in range(0,len(row[k][0])):
                    elem = (row[k][1][index],row[k][2][index],row["retraction_mention"])
                    groups[k][elem] += 1
            else:
                for elem in row[k]:
                    groups[k][elem] += 1
    
    return {"count":count_intext_cits,"groups":groups}

def sources_map(df):
    groups = {"subject":defaultdict(int),"intext_cits": defaultdict(int)}
    for index, row in df.iterrows():
        ## groups
        for k in groups:
            
            if k == "subject":
                for a_index in range(0,len(row["area"])):
                    a = row["area"][a_index]
                    for c in row["category"][a_index]:
                        elem = (a,c)
                        groups[k][elem] += 1
            
            if k == "intext_cits":
                for a_index in range(0,len(row["area"])):
                    a = row["area"][a_index]
                    for index in range(0,len(row["intext_cit"][0])):
                        elem = (a, row["intext_cit"][1][index], row["intext_cit"][2][index], row["retraction_mention"])
                        groups[k][elem] += 1
                        
    return {"groups":groups}

In [220]:
for p in periods:
    p_years = p["years"]
    p_df = base_df.copy()
    m = p_df.apply(filter_fn, axis=1)
    p_df = p_df[m]
    
    intext_cits = intext_cits_map(p_df)
    sources = sources_map(p_df)
    
    p["results"] = {
        
        ## Data frame
        "df": p_df,
        
        "citations": p_df.groupby(['year']).size().reset_index(name='counts').to_records(index=False),
        
        ## documents
        "docs": len(p_df),
        "paywall": p_df[p_df["source"] == "other"]["source"].count(),
        "open": p_df[p_df["source"] == "doi"]["source"].count(),
        
        ## in-text citations
        "total_intext_cits": intext_cits["count"],
        "avg_intext_cits": round(intext_cits["count"]/len(p_df),2),
        "mentions_retraction": p_df[p_df["retraction_mention"] == "yes"]["retraction_mention"].count(),
        "intext_cits_groups": intext_cits["groups"],
        
        ## sources
        "sources_groups": sources["groups"],        
    }

In [232]:
## Print on file the results
for p in periods:
    csv_files = [
        #p[label]+"_gen.csv", 
        p["label"]+"_citations.csv", 
        p["label"]+"_intext_cits_gen.csv", 
        p["label"]+"_sources_gen.csv", 
        p["label"]+"_sources_intext_cits.csv"]
    
    for f_name in csv_files:
        
        rows = []
        
        # A file 
        if f_name.endswith("_citations.csv"):
            rows.append(["year","count"])
            for val in p["results"]["citations"]:
                rows.append([val[0],val[1]])
        
        # A file 
        if f_name.endswith("_intext_cits_gen.csv"):
            rows.append(["intent","sentiment","mentions_retraction","count"])
            for k,val in p["results"]["intext_cits_groups"]["intext_cit"].items():
                rows.append([k[0],k[1],k[2],val])
        
        # A file 
        if f_name.endswith("_sources_gen.csv"):
            rows.append(["area","category","count"])
            for k,val in p["results"]["sources_groups"]["subject"].items():
                rows.append([k[0],k[1],val])
                
        # A file 
        if f_name.endswith("_sources_intext_cits.csv"):
            rows.append(["area","intent","sentiment","mentions_retraction","count"])
            for k,val in p["results"]["sources_groups"]["intext_cits"].items():
                rows.append([k[0],k[1],k[2],k[3],val])
        
        #Write on file
        with open("../results/data/"+f_name,"w") as outfile:
            w = csv.writer(outfile)
            for r in rows:
                w.writerow(r)
            