# COCI Data Analysis 

An analysis on COCI to check the percentage of citations in COCI compared with the other main open citations repositories.

### An example with a small sample

In [1]:
DATA_PATH = "../../../data/coci-dataset/coci_sample.csv"
CITING_CSV_INDEX = 0
CITED_CSV_INDEX = 1

#Define my COCI inner vars {<VAR-NAME>: <CSV-INDEX>}
COCI_KEY = CITING_CSV_INDEX
VAR_INDEX = {}
VAR_INDEX['CITING_DOI'] = CITING_CSV_INDEX

# The dataset we wish to campare COCI with 
# we can use variables from VAR_INDEX to put inside here
DATASETS_TO_COMPARE = {}
DATASETS_TO_COMPARE['wikidata'] = "https://opencitations.net/wikidata/api/v1/metadata/[[CITING_DOI]]"

#Processing Buffer: number of citations
#Consider COCI have 445,826,118 Citaions 
BUFFER = 1000000
PROCESS_DIR = "../data"

In [2]:
import csv
import os, os.path
import re
import requests
import json

with open(DATA_PATH) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    buffer_count = BUFFER
    results = {}
    for row in csv_reader:
        
        #Dump the buffer on a file
        if buffer_count == 0:
            files_processed = len([name for name in os.listdir(PROCESS_DIR) if os.path.isfile(name)])
            f_name = str(files_processed)+".csv"
            # -> Build the csv header
            HEADER = "citing,coci_refs,"
            for k_compare_ds in DATASETS_TO_COMPARE:
                HEADER = HEADER + k_compare_ds + ","
            HEADER = HEADER[:-1] + "\n"    
            # -> write the csv
            with open(PROCESS_DIR+"/"+f_name, 'w') as d_file:
                d_file.write(file_value)
            # -> reset vars
            buffer_count = BUFFER
            results = {}
        
        #for each citation of COCI 
        # -> select and define the vars needed
        var_values = {}
        for var_k in VAR_INDEX:
            var_values[var_k] = row[VAR_INDEX[var_k]]
        
        # -> check the other RESOURCES 
        ds_to_compare = {}
        for ds_compare_k in DATASETS_TO_COMPARE:
            # -> Normalize the API call with corresponding value
            api_call = DATASETS_TO_COMPARE[ds_compare_k]
            var_key_list = re.findall(r"\[\[(.*)\]\]",api_call)
            for v_k in var_key_list:
                if v_k in var_values:
                    api_call = api_call.replace("[["+v_k+"]]",var_values[v_k])
            ds_to_compare[ds_compare_k] = api_call
        
        # -> Compare now
        for ds_compare_k in ds_to_compare:
            results[ds_compare_k] = []
            res = requests.get(ds_to_compare[ds_compare_k])
            my_json = res.content.decode('utf8').replace("'", '"')
            get_data = json.loads(my_json)
            
            #check if is empty 
            res_val = 0
            if len(get_data) != 0:
                # select the references according to the dataset type
                if ds_compare_k == "wikidata":
                    res_val = len(get_data[0]['reference'].split(";"))
                else:
                    res_val = 0
                    
        #PUSH CITING, INCOCI, ... etc 
        results[ds_compare_k].append({'CITING'})