### Use the 100 gb ram instance

In [1]:
import pandas as pd
from tables import *
import csv
import numpy as np
import pickle
import time
from scipy import sparse
from tqdm import tqdm_notebook as tqdm

In [2]:
path = '/home/User1/data/self_citations/'

In [3]:
articles = {}
with open(path+'article.txt','r') as f:
    reader = csv.reader(f,delimiter='\t')
    next(reader)
    for line in reader:
        articles[int(line[0])] = int(line[2])

In [4]:
with open(path+'dict_citant.p','rb') as f:
    dict_citant = pickle.load(f)

In [5]:
with open(path+'dict_cluster_art.p','rb') as f:
    dict_cluster_art = pickle.load(f)

#Run on first time only 
authors_info = pd.read_csv(path+'authors_info.csv',sep='\t')
nb_authors_disc = authors_info.groupby('EDiscipline').agg('count')
list_disciplines = nb_authors_disc.index
authors_disc_idx = {}
for disc in list_disciplines:
    #Cluster_ID:pos
    authors_disc_idx[disc] = {x[1]['Cluster_ID']:i for i,x in enumerate(authors_info.loc[authors_info['EDiscipline'] == disc].iterrows())}
with open(path+'authors_disc.p','wb') as f:
    pickle.dump(authors_disc_idx,f)

In [6]:
with open(path+'authors_disc.p','rb') as f:
    authors_disc_idx = pickle.load(f)
list_disciplines = list(authors_disc_idx.keys())
nb_disciplines = len(list_disciplines)

In [7]:
min_year = 1980
max_year = 2019
nb_years = max_year-min_year

In [8]:
authors_info = {}
with open(path+'authors_info.csv','r') as f:
    reader = csv.reader(f,delimiter='\t')
    next(reader)
    for line in reader:
        authors_info[int(line[0])] = [line[1],int(line[2])]

In [9]:
citants = list(dict_citant.keys())
nb_citants = len(citants)
idx = np.arange(nb_citants)
np.random.shuffle(idx)
types_cit = ['others']#['self','co','others']
nb_types = len(types_cit)
total_iterations = nb_citants*nb_types

### Compute raw citations

In [None]:
start_time = time.time()
count = 0
f = open(path+'progress.txt','w')
for type_cit in ['co']:# types_cit:
    references = {d:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for d in list_disciplines}
    citations = {d:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for d in list_disciplines}
    for i in range(nb_citants):
        count += 1
        art_citant = citants[idx[i]]
        year_citant = articles[art_citant]
        if year_citant >= min_year and year_citant < max_year:
            if art_citant in dict_cluster_art:
                authors_citant = set(dict_cluster_art[art_citant])
                refs_citant = set(dict_citant[art_citant])
                for ref in refs_citant:
                    if ref in dict_cluster_art:
                        authors_cite = set(dict_cluster_art[ref])
                        int_authors = authors_cite.intersection(authors_citant)
                        if len(int_authors) == 0:
                            if type_cit == 'others':
                                for ID_author_cite in authors_cite:
                                    info = authors_info[ID_author_cite]
                                    author_age = year_citant - info[1]
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author_cite]
                                        citations[disc][author_idx,author_age] += 1
                                for ID_author_citant in authors_citant:
                                    info = authors_info[ID_author_citant]
                                    author_age = year_citant - info[1]
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author_citant]
                                        references[disc][author_idx,author_age] += 1
                        else:
                            if type_cit == 'self':
                                for ID_author in int_authors:
                                    info = authors_info[ID_author]
                                    author_age = year_citant - info[1]
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author]
                                        citations[disc][author_idx,author_age] += 1
                                        references[disc][author_idx,author_age] += 1
                            elif type_cit == 'co':        
                                diff_cites = authors_cite.difference(int_authors)
                                diff_citant = authors_citant.difference(int_authors)
                                for ID_author_cite in diff_cites:
                                    info = authors_info[ID_author_cite]
                                    author_age = year_citant - info[1]
                                    if author_age <0:
                                        author_age = 0
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author_cite]
                                        citations[disc][author_idx,author_age] += 1
                                for ID_author_citant in diff_citant:
                                    info = authors_info[ID_author_citant]
                                    author_age = year_citant - info[1]
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author_citant]
                                        references[disc][author_idx,author_age] += 1
        if count % 1000 == 1 :
            elapsed_time = time.time() - start_time
            elapsed_time_h = np.round(elapsed_time/3600,2)
            time_per_art = elapsed_time/count
            time_left_h = np.round(time_per_art*(total_iterations-count)/3600,2)
            perc = np.round(count/total_iterations*100,3)
            f.write('Progress: {} articles, {}%, Time since start: {}, Time left: {}\n'.format(count,perc,elapsed_time_h,time_left_h))    
            f.flush()
    for disc in list_disciplines:
        with open('{}/arrays/citations_{}_{}_array.npz'.format(path,type_cit,disc),'wb') as f2:
            sparse.save_npz(f2,citations[disc])
        with open('{}/arrays/references_{}_{}_array.npz'.format(path,type_cit,disc),'wb') as f2:
            sparse.save_npz(f2,references[disc])

### Compute year normalized citations

In [10]:
citant_per_year = np.zeros(nb_years)
ref_per_year = np.zeros(nb_years)
for i in tqdm(range(nb_citants)):
    art_citant = citants[idx[i]]
    year_citant = articles[art_citant]
    if year_citant >= min_year and year_citant < max_year:
        rel_year_citant = year_citant - min_year
        citant_per_year[rel_year_citant] += 1
        ref_per_year[rel_year_citant] += len(dict_citant[art_citant])
cit_per_art_year = np.divide(ref_per_year,citant_per_year)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


A Jupyter Widget




In [None]:
weight_citant = 1/(cit_per_art_year/max(cit_per_art_year))
weight_citant = {year:weight_citant[i] for i,year in enumerate(range(min_year,max_year))}

In [None]:
f = open(path+'progress.txt','w')
for type_cit in ['co']:#types_cit:
    #references = {d:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for d in list_disciplines}
    citations = {d:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for d in list_disciplines}
    start_time = time.time()
    count = 0
    for i in range(nb_citants):
        count += 1
        art_citant = citants[idx[i]]
        year_citant = articles[art_citant]
        if year_citant >= min_year and year_citant < max_year:
            weight = weight_citant[year_citant]
            if art_citant in dict_cluster_art:
                authors_citant = set(dict_cluster_art[art_citant])
                refs_citant = set(dict_citant[art_citant])
                for ref in refs_citant:
                    if ref in dict_cluster_art:
                        authors_cite = set(dict_cluster_art[ref])
                        int_authors = authors_cite.intersection(authors_citant)
                        if len(int_authors) == 0:
                            if type_cit == 'others':
                                for ID_author_cite in authors_cite:
                                    info = authors_info[ID_author_cite]
                                    author_age = year_citant - info[1]
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author_cite]
                                        citations[disc][author_idx,author_age] += weight
                                for ID_author_citant in authors_citant:
                                    info = authors_info[ID_author_citant]
                                    author_age = year_citant - info[1]
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author_citant]
                                        references[disc][author_idx,author_age] += weight
                        else:
                            if type_cit == 'self':
                                for ID_author in int_authors:
                                    info = authors_info[ID_author]
                                    author_age = year_citant - info[1]
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author]
                                        citations[disc][author_idx,author_age] += weight
                                        references[disc][author_idx,author_age] += weight
                            elif type_cit == 'co':        
                                diff_cites = authors_cite.difference(int_authors)
                                diff_citant = authors_citant.difference(int_authors)
                                for ID_author_cite in diff_cites:
                                    info = authors_info[ID_author_cite]
                                    author_age = year_citant - info[1]
                                    if author_age < 0:
                                        author_age = 0
                                    if author_age < nb_years:
                                        disc = info[0]
                                        author_idx = authors_disc_idx[disc][ID_author_cite]
                                        citations[disc][author_idx,author_age] += weight
                                #for ID_author_citant in diff_citant:
                                #    info = authors_info[ID_author_citant]
                                #    author_age = year_citant - info[1]
                                #    if author_age < nb_years:
                                #        disc = info[0]
                                #        author_idx = authors_disc_idx[disc][ID_author_citant]
                                #        references[disc][author_idx,author_age] += weight
        if count % 1000 == 1 :
            elapsed_time = time.time() - start_time
            elapsed_time_h = np.round(elapsed_time/3600,2)
            time_per_art = elapsed_time/count
            time_left_h = np.round(time_per_art*(total_iterations-count)/3600,2)
            perc = np.round(count/total_iterations*100,3)
            f.write('Progress: {} articles, {}%, Time since start: {}, Time left: {}\n'.format(count,perc,elapsed_time_h,time_left_h))    
            f.flush()
    for disc in list_disciplines:
        with open('{}/arrays/citations_{}_{}_array_norm.npz'.format(path,type_cit,disc),'wb') as f2:
            sparse.save_npz(f2,sparse.csr_matrix(citations[disc]))
        #with open('{}/arrays/references_{}_{}_array_norm.npz'.format(path,type_cit,disc),'wb') as f2:
        #    sparse.save_npz(f2,sparse.csr_matrix(references[disc]))

In [47]:
for disc in list_disciplines:
    with open('{}/arrays/citations_{}_{}_array_norm.npz'.format(path,type_cit,disc),'wb') as f2:
        sparse.save_npz(f2,sparse.csr_matrix(citations[disc]))
    with open('{}/arrays/references_{}_{}_array_norm.npz'.format(path,type_cit,disc),'wb') as f2:
        sparse.save_npz(f2,sparse.csr_matrix(references[disc]))

In [None]:
#references = {d:{z:{y:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for y in ['self','co','others']} for z in ['age','year']} for d in list_disciplines}
#citations = {d:{z:{y:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for y in ['self','co','others']} for z in ['age','year']} for d in list_disciplines}
references = {d:{y:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for y in ['self','co','others']} for d in list_disciplines}
citations = {d:{y:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for y in ['self','co','others']} for d in list_disciplines}

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f prof_lines prof_lines()