In [1]:
import pandas as pd
from tables import *
import csv
import numpy as np
import pickle
import time
from scipy import sparse
from tqdm import tqdm_notebook as tqdm

In [2]:
path = '/home/User1/data/self_citations/'

In [3]:
articles = {}
with open(path+'article.txt','r') as f:
    reader = csv.reader(f,delimiter='\t')
    next(reader)
    for line in reader:
        articles[int(line[0])] = int(line[2])

In [4]:
with open(path+'IDs_network.p','rb') as f:
    IDs_network = pickle.load(f)
authors_network = {}
count = 0
with open(path+'author_network.npy','rb') as f:
    while True:
        try:
            authors_network[IDs_network[count]] = np.load(f)
            count += 1
        except ValueError:
            break
del IDs_network

In [5]:
with open(path+'dict_cite.p','rb') as f:
    dict_cite = pickle.load(f)
with open(path+'dict_citant.p','rb') as f:
    dict_citant = pickle.load(f)

In [6]:
with open(path+'dict_cluster_art.p','rb') as f:
    dict_cluster_art = pickle.load(f)
with open(path+'dict_cluster_ID.p','rb') as f:
    dict_cluster_ID = pickle.load(f)

In [7]:
with open(path+'authors_disc.p','rb') as f:
    authors_disc_idx = pickle.load(f)
list_disciplines = list(authors_disc_idx.keys())
nb_disciplines = len(list_disciplines)

In [8]:
min_year = 1980
max_year = 2019
nb_years = max_year-min_year

In [9]:
authors_info = {}
with open(path+'authors_info.csv','r') as f:
    reader = csv.reader(f,delimiter='\t')
    next(reader)
    for line in reader:
        authors_info[int(line[0])] = [line[1],int(line[2])]

In [10]:
citants = list(dict_citant.keys())
nb_citants = len(citants)
idx = np.arange(nb_citants)
citant_per_year = np.zeros(nb_years)
ref_per_year = np.zeros(nb_years)
for i in tqdm(range(nb_citants)):
    art_citant = citants[idx[i]]
    year_citant = articles[art_citant]
    if year_citant >= min_year and year_citant < max_year:
        rel_year_citant = year_citant - min_year
        citant_per_year[rel_year_citant] += 1
        ref_per_year[rel_year_citant] += len(dict_citant[art_citant])
cit_per_art_year = np.divide(ref_per_year,citant_per_year)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


A Jupyter Widget




In [11]:
weight_citant = 1/(cit_per_art_year/max(cit_per_art_year))
weight_citant = {year:weight_citant[i] for i,year in enumerate(range(min_year,max_year))}

In [12]:
authors = list(dict_cluster_ID.keys())
nb_authors = len(authors)
idx = np.arange(nb_authors)
np.random.shuffle(idx)
total_iterations = nb_authors

### Citations

In [13]:
f = open(path+'progress.txt','w')
count = 0
citations = {d:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for d in list_disciplines}
start_time = time.time()
for i in range(nb_authors):
    count += 1
    author_ID = authors[idx[i]]
    arts_author = dict_cluster_ID[author_ID]
    if author_ID in authors_network:
        network = authors_network[author_ID]
        years_net = np.unique(network[1,:])
        dict_network = {}
        for year in years_net:
            dict_network[year] = set(network[0,np.where(network[1,:]<=year)[0]])

        for art_cite in arts_author:
            year_cite = articles[art_cite]
            if year_cite >= min_year and year_cite < max_year:
                if art_cite in dict_cite:
                    authors_cite = set(dict_cluster_art[art_cite])
                    arts_citant = dict_cite[art_cite]
                    for art_citant in arts_citant:
                        year_citant = articles[art_citant]
                        if year_citant >= min_year and year_citant < max_year:
                            weight = weight_citant[year_citant]
                            if art_citant in dict_cluster_art:
                                authors_citant = set(dict_cluster_art[art_citant])
                                int_authors = authors_cite.intersection(authors_citant)
                                if len(int_authors) == 0:
                                    id_years = years_net[np.where(years_net<=year_citant)[0]]
                                    if len(id_years) > 0:
                                        year_network = np.max(id_years)
                                        network = dict_network[year_network]
                                        if len(authors_citant.intersection(network)) > 0:
                                            info = authors_info[author_ID]
                                            author_age = year_citant - info[1]
                                            if author_age < nb_years:
                                                disc = info[0]
                                                author_idx = authors_disc_idx[disc][author_ID]
                                                citations[disc][author_idx,author_age] += weight

    if count % 1000 == 1 :
        elapsed_time = time.time() - start_time
        elapsed_time_h = np.round(elapsed_time/3600,2)
        time_per_art = elapsed_time/count
        time_left_h = np.round(time_per_art*(total_iterations-count)/3600,2)
        perc = np.round(count/total_iterations*100,3)
        f.write('Progress: {} articles, {}%, Time since start: {}, Time left: {}\n'.format(count,perc,elapsed_time_h,time_left_h))    
        f.flush()


            
           

In [14]:
for disc in list_disciplines:
    with open('{}/arrays/citations_{}_{}_network_array_norm.npz'.format(path,'self',disc),'wb') as f2:
        sparse.save_npz(f2,sparse.csr_matrix(citations[disc]))

### References

In [None]:
f = open(path+'progress.txt','w')
count = 0
references = {d:sparse.lil_matrix((len(authors_disc_idx[d]),nb_years)) for d in list_disciplines}
start_time = time.time()
for i in range(nb_authors):
    count += 1
    author_ID = authors[idx[i]]
    arts_author = dict_cluster_ID[author_ID]
    if author_ID in authors_network:
        network = authors_network[author_ID]
        years_net = np.unique(network[1,:])
        dict_network = {}
        for year in years_net:
            dict_network[year] = set(network[0,np.where(network[1,:]<=year)[0]])

        for art_citant in arts_author:
            year_citant = articles[art_citant]
            if year_citant >= min_year and year_citant < max_year:
                weight = weight_citant[year_citant]
                if art_citant in dict_citant:
                    arts_cite = dict_citant[art_citant]
                    authors_citant = set(dict_cluster_art[art_citant])
                    for art_cite in arts_cite:
                        if art_cite in dict_cluster_art:
                            authors_cite = set(dict_cluster_art[art_cite])
                            int_authors = authors_cite.intersection(authors_citant)
                            if len(int_authors) == 0:
                                id_years = years_net[np.where(years_net<=year_citant)[0]]
                                if len(id_years) > 0:
                                    year_network = np.max(id_years)
                                    network = dict_network[year_network]
                                    if len(authors_cite.intersection(network)) > 0:
                                        info = authors_info[author_ID]
                                        author_age = year_citant - info[1]
                                        if author_age < nb_years:
                                            disc = info[0]
                                            author_idx = authors_disc_idx[disc][author_ID]
                                            references[disc][author_idx,author_age] += weight

    if count % 1000 == 1 :
        elapsed_time = time.time() - start_time
        elapsed_time_h = np.round(elapsed_time/3600,2)
        time_per_art = elapsed_time/count
        time_left_h = np.round(time_per_art*(total_iterations-count)/3600,2)
        perc = np.round(count/total_iterations*100,3)
        f.write('Progress: {} articles, {}%, Time since start: {}, Time left: {}\n'.format(count,perc,elapsed_time_h,time_left_h))    
        f.flush()


In [None]:
for disc in list_disciplines:
    with open('{}/arrays/references_{}_{}_network_array_norm.npz'.format(path,'self',disc),'wb') as f2:
        sparse.save_npz(f2,sparse.csr_matrix(references[disc]))

### Find average coaut years

In [71]:
citants = list(dict_citant.keys())
nb_citants = len(citants)
idx = np.arange(nb_citants)
citant_per_year = np.zeros(nb_years)
aut_per_year = np.zeros(nb_years)
for i in tqdm(range(nb_citants)):
    art_citant = citants[idx[i]]
    year_citant = articles[art_citant]
    if year_citant >= min_year and year_citant < max_year:
        rel_year_citant = year_citant - min_year
        if art_citant in dict_cluster_art:
            citant_per_year[rel_year_citant] += 1
            aut_per_year[rel_year_citant] += len(dict_cluster_art[art_citant])


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


A Jupyter Widget




In [75]:
aut_per_art_year = np.divide(aut_per_year,citant_per_year)

In [76]:
aut_per_art_year

array([2.58639224, 2.63546928, 2.6720403 , 2.7174379 , 2.76292371,
       2.81186071, 2.87538792, 2.939422  , 3.00449287, 3.06479835,
       3.13572934, 3.16195584, 3.30481557, 3.39004444, 3.44409977,
       3.54330382, 3.58947494, 3.66406739, 3.69684414, 3.74507228,
       3.80855104, 3.86737572, 3.91751835, 3.98328911, 4.11292973,
       4.18540465, 4.23553708, 4.29401113, 4.32872316, 4.39928687,
       4.56883694, 4.81484474, 5.09681504, 5.08702321, 5.15607434,
       5.30105618, 5.44910048, 5.52269407, 5.60170041])

In [77]:
citant_per_year = np.zeros(nb_years)
has_clus_per_year = np.zeros(nb_years)
for i in tqdm(range(nb_citants)):
    art_citant = citants[idx[i]]
    year_citant = articles[art_citant]
    if year_citant >= min_year and year_citant < max_year:
        rel_year_citant = year_citant - min_year
        citant_per_year[rel_year_citant] += 1
        if art_citant in dict_cluster_art:
            has_clus_per_year[rel_year_citant] += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


A Jupyter Widget




In [78]:
perc_clus_year = np.divide(has_clus_per_year,citant_per_year)
perc_clus_year 

array([0.99710706, 0.9975832 , 0.9980145 , 0.99787865, 0.99814447,
       0.99774353, 0.99812773, 0.99670111, 0.99830335, 0.99846729,
       0.99850549, 0.99803272, 0.99790516, 0.99891002, 0.99899947,
       0.99912394, 0.99900148, 0.9988984 , 0.99858032, 0.99857196,
       0.99821485, 0.99791223, 0.99862986, 0.99846309, 0.99801614,
       0.99825666, 0.99648094, 0.9291154 , 0.99687118, 0.932903  ,
       0.99343935, 0.99339224, 0.97513723, 0.99734763, 0.95275385,
       0.99451635, 0.86741259, 0.98746563, 0.16694203])