In [1]:
import pickle
import csv
from tqdm import tqdm_notebook as tqdm
import numpy as np

In [2]:
path = '/home/User1/data/self_citations/'
out_path = path + 'model/'

### Get nb of articles

In [3]:
with open(path+'dict_cluster_ID.p','rb') as f:
    dict_cluster_ID = pickle.load(f)

In [4]:
articles = {}
with open(path+'article.txt','r') as f:
    reader = csv.reader(f,delimiter='\t')
    next(reader)
    for line in reader:
        articles[int(line[0])] = int(line[2])

In [5]:
authors = list(dict_cluster_ID.keys())
nb_authors = len(authors)

In [6]:
max_age = 20
years = np.arange(max_age)
with open(out_path+"nb_articles.csv",'w') as f:
    header = ["Cluster_ID"]
    years_header = ["nb_art_"+str(x+1) for x in years]
    header = header + years_header
    f.write(",".join(header) + '\n')
    for i in tqdm(range(nb_authors)):
        author = int(authors[i])
        arts_author = dict_cluster_ID[author]
        arts_per_year = {}
        for art in arts_author:
            year_art = articles[art]
            if year_art in arts_per_year:
                arts_per_year[year_art] += 1
            else:
                arts_per_year[year_art] = 1
        years_pub = list(arts_per_year.keys())
        years_pub.sort()
        min_year = years_pub[0]
        nb_arts = np.zeros(len(years),dtype=np.int)
        prev_age = 0
        for year in years_pub:
            age = year - min_year 
            if age == 0:
                nb_arts[age] = arts_per_year[year]
            elif age < max_age :
                nb_arts[prev_age+1:age+1] = nb_arts[prev_age] + arts_per_year[year]
            prev_age = age
        if prev_age < max_age - 1:
            nb_arts[prev_age+1:max_age] = nb_arts[prev_age] 
        f.write(str(author) + ',' + ",".join(map(str,nb_arts)) + '\n')
    
    
    
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


A Jupyter Widget




### Get nb citations

In [18]:
from scipy import sparse

In [4]:
#Map to auth ID
with open(path+'authors_disc.p','rb') as f:
    authors_disc_idx = pickle.load(f)

In [10]:
list_disciplines = list(authors_disc_idx.keys())
for disc in list_disciplines:
    authors_disc_idx[disc] = {authors_disc_idx[disc][x]:x for x in authors_disc_idx[disc]}

temp = authors_disc_idx[list_disciplines[0]]
temp_keys = list(temp.keys())
temp_keys[:10]

In [15]:
nb_disciplines = len(list_disciplines)

In [16]:
max_age = 20
years = np.arange(max_age)

In [None]:
types_cit = ['self','co','others']
for type_i in [2]:#tqdm(range(len(types_cit))):
    type_cit = types_cit[type_i]
    with open(out_path + 'nb_citations_' + type_cit + '.csv','w') as f:
        header = ["Cluster_ID"]
        years_header = ["nb_cits_"+type_cit +'_'+str(x+1) for x in years]
        header = header + years_header
        f.write(",".join(header) + '\n')
        for i in tqdm(range(nb_disciplines)):
            disc = list_disciplines[i]
            arrays_cits = sparse.load_npz('{}/arrays/citations_{}_{}_array_norm.npz'.format(path,type_cit,disc)).todense()
            nb_authors = arrays_cits.shape[0]
            cum_cits = np.round(np.array(np.cumsum(arrays_cits,axis=1)),2)
            for j in tqdm(range(nb_authors)):
                author = str(int(authors_disc_idx[disc][j]))
                f.write(author + ',' + ','.join(map(str,cum_cits[j,:max_age])) + '\n')
           

### Get nb coauthors

In [3]:
with open(path+'IDs_network.p','rb') as f:
    IDs_network = pickle.load(f)
authors_network = {}
count = 0
with open(path+'author_network.npy','rb') as f:
    while True:
        try:
            authors_network[IDs_network[count]] = np.load(f)
            count += 1
        except ValueError:
            break
del IDs_network

In [4]:
authors = list(authors_network.keys())
nb_authors = len(authors)

In [8]:
max_age = 20
years = np.arange(max_age)
with open(out_path+"nb_coauthors.csv",'w') as f:
    header = ["Cluster_ID"]
    years_header = ["nb_art_"+str(x+1) for x in years]
    header = header + years_header
    f.write(",".join(header) + '\n')
    for i in tqdm(range(nb_authors)):
        author = int(authors[i])
        coauthors = authors_network[author]
        coauthors = coauthors[1,:].astype(np.int)
        co_per_year = {}
        nb_coauthors = len(coauthors)
        for co_i in range(nb_coauthors):
            year_co = coauthors[co_i]
            if year_co in co_per_year:
                co_per_year[year_co] += 1
            else:
                co_per_year[year_co] = 1
        years_co = list(co_per_year.keys())
        years_co.sort()
        min_year = years_co[0]
        cum_coauthors = np.zeros(max_age,dtype=np.int)
        prev_age = 0
        for year in years_co:
            age = year - min_year 
            if age == 0:
                cum_coauthors[age] = co_per_year[year]
            elif age < max_age :
                cum_coauthors[prev_age+1:age+1] = cum_coauthors[prev_age] + co_per_year[year]
            prev_age = age
        if prev_age < max_age - 1:
            cum_coauthors[prev_age+1:max_age] = cum_coauthors[prev_age] 
        f.write(str(author) + ',' + ",".join(map(str,cum_coauthors)) + '\n')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


A Jupyter Widget




### Get country

In [3]:
articles = {}
with open(path+'article.txt','r') as f:
    reader = csv.reader(f,delimiter='\t')
    next(reader)
    for line in reader:
        articles[int(line[0])] = int(line[2])

In [4]:
country_authors = {}
with open(path+'clusters_locations.txt','r',encoding="latin-1") as f:
    reader = csv.reader(f,delimiter='\t')
    print(next(reader))
    for line in reader:
        ID = line[0]
        if ID != '':
            ID = int(ID)
            if ID in country_authors:
                country_authors[ID] = [np.append(country_authors[ID][0],line[6]),np.append(country_authors[ID][1],articles[int(line[2])])]
            else:
                country_authors[ID] = [np.array([line[6]]),np.array([articles[int(line[2])]])]

['Cluster_ID', 'Full_Name', 'ID_Art', 'Institution', 'Ville', 'EPays', 'ERegroupement']


In [5]:
authors = list(country_authors.keys())
nb_authors = len(authors)
nb_authors

22166707

In [69]:
#def line_profiler():
max_age = 20
years = np.arange(max_age)
count = 0
with open(out_path+"country_authors.csv",'w') as f:
    header = ["Cluster_ID"]
    years_header = ["country_"+str(x+1) for x in years]
    header = header + years_header
    f.write(",".join(header) + '\n')
    for i in tqdm(range(nb_authors)):
#         count += 1
#         if count == 200:
#             break
        author = int(authors[i])
        countries_author = country_authors[author]
        countries = countries_author[0].astype(object)
        years = countries_author[1].astype(np.int)
        max_count_year = {}
        nb_countries = len(countries)
        unique_years = np.unique(years)
        unique_years.sort()

        min_year = unique_years[0]
        cum_country = np.empty(max_age, dtype=object)
        prev_age = 0
        for year in unique_years:
            age = year - min_year 
            idx = np.where(years<=year)[0]
            unique,pos = np.unique(countries[idx],return_inverse=True)
            counts = np.bincount(pos)
            maxpos = np.argwhere(counts == np.amax(counts))
            #counts = np.bincount(countries[idx])
            country = np.max(counts)
            if len(maxpos) > 1:
                country = unique[maxpos[0]][0]
            else:
                country = unique[maxpos][0][0]
            if age == 0:
                cum_country[age] = country
            elif age < max_age :
                cum_country[prev_age+1:age+1] = country
            prev_age = age
        if prev_age < max_age - 1:
            cum_country[prev_age+1:max_age] = ''#cum_country[prev_age]
        f.write(str(author) + ',' + ",".join(map(str,cum_country)) + '\n')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


A Jupyter Widget




In [7]:
%load_ext line_profiler

In [17]:
%lprun -f line_profiler line_profiler()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


A Jupyter Widget

### Years active

In [3]:
authors_info = {}
with open(path+'authors_info.csv','r') as f:
    reader = csv.reader(f,delimiter='\t')
    next(reader)
    for line in reader:
        authors_info[int(line[0])] = [line[1],int(line[2])]

KeyboardInterrupt: 