In [1]:
from gensim.corpora import MmCorpus, Dictionary
from gensim import similarities
import numpy as np
from tables import *
import time
import multiprocessing as mp
import csv

In [2]:
class SimPairs(IsDescription):
    Id_Art1 = UInt32Col()
    Id_Art2 = UInt32Col()
    Sim = Float16Col()

In [3]:
tfidf_path = "/mnt/disks/sdb/data/pub_full_tfidf.mm"
ids_path = "/mnt/disks/sdb/data/abs_cits.txt"
cits_path = "/mnt/disks/sdb/data/DictCits.h5"
out_path = "/mnt/disks/sdc/data/"
dict_path = "/mnt/disks/sdb/data/dct.p"
info_path = "/mnt/disks/sdb/data/pub_exp_info.txt"

In [4]:
h5file_cits = open_file(cits_path, mode="r", title="DictCits")
table_cits = h5file_cits.root.pub_exp.DictCits
nb_l_cits = len(table_cits)


years_incl = set([2000,2001])

from tqdm import tqdm_notebook as tqdm
#years = set()
idx_incl = []

for i in tqdm(range(nb_l_cits)):
#for line in table_cits.iterrows():
    line = table_cits[i]
    year = line['year_citant']
    #years.add(year)
    if year in years_incl:
        idx_incl.append(i)
idx_incl = np.array(idx_incl)

In [5]:
nb_citants_incl = nb_l_cits

In [6]:
h5file_cits.close()
del table_cits

###### Maybe use for fastest year id

#years = set()
years = []
with open(info_path, newline='') as f:
    reader = csv.reader(f,delimiter='\t', quoting=csv.QUOTE_NONE)
    next(reader)
    for line in reader: 
        #years.add(int(line[2]))
        years.append(int(line[2]))

In [7]:
dictionary = Dictionary.load(dict_path)
num_features= len(dictionary)
del dictionary

In [8]:
mm_tfidf = MmCorpus(tfidf_path)

In [9]:
with open(ids_path,'r') as f:
    abs_cits = f.readlines()

In [10]:
abs_cits = np.array([int(x.strip()) for x in abs_cits])

In [11]:
abs_dict = {x:i for i,x in enumerate(abs_cits)}
del abs_cits

In [12]:
years = np.arange(1898,2020)

In [17]:
h5file_sim = open_file(out_path + 'similarity.h5', mode="w", title="similarity")
for year in years:
    group = h5file_sim.create_group("/", str(year))
    for i in range(10):
        h5file_sim.create_table(group, str(i), SimPairs)
h5file_sim.close()









In [18]:
def arr_from_bytes(arr,add):
    return np.fromstring(arr + bytes(add),dtype=np.uint32)

In [19]:
def compute_sim(inqueue, output):
    h5file_cits = open_file(cits_path, mode="r", title="DictCits")
    table_cits = h5file_cits.root.pub_exp.DictCits
    for i in iter(inqueue.get,sentinel):
        line_cits = table_cits[i]
        citant = line_cits['citant']
        loaded = False
        add = 0
        while not loaded:
            try:
                cites = arr_from_bytes(line_cits['cites'],add)
                loaded = True
            except ValueError:
                add += 1
            if add > 5:
                break
        if loaded:
            comps = []
            for cite in cites:
                if cite in abs_dict.keys():
                    comps.append(cite)
            if citant in abs_dict.keys():
                comps.append(citant)
            tfidf = []
            for Art_ID in comps:
                tfidf.append(mm_tfidf[abs_dict[Art_ID]])
            comp_map = {i:x for i,x in enumerate(comps)}
            index = similarities.SparseMatrixSimilarity(tfidf,num_features=num_features) #check if num ft has impact on time
            sim = index[tfidf]
            pairs_id = np.tril_indices(sim.shape[0],-1)
            pairs_sim = []
            for j in range(len(pairs_id[0])):
                pairs_sim.append([comp_map[pairs_id[0][j]],comp_map[pairs_id[1][j]],sim[pairs_id[0][j],pairs_id[1][j]]])
            output.put([pairs_sim,line_cits['year_citant'],str(citant)[0]])
    h5file_cits.close()

In [20]:
def handle_output(output):
    start_time = time.time()
    counter = 0
    f = open(out_path+'progress.txt','w')
    h5file_sim = open_file(out_path + 'similarity.h5', mode="a", title="similarity")
    while True:
        args = output.get()
        if args:
            table_sim = h5file_sim.get_node('/{}/{}'.format(args[1],args[2]))
            sim = table_sim.row
            for line in args[0]: 
                IDs = np.sort(np.array([line[0],line[1]]))
                sim['Id_Art1'] = IDs[0]
                sim['Id_Art2'] = IDs[1]
                sim['Sim'] = line[2]
                sim.append()
            counter += 1
            table_sim.flush()
            if counter % 1000 == 0 :
                elapsed_time = time.time() - start_time
                elapsed_time_h = np.round(elapsed_time/3600,2)
                time_per_art = elapsed_time/counter
                time_left_h = np.round(time_per_art*(nb_citants_incl-counter)/3600,2)
                perc = np.round(counter/nb_citants_incl*100,3)
                f.write('Progress: {} manuscripts, {}%, Time since start: {}, Time left: {}\n'.format(counter,perc,elapsed_time_h,time_left_h))    
                f.flush()
        else:
            break
    f.close()
    h5file_sim.close()

In [None]:
num_processes = mp.cpu_count()-2
sentinel = None
manager = mp.Manager()
output = mp.Queue()
inqueue = mp.Queue()
jobs = []
proc = mp.Process(target=handle_output, args=(output, ))
proc.start()

for i in range(num_processes):
    p = mp.Process(target=compute_sim, args=(inqueue, output))
    jobs.append(p)
    p.start()
#for i in idx_incl:
for i in range(nb_l_cits):
    inqueue.put(i)
for i in range(num_processes):
    # Send the sentinal to tell Simulation to end
    inqueue.put(sentinel)
for p in jobs:
    p.join()
output.put(None)
proc.join()