In [20]:
from gensim.corpora import MmCorpus, Dictionary
from gensim import similarities
import numpy as np
from tables import *
import time
import multiprocessing as mp
import csv
from tqdm import tqdm_notebook as tqdm


In [2]:
class SimPairs(IsDescription):
    Citant = UInt32Col()
    Cite = UInt32Col()
    Sim = Float16Col()

In [3]:
tfidf_path = "/mnt/disks/sdb/data/pub_full_tfidf.mm"
ids_path = "/mnt/disks/sdb/data/abs_cits.txt"
cits_path = "/mnt/disks/sdb/data/DictCits.h5"
out_path = "/mnt/disks/sdc/data/"
dict_path = "/mnt/disks/sdb/data/dct.p"
info_path = "/mnt/disks/sdb/data/pub_exp_info.txt"

In [4]:
h5file_cits = open_file(cits_path, mode="r", title="DictCits")
table_cits = h5file_cits.root.pub_exp.DictCits
nb_l_cits = len(table_cits)
nb_citants_incl = nb_l_cits
h5file_cits.close()
del table_cits

In [5]:
dictionary = Dictionary.load(dict_path)
num_features= len(dictionary)
del dictionary


In [6]:
mm_tfidf = MmCorpus(tfidf_path)

In [7]:
with open(ids_path,'r') as f:
    abs_cits = f.readlines()
abs_cits = np.array([int(x.strip()) for x in abs_cits])


In [8]:
abs_dict = {x:i for i,x in enumerate(abs_cits)}
del abs_cits


In [9]:
years = np.arange(1898,2020)

In [11]:
h5file_sim = open_file(out_path + 'similarity_refs_only.h5', mode="w", title="similarity")
group = h5file_sim.create_group("/", "citations")
h5file_sim.create_table(group, "sim", SimPairs)
h5file_sim.close()

In [12]:
def arr_from_bytes(arr,add):
    return np.fromstring(arr + bytes(add),dtype=np.uint32)

In [17]:
idx_citant = np.arange(nb_l_cits)
np.random.shuffle(idx_citant)
h5file_cits = open_file(cits_path, mode="r", title="DictCits")
table_cits = h5file_cits.root.pub_exp.DictCits
f = open(out_path+'progress.txt','w')
h5file_sim = open_file(out_path + 'similarity_refs_only.h5', mode="a", title="similarity")
table_sim = h5file_sim.root.citations.sim
row = table_sim.row


In [25]:
len(table_sim)

31708

In [24]:
counter = 0
start_time = time.time()
for i in tqdm(idx_citant):
    line_cits = table_cits[i]
    citant = line_cits['citant']
    loaded = False
    add = 0
    while not loaded:
        try:
            cites = arr_from_bytes(line_cits['cites'],add)
            loaded = True
        except ValueError:
            add += 1
        if add > 5:
            break
    if loaded:
        if citant in abs_dict.keys():
            tfidf_base = mm_tfidf[abs_dict[citant]]
            comps = []
            for cite in cites:
                if cite in abs_dict.keys():
                    comps.append(cite)
            tfidf_comps = []
            for Art_ID in comps:
                tfidf_comps.append(mm_tfidf[abs_dict[Art_ID]])
            index = similarities.SparseMatrixSimilarity([tfidf_base],num_features=num_features) #check if num ft has impact on time
            sim = index[tfidf_comps]
            pairs_sim = []
            for j in range(len(comps)):
                pairs_sim.append([citant,comps[j],sim[j][0]])
            for line in pairs_sim: 
                row['Citant'] = line[0]
                row['Cite'] = line[1]
                row['Sim'] = line[2]
                row.append()
            counter += 1
            table_sim.flush()
            if counter % 100 == 0 :
                elapsed_time = time.time() - start_time
                elapsed_time_h = np.round(elapsed_time/3600,2)
                time_per_art = elapsed_time/counter
                time_left_h = np.round(time_per_art*(nb_citants_incl-counter)/3600,2)
                perc = np.round(counter/nb_citants_incl*100,3)
                f.write('Progress: {} manuscripts, {}%, Time since start: {}, Time left: {}\n'.format(counter,perc,elapsed_time_h,time_left_h))    
                f.flush()
                
h5file_cits.close()    
f.close()
h5file_sim.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


A Jupyter Widget

KeyboardInterrupt: 

In [12]:
def compute_sim(inqueue, output):
    h5file_cits = open_file(cits_path, mode="r", title="DictCits")
    table_cits = h5file_cits.root.pub_exp.DictCits
    for i in iter(inqueue.get,sentinel):
        line_cits = table_cits[i]
        citant = line_cits['citant']
        loaded = False
        add = 0
        while not loaded:
            try:
                cites = arr_from_bytes(line_cits['cites'],add)
                loaded = True
            except ValueError:
                add += 1
            if add > 5:
                break
        if loaded:
            if citant in abs_dict.keys():
                tfidf_base = mm_tfidf[abs_dict[citant]]
                comps = []
                for cite in cites:
                    if cite in abs_dict.keys():
                        comps.append(cite)
                tfidf_comps = []
                for Art_ID in comps:
                    tfidf_comps.append(mm_tfidf[abs_dict[Art_ID]])
                index = similarities.SparseMatrixSimilarity([tfidf_base],num_features=num_features) #check if num ft has impact on time
                sim = index[tfidf_comps]
                pairs_sim = []
                for j in range(len(comps)):
                    pairs_sim.append([citant,comps[j],sim[j][0]])
                output.put(pairs_sim)
h5file_cits.close()    
f.close()
h5file_sim.close()

In [13]:
def handle_output(output):
    start_time = time.time()
    counter = 0
    f = open(out_path+'progress.txt','w')
    h5file_sim = open_file(out_path + 'similarity_refs_only.h5', mode="a", title="similarity")
    table_sim = h5file_sim.root.citations.sim
    sim = table_sim.row
    while True:
        args = output.get()
        if args:
            for line in args: 
                sim['Citant'] = line[0]
                sim['Cite'] = line[1]
                sim['Sim'] = line[2]
                sim.append()
            counter += 1
            table_sim.flush()
            if counter % 1000 == 0 :
                elapsed_time = time.time() - start_time
                elapsed_time_h = np.round(elapsed_time/3600,2)
                time_per_art = elapsed_time/counter
                time_left_h = np.round(time_per_art*(nb_citants_incl-counter)/3600,2)
                perc = np.round(counter/nb_citants_incl*100,3)
                f.write('Progress: {} manuscripts, {}%, Time since start: {}, Time left: {}\n'.format(counter,perc,elapsed_time_h,time_left_h))    
                f.flush()
        else:
            break
    f.close()
    h5file_sim.close()

In [14]:
num_processes = mp.cpu_count()-10
sentinel = None
manager = mp.Manager()
output = mp.Queue()
inqueue = mp.Queue()
jobs = []
proc = mp.Process(target=handle_output, args=(output, ))
proc.start()

for i in range(num_processes):
    p = mp.Process(target=compute_sim, args=(inqueue, output))
    jobs.append(p)
    p.start()
idx_citant = np.arange(nb_l_cits)
np.random.shuffle(idx_citant)
for i in idx_citant:
    inqueue.put(i)
for i in range(num_processes):
    # Send the sentinal to tell Simulation to end
    inqueue.put(sentinel)
for p in jobs:
    p.join()
output.put(None)
proc.join()

KeyboardInterrupt: 

In [16]:
h5file_sim = open_file(out_path + 'similarity_refs_only.h5', mode="a", title="similarity")
table_sim = h5file_sim.root.citations.sim

In [18]:
table_sim[0]

(11060947, 9549685,  0.15246582)