In [1]:
import pandas as pd
import numpy as np
from common import *
from glob import glob
from os import rename, path
from gensim import corpora
from os.path import basename

import gensim
from gensim import corpora, models, similarities

DATA_FOLDER = '../data/'

In [2]:
%%time
dictionary = corpora.Dictionary.load(join(DATA_FOLDER, 'old.dict'))
corpus = corpora.MmCorpus(join(DATA_FOLDER, 'corpus.mm'))

CPU times: user 2.13 s, sys: 268 ms, total: 2.4 s
Wall time: 2.41 s


In [3]:
print(corpus)

MmCorpus(1194664 documents, 3496816 features, 192260021 non-zero entries)


In [4]:
%%time
fmodel = join(DATA_FOLDER, 'tfidf.model')
if not path.exists(fmodel):    
    tfidf = models.TfidfModel(corpus)
    tfidf.save(fmodel)
else:
    tfidf = models.TfidfModel.load(fmodel)

CPU times: user 532 ms, sys: 108 ms, total: 640 ms
Wall time: 657 ms


In [5]:
# %%time
# index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary), num_best=200)

In [6]:
%%time
fname = join(DATA_FOLDER, 'sim_index/sim')
if not exists(fname):
    index = similarities.Similarity(fname, tfidf[corpus], 
                                    num_features=len(dictionary), num_best=200, 
                                    chunksize=4*256, shardsize=5*32768)
    index.save(fname)
else:
    index = similarities.Similarity.load(fname)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 342 µs


### Test

In [7]:
with open(join(DATA_FOLDER, 'gold.txt'), 'r') as f:
    gold_txt = f.read()
lines = gold_txt.splitlines()
gold = {line.split()[0]:line.split()[1:] for line in lines}

In [8]:
# DATA_FOLDER = '../data/'
# test_docs = []
# for root, directories, filenames in os.walk(join(DATA_FOLDER, 'FIPS/test/')):
#     test_docs += [join(root,f) for f in filenames]

In [9]:
# for f in test_docs:
#     if not '.txt' in f:
#         rename(f, f + '.txt')

In [10]:
test_docs = glob(join(DATA_FOLDER, 'docs/*.txt'))

In [11]:
preds = {}
for fname in test_docs:
    with open(fname, 'r') as f:
        doc_text = f.read()
    vec_bow = dictionary.doc2bow(tokenize(doc_text))
    vec_model = tfidf[vec_bow] # convert the query to model space
    preds[path.splitext(basename(fname))[0]] = index[vec_model]

In [12]:
%%time
all_docs = get_all_docs(DATA_FOLDER)

CPU times: user 136 ms, sys: 76 ms, total: 212 ms
Wall time: 209 ms


In [13]:
preds['04132544-ab51-06be-00f7-a94de65a3b84']

[(1000608, 0.74110907316207886),
 (833813, 0.3702671229839325),
 (31217, 0.36403593420982361),
 (440266, 0.36107760667800903),
 (539626, 0.35619732737541199),
 (711530, 0.35407131910324097),
 (1133233, 0.35396155714988708),
 (1059928, 0.349295973777771),
 (760314, 0.34820276498794556),
 (1137139, 0.34278708696365356),
 (612342, 0.33962130546569824),
 (1094016, 0.33824729919433594),
 (1064277, 0.33652445673942566),
 (280519, 0.33607524633407593),
 (292811, 0.33546587824821472),
 (1132889, 0.33351278305053711),
 (1164896, 0.32887572050094604),
 (744893, 0.32854816317558289),
 (485076, 0.32790091633796692),
 (29401, 0.32746857404708862),
 (1188308, 0.32523852586746216),
 (559662, 0.32256323099136353),
 (955228, 0.31950882077217102),
 (133498, 0.31864705681800842),
 (1070247, 0.31745350360870361),
 (1146866, 0.31743741035461426),
 (874035, 0.31590449810028076),
 (740867, 0.31577873229980469),
 (20971, 0.31522560119628906),
 (975114, 0.31316456198692322),
 (363714, 0.31148520112037659),
 (2

In [14]:
all_docs[31217]

'../data/FIPS/Inventions patents_txt_output/cd/5f/0002338697C220081120RU.txt'

In [16]:
new_pred = {}
for k, v in preds.items():
    new_pred[k] = [path.splitext(basename(all_docs[key]))[0] for key, sim in v]

In [19]:
new_pred['04132544-ab51-06be-00f7-a94de65a3b84']

['2010129585A20120127RU',
 '2006137334A20080427RU',
 '0002338697C220081120RU',
 '0002181344C220020420RU',
 '0002565063C220151020RU',
 '0093054505A19970310RU',
 '0000105619U120110620RU',
 '0000037089U120040410RU',
 '0094046342A119961020RU',
 '0000114681U120120410RU',
 '2011142344A20130427RU',
 '0000023437U120020620RU',
 '0000045379U120050510RU',
 '0002155721C220000910RU',
 '0002327506C120080627RU',
 '0000048969U120051110RU',
 '0000098997U120101110RU',
 '0099100047A20001027RU',
 '0002131500C119990610RU',
 '0002557571C120150727RU',
 '0000149112U120141220RU',
 'abe81efc-9887-ece3-9055-48e31161c2ea',
 '0098121516A20000920RU',
 '0002390503C120100527RU',
 '0000032774U120030927RU',
 '0000009178U119990216RU',
 '2002102112A20030727RU',
 '0094016521A119951210RU',
 '0002472716C220130120RU',
 '2010152710A20120627RU',
 '0002374400C120091127RU',
 '0002152907C120000720RU',
 '0000025888U120021027RU',
 '2006128933A20080220RU',
 '0002381184C120100210RU',
 '0002080301C119970527RU',
 '0096113326A19981010RU

In [21]:
result = []
for key, val in new_pred.items():
    true_val = gold[key]
    gold_len = len(true_val)
    
    inter10 = set(val[0:10]).intersection(true_val)
    inter20 = set(val[0:20]).intersection(true_val)
    inter200 = set(val[0:200]).intersection(true_val)
    
    acc10 = len(inter10)/gold_len
    acc20 = len(inter20)/gold_len
    acc200 = len(inter200)/gold_len
    
    result.append([acc10, acc20, acc200])

result = pd.DataFrame(result, columns=['acc10', 'acc20', 'acc200'])
print(result.median(axis=0))

acc10     0.333333
acc20     0.500000
acc200    1.000000
dtype: float64


In [154]:
len([v for v in all_docs if '-' in v])

241