In [55]:
import pandas as pd
import numpy as np
from common import *
from glob import glob
from os import rename, path
from gensim import corpora
from os.path import basename

import gensim
from gensim import corpora, models, similarities

DATA_FOLDER = '../data/'

In [56]:
%%time
dictionary = corpora.Dictionary.load(join(DATA_FOLDER, 'old.dict'))
corpus = corpora.MmCorpus(join(DATA_FOLDER, 'corpus.mm'))

CPU times: user 2.37 s, sys: 300 ms, total: 2.67 s
Wall time: 2.67 s


In [57]:
print(corpus)

MmCorpus(1194481 documents, 3496772 features, 192186674 non-zero entries)


In [58]:
%%time
fmodel = join(DATA_FOLDER, 'tfidf.model')
if not path.exists(fmodel):    
    tfidf = models.TfidfModel(corpus)
    tfidf.save(fmodel)
else:
    tfidf = models.TfidfModel.load(fmodel)

CPU times: user 704 ms, sys: 268 ms, total: 972 ms
Wall time: 1.01 s


In [5]:
# %%time
# index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary), num_best=200)

In [59]:
%%time
fname = join(DATA_FOLDER, 'sim_index/sim')
if not exists(fname):
    index = similarities.Similarity(fname, tfidf[corpus], 
                                    num_features=len(dictionary), num_best=200, 
                                    chunksize=4*256, shardsize=5*32768)
    index.save(fname)
else:
    index = similarities.Similarity.load(fname)

CPU times: user 25min 52s, sys: 3.3 s, total: 25min 55s
Wall time: 25min 55s


### Test

In [60]:
with open(join(DATA_FOLDER, 'gold.txt'), 'r') as f:
    gold_txt = f.read()
lines = gold_txt.splitlines()
gold = {line.split()[0]:line.split()[1:] for line in lines}

In [61]:
# DATA_FOLDER = '../data/'
# test_docs = []
# for root, directories, filenames in os.walk(join(DATA_FOLDER, 'FIPS/test/')):
#     test_docs += [join(root,f) for f in filenames]

In [62]:
# for f in test_docs:
#     if not '.txt' in f:
#         rename(f, f + '.txt')

In [63]:
test_docs = glob(join(DATA_FOLDER, 'docs/*.txt'))

In [64]:
preds = {}
for fname in test_docs:
    with open(fname, 'r') as f:
        doc_text = f.read()
    vec_bow = dictionary.doc2bow(tokenize(doc_text))
    vec_model = tfidf[vec_bow] # convert the query to model space
    preds[path.splitext(basename(fname))[0]] = index[vec_model]

In [66]:
%%time
all_docs = get_all_docs(DATA_FOLDER)

CPU times: user 160 ms, sys: 124 ms, total: 284 ms
Wall time: 411 ms


In [77]:
p1 = preds['04132544-ab51-06be-00f7-a94de65a3b84']

In [90]:
new_pred = {}
for k, v in preds.items():
    new_pred[k] = [path.splitext(basename(all_docs[key]))[0] for key, sim in p1]

In [93]:
new_pred

{'04132544-ab51-06be-00f7-a94de65a3b84': ['2010129585A20120127RU',
  '2006137334A20080427RU',
  '0002338697C220081120RU',
  '0002181344C220020420RU',
  '0002565063C220151020RU',
  '0093054505A19970310RU',
  '0000105619U120110620RU',
  '0000037089U120040410RU',
  '0094046342A119961020RU',
  '0000114681U120120410RU',
  '2011142344A20130427RU',
  '0000023437U120020620RU',
  '0000045379U120050510RU',
  '0002155721C220000910RU',
  '0002327506C120080627RU',
  '0000048969U120051110RU',
  '0000098997U120101110RU',
  '0099100047A20001027RU',
  '0002131500C119990610RU',
  '0002557571C120150727RU',
  '0000149112U120141220RU',
  '0098121516A20000920RU',
  '0002390503C120100527RU',
  '0000032774U120030927RU',
  '0000009178U119990216RU',
  '2002102112A20030727RU',
  '0094016521A119951210RU',
  '0002472716C220130120RU',
  '2010152710A20120627RU',
  '0002374400C120091127RU',
  '0002152907C120000720RU',
  '0000025888U120021027RU',
  '2006128933A20080220RU',
  '0002381184C120100210RU',
  '0002080301C119

In [108]:
result = []
for key, val in new_pred.items():
    true_val = gold[key]
    
    inter10 = set(val[0:10]).intersection(true_val)
    inter20 = set(val[0:20]).intersection(true_val)
    inter200 = set(val[0:200]).intersection(true_val)
    
    acc10 = len(inter10)/10
    acc20 = len(inter20)/20
    acc200 = len(inter200)/200
    
    result.append([acc10, acc20, acc200])

result = pd.DataFrame(result, columns=['acc10', 'acc20', 'acc200'])
print(result.median(axis=0))

acc10     0.0
acc20     0.0
acc200    0.0
dtype: float64


In [121]:
[v for v in all_docs if '-' in v]

['../data/FIPS/test/sims/5bc379e2-2879-7308-7b42-81734a6a2525.txt',
 '../data/FIPS/test/sims/b7019c93-0c23-9ea8-59d7-9d996ae4219f.txt',
 '../data/FIPS/test/sims/558f8a6c-5aa9-2d06-157c-1b9de4f52880.txt',
 '../data/FIPS/test/sims/d6ca12d4-2984-cc47-970e-35f6dd959a34.txt',
 '../data/FIPS/test/sims/efb0fff8-e02d-cf41-ddc3-2521ffade5e9.txt',
 '../data/FIPS/test/sims/d9cc0be2-d088-21cc-f759-aca749c18e75.txt',
 '../data/FIPS/test/sims/7f8e1894-2bc9-3547-4125-ca087f8ad60f.txt',
 '../data/FIPS/test/sims/dc4e1ee9-e673-84fd-59de-c05d00f6846b.txt',
 '../data/FIPS/test/sims/02b48283-4b33-d70b-787b-7d20f8efa285.txt',
 '../data/FIPS/test/sims/8546a129-cee8-4846-1537-45ab2a033e8c.txt',
 '../data/FIPS/test/sims/65aeacff-41b4-dcdd-73b0-dbeaffaab24f.txt',
 '../data/FIPS/test/sims/5f4fcb8e-45da-3bd6-574d-84115855c55f.txt',
 '../data/FIPS/test/sims/220da016-cdb6-328b-6afc-cff5fd041dc6.txt',
 '../data/FIPS/test/sims/bd8e7975-dd4e-609f-51a4-291b0f58b102.txt',
 '../data/FIPS/test/sims/94dd3e13-6073-a848-8ec8