In [1]:
import pandas as pd
import numpy as np
from common import *
from glob import glob
from os import rename, path
from gensim import corpora
from os.path import basename

import gensim
from gensim import corpora, models, similarities

DATA_FOLDER = '../data/'

In [7]:
%%time
dictionary = corpora.Dictionary.load(join(DATA_FOLDER, 'dict_all.dict'))
corpus = corpora.MmCorpus(join(DATA_FOLDER, 'corpus_all.mm'))

CPU times: user 2.09 s, sys: 256 ms, total: 2.34 s
Wall time: 2.34 s


In [8]:
print(corpus)

MmCorpus(1194423 documents, 3496597 features, 192168333 non-zero entries)


In [9]:
%%time
fmodel = join(DATA_FOLDER, 'tfidf.model')
if not path.exists(fmodel):    
    tfidf = models.TfidfModel(corpus)
    tfidf.save(fmodel)
else:
    tfidf = models.TfidfModel.load(fmodel)

CPU times: user 556 ms, sys: 140 ms, total: 696 ms
Wall time: 694 ms


In [5]:
# %%time
# index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary), num_best=200)

In [10]:
%%time
fname = join(DATA_FOLDER, 'sim_index/sim.index')
if not exists(fname):
    index = similarities.Similarity(fname, tfidf[corpus], 
                                    num_features=len(dictionary), num_best=200, 
                                    chunksize=4*256, shardsize=5*32768)
    index.save(fname)
else:
    index = similarities.Similarity.load(fname)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 341 µs


### Test

In [11]:
with open(join(DATA_FOLDER, 'gold.txt'), 'r') as f:
    gold_txt = f.read()
lines = gold_txt.splitlines()
gold = {line.split()[0]:line.split()[1:] for line in lines}

In [8]:
# DATA_FOLDER = '../data/'
# test_docs = []
# for root, directories, filenames in os.walk(join(DATA_FOLDER, 'FIPS/test/')):
#     test_docs += [join(root,f) for f in filenames]

In [9]:
# for f in test_docs:
#     if not '.txt' in f:
#         rename(f, f + '.txt')

In [12]:
test_docs = glob(join(DATA_FOLDER, 'docs/*.txt'))

In [13]:
preds = {}
for fname in test_docs:
    with open(fname, 'r') as f:
        doc_text = f.read()
    vec_bow = dictionary.doc2bow(tokenize(doc_text))
    vec_model = tfidf[vec_bow] # convert the query to model space
    preds[path.splitext(basename(fname))[0]] = index[vec_model]

In [12]:
preds['04132544-ab51-06be-00f7-a94de65a3b84']

[(1000367, 0.78170257806777954),
 (833572, 0.39055183529853821),
 (31217, 0.3839816153049469),
 (440266, 0.38086932897567749),
 (539626, 0.37571424245834351),
 (711289, 0.37349048256874084),
 (1132992, 0.37336385250091553),
 (1059687, 0.36844420433044434),
 (760073, 0.36726880073547363),
 (1136898, 0.36158469319343567),
 (612101, 0.35824453830718994),
 (1093775, 0.35679537057876587),
 (1064036, 0.35498917102813721),
 (280519, 0.35450446605682373),
 (292811, 0.35384908318519592),
 (1132648, 0.3517797589302063),
 (1164655, 0.34690374135971069),
 (744652, 0.34654209017753601),
 (485076, 0.34585636854171753),
 (29401, 0.34539955854415894),
 (1188067, 0.34304732084274292),
 (954987, 0.3370242714881897),
 (133498, 0.33612453937530518),
 (1070006, 0.33486449718475342),
 (1146625, 0.33482185006141663),
 (873794, 0.33323609828948975),
 (740626, 0.33308267593383789),
 (20971, 0.33248347043991089),
 (974873, 0.33031225204467773),
 (363714, 0.3285389244556427),
 (267710, 0.32780534029006958),
 (11

In [6]:
%%time
all_docs = get_all_docs(DATA_FOLDER)

CPU times: user 176 ms, sys: 72 ms, total: 248 ms
Wall time: 244 ms


In [30]:
all_docs[1000367]

'../data/FIPS/Inventions applications_txt_output/11/c2/0092005055A19950220RU.txt'

In [14]:
corpus[1000367]

[(12, 2.0),
 (20, 6.0),
 (30, 1.0),
 (99, 2.0),
 (134, 1.0),
 (155, 2.0),
 (163, 1.0),
 (380, 1.0),
 (396, 1.0),
 (405, 1.0),
 (410, 3.0),
 (507, 1.0),
 (788, 1.0),
 (846, 2.0),
 (1280, 1.0),
 (1296, 1.0),
 (1816, 1.0),
 (1817, 5.0),
 (2592, 3.0),
 (2682, 2.0),
 (2997, 1.0),
 (3606, 1.0),
 (3759, 2.0),
 (4125, 1.0),
 (4728, 5.0),
 (4827, 5.0),
 (5545, 2.0),
 (5549, 2.0),
 (8453, 1.0),
 (8642, 1.0),
 (11030, 1.0),
 (11040, 1.0),
 (11784, 1.0),
 (12079, 4.0),
 (13430, 2.0),
 (24235, 1.0),
 (25702, 1.0),
 (41891, 3.0),
 (721979, 1.0)]

In [16]:
dictionary[20]

'зона'

In [None]:
result = []
for key, val in preds.iteritems():
    seq = [doc_num for doc_num, sim in val]
    true_seq = gold[key]
    
    inter10 = set(seq[0:10]).intersection(true_seq)
    inter20 = set(seq[0:20]).intersection(true_seq)
    inter200 = set(seq[0:200]).intersection(true_seq)
    
    acc10 = len(inter10)/10
    acc20 = len(inter20)/20
    acc200 = len(inter200)/200
    
    result.append([acc10, acc20, acc200])

result = pd.DataFrame(result, columns=['acc10', 'acc20', 'acc200'])
print(result)
    