In [10]:
from common import *

# Load data

In [11]:
model = gensim.models.Word2Vec.load(join(DATA_FOLDER, 'vectors/w2v_model_300_w10'))
wv = model.wv

2017-07-31 10:42:40,916 : INFO : loading Word2Vec object from ../data/vectors/w2v_model_300_w10
2017-07-31 10:42:42,776 : INFO : loading wv recursively from ../data/vectors/w2v_model_300_w10.wv.* with mmap=None
2017-07-31 10:42:42,777 : INFO : loading syn0 from ../data/vectors/w2v_model_300_w10.wv.syn0.npy with mmap=None
2017-07-31 10:42:42,938 : INFO : setting ignored attribute syn0norm to None
2017-07-31 10:42:42,939 : INFO : loading syn1neg from ../data/vectors/w2v_model_300_w10.syn1neg.npy with mmap=None
2017-07-31 10:42:43,089 : INFO : setting ignored attribute cum_table to None
2017-07-31 10:42:43,090 : INFO : loaded ../data/vectors/w2v_model_300_w10


In [12]:
dictionary = corpora.Dictionary.load(join(DATA_FOLDER, 'pure.dict'))
tfidf = models.TfidfModel.load(join(DATA_FOLDER, 'tfidf_pure.model'))
# !!!!!
tfidf.normalize= False

2017-07-31 10:42:44,954 : INFO : loading Dictionary object from ../data/pure.dict
2017-07-31 10:42:46,195 : INFO : loaded ../data/pure.dict
2017-07-31 10:42:46,309 : INFO : loading TfidfModel object from ../data/tfidf_pure.model
2017-07-31 10:42:46,689 : INFO : loaded ../data/tfidf_pure.model


In [13]:
all_docs = get_all_docs(DATA_FOLDER)
val_docs = sorted(glob(join(DATA_FOLDER, 'validate/*.txt')))

# Main funcs

In [14]:
def sim_matrix(word_set1, word_set2, wv):
    l1 = list(word_set1)
    l2 = list(word_set2)
    wv1 = wv[l1]
    wv2 = wv[l2]
    
    arr = cosine_similarity(wv1, wv2)
    
    smat = pd.DataFrame(arr, index=l1, columns=l2)
    return smat
    

In [15]:
def wmd(smat, tfidf_weights):
    mins = np.amin(smat, axis=0)
    return np.dot(mins, tfidf_weights)

In [16]:
def get_test_tfidf_weights(val_docs, wv):
    tweights = {}
    for fname in tqdm(val_docs):
        with open(fname, 'r') as f:
            doc_text = f.read()
        tokenized_filtered = [w for w in tokenize(doc_text) if w in wv]
        doc_bow = dictionary.doc2bow(tokenized_filtered)
        sorted_tfidf = sorted(tfidf[doc_bow], key=itemgetter(1), reverse=True)
        sorted_tfidf = pd.DataFrame([(dictionary[k],v) for k,v in sorted_tfidf],
                                   columns=['word', 'score'])
        
        name = path.splitext(basename(fname))[0]
        tweights[name] = sorted_tfidf
            
    return tweights

In [17]:
tweights = get_test_tfidf_weights(val_docs, wv)
val_words = set(pd.concat([v.word for v in tweights.values()]))
ziped_files = sorted(glob(DATA_FOLDER + '/documents/*.gz'), key=natural_keys)

100%|██████████| 130/130 [00:00<00:00, 193.23it/s]


In [18]:
# tweights['_01_98_2010120931A10010101RU']

In [104]:
def doc_queries_distances(d_smat, tweights):
    dists = []
    for k, query in tweights.items():
        if len(query) == 0:
            dists.append(None)
            continue
        
        qsmat = d_smat.loc[:, query.word]
        dist = wmd(qsmat, query.score)
        dists.append(dist)
    return dists


def _calc(docs):        
    doc_dists = []
    words = set([w for doc in docs for sent in doc for w in sent if w in wv])
    smat = sim_matrix(words, val_words, wv)
    for doc in tqdm(docs):
        doc_words = set([w for sent in doc for w in sent])
        d_smat = smat.loc[doc_words, :]
        dists = doc_queries_distances(d_smat, tweights)

        doc_dists.append(dists)
    return doc_dists


def calc_dists(fnames, wv):
    doc_dists = []
    for fn in fnames:
        print(fn)
        ! echo {fn} >> ../data/out
        with GzipFile(fn, 'r') as myzip:
            text = myzip.read()
        docs = json.loads(text)
        
        parallelizer = Parallel(n_jobs=cpu_count)

        # this iterator returns the functions to execute for each task
        tasks_iterator = ( delayed(_calc)(docs_block) for 
                          docs_block in grouper(len(docs)//cpu_count, docs) )  
        result = parallelizer( tasks_iterator )        
        doc_dists += result
            
    return np.array(sum(doc_dists, []))

In [130]:
val_dists = calc_dists(ziped_files[:1], wv)

../data//documents/0.json.gz


100%|██████████| 298/298 [01:41<00:00,  3.32it/s]
100%|██████████| 2/2 [00:00<00:00,  4.38it/s]t/s]
100%|██████████| 298/298 [01:44<00:00,  3.37it/s]
100%|██████████| 298/298 [01:46<00:00,  3.87it/s]
100%|██████████| 298/298 [01:48<00:00,  4.27it/s]


In [131]:
np.save(join(DATA_FOLDER, 'val_dists.npy'), val_dists)

In [153]:
val_dists = np.load(join(DATA_FOLDER, 'val_dists.npy'))

In [155]:
sorted_args = val_dists.argsort(axis=0)[-200:][::-1]

TODO:
* normalize = False in tfidf
* first 2-3 sents of query

# First n

In [19]:
# fn = join(DATA_FOLDER, 'first2.json.gz')

# with GzipFile(fn, 'r') as myzip:
#     text = myzip.read()
# docs = json.loads(text)
# del text

In [26]:
fn = join(DATA_FOLDER, 'first3_val.json.gz')

with GzipFile(fn, 'r') as myzip:
    text = myzip.read()
val_documents = json.loads(text)
del text

In [27]:
len(val_documents)

130

In [47]:
tweights = {}
for fname, doc in tqdm(zip(val_docs, val_documents)):
    tokenized_filtered = [w for w in doc if w in wv]
    doc_bow = dictionary.doc2bow(tokenized_filtered)
    sorted_tfidf = sorted(tfidf[doc_bow], key=itemgetter(1), reverse=True)
    sorted_tfidf = pd.DataFrame([(dictionary[k],v) for k,v in sorted_tfidf],
                               columns=['word', 'score'])

    name = path.splitext(basename(fname))[0]
    tweights[name] = sorted_tfidf

val_words = set(pd.concat([v.word for v in tweights.values()]))

130it [00:00, 1610.34it/s]


In [48]:
val_dists = calc_dists(ziped_files[:1], wv)

../data//documents/0.json.gz


100%|██████████| 298/298 [01:54<00:00,  2.55it/s]
100%|██████████| 2/2 [00:00<00:00,  2.82it/s]t/s]
100%|██████████| 298/298 [01:55<00:00,  2.77it/s]
100%|██████████| 298/298 [01:56<00:00,  3.09it/s]
100%|██████████| 298/298 [01:56<00:00,  4.08it/s]


# Draft

In [19]:
with GzipFile(ziped_files[0], 'r') as myzip:
    text = myzip.read()
docs = json.loads(text)

In [20]:
doc = docs[0]
doc_words = set([w for sent in doc for w in sent if w in wv])
dists = doc_queries_distances(doc_words, val_words, tweights)

np.argmax(dists)

7

In [21]:
smat = sim_matrix(val_words, doc_words, wv)

In [None]:
sum(doc, [])

In [None]:
all_docs[0]

In [None]:
list(tweights.keys())[114]