In [1]:
from common import *

Using TensorFlow backend.


# Load data

In [2]:
model = gensim.models.Word2Vec.load(join(DATA_FOLDER, 'vectors/w2v_model_300_w10'))
wv = model.wv

2017-07-29 19:59:00,928 : INFO : loading Word2Vec object from ../data/vectors/w2v_model_300_w10
2017-07-29 19:59:02,906 : INFO : loading wv recursively from ../data/vectors/w2v_model_300_w10.wv.* with mmap=None
2017-07-29 19:59:02,907 : INFO : loading syn0 from ../data/vectors/w2v_model_300_w10.wv.syn0.npy with mmap=None
2017-07-29 19:59:03,059 : INFO : setting ignored attribute syn0norm to None
2017-07-29 19:59:03,059 : INFO : loading syn1neg from ../data/vectors/w2v_model_300_w10.syn1neg.npy with mmap=None
2017-07-29 19:59:03,207 : INFO : setting ignored attribute cum_table to None
2017-07-29 19:59:03,208 : INFO : loaded ../data/vectors/w2v_model_300_w10


In [3]:
dictionary = corpora.Dictionary.load(join(DATA_FOLDER, 'pure.dict'))
tfidf = models.TfidfModel.load(join(DATA_FOLDER, 'tfidf_pure.model'))
# !!!!!
tfidf.normalize= False

2017-07-29 19:59:04,913 : INFO : loading Dictionary object from ../data/pure.dict
2017-07-29 19:59:06,232 : INFO : loaded ../data/pure.dict
2017-07-29 19:59:06,233 : INFO : loading TfidfModel object from ../data/tfidf_pure.model
2017-07-29 19:59:06,641 : INFO : loaded ../data/tfidf_pure.model


In [4]:
all_docs = get_all_docs(DATA_FOLDER)
val_docs = glob(join(DATA_FOLDER, 'validate/*.txt'))

# Main funcs

In [5]:
def sim_matrix(word_set1, word_set2, wv):
    l1 = list(word_set1)
    l2 = list(word_set2)
    wv1 = wv[l1]
    wv2 = wv[l2]
    
    arr = cosine_similarity(wv1, wv2)
    
    smat = pd.DataFrame(arr, index=l1, columns=l2)
    return smat
    

In [6]:
def wmd(smat, tfidf_weights):
    mins = np.amin(smat, axis=0)
    return np.dot(mins, tfidf_weights)

In [7]:
def get_test_tfidf_weights(val_docs, wv):
    tweights = {}
    for fname in tqdm(val_docs):
        with open(fname, 'r') as f:
            doc_text = f.read()
        tokenized_filtered = [w for w in tokenize(doc_text) if w in wv]
        doc_bow = dictionary.doc2bow(tokenized_filtered)
        sorted_tfidf = sorted(tfidf[doc_bow], key=itemgetter(1), reverse=True)
        sorted_tfidf = pd.DataFrame([(dictionary[k],v) for k,v in sorted_tfidf],
                                   columns=['word', 'score'])
        
        name = path.splitext(basename(fname))[0]
        tweights[name] = sorted_tfidf
            
    return tweights

In [8]:
def doc_queries_distances(doc_words, val_words, tweights):
    dists = []
    smat = sim_matrix(val_words, doc_words, wv)
    for k, query in tweights.items():
        if len(query) == 0:
            dists.append(-99999999999)
            continue
        
        qsmat = smat.loc[query.word, :]
        dist = wmd(qsmat.T, query.score)
        dists.append(dist)
    return dists

In [9]:
tweights = get_test_tfidf_weights(val_docs, wv)
val_words = set(pd.concat([v.word for v in tweights.values()]))

100%|██████████| 130/130 [00:00<00:00, 195.10it/s]


In [10]:
# tweights['_01_98_2010120931A10010101RU']

In [11]:
ziped_files = sorted(glob(DATA_FOLDER + '/documents/*.gz'), key=natural_keys)

def _calc(docs):        
    doc_dists = []
    for doc in tqdm(docs):
        doc_words = set([w for sent in doc for w in sent if w in wv])
        dists = doc_queries_distances(doc_words, val_words, tweights)

        arg = np.argmax(dists)
        doc_dists.append((arg, dists[arg]))
    return doc_dists


def calc_dists(fnames, wv):
    doc_dists = []
    for fn in fnames:
        print(fn)
        with GzipFile(fn, 'r') as myzip:
            text = myzip.read()
        docs = json.loads(text)
        
        parallelizer = Parallel(n_jobs=cpu_count)

        # this iterator returns the functions to execute for each task
        tasks_iterator = ( delayed(_calc)(docs_block) for 
                          docs_block in grouper(len(docs)//cpu_count, docs) )  
        result = parallelizer( tasks_iterator )        
        doc_dists += result
            
    return np.array(doc_dists)

In [15]:
val_dists = calc_dists(ziped_files[-10:], wv)

../data//documents/991.json.gz


100%|██████████| 298/298 [02:24<00:00,  2.04it/s]
100%|██████████| 298/298 [02:24<00:00,  2.01it/s]
100%|██████████| 2/2 [00:00<00:00,  3.03it/s]t/s]
 99%|█████████▉| 296/298 [02:25<00:00,  2.38it/s]
100%|██████████| 298/298 [02:26<00:00,  3.08it/s]


../data//documents/992.json.gz


 99%|█████████▊| 294/298 [02:23<00:01,  2.05it/s]
100%|██████████| 2/2 [00:00<00:00,  2.34it/s]t/s]
100%|██████████| 298/298 [02:24<00:00,  2.19it/s]
100%|██████████| 298/298 [02:25<00:00,  2.45it/s]
100%|██████████| 298/298 [02:25<00:00,  3.05it/s]


../data//documents/993.json.gz


 21%|██        | 62/298 [00:29<02:11,  1.80it/s]

KeyboardInterrupt: 

In [13]:
np.save(join(DATA_FOLDER, 'val_dists.npy'), val_dists)

TODO:
* normalize = False in tfidf
* first 2-3 sents of query

# First n

In [19]:
# fn = join(DATA_FOLDER, 'first2.json.gz')

# with GzipFile(fn, 'r') as myzip:
#     text = myzip.read()
# docs = json.loads(text)
# del text

In [26]:
fn = join(DATA_FOLDER, 'first3_val.json.gz')

with GzipFile(fn, 'r') as myzip:
    text = myzip.read()
val_documents = json.loads(text)
del text

In [27]:
len(val_documents)

130

In [47]:
tweights = {}
for fname, doc in tqdm(zip(val_docs, val_documents)):
    tokenized_filtered = [w for w in doc if w in wv]
    doc_bow = dictionary.doc2bow(tokenized_filtered)
    sorted_tfidf = sorted(tfidf[doc_bow], key=itemgetter(1), reverse=True)
    sorted_tfidf = pd.DataFrame([(dictionary[k],v) for k,v in sorted_tfidf],
                               columns=['word', 'score'])

    name = path.splitext(basename(fname))[0]
    tweights[name] = sorted_tfidf

val_words = set(pd.concat([v.word for v in tweights.values()]))

130it [00:00, 1610.34it/s]


In [48]:
val_dists = calc_dists(ziped_files[:1], wv)

../data//documents/0.json.gz


100%|██████████| 298/298 [01:54<00:00,  2.55it/s]
100%|██████████| 2/2 [00:00<00:00,  2.82it/s]t/s]
100%|██████████| 298/298 [01:55<00:00,  2.77it/s]
100%|██████████| 298/298 [01:56<00:00,  3.09it/s]
100%|██████████| 298/298 [01:56<00:00,  4.08it/s]


# Draft

In [19]:
with GzipFile(ziped_files[0], 'r') as myzip:
    text = myzip.read()
docs = json.loads(text)

In [20]:
doc = docs[0]
doc_words = set([w for sent in doc for w in sent if w in wv])
dists = doc_queries_distances(doc_words, val_words, tweights)

np.argmax(dists)

7

In [21]:
smat = sim_matrix(val_words, doc_words, wv)

In [None]:
sum(doc, [])

In [None]:
all_docs[0]

In [None]:
list(tweights.keys())[114]