In [1]:
from common import *

Using TensorFlow backend.


# Load data

In [2]:
model = gensim.models.Word2Vec.load(join(DATA_FOLDER, 'vectors/w2v_model_300_w10'))
wv = model.wv

2017-07-29 00:07:34,232 : INFO : loading Word2Vec object from ../data/vectors/w2v_model_300_w10
2017-07-29 00:07:36,236 : INFO : loading wv recursively from ../data/vectors/w2v_model_300_w10.wv.* with mmap=None
2017-07-29 00:07:36,237 : INFO : loading syn0 from ../data/vectors/w2v_model_300_w10.wv.syn0.npy with mmap=None
2017-07-29 00:07:36,393 : INFO : setting ignored attribute syn0norm to None
2017-07-29 00:07:36,394 : INFO : loading syn1neg from ../data/vectors/w2v_model_300_w10.syn1neg.npy with mmap=None
2017-07-29 00:07:36,548 : INFO : setting ignored attribute cum_table to None
2017-07-29 00:07:36,548 : INFO : loaded ../data/vectors/w2v_model_300_w10


In [3]:
dictionary = corpora.Dictionary.load(join(DATA_FOLDER, 'pure.dict'))
tfidf = models.TfidfModel.load(join(DATA_FOLDER, 'tfidf_pure.model'))
!!!!!
tfidf.normalize= False

2017-07-29 00:07:38,300 : INFO : loading Dictionary object from ../data/pure.dict
2017-07-29 00:07:39,638 : INFO : loaded ../data/pure.dict
2017-07-29 00:07:39,639 : INFO : loading TfidfModel object from ../data/tfidf_pure.model
2017-07-29 00:07:40,054 : INFO : loaded ../data/tfidf_pure.model


In [4]:
all_docs = get_all_docs(DATA_FOLDER)
val_docs = glob(join(DATA_FOLDER, 'validate/*.txt'))

# Main funcs

In [5]:
def sim_matrix(word_set1, word_set2, wv):
    l1 = list(word_set1)
    l2 = list(word_set2)
    wv1 = wv[l1]
    wv2 = wv[l2]
    
    arr = cosine_similarity(wv1, wv2)
    
    smat = pd.DataFrame(arr, index=l1, columns=l2)
    return smat
    

In [6]:
def wmd(smat, tfidf_weights):
    mins = np.amin(smat, axis=0)
    return np.dot(mins, tfidf_weights)

In [7]:
def get_test_tfidf_weights(val_docs, wv):
    tweights = {}
    for fname in tqdm(val_docs):
        with open(fname, 'r') as f:
            doc_text = f.read()
        tokenized_filtered = [w for w in tokenize(doc_text) if w in wv]
        doc_bow = dictionary.doc2bow(tokenized_filtered)
        sorted_tfidf = sorted(tfidf[doc_bow], key=itemgetter(1), reverse=True)
        sorted_tfidf = pd.DataFrame([(dictionary[k],v) for k,v in sorted_tfidf],
                                   columns=['word', 'score'])
        
        name = path.splitext(basename(fname))[0]
        tweights[name] = sorted_tfidf
            
    return tweights

In [8]:
def doc_queries_distances(doc_words, val_words, tweights):
    dists = []
    smat = sim_matrix(val_words, doc_words, wv)
    for k, query in tweights.items():
        if len(query) == 0:
            dists.append(-99999999999)
            continue
        
        qsmat = smat.loc[query.word, :]
        dist = wmd(qsmat.T, query.score)
        dists.append(dist)
    return dists

In [9]:
tweights = get_test_tfidf_weights(val_docs, wv)
val_words = set(pd.concat([v.word for v in tweights.values()]))

100%|██████████| 130/130 [00:00<00:00, 175.55it/s]


In [10]:
# tweights['_01_98_2010120931A10010101RU']

In [11]:
ziped_files = sorted(glob(DATA_FOLDER + '/documents/*.gz'), key=natural_keys)

def _calc(docs):        
    doc_dists = []
    for doc in tqdm(docs):
        doc_words = set([w for sent in doc for w in sent if w in wv])
        dists = doc_queries_distances(doc_words, val_words, tweights)

        arg = np.argmax(dists)
        doc_dists.append((arg, dists[arg]))
    return doc_dists


def calc_dists(fnames, wv):
    doc_dists = []
    for fn in fnames:
        print(fn)
        with GzipFile(fn, 'r') as myzip:
            text = myzip.read()
        docs = json.loads(text)
        
        parallelizer = Parallel(n_jobs=cpu_count)

        # this iterator returns the functions to execute for each task
        tasks_iterator = ( delayed(_calc)(docs_block) for 
                          docs_block in grouper(len(docs)//cpu_count, docs) )  
        result = parallelizer( tasks_iterator )        
        doc_dists += result
            
    return np.array(doc_dists)

In [12]:
val_dists = calc_dists(ziped_files[:1], wv)

../data//documents/0.json.gz


100%|██████████| 298/298 [02:52<00:00,  1.66it/s]
100%|██████████| 2/2 [00:01<00:00,  1.92it/s]t/s]
100%|██████████| 298/298 [02:54<00:00,  2.04it/s]
100%|██████████| 298/298 [02:55<00:00,  2.63it/s]
100%|██████████| 298/298 [02:58<00:00,  3.59it/s]


In [13]:
np.save(join(DATA_FOLDER, 'val_dists.npy'), val_dists)

TODO:
* normalize = False in tfidf
* first 2-3 sents of query

# Draft

In [19]:
with GzipFile(ziped_files[0], 'r') as myzip:
    text = myzip.read()
docs = json.loads(text)

In [20]:
doc = docs[0]
doc_words = set([w for sent in doc for w in sent if w in wv])
dists = doc_queries_distances(doc_words, val_words, tweights)

np.argmax(dists)

7

In [21]:
smat = sim_matrix(val_words, doc_words, wv)

In [None]:
sum(doc, [])

In [None]:
all_docs[0]

In [None]:
list(tweights.keys())[114]