In [22]:
from pyemd import emd
import numpy as np

In [4]:
from pkgs.FastText import FastVector
from src import processing

In [9]:
from gensim.corpora.dictionary import Dictionary

In [5]:
vecs = {}
vecs['en'] = FastVector('dump/en/wordvecs.txt')
vecs['zh'] = FastVector('dump/zh/wordvecs.txt')

Reading word vectors from dump/en/wordvecs.txt
Reading word vectors from dump/zh/wordvecs.txt


In [45]:
def wmdsimilarity(doc1, doc2, lang1, lang2, vecs):
    tok1 = list(processing.tokenize(lang1, doc1))
    tok2 = list(processing.tokenize(lang2, doc2))
    
    print(tok1, tok2)
    
    dictionary = Dictionary(documents=[tok1, tok2])
    vocab_len = len(dictionary)

    if vocab_len == 1:
        # Both documents are composed by a single unique token
        return 0.0

    # Sets for faster look-up.
    docset1 = set(tok1)
    docset2 = set(tok2)
    
    print(dictionary, docset1, docset2)

    # Compute distance matrix.
    distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
    for i, t1 in dictionary.items():
        for j, t2 in dictionary.items():
            if t1 not in docset1 or t2 not in docset2:
                continue
            # Compute Euclidean distance between word vectors.
            distance_matrix[i, j] = np.sqrt(np.sum((vecs[lang1][t1] - vecs[lang2][t2])**2))

    if np.sum(distance_matrix) == 0.0:
        # `emd` gets stuck if the distance matrix contains only zeros.
        print('The distance matrix is all zeros. Aborting (returning inf).')
        return float('inf')

    def nbow(document):
        d = np.zeros(vocab_len, dtype=np.double)
        nbow = dictionary.doc2bow(document)  # Word frequencies.
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)  # Normalized word frequencies.
        return d

    # Compute nBOW representation of documents.
    d1 = nbow(tok1)
    d2 = nbow(tok2)

    # Compute WMD.
    return emd(d1, d2, distance_matrix)

In [26]:
import pkgs.WMD
import imp
imp.reload(pkgs.WMD)

<module 'pkgs.WMD' from 'C:\\Users\\HP\\Documents\\root\\MulLing\\pkgs\\WMD.py'>

In [27]:
doc1 = 'Hello world'
doc2 = '你好世界'
wmdsim = pkgs.WMD.wmdsimilarity(doc1, doc2, 'en', 'zh', vecs, with_flow=True)
print(wmdsim)

['hello', 'world'] ['你好', '世界']
Dictionary(4 unique tokens: ['hello', 'world', '世界', '你好']) {'hello', 'world'} {'世界', '你好'}
{'tokens': ['hello', 'world', '世界', '你好'], 'pdf1': [0.5, 0.5, 0.0, 0.0], 'pdf2': [0.0, 0.0, 0.5, 0.5], 'wmd': 0.5170738394221079, 'flow': [[0.0, 0.0, 0.0, 0.5], [0.0, 0.0, 0.5, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]}


In [7]:
from flask import jsonify
print(jsonify(wmdsim))

RuntimeError: Working outside of application context.

This typically means that you attempted to use functionality that needed
to interface with the current application object in some way. To solve
this, set up an application context with app.app_context().  See the
documentation for more information.

In [10]:
import pkgs.FastText
import imp
imp.reload(pkgs.FastText)

<module 'pkgs.FastText' from 'C:\\Users\\HP\\Documents\\root\\MulLing\\pkgs\\FastText.py'>

In [11]:
lang = 'rup'
r = pkgs.FastText.FastVectorExport(lang, outpath='./dump/%s/new_wordvecs.txt'%lang , vector_file='./dump/numberbatch-19.08.txt.gz')

Reading word vectors from ./dump/numberbatch-19.08.txt.gz
Expected Vocab Size: 5107
Vocab Size: 5107
