In [None]:
import nltk
import nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu
import math
import numpy as np
import os
try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')

!pip install bert-embedding
from bert_embedding import BertEmbedding
bert_E = BertEmbedding()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting bert-embedding
  Downloading https://files.pythonhosted.org/packages/62/85/e0d56e29a055d8b3ba6da6e52afe404f209453057de95b90c01475c3ff75/bert_embedding-1.0.1-py3-none-any.whl
Collecting numpy==1.14.6
[?25l  Downloading https://files.pythonhosted.org/packages/e5/c4/395ebb218053ba44d64935b3729bc88241ec279915e72100c5979db10945/numpy-1.14.6-cp36-cp36m-manylinux1_x86_64.whl (13.8MB)
[K     |████████████████████████████████| 13.8MB 266kB/s 
[?25hCollecting mxnet==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/c0/e9/241aadccc4522f99adee5b6043f730d58adb7c001e0a68865a3728c3b4ae/mxnet-1.4.0-py2.py3-none-manylinux1_x86_64.whl (29.6MB)
[K     |████████████████████████████████| 29.6MB 118kB/s 
[?25hCollecting typing==3.6.6
  Downloading https://files.pythonhosted.org/packages/4a/bd/eee1157fc2d8514970b345d69cb9975dcd1e42cd7e61146ed841f6e68309/typing-3.6.6-py3-no

Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/book_corpus_wiki_en_uncased-a6607397.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip...
Downloading /root/.mxnet/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip...


In [None]:
!python -m spacy download pt_core_news_sm

import pt_core_news_sm
import nltk
nltk.download('stopwords')
nlp = pt_core_news_sm.load()

[38;5;3m⚠ Skipping model package dependencies and setting `--no-deps`. You
don't seem to have the spaCy package itself installed (maybe because you've
built from source?), so installing the model dependencies would cause spaCy to
be downloaded, which probably isn't what you want. If the model package has
other dependencies, you'll have to install them manually.[0m
Collecting pt_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz (21.2MB)
[K     |████████████████████████████████| 21.2MB 1.2MB/s 
[?25hBuilding wheels for collected packages: pt-core-news-sm
  Building wheel for pt-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for pt-core-news-sm: filename=pt_core_news_sm-2.2.5-cp36-none-any.whl size=21186282 sha256=f0862570dc6b6d4175ced2854f60c5dfb8a02c3123c3aa737f875720e5b6cb74
  Stored in directory: /tmp/pip-ephem-wheel-cache-eo5a2j86/wheels/ea/94/74/ec9be8418e9231b471be

In [None]:
def token_list(embeddings, no_sep=False):
    """
    Returns with the tokens of the embedding data from the BertEmbedding.

    Params:
        embeddings: The embedding data from BertEmbedding
        no_sep: If True, the separators are trimmed.
    Return:
        tokens: list of tokens
    """
    if no_sep:
        return embeddings[0][0][1:-1]
    return embeddings[0][0]


def sentence_embs(embeddings):
    """Return with the sentence level embeddings"""
    return embeddings[0][1][0]

def prep(sentence):
    """Return with tokens and sentence level embeddings"""
    embs = bert_E([sentence])
    tokens = token_list(embs, no_sep=True)
    se = sentence_embs(embs)
    return tokens, se


def mapfunct(x, type='exp', n=0.2):
    """
    Map 0-inf to 1-0 with some function
    
    Type:
        inverse: 1/(1+n*x)
        arctan: 1-2/pi*arctan(x)
        exp: (1/(1+n))^x
    """
    if type=='inverse':
        return 1/(1+n*x)
    if type=='arctan':
        return 1-2/math.pi*math.atan(n*x)
    if type=='exp':
        return (1/(1+n))**x
    else:
        raise(NotImplementedError("Function not implemented"))


def square_rooted(x):
    return math.sqrt(sum([a*a for a in x]))


def cosine_similarity(x,y):
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return numerator/float(denominator)

In [None]:
s0 = "James Cook was a very good man and a loving husband."
s1 = "James Cook was a very nice man and a loving husband."
s2 = "James Cook was a bad man and a terrible husband."
s3 = "James Cook was a nice person and a good husband."
s4 = "The sky is blue today and learning history is important."

In [None]:
r0, e0 = prep(s0)
r1, e1 = prep(s1)
r2, e2 = prep(s2)
r3, e3 = prep(s3)
r4, e4 = prep(s4)

In [None]:
t0 = nlp(s0)
t1 = nlp(s1)
t2 = nlp(s2)
t3 = nlp(s3)
t4 = nlp(s4)

In [None]:
SmoothingFunction = nltk.translate.bleu_score.SmoothingFunction()

In [None]:
print("r0-r0 bleu score: ", bleu.sentence_bleu([r0], r0, smoothing_function=SmoothingFunction.method0))
print("r0-r1 bleu score: ", bleu.sentence_bleu([r1], r0, smoothing_function=SmoothingFunction.method0))
print("r0-r2 bleu score: ", bleu.sentence_bleu([r2], r0, smoothing_function=SmoothingFunction.method0))
print("r0-r3 bleu score: ", bleu.sentence_bleu([r3], r0, smoothing_function=SmoothingFunction.method0))
print("r0-r4 bleu score: ", bleu.sentence_bleu([r4], r0, smoothing_function=SmoothingFunction.method0))
print("")
print("r0-r0 bleu score: ", bleu.sentence_bleu([r0], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r1 bleu score: ", bleu.sentence_bleu([r1], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r2 bleu score: ", bleu.sentence_bleu([r2], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r3 bleu score: ", bleu.sentence_bleu([r3], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r4 bleu score: ", bleu.sentence_bleu([r4], r0, smoothing_function=SmoothingFunction.method2))
print("")
print("r0-r0 gleu score: ", gleu.sentence_gleu([r0], r0))
print("r0-r1 gleu score: ", gleu.sentence_gleu([r1], r0))
print("r0-r2 gleu score: ", gleu.sentence_gleu([r2], r0))
print("r0-r3 gleu score: ", gleu.sentence_gleu([r3], r0))
print("r0-r4 gleu score: ", gleu.sentence_gleu([r4], r0))
print("")
print("e0-e0 Euclid distance:", np.linalg.norm(e0-e0))
print("e0-e1 Euclid distance:", np.linalg.norm(e1-e0))
print("e0-e2 Euclid distance:", np.linalg.norm(e2-e0))
print("e0-e3 Euclid distance:", np.linalg.norm(e3-e0))
print("e0-e4 Euclid distance:", np.linalg.norm(e4-e0))
print("")
print("e0-e0 Euclid distance:", str(mapfunct(np.linalg.norm(e0-e0))))
print("e0-e1 Euclid distance:", str(mapfunct(np.linalg.norm(e1-e0))))
print("e0-e2 Euclid distance:", str(mapfunct(np.linalg.norm(e2-e0))))
print("e0-e3 Euclid distance:", str(mapfunct(np.linalg.norm(e3-e0))))
print("e0-e4 Euclid distance:", str(mapfunct(np.linalg.norm(e4-e0))))
print("")
print("e0-e0 cosine-similarity:", cosine_similarity(e0,e0))
print("e0-e1 cosine-similarity:", cosine_similarity(e1,e0))
print("e0-e2 cosine-similarity:", cosine_similarity(e2,e0))
print("e0-e3 cosine-similarity:", cosine_similarity(e3,e0))
print("e0-e4 cosine-similarity:", cosine_similarity(e4,e0))
print("")
print("t0-t0 spacy similarity", t0.similarity(t0))
print("t0-t1 spacy similarity", t1.similarity(t0))
print("t0-t2 spacy similarity", t2.similarity(t0))
print("t0-t3 spacy similarity", t3.similarity(t0))
print("t0-t4 spacy similarity", t4.similarity(t0))

r0-r0 bleu score:  1.0
r0-r1 bleu score:  0.6580370064762462
r0-r2 bleu score:  0.5280972216470737
r0-r3 bleu score:  0.4132584091896901
r0-r4 bleu score:  0.5623413251903491

r0-r0 bleu score:  1.0
r0-r1 bleu score:  0.7016879391277372
r0-r2 bleu score:  0.3508439695638686
r0-r3 bleu score:  0.2998221389342337
r0-r4 bleu score:  0.12605968092174913

r0-r0 gleu score:  1.0
r0-r1 gleu score:  0.7058823529411765
r0-r2 gleu score:  0.38235294117647056
r0-r3 gleu score:  0.3235294117647059
r0-r4 gleu score:  0.029411764705882353

e0-e0 Euclid distance: 0.0
e0-e1 Euclid distance: 1.9738714
e0-e2 Euclid distance: 3.6317627
e0-e3 Euclid distance: 3.0969253
e0-e4 Euclid distance: 17.017267

e0-e0 Euclid distance: 1.0
e0-e1 Euclid distance: 0.6977605424333337
e0-e2 Euclid distance: 0.5157420006874345
e0-e3 Euclid distance: 0.568566934219589
e0-e4 Euclid distance: 0.04493156820402338

e0-e0 cosine-similarity: 1.0
e0-e1 cosine-similarity: 0.9900622593588156
e0-e2 cosine-similarity: 0.965961241983

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
