In [2]:
import os
import numpy as np
from numpy.linalg import norm

## utilities

In [6]:
def head(li, n=10):
    for i, content in enumerate(li):
        if i > n: break
        print(content)

In [8]:
def cosine_similarity(a, b):
    return np.inner(a, b) / (norm(a) * norm(b))

## load all embeddings

In [2]:
emb_dir = 'bundle_vec'

In [3]:
bundles_emb = {}
for filename in os.listdir(emb_dir):
    if filename in ['.', '..']: continue
    bundle = os.path.splitext(filename)[0].replace('_', ' ')
    emb = np.load(os.path.join(emb_dir, filename))
    bundles_emb[bundle] = emb

check content

In [4]:
for k, v in bundles_emb.items():
    print(k, v)
    break

r u mi [ 0.00746889  0.00400403  0.05927827 ... -0.01658441 -0.02914623
 -0.01108123]


In [7]:
head(bundles_emb.keys(), 10)

r u mi
look forward to
see -PRON- Facebook page
Bus route 862 from
sweeping view of
likely to be
bird - eye view of the
hwy 11 km
view from the
serve some of
house in a


## test similarity

In [9]:
def most_similar(target, bundles, n=5):
    similarities = []
    target_emb = bundles[target]
    for bundle, bundle_emb in bundles.items():
        if bundle == target: continue
        similarities.append((target, bundle, cosine_similarity(target_emb, bundle_emb)))
    similarities.sort(key=lambda emb:-emb[2])
    return similarities[:n]

In [24]:
def print_similarity(tuples):
    head = True
    for t in tuples:
        if head:
            print(f'{t[0]}')
            head = False
        print(f'  > {t[1]}\t{t[2]}')

## slide show

### better examples

In [41]:
print_similarity(most_similar('look out for', bundles_emb))

look out for
  > look for the	0.9493948817253113
  > ask for	0.8713613152503967
  > build in the	0.8521924018859863
  > view over the	0.848524272441864
  > close on the	0.8435164093971252


### worse examples

It seems only to find phrases that contains more the same words

In [37]:
print_similarity(most_similar('rest of the', bundles_emb))

rest of the
  > much of the	0.9916155338287354
  > part of the	0.9897516369819641
  > many of the	0.9894800186157227
  > example of the	0.9874657988548279
  > use of the	0.9869388341903687


In [39]:
print_similarity(most_similar('look out for', bundles_emb))

look out for
  > look for the	0.9493948817253113
  > ask for	0.8713613152503967
  > build in the	0.8521924018859863
  > view over the	0.848524272441864
  > close on the	0.8435164093971252


In [43]:
print_similarity(most_similar('set in a', bundles_emb))

set in a
  > house in a	0.98302161693573
  > such as a	0.9812556505203247
  > person for a	0.9805894494056702
  > area be a	0.9783334732055664
  > hotel be a	0.9744232892990112


... or totally no relations

In [45]:
print_similarity(most_similar('base on', bundles_emb))

base on
  > ask for	0.8358050584793091
  > head up the	0.777601957321167
  > check the website	0.7459233403205872
  > craft beers on	0.677878737449646
  > Night Market p	0.6553135514259338


### playground

In [44]:
print_similarity(most_similar('ask for', bundles_emb))

ask for
  > look for the	0.9020828604698181
  > base on	0.8358050584793091
  > build in the	0.8281060457229614
  > close on the	0.8229482769966125
  > check the website	0.8090062141418457


In [26]:
print_similarity(most_similar('pros and cons', bundles_emb))

pros and cons
  > philippine and eurasian	0.9594093561172485
  > local and international	0.9533032774925232
  > much of the	0.9510403275489807
  > rest of the	0.9498149156570435
  > many of the	0.9483734369277954


In [27]:
print_similarity(most_similar('look forward to', bundles_emb))

look forward to
  > serve some of	0.9575366973876953
  > person for a	0.9568845629692078
  > beach be a	0.9548571109771729
  > house in a	0.9541386365890503
  > sweeping view of	0.9484091997146606


In [29]:
print_similarity(most_similar('come for the', bundles_emb))

come for the
  > look for the	0.9471341371536255
  > build in the	0.9103793501853943
  > ask for	0.9102256298065186
  > close on the	0.892799973487854
  > view over the	0.8551517128944397


In [31]:
print_similarity(most_similar('hope for', bundles_emb))

hope for
  > ask for	0.9324443936347961
  > look for the	0.8960824012756348
  > build in the	0.8452931642532349
  > base on	0.8345198631286621
  > close on the	0.8268554210662842


In [32]:
print_similarity(most_similar('according to', bundles_emb))

according to
  > apply to	0.9652746319770813
  > person for a	0.9581030011177063
  > date back to	0.9562320113182068
  > house in a	0.9539720416069031
  > area be a	0.9484989643096924


In [34]:
print_similarity(most_similar('wall of the', bundles_emb))

wall of the
  > rest of the	0.979120135307312
  > example of the	0.9783733487129211
  > part of the	0.9783085584640503
  > use of the	0.9782791137695312
  > many of the	0.978050172328949


In [35]:
print_similarity(most_similar('likely to be', bundles_emb))

likely to be
  > step to the	0.9713488221168518
  > much of the	0.9707946181297302
  > many of the	0.968322217464447
  > way to get here be	0.9675189852714539
  > use of the	0.9660195708274841


In [46]:
print_similarity(most_similar('shop in the', bundles_emb))

shop in the
  > build in the	0.8964031338691711
  > ask for	0.8503081202507019
  > head up the	0.836192786693573
  > check the website	0.8255414962768555
  > craft beers on	0.6668857932090759


In [47]:
print_similarity(most_similar('want to know', bundles_emb))

want to know
  > likely to be	0.9710200428962708
  > step to the	0.9704611301422119
  > use of the	0.965178370475769
  > much of the	0.9650048017501831
  > many of the	0.9639893770217896


In [48]:
print_similarity(most_similar('apply to', bundles_emb))

apply to
  > date back to	0.9482353329658508
  > person for a	0.9474472999572754
  > area be a	0.9461380243301392
  > serve some of	0.9446873664855957
  > beach be a	0.9442049860954285


In [50]:
print_similarity(most_similar('date back to', bundles_emb))

date back to
  > look forward to	0.9680248498916626
  > person for a	0.9631557464599609
  > area be a	0.962228536605835
  > house in a	0.9607532024383545
  > floor be a	0.9602826833724976


In [1]:
while True:
    query = input('input: ')
    if query in ['quit', 'q']: breakn
    print_similarity(most_similar(query, bundles_emb))

input: ask for a


NameError: name 'print_similarity' is not defined