In [34]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd

In [None]:
#download the model to local so it can be used again and again
# !mkdir ./sentence_wise_email
# Download the module, and uncompress it to the destination folder. 

# DO NOT DOWNLOAD IT EACH TIME, IF YOU HAD DOWNLOADED IT ONCE, ITS ENOUGH

# !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/3?tf-hub-format=compressed" | tar -zxvC ./sentence_wise_email

In [None]:
embed = hub.Module("./sentence_wise_email")

# Compute a representation for each message, showing various lengths supported.
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embed(messages))

    for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
        print("Message: {}".format(messages[i]))
        print("Embedding size: {}".format(len(message_embedding)))
        message_embedding_snippet = ", ".join((str(x) for x in message_embedding[:3]))
        print("Embedding: [{}, ...]\n".format(message_embedding_snippet))


In [35]:
df = pd.read_csv("./goodbooks-10k/books.csv", usecols=['id', 'title'])
titleList = df['title'].tolist()
idList = df['id'].tolist()
print (titleList[0])
titleIdDict = dict(zip(titleList, idList))
print(titleIdDict.get(titleList[3]))


The Hunger Games (The Hunger Games, #1)
4


In [36]:
#Function so that one session can be called multiple times. 
#Useful while multiple calls need to be done for embedding.
import tensorflow as tf
import tensorflow_hub as hub

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

def embed_useT(module):
    with tf.Graph().as_default():
        sentences = tf.placeholder(tf.string)
        embed = hub.Module(module)
        embeddings = embed(sentences)
        session = tf.train.MonitoredSession()
    return lambda x: session.run(embeddings, {sentences: x})

embed_fn = embed_useT('./sentence_wise_email')

In [37]:
# messages = [
#     "we are sorry for the inconvenience",
#     "we are sorry for the delay",
#     "we regret for your inconvenience",
#     "we don't deliver to baner region in pune",
#     "we will get you the best possible rate"
# ]

# encoding_matrix = embed_fn(messages)
encoding_matrix = embed_fn(titleList)


In [38]:
np.inner(encoding_matrix, encoding_matrix)

array([[1.0000002 , 0.6433921 , 0.6784331 , ..., 0.21079348, 0.3800934 ,
        0.42424315],
       [0.6433921 , 0.9999998 , 0.72538793, ..., 0.14122126, 0.30932796,
        0.2677622 ],
       [0.6784331 , 0.72538793, 1.0000001 , ..., 0.14197785, 0.359599  ,
        0.29009897],
       ...,
       [0.21079348, 0.14122126, 0.14197785, ..., 1.0000001 , 0.22863138,
        0.47335958],
       [0.3800934 , 0.30932796, 0.359599  , ..., 0.22863138, 0.9999999 ,
        0.3064736 ],
       [0.42424315, 0.2677622 , 0.29009897, ..., 0.47335958, 0.3064736 ,
        0.9999999 ]], dtype=float32)

In [39]:
from math import*
from decimal import Decimal

class Similarity():
    def euclidean_distance(self,x,y):

        """ return euclidean distance between two lists """

        return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

    def manhattan_distance(self,x,y):

        """ return manhattan distance between two lists """

        return sum(abs(a-b) for a,b in zip(x,y))

    def minkowski_distance(self,x,y,p_value):

        """ return minkowski distance between two lists """

        return self.nth_root(sum(pow(abs(a-b),p_value) for a,b in zip(x, y)),
           p_value)

    def nth_root(self,value, n_root):

        """ returns the n_root of an value """

        root_value = 1/float(n_root)
        return round (Decimal(value) ** Decimal(root_value),3)

    def cosine_similarity(self,x,y):

        """ return cosine similarity between two lists """

        numerator = sum(a*b for a,b in zip(x,y))
        denominator = self.square_rooted(x)*self.square_rooted(y)
        return round(numerator/float(denominator),3)

    def square_rooted(self,x):

        """ return 3 rounded square rooted value """

        return round(sqrt(sum([a*a for a in x])),3)

In [40]:
def recommendListOfTopBooks(bookTitle):
    similarities = []
    measures = Similarity()
    bookIndex = titleIdDict.get(titleList[3])
    for index, title in enumerate(titleList):
        if(index != bookIndex):
            similarities.append(measures.cosine_similarity(encoding_matrix[bookIndex], encoding_matrix[index]))
    return similarities


In [41]:
def recommendTopBooks(bookTitle):
    similarities = []
    measures = Similarity()
    bookIndex = titleIdDict.get(titleList[3])
    for index, title in enumerate(titleList):
        if(index != bookIndex):
            similarities.append({'score': measures.cosine_similarity(encoding_matrix[bookIndex], encoding_matrix[index]), 'title': titleList[index]})
    return similarities


In [42]:
measures = Similarity()
measures.cosine_similarity(encoding_matrix[0],encoding_matrix[1])

0.643

In [43]:
bookTitle = titleList[0]
print('Book title to recommend: ', bookTitle)
similarities = recommendTopBooks(bookTitle)
booksRecommended = sorted(similarities, key = lambda i: i['score'], reverse=True)
print(booksRecommended[:10])

Book title to recommend:  The Hunger Games (The Hunger Games, #1)
[{'score': 0.836, 'title': 'The Catcher in the Rye'}, {'score': 0.805, 'title': 'The Pursuit of Happyness'}, {'score': 0.797, 'title': 'The Silver Linings Playbook'}, {'score': 0.792, 'title': 'The Phantom Tollbooth'}, {'score': 0.787, 'title': 'Gone with the Wind'}, {'score': 0.784, 'title': 'To Kill a Mockingbird'}, {'score': 0.783, 'title': 'The Dispossessed'}, {'score': 0.78, 'title': 'Wuthering Heights'}, {'score': 0.776, 'title': 'The Perks of Being a Wallflower'}, {'score': 0.773, 'title': 'Pride and Prejudice'}]
