In [1]:
import numpy as np
import pandas as pd
import re
import nltk

In [2]:
text = '''Today the embedded systems are ubiquitous in occurrence, most significant in function and project an absolutely promising picture of developments in the near future.
The Merchant Of Venice is one of Shakespeare's best known plays.
How To Be A Domestic Goddess: Baking and the Art of Comfort Cooking is a bestselling cookbook by the famous chef Nigella Lawson who aims to introduce the art of baking through text with an emphasis.
Lose A Kilo A Week is a detailed diet and weight loss plan, and also shows how to maintain the ideal weight after reaching it.
Few Things Left Unsaid is a story of love, romance, and heartbreak.'''

In [42]:
sentences = nltk.sent_tokenize(text)

In [43]:
sentences
sent_df = pd.DataFrame(sentences,columns=['desc'])
sent_df

Unnamed: 0,desc
0,Today the embedded systems are ubiquitous in o...
1,The Merchant Of Venice is one of Shakespeare's...
2,How To Be A Domestic Goddess: Baking and the A...
3,Lose A Kilo A Week is a detailed diet and weig...
4,"Few Things Left Unsaid is a story of love, rom..."


In [44]:
import string
stopwords = nltk.corpus.stopwords.words('english')
def preprocess(text):
    '''Combination of all the necassary text preprocessing'''
    #text = ''.join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',text)
    tokens = [word.lower() for word in tokens]
    #text = " ".join([word for word in tokens if word not in stopwords])
    
    return text
sent_df['cleaned'] = sent_df['desc'].apply(lambda x: preprocess(x))
sent_df

Unnamed: 0,desc,cleaned
0,Today the embedded systems are ubiquitous in o...,Today the embedded systems are ubiquitous in o...
1,The Merchant Of Venice is one of Shakespeare's...,The Merchant Of Venice is one of Shakespeare's...
2,How To Be A Domestic Goddess: Baking and the A...,How To Be A Domestic Goddess: Baking and the A...
3,Lose A Kilo A Week is a detailed diet and weig...,Lose A Kilo A Week is a detailed diet and weig...
4,"Few Things Left Unsaid is a story of love, rom...","Few Things Left Unsaid is a story of love, rom..."


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords)
features = vectorizer.fit_transform(sent_df['cleaned'])
print(features.shape)
print(vectorizer.get_feature_names())

(5, 57)
['absolutely', 'aims', 'also', 'art', 'baking', 'best', 'bestselling', 'chef', 'comfort', 'cookbook', 'cooking', 'detailed', 'developments', 'diet', 'domestic', 'embedded', 'emphasis', 'famous', 'function', 'future', 'goddess', 'heartbreak', 'ideal', 'introduce', 'kilo', 'known', 'lawson', 'left', 'lose', 'loss', 'love', 'maintain', 'merchant', 'near', 'nigella', 'occurrence', 'one', 'picture', 'plan', 'plays', 'project', 'promising', 'reaching', 'romance', 'shakespeare', 'shows', 'significant', 'story', 'systems', 'text', 'things', 'today', 'ubiquitous', 'unsaid', 'venice', 'week', 'weight']


In [46]:
features_df = pd.DataFrame(features.toarray())

In [47]:
features_df.columns = vectorizer.get_feature_names()
features_df

Unnamed: 0,absolutely,aims,also,art,baking,best,bestselling,chef,comfort,cookbook,...,story,systems,text,things,today,ubiquitous,unsaid,venice,week,weight
0,0.267261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.267261,0.0,0.0,0.267261,0.267261,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.377964,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377964,0.0,0.0
2,0.0,0.213201,0.0,0.426401,0.426401,0.0,0.213201,0.213201,0.213201,0.213201,...,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.377964,0.0,0.0,0.377964,0.0,0.0,0.377964,0.0,0.0,0.0


In [48]:
book_names = '''How to Be a Domestic Goddess : Baking and the Art of Comfort Cooking (Paperback)
Embedded / Real-Time Systems 1st Edition (Paperback)
The Merchant of Venice (Paperback)
Lose a Kilo a Week (Paperback)
Few Things Left Unsaid (Paperback)'''

In [49]:
book_names = book_names.split('\n')

In [50]:
book_names_df = pd.DataFrame(book_names,columns=['book names'])

In [56]:
book_names_df
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [52]:
name_features = vectorizer.transform(book_names_df['book names'])

In [53]:
name_features_df = pd.DataFrame(name_features.toarray())
name_features_df.columns = vectorizer.get_feature_names()
name_features_df

Unnamed: 0,absolutely,aims,also,art,baking,best,bestselling,chef,comfort,cookbook,...,story,systems,text,things,today,ubiquitous,unsaid,venice,week,weight
0,0.0,0.0,0.0,0.408248,0.408248,0.0,0.0,0.0,0.408248,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0


In [54]:
desc_vectors = features.toarray()
book_vectors = name_features.toarray()

x = np.dot(book_vectors,desc_vectors.T)
list(np.argmax(x,axis=0))

[1, 2, 0, 3, 4]

In [55]:
answers = []
for desc in desc_vectors:
    max_sim = 0.0
    for i,book in enumerate(book_vectors):
        similarity = np.dot(book,desc)
        if similarity>max_sim:
            max_sim = similarity
            answers.append(i+1)
    
        
answers        

[2, 3, 1, 4, 5]