In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = 'keywords3.csv'

model_data = pd.read_csv(data)
model_data.head()

Unnamed: 0,book_authors,book_title,image_url,book_authors_init,book_title_init,keywords
0,suzanne_collins,the_hunger_games,https://images.gr-assets.com/books/1447303603l...,Suzanne Collins,The Hunger Games,suzanne_collins representative katniss katniss...
1,j.k._rowling mary_grandpré,harry_potter_and_the_order_of_the_phoenix,https://images.gr-assets.com/books/1255614970l...,J.K. Rowling|Mary GrandPré,Harry Potter and the Order of the Phoenix,j.k._rowling mary_grandpré haunting harry terr...
2,harper_lee,to_kill_a_mockingbird,https://images.gr-assets.com/books/1361975680l...,Harper Lee,To Kill a Mockingbird,harper_lee kill mockingbird mockingbird mockin...
3,stephenie_meyer,twilight,https://images.gr-assets.com/books/1361039443l...,Stephenie Meyer,Twilight,stephenie_meyer edward vampire edward cullen c...
4,markus_zusak,the_book_thief,https://images.gr-assets.com/books/1522157426l...,Markus Zusak,The Book Thief,markus_zusak german girl story liesel girl boo...


In [3]:
model_data[model_data["book_title"]=="champion"]

Unnamed: 0,book_authors,book_title,image_url,book_authors_init,book_title_init,keywords
606,marie_lu,champion,https://images.gr-assets.com/books/1382652310l...,Marie Lu,Champion,marie_lu june knows champion june suspense mar...


In [4]:
tfidf = TfidfVectorizer(analyzer = 'word',
                        min_df=1,
                        max_df = 0.99,
                        stop_words="english",
                        encoding = 'utf-8', 
                        token_pattern=r"(?u)\S\S+")
tfidf_encoding = tfidf.fit_transform(model_data["keywords"])
print(tfidf.get_feature_names_out()[1:100])
print(tfidf_encoding.shape)

['12' '12ashton' '1327' '1351' '1666' '1726' '1767' '1771' '1815' '1819'
 '1832' '1847' '1850s' '1854' '1859' '1870s' '1878' '1879' '1890' '1895'
 '1903' '1904' '1905' '1914' '1920s' '1921' '1922' '1936' '1939' '1940s'
 '1941' '1942' '1944' '1945' '1946' '1949' '1950s' '1953' '1956' '1957'
 '1960' '1963' '1970' '1980s' '1981' '1984' '1986' '1988' '1991' '2019'
 '2045' '28' '34' '451' '5th' '747' 'a.a._milne' 'a.s._byatt' 'a.w._wheen'
 'aarons' 'ababa' 'abandoned' 'abbey' 'abby' 'abducted' 'abducts' 'abel'
 'abernathy' 'abhorsen' 'abigail' 'abilities' 'ability' 'able'
 'abnormality' 'abortion' 'abraham_verghese' 'abridged' 'absalom' 'absurd'
 'abyss' 'accident' 'acclaimed' 'according' 'account' 'accounts' 'ace'
 'acheron' 'achilles' 'achingly' 'ackroyd' 'activism' 'actually' 'adam'
 'adam_long' 'adams' 'addiction' 'addie' 'addresses' 'ado']
(993, 4360)


In [12]:
## Find most books with similar words in a given text query

def comp_description(query, results_number=3):
        results=[]
        q_vector = tfidf.transform([query])
        print("Comparable Description: ", query)
        results.append(cosine_similarity(q_vector, tfidf_encoding.toarray()))
        elem_list=[]
        for i in results[:10]:
            for elem in i[0]:
                    #print("Review",f, "Similarity: ", elem)
                    elem_list.append(elem)
                    
        for i in range(results_number):
            print("Most relevant to the query is Book #" ,elem_list.index(max(elem_list)))
            print("Similarity: ", max(elem_list))
            if sum(elem_list) / len(elem_list)==0.0:
                print("No similar descriptions")
            else:
                print(model_data['book_title'].loc[elem_list.index(max(elem_list)):elem_list.index(max(elem_list))], '\n')
                elem_list.pop(elem_list.index(max(elem_list)))

In [13]:
query = "I want to read a Jane Austen book that have Greek mythology components."
comp_description(query)

Comparable Description:  I want to read a Jane Austen book that have Greek mythology components.
Most relevant to the query is Book # 441
Similarity:  0.41706009882167416
441    northanger_abbey
Name: book_title, dtype: object 

Most relevant to the query is Book # 559
Similarity:  0.336002491326702
559    city_of_glass
Name: book_title, dtype: object 

Most relevant to the query is Book # 605
Similarity:  0.3261794723240366
605    lover_unbound
Name: book_title, dtype: object 



In [8]:
import pickle

In [10]:
# save model
with open('model.pickle', 'wb') as f:
    pickle.dump(tfidf, f)