In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = 'keywords1.csv'

model_data = pd.read_csv(data)
model_data.head()

Unnamed: 0,book_title,keywords
0,the_hunger_games,suzanne_collins katniss winning hunger lottery...
1,harry_potter_and_the_order_of_the_phoenix,j.k._rowling mary_grandpré harry hogwarts terr...
2,to_kill_a_mockingbird,harper_lee mockingbird literature pulitzer nov...
3,twilight,stephenie_meyer cullen vampire vampires bella ...
4,the_book_thief,markus_zusak liesel jewish german book war


In [3]:
model_data[model_data["book_title"]=="champion"]

Unnamed: 0,book_title,keywords
606,champion,marie_lu june marie trilogy plague republic


In [4]:
tfidf = TfidfVectorizer(analyzer = 'word',
                        min_df=1,
                        max_df = 0.99,
                        stop_words="english",
                        encoding = 'utf-8', 
                        token_pattern=r"(?u)\S\S+")
tfidf_encoding = tfidf.fit_transform(model_data["keywords"])
print(tfidf.get_feature_names_out()[1:100])
print(tfidf_encoding.shape)

['1666' '1726' '1776' '1815' '1818' '1832' '1847' '1850s' '1854' '1859'
 '1879' '1895' '1903' '1914' '1920s' '1921' '1922' '1925' '1939' '1940s'
 '1941' '1942' '1943' '1944' '1954' '1956' '1957' '1959' '1960' '1981'
 '1984' '1988' '2045' '451' '747' 'a.a._milne' 'a.s._byatt' 'a.w._wheen'
 'aarons' 'abandoned' 'abbey' 'abby' 'abducted' 'abel' 'abernathy'
 'abhorsen' 'abigail' 'abilities' 'abortion' 'abraham_verghese' 'abridged'
 'absalom' 'absurd' 'abuse' 'ace' 'acheron' 'achieve' 'achilles' 'ackroyd'
 'actor' 'adam' 'adam_long' 'adams' 'addiction' 'adolescence' 'adolescent'
 'adored' 'adrienne' 'adventure' 'adventurenarnia' 'adventurer'
 'adventures' 'advice' 'aelin' 'aeneas' 'aeneid' 'affair' 'affairs'
 'afghan' 'afghanistan' 'africa' 'african' 'afterlife' 'afterworld'
 'agatha' 'agatha_christie' 'ages' 'agrarian' 'aiden' 'airman' 'airplanes'
 'aislinn' 'aladdin' 'alagaësia' 'alan_moore' 'alan_myers' 'alan_paton'
 'alan_r._clarke' 'alan_shelston']
(993, 3661)


In [7]:
## Find most books with similar words in a given text query

def comp_description(query, results_number=3):
        results=[]
        q_vector = tfidf.transform([query])
        print("Comparable Description: ", query)
        results.append(cosine_similarity(q_vector, tfidf_encoding.toarray()))
        elem_list=[]
        for i in results[:10]:
            for elem in i[0]:
                    #print("Review",f, "Similarity: ", elem)
                    elem_list.append(elem)
                    
        for i in range(results_number):
            print("Most relevant to the query is Book #" ,elem_list.index(max(elem_list)))
            print("Similarity: ", max(elem_list))
            if sum(elem_list) / len(elem_list)==0.0:
                print("No similar descriptions")
            else:
                print(model_data['book_title'].loc[elem_list.index(max(elem_list)):elem_list.index(max(elem_list))], '\n')
                elem_list.pop(elem_list.index(max(elem_list)))

In [8]:
query = "I want to read a Jane Austen book that have Greek mythology components."
comp_description(query)

Comparable Description:  I want to read a Jane Austen book that have Greek mythology components.
Most relevant to the query is Book # 353
Similarity:  0.33206672352509137
353    mansfield_park
Name: book_title, dtype: object 

Most relevant to the query is Book # 606
Similarity:  0.30158239497264594
606    champion
Name: book_title, dtype: object 

Most relevant to the query is Book # 341
Similarity:  0.1964231255004696
341    the_mark_of_athena
Name: book_title, dtype: object 



In [8]:
import pickle

In [10]:
# save model
with open('model.pickle', 'wb') as f:
    pickle.dump(tfidf, f)