In [2]:
# Import Pandas
import pandas as pd

# Load Movies Metadata
books_df = pd.read_csv('data/books/books.csv', low_memory=False)

In [3]:
smaller_selection = books_df.groupby(['book_title', 'Summary', 'book_author', 'year_of_publication', 'Category']).size().reset_index(name='counts')
smaller_selection = smaller_selection[smaller_selection['counts'] > 8]
smaller_selection = smaller_selection[smaller_selection['Summary'] != '9']
smaller_selection = smaller_selection.reset_index()
smaller_selection.shape

(13132, 7)

In [5]:
# import TfidfVectorize
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

smaller_selection['Summary'] = smaller_selection['Summary'].fillna('')

tfidf_matrix = tfidf.fit_transform(smaller_selection['Summary'])

tfidf_matrix.shape


(13132, 28789)

In [6]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
cosine_sim

array([[1.        , 0.05862086, 0.        , ..., 0.        , 0.02489951,
        0.        ],
       [0.05862086, 1.        , 0.        , ..., 0.        , 0.04009198,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.02489951, 0.04009198, 0.        , ..., 0.        , 1.        ,
        0.01775478],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01775478,
        1.        ]])

In [18]:
indices = pd.Series(smaller_selection.index, index=smaller_selection['book_title']).drop_duplicates()

In [19]:
indices

book_title
 Good Wives: Image and Reality in the Lives of Women in Northern New England, 1650-1750        0
 This Place Has No Atmosphere (Laurel-Leaf Books)                                              1
'Salem's Lot                                                                                   2
01-01-00: The Novel of the Millennium                                                          3
1,000 Places to See Before You Die                                                             4
                                                                                           ...  
my three girls  (count on a cop)                                                           13127
one hundred years of solitude                                                              13128
together by christmas                                                                      13129
why I'm like this : True Stories                                                           13130
Â¿QuÃ© me quieres, 

In [20]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return smaller_selection.iloc[movie_indices]

In [24]:
get_recommendations('The Da Vinci Code')

Unnamed: 0,index,book_title,Summary,book_author,year_of_publication,Category,counts
9683,204392,The Da Vinci Code (Random House Large Print),Harvard symbologist Robert Langdon and French ...,DAN BROWN,2003.0,['Fiction'],9
9684,204396,The Da Vinci Legacy,"The Da Vinci Legacy First published in 1983, T...",Lewis Perdue,2004.0,['Fiction'],27
1072,21655,Baggage,"Inseparable throughout high school, Sophie and...",Emily Barr,2003.0,['Fiction'],12
2223,45135,Cracking the Da Vinci Code : The Unauthorized ...,Uses an A to Z format to answer questions abou...,Simon Cox,2004.0,['Religion'],11
6213,136088,Moon Women,Ruth Ann&#39;s life is turned upside down when...,Pamela Duncan,2001.0,['Fiction'],17
3384,71641,Family Secrets: Pyramid of Lies,Cryptologist Gretchen Wagner embarks on a desp...,Anne Marie Winston,2003.0,['Fiction'],9
2766,57743,Digital Fortress : A Thriller,"Before the multi-million, runaway bestseller T...",Dan Brown,2000.0,['Fiction'],85
11433,232200,The Solitaire Mystery,The author of the best-selling Sophie&#39;s Wo...,Jostein Gaarder,1997.0,['Fiction'],23
3634,76495,Fool's Puzzle,Benni Harper starts over as curator of a folk-...,Earlene Fowler,1995.0,['Fiction'],34
6953,150464,Others,"Hired to find a baby stolen at birth, Nicholas...",James Herbert,2000.0,['Fiction'],11
