In [88]:
import csv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import pickle
import json

In [43]:
def parse_fields(line):
    return {
        "book_id": line[0],
        "title": line[1],
        "cover_image": line[7]
}

In [44]:
books_titles = []
with open('Books.csv', 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    row_count = 0
    for row in csv_reader:
        row_count += 1
        if row_count == 1:
            continue
        fields = parse_fields(row)
        books_titles.append(fields)
        
        

In [45]:
books_titles

[{'book_id': '0195153448',
  'title': 'Classical Mythology',
  'cover_image': 'http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg'},
 {'book_id': '0002005018',
  'title': 'Clara Callan',
  'cover_image': 'http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg'},
 {'book_id': '0060973129',
  'title': 'Decision in Normandy',
  'cover_image': 'http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg'},
 {'book_id': '0374157065',
  'title': 'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
  'cover_image': 'http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg'},
 {'book_id': '0393045218',
  'title': 'The Mummies of Urumchi',
  'cover_image': 'http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg'},
 {'book_id': '0399135782',
  'title': "The Kitchen God's Wife",
  'cover_image': 'http://images.amazon.com/images/P/0399135782.01.LZZZZZZZ.jpg'},
 {'book_id': '0425176428',
  'title': "What If?: The World's Fo

In [46]:
titles = pd.DataFrame.from_dict(books_titles)

In [47]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [48]:
titles["mod_title"] = titles["mod_title"].str.lower()

In [49]:
titles["mod_title"] = titles["mod_title"].str.replace("\\s+", " ", regex=True)

In [50]:
titles = titles[titles["mod_title"].str.len() > 0]

In [51]:
titles.to_json("books_titles.json")

In [52]:
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [53]:
def show_image(url):
    return '<img src="{}" width=50></img>'.format(url)

In [54]:
def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    return results.head(10).style.format({'cover_image': show_image})

In [55]:
def search_books(query, vectorizer, titles_df, n_results=10):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -n_results*3)[-n_results*3:]
    
    results = titles_df.iloc[indices].copy()
    results['similarity'] = similarity[indices]
    results = results.sort_values('similarity', ascending=False)
    
    results = results.drop_duplicates(subset=['title'], keep='first')
    
    results = results[results['title'].isin(available_books_set)]
    
    return results.head(n_results)

In [56]:
search("Perks of Being a Wallflower", vectorizer)

Unnamed: 0,book_id,title,cover_image,mod_title
196926,0312262140,"One Mykonos: Being Ancient, Being Islands, Being Giants, Being Gay",,one mykonos being ancient being islands being giants being gay
255106,0395300673,A Way of Being,,a way of being
73455,0451204409,The Rake and the Wallflower (Signet Regency Romance),,the rake and the wallflower signet regency romance
31442,0373707908,"Wallflower (Harlequin Superromance , No 790)",,wallflower harlequin superromance no 790
76041,051510843X,Wallflower: A Janek Novel,,wallflower a janek novel
223777,0736649360,The Perks of Being a Wallflower,,the perks of being a wallflower
203107,0843926430,Being,,being
239520,0553131192,Wallflower at the Orgy,,wallflower at the orgy
81914,0679400478,Wallflower,,wallflower
1150,0671027344,The Perks of Being a Wallflower,,the perks of being a wallflower


In [57]:
liked_books = ["0446675504", "0517542099", "0671027344"]

In [58]:
try:
    with open('book_recommender_model.pkl', 'rb') as f:
        data = pickle.load(f)
    model = data['model']
    df = data['df']
    df_books = data['df_books']
    df_ratings_rm = data['df_ratings_rm']
except FileNotFoundError:
    print("Model not found!")

available_books = sorted(df.index.unique().tolist())

def get_recommends(title = ""):
    try:
        book_data = df.loc[title]
    except KeyError as e:
        print(f'The given book "{title}" does not exist')
        return None

    if book_data.ndim == 2:
        book_values = book_data.iloc[0].values.reshape(1, -1)
    else:
        book_values = book_data.values.reshape(1, -1)
    
    if book_values.ndim != 2:
        book_values = book_values.reshape(1, -1)
    
    distance, indice = model.kneighbors(book_values, n_neighbors=6)
    
    recommended_books = pd.DataFrame({
        'title': df.iloc[indice[0]].index.values,
        'distance': distance[0]
    }).sort_values(by='distance', ascending=True)
    
    recommended_books = recommended_books[recommended_books['distance'] > 0.01].head(5).values
    
    return [title, recommended_books]

available_books_set = set(df.index.unique())

In [98]:
test_queries = {
    "Perks of Wallflower": "The Perks of Being a Wallflower",
    "Hitchhikers guide": "The Hitchhiker's Guide to the Galaxy",
    "mockingbird kill": "To Kill a Mockingbird",
}

def evaluate_search_performance(test_queries, k=5):
    results = {}
    for query, expected_book in test_queries.items():
        search_results = search_books(query, vectorizer, titles, n_results=k)
        retrieved_books = search_results['title'].tolist()
        
        found = expected_book in retrieved_books
        
        results[query] = {
            'found': found,
            'retrieved': retrieved_books[:3],
            'expected': expected_book
        }
    return results

In [99]:
print (json.dumps(evaluate_search_performance(test_queries), indent=4))

{
    "Perks of Wallflower": {
        "found": true,
        "retrieved": [
            "The Perks of Being a Wallflower"
        ],
        "expected": "The Perks of Being a Wallflower"
    },
    "Hitchhikers guide": {
        "found": true,
        "retrieved": [
            "The Hitchhiker's Guide to the Galaxy"
        ],
        "expected": "The Hitchhiker's Guide to the Galaxy"
    },
    "mockingbird kill": {
        "found": true,
        "retrieved": [
            "To Kill a Mockingbird"
        ],
        "expected": "To Kill a Mockingbird"
    }
}
