#### Data 612 - Project 2 : Content-Based and Collaborative Filtering<br>Date: June 18, 2019<br>Team Info: 
+ Christina Valore
+ Juliann McEachern 
+ Rajwant Mishra

<h1 align="center">Good Books Recommender System</h1>

## Dataset Selection

Data was obtain from [goodbooks2017](#cite-goodbooks2017).

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data from local csv  into pandas dataframe
book_tags = pd.read_csv('https://raw.githubusercontent.com/jemceach/612-group/master/project-2/data/book_tags.csv')
tags = pd.read_csv('https://raw.githubusercontent.com/jemceach/612-group/master/project-2/data/tags.csv')
books = pd.read_csv('https://raw.githubusercontent.com/jemceach/612-group/master/project-2/data/books.csv')
ratings = pd.read_csv('https://raw.githubusercontent.com/jemceach/612-group/master/project-2/data/ratings.tar.gz', 
                      compression='gzip')

In [3]:
books[books.book_id == 1].head()


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...


In [4]:
ratings[ratings.book_id== 1].head()

Unnamed: 0,ratings.csv,user_id,book_id,rating
217482,217482.0,2886.0,1.0,5.0
292053,292053.0,6158.0,1.0,5.0
292380,292380.0,3991.0,1.0,4.0
293265,293265.0,5281.0,1.0,5.0
296200,296200.0,5721.0,1.0,5.0


In [5]:
book_tags[book_tags.goodreads_book_id==2767052].head()

Unnamed: 0,goodreads_book_id,tag_id,count
619294,2767052,11557,50755
619295,2767052,8717,35418
619296,2767052,33114,25968
619297,2767052,11743,13819
619298,2767052,10064,12985


In [6]:
tagc = [11557,8717,3314,11743,10064]
tags[tags.tag_id.isin(tagc)]

Unnamed: 0,tag_id,tag_name
3314,3314,atheism-secularism
8717,8717,currently-reading
10064,10064,dystopian
11557,11557,favorites
11743,11743,fiction


#### Data Cleaning

In [7]:
# Clean ratings data
ratings = ratings.drop('ratings.csv', axis=1)
ratings = ratings[:-1].astype(int)

In [8]:
# Clean books data
## select only books writen in english 
filter_list = ['eng', 'en-US', 'en-GB', 'en-CA', 'en']
books_df = books[books.language_code.isin(filter_list)]

## subset columns
books_df = books_df[['book_id', 'goodreads_book_id', 'isbn', 'authors', 'title', 'original_publication_year', 'average_rating']]

## drop 15 occurances of no publication year
books_df = books_df.dropna(axis=0, subset=['original_publication_year'])

## change publication year data type to int
books_df['original_publication_year'] = books_df['original_publication_year'].astype(int)

## join book_tags, tags, and books dataframes
merge_tags = pd.merge(book_tags, tags, on='tag_id')
group_tags = pd.DataFrame(merge_tags.groupby('goodreads_book_id')['tag_name'].apply(lambda x: "%s" % ', '.join(x)))
reindex_tags = group_tags.reset_index().rename({'tag_name':'tags'}, axis=1)
tagged_books = pd.merge(books_df, reindex_tags, on='goodreads_book_id')

## view tagged_books
tagged_books


Unnamed: 0,book_id,goodreads_book_id,isbn,authors,title,original_publication_year,average_rating,tags
0,1,2767052,439023483,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",2008,4.34,"to-read, fantasy, favorites, currently-reading..."
1,2,3,439554934,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997,4.44,"to-read, fantasy, favorites, currently-reading..."
2,3,41865,316015849,Stephenie Meyer,"Twilight (Twilight, #1)",2005,3.57,"to-read, fantasy, favorites, currently-reading..."
3,4,2657,61120081,Harper Lee,To Kill a Mockingbird,1960,4.25,"to-read, favorites, currently-reading, young-a..."
4,5,4671,743273567,F. Scott Fitzgerald,The Great Gatsby,1925,3.89,"to-read, favorites, currently-reading, young-a..."
5,6,11870085,525478817,John Green,The Fault in Our Stars,2012,4.26,"to-read, favorites, currently-reading, young-a..."
6,7,5907,618260307,J.R.R. Tolkien,The Hobbit,1937,4.25,"to-read, fantasy, favorites, currently-reading..."
7,8,5107,316769177,J.D. Salinger,The Catcher in the Rye,1951,3.79,"to-read, favorites, currently-reading, young-a..."
8,9,960,1416524797,Dan Brown,"Angels & Demons (Robert Langdon, #1)",2000,3.85,"to-read, fantasy, favorites, currently-reading..."
9,10,1885,679783261,Jane Austen,Pride and Prejudice,1813,4.24,"to-read, favorites, young-adult, fiction, book..."


## Content-Based Filtering 

Individually filter recommendations based on books with similar features.

#### Item profile

We start by creating an item profile for each book which contains features such as authors or tags.

TF-IDF: Term Frequency times Inverse document frequency. 

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Create TF-IDF features matrix and cosine similarity matrix for tags 
TF = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
TFIDF_tag_matrix = TF.fit_transform(tagged_books['tags'])
tag_csm = linear_kernel(TFIDF_tag_matrix, TFIDF_tag_matrix)

# Create TF-IDF features matrix and cosine similarity matrix for authors 
TFIDF_author_matrix = TF.fit_transform(tagged_books['authors'])
author_csm = linear_kernel(TFIDF_author_matrix, TFIDF_author_matrix)

# Create array and indices series for recommender functions
titles = tagged_books['title']
authors = tagged_books['authors']
indices = pd.Series(tagged_books.index, index=tagged_books['title'])

# Recommend books from cosine similarity score of book tags
def tag_recommender(title):
    # Set indices to titles
    idx = indices[title]
    
    # list and sort similarity scores 
    score = list(enumerate(tag_csm[idx]))
    score = sorted(score, key=lambda x: x[1], reverse=True)
    
    # recommend top 5 books 
    top_five = score[1:6]
    book_indices = [i[0] for i in top_five]
    return titles.iloc[book_indices]

# Recommend books from cosine similarity score of authors
def author_recommender(title):
    # Set indices to titles
    idx = indices[title]
    
    # list and sort similarity scores 
    score = list(enumerate(author_csm[idx]))
    score = sorted(score, key=lambda x: x[1], reverse=True)
    
    # recommend top 5 books 
    top_five = score[1:6]
    book_indices = [i[0] for i in top_five]
    return titles.iloc[book_indices]


In [10]:
# Test functions

book_test = ['The Great Gatsby', 'Gone Girl','Twilight (Twilight, #1)','100 Selected Poems']

for i in book_test:
    print('Top 5 Recommendations for:', i, '\n', tag_recommender(i),' \n \n')
    
for i in book_test:
    print('Top 5 Recommendations for:', i, '\n', author_recommender(i),' \n \n')

Top 5 Recommendations for: The Great Gatsby 
 7       The Catcher in the Rye
31             Of Mice and Men
3        To Kill a Mockingbird
27           Lord of the Flies
1492               Ethan Frome
Name: title, dtype: object  
 

Top 5 Recommendations for: Gone Girl 
 219                Dark Places
236              Sharp Objects
2444    The Kind Worth Killing
1288              Pretty Girls
58       The Girl on the Train
Name: title, dtype: object  
 

Top 5 Recommendations for: Twilight (Twilight, #1) 
 49                                Eclipse (Twilight, #3)
46                               New Moon (Twilight, #2)
806                        Midnight Sun (Twilight, #1.5)
713    The Short Second Life of Bree Tanner: An Eclip...
407                  Blood Promise (Vampire Academy, #4)
Name: title, dtype: object  
 

Top 5 Recommendations for: 100 Selected Poems 
 2755    The Poetry of Robert Frost (Collected Poems, C...
1425                The Complete Poems of Emily Dickinson
2308   

In [11]:
tagged_books[tagged_books.book_id.isin([3,2029,12187,1030,1456])]

Unnamed: 0,book_id,goodreads_book_id,isbn,authors,title,original_publication_year,average_rating,tags
2,3,41865,316015849,Stephenie Meyer,"Twilight (Twilight, #1)",2005,3.57,"to-read, fantasy, favorites, currently-reading..."
990,1030,10996342,316126691,Chad Harbach,The Art of Fielding,2011,3.99,"to-read, favorites, currently-reading, young-a..."
1391,1456,2146957,1590388984,Brandon Mull,"Grip of the Shadow Plague (Fablehaven, #3)",2008,4.26,"to-read, fantasy, favorites, currently-reading..."
1924,2029,815309,224081187,Ian McEwan,On Chesil Beach,2007,3.54,"to-read, favorites, currently-reading, fiction..."


## User-User Collaborative Filtering 

## Item-Item Collaborative Filtering 

In [12]:
# Branch ITEM-ITEM
# Change 2

1. Data Sparsity: In case of large number of items, number of items a user has rated reduces to a tiny percentage making the correlation coefficient less reliable
2. User profiles change quickly and the entire system model had to be recomputed which is both time and computationally expensive
To cater to these issues, we will use ITEM-ITEM collaborative filtering.

<b>ITEM-ITEM collaborative filtering</b>
ITEM-ITEM collaborative filtering look for items that are similar to the articles that user has already rated and recommend most similar articles. But what does that mean when we say item-item similarity? In this case we don’t mean whether two items are the same by attribute like Fountain pen and pilot pen are similar because both are pen. Instead, what similarity means is how people treat two items the same in terms of like and dislike.
This method is quite stable in itself as compared to User based collaborative filtering because the average item has a lot more ratings than the average user. So an individual rating doesn’t impact as much.

To calculate similarity between two items, we looks into the set of items the target user has rated and computes how similar they are to the target item i and then selects k most similar items. Similarity between two items is calculated by taking the ratings of the users who have rated both the items and thereafter using the cosine similarity function mentioned below:

Once we have the similarity between the items, the prediction is then computed by taking a weighted average of the target user’s ratings on these similar items. The formula to calculate rating is very similar to the user based collaborative filtering except the weights are between items instead of between users. And we use the current users rating for the item or for other items, instead of other users rating for the current items.

In [13]:
ratings.head(20)

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
5,2,26,4
6,2,315,3
7,2,33,4
8,2,301,5
9,2,2686,5


In [14]:
M = pd.pivot_table(ratings,columns='user_id',index='book_id',values='rating',fill_value=0)
M.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,53415,53416,53417,53418,53419,53420,53421,53422,53423,53424
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,4,0,...,0,0,4,5,4,4,4,4,4,4
2,0,5,0,5,0,0,0,0,4,0,...,0,0,0,0,5,5,5,5,5,5
3,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,3,3,0,0,0,4
4,5,0,3,4,0,0,0,3,0,5,...,0,0,0,0,3,0,5,0,5,5
5,0,5,0,4,0,0,3,3,5,5,...,0,0,0,0,3,2,4,0,0,0


In [15]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances

#get cosine similarities for ratings matrix M; pairwise_distances returns the distances between ratings and hence
#similarities are obtained by subtracting distances from 1
cosine_sim = 1-pairwise_distances(M, metric="cosine")
cosine_sim

array([[1.        , 0.5898403 , 0.5611419 , ..., 0.01774628, 0.03473761,
        0.0139493 ],
       [0.5898403 , 1.        , 0.52380869, ..., 0.02297971, 0.0259986 ,
        0.01615337],
       [0.5611419 , 0.52380869, 1.        , ..., 0.00431772, 0.01706002,
        0.00260247],
       ...,
       [0.01774628, 0.02297971, 0.00431772, ..., 1.        , 0.        ,
        0.        ],
       [0.03473761, 0.0259986 , 0.01706002, ..., 0.        , 1.        ,
        0.        ],
       [0.0139493 , 0.01615337, 0.00260247, ..., 0.        , 0.        ,
        1.        ]])

In [16]:
#Cosine similarity matrix
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.000000,0.589840,0.561142,0.448696,0.396913,0.426246,0.400916,0.338818,0.401081,0.369558,...,0.017498,0.017972,0.019893,0.046258,0.021739,0.023686,0.009895,0.017746,0.034738,0.013949
1,0.589840,1.000000,0.523809,0.493046,0.441769,0.322296,0.525491,0.399516,0.440514,0.429215,...,0.020618,0.018916,0.019696,0.041959,0.025115,0.022102,0.010512,0.022980,0.025999,0.016153
2,0.561142,0.523809,1.000000,0.362559,0.309284,0.313530,0.310542,0.265028,0.364219,0.364369,...,0.010293,0.009201,0.008138,0.034482,0.011529,0.029365,0.008328,0.004318,0.017060,0.002602
3,0.448696,0.493046,0.362559,1.000000,0.587586,0.282096,0.425565,0.551808,0.370716,0.479950,...,0.038342,0.010529,0.024342,0.025542,0.038902,0.010172,0.022494,0.019006,0.032705,0.025103
4,0.396913,0.441769,0.309284,0.587586,1.000000,0.257952,0.396766,0.577414,0.331094,0.466008,...,0.040399,0.007803,0.020729,0.017585,0.044162,0.004830,0.028338,0.020741,0.025513,0.029433
5,0.426246,0.322296,0.313530,0.282096,0.257952,1.000000,0.170626,0.205245,0.187138,0.221719,...,0.010348,0.001618,0.010204,0.029168,0.013830,0.005803,0.004595,0.004220,0.027064,0.005820
6,0.400916,0.525491,0.310542,0.425565,0.396766,0.170626,1.000000,0.394625,0.338549,0.372836,...,0.027935,0.031301,0.032459,0.029757,0.029899,0.014056,0.017376,0.039016,0.017203,0.028317
7,0.338818,0.399516,0.265028,0.551808,0.577414,0.205245,0.394625,1.000000,0.309951,0.389588,...,0.043557,0.008383,0.025391,0.012553,0.038535,0.002456,0.019693,0.019412,0.028998,0.029317
8,0.401081,0.440514,0.364219,0.370716,0.331094,0.187138,0.338549,0.309951,1.000000,0.303735,...,0.015303,0.016581,0.014974,0.005684,0.018348,0.009599,0.015609,0.016692,0.013717,0.016419
9,0.369558,0.429215,0.364369,0.479950,0.466008,0.221719,0.372836,0.389588,0.303735,1.000000,...,0.031779,0.003460,0.021111,0.018104,0.029938,0.019453,0.018978,0.018570,0.023723,0.011875


In [19]:

#make necesarry imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import sys, os
from contextlib import contextmanager
global k,metric
k=4
metric='cosine'
#This function finds k similar items given the item_id and ratings matrix M

def findksimilaritems(book_id, ratingsMatrix, metric=metric, k=k):
    similarities=[]
    indices=[]    
    ratings=ratingsMatrix.T
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[book_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    print ('{0} most similar items for item {1}:\n'.format(k,book_id))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == book_id:
            continue;

        else:
            print ('{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i]))


    return similarities,indices

In [20]:
similarities,indices=findksimilaritems(3,M)

4 most similar items for item 3:

1: Item 2029 :, with similarity of 0.23915115445632507
2: Item 12187 :, with similarity of 0.23873427001768688
3: Item 1030 :, with similarity of 0.2236168282056039
4: Item 1456 :, with similarity of 0.21573034819507708


In [21]:
books_df[books_df.book_id.isin([3,2029,12187,1030,1456])]

Unnamed: 0,book_id,goodreads_book_id,isbn,authors,title,original_publication_year,average_rating
2,3,41865,316015849,Stephenie Meyer,"Twilight (Twilight, #1)",2005,3.57
1029,1030,10996342,316126691,Chad Harbach,The Art of Fielding,2011,3.99
1455,1456,2146957,1590388984,Brandon Mull,"Grip of the Shadow Plague (Fablehaven, #3)",2008,4.26
2028,2029,815309,224081187,Ian McEwan,On Chesil Beach,2007,3.54


In [22]:
tagged_books[tagged_books.book_id.isin([3,2029,12187,1030,1456,4081])]

Unnamed: 0,book_id,goodreads_book_id,isbn,authors,title,original_publication_year,average_rating,tags
2,3,41865,316015849,Stephenie Meyer,"Twilight (Twilight, #1)",2005,3.57,"to-read, fantasy, favorites, currently-reading..."
990,1030,10996342,316126691,Chad Harbach,The Art of Fielding,2011,3.99,"to-read, favorites, currently-reading, young-a..."
1391,1456,2146957,1590388984,Brandon Mull,"Grip of the Shadow Plague (Fablehaven, #3)",2008,4.26,"to-read, fantasy, favorites, currently-reading..."
1924,2029,815309,224081187,Ian McEwan,On Chesil Beach,2007,3.54,"to-read, favorites, currently-reading, fiction..."
3747,4081,231,312424442,Tom Wolfe,I am Charlotte Simmons,2004,3.4,"to-read, favorites, currently-reading, young-a..."


In [24]:

## starting new Test with 10K data
test_rating = ratings[1:10000]
test_rating.head(),len(test_rating)

(   user_id  book_id  rating
 1        2     4081       4
 2        2      260       5
 3        2     9296       5
 4        2     2318       3
 5        2       26       4, 9999)

In [25]:
test_rating_item = pd.merge(test_rating,books_df)
test_rating_item.head()

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,isbn,authors,title,original_publication_year,average_rating
0,2,4081,4,231,312424442.0,Tom Wolfe,I am Charlotte Simmons,2004,3.4
1,258,4081,5,231,312424442.0,Tom Wolfe,I am Charlotte Simmons,2004,3.4
2,364,4081,4,231,312424442.0,Tom Wolfe,I am Charlotte Simmons,2004,3.4
3,316,4081,2,231,312424442.0,Tom Wolfe,I am Charlotte Simmons,2004,3.4
4,2,260,5,4865,,Dale Carnegie,How to Win Friends and Influence People,1936,4.13


In [27]:

test_rating_item = test_rating_item[['book_id','title','user_id','rating']]
test_rating_item.head()

Unnamed: 0,book_id,title,user_id,rating
0,4081,I am Charlotte Simmons,2,4
1,4081,I am Charlotte Simmons,258,5
2,4081,I am Charlotte Simmons,364,4
3,4081,I am Charlotte Simmons,316,2
4,260,How to Win Friends and Influence People,2,5


In [28]:
test_M = pd.pivot_table(test_rating_item,index='user_id',columns='title',values='rating',fill_value=0)
test_M.head(10)

title,"Angels (Walsh Family, #3)",'Salem's Lot,"'Tis (Frank McCourt, #2)",100 Selected Poems,1776,1984,"2001: A Space Odyssey (Space Odyssey, #1)","2010: Odyssey Two (Space Odyssey, #2)","2061: Odyssey Three (Space Odyssey, #3)",300,...,"Y: The Last Man, Vol. 8: Kimono Dragons (Y: The Last Man, #8)",Year of Wonders,Yertle the Turtle and Other Stories,You Can Heal Your Life,You Shall Know Our Velocity!,Zami: A New Spelling of My Name,Zen and the Art of Motorcycle Maintenance: An Inquiry Into Values,Zodiac,Zorba the Greek,Zorro
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,3,0,0,0


In [29]:
# find the corealtion in book
# Only for Boook wher atleat 10 Users had read it

test_M_corr = test_M.corr(method='pearson',min_periods=10)

In [30]:
test_M_corr.head()

title,"Angels (Walsh Family, #3)",'Salem's Lot,"'Tis (Frank McCourt, #2)",100 Selected Poems,1776,1984,"2001: A Space Odyssey (Space Odyssey, #1)","2010: Odyssey Two (Space Odyssey, #2)","2061: Odyssey Three (Space Odyssey, #3)",300,...,"Y: The Last Man, Vol. 8: Kimono Dragons (Y: The Last Man, #8)",Year of Wonders,Yertle the Turtle and Other Stories,You Can Heal Your Life,You Shall Know Our Velocity!,Zami: A New Spelling of My Name,Zen and the Art of Motorcycle Maintenance: An Inquiry Into Values,Zodiac,Zorba the Greek,Zorro
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",1.0,-0.009584,-0.008679,-0.003937,-0.005411,-0.02999,-0.007798,-0.006846,-0.005544,-0.007731,...,-0.003937,-0.005579,-0.005522,-0.003937,-0.010845,-0.003937,-0.009511,-0.005411,-0.008733,-0.005411
'Salem's Lot,-0.009584,1.0,-0.021127,-0.009584,0.205876,0.038636,-0.018982,-0.016665,-0.013496,0.259384,...,-0.009584,-0.01358,0.178175,-0.009584,-0.026398,-0.009584,-0.023152,-0.013171,-0.021259,-0.013171
"'Tis (Frank McCourt, #2)",-0.008679,-0.021127,1.0,-0.008679,-0.011928,-0.066112,-0.017191,-0.015092,-0.012222,-0.017043,...,-0.008679,-0.012298,-0.012174,-0.008679,0.150269,-0.008679,-0.020966,-0.011928,-0.019252,-0.011928
100 Selected Poems,-0.003937,-0.009584,-0.008679,1.0,-0.005411,-0.02999,-0.007798,-0.006846,-0.005544,-0.007731,...,-0.003937,-0.005579,-0.005522,-0.003937,-0.010845,-0.003937,-0.009511,-0.005411,-0.008733,-0.005411
1776,-0.005411,0.205876,-0.011928,-0.005411,1.0,-0.041218,-0.010718,-0.009409,-0.00762,0.328068,...,-0.005411,-0.007667,-0.00759,-0.005411,-0.014904,-0.005411,-0.013072,-0.007437,-0.012003,-0.007437


In [31]:
test_M1 =pd.pivot_table(test_rating_item,index='user_id',columns='title',values='rating',fill_value=0)

test_M1.T.head()

user_id,1,2,4,6,8,9,10,11,15,18,...,429,439,440,444,446,447,449,452,453,454
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Angels (Walsh Family, #3)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
'Salem's Lot,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"'Tis (Frank McCourt, #2)",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100 Selected Poems,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1776,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
test_M_corr1 = 1-pairwise_distances(test_M1.T, metric="correlation" )

#Cosine similarity matrix
pd.DataFrame(test_M_corr1).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2203,2204,2205,2206,2207,2208,2209,2210,2211,2212
0,1.0,-0.009584,-0.008679,-0.003937,-0.005411,-0.02999,-0.007798,-0.006846,-0.005544,-0.007731,...,-0.003937,-0.005579,-0.005522,-0.003937,-0.010845,-0.003937,-0.009511,-0.005411,-0.008733,-0.005411
1,-0.009584,1.0,-0.021127,-0.009584,0.205876,0.038636,-0.018982,-0.016665,-0.013496,0.259384,...,-0.009584,-0.01358,0.178175,-0.009584,-0.026398,-0.009584,-0.023152,-0.013171,-0.021259,-0.013171
2,-0.008679,-0.021127,1.0,-0.008679,-0.011928,-0.066112,-0.017191,-0.015092,-0.012222,-0.017043,...,-0.008679,-0.012298,-0.012174,-0.008679,0.150269,-0.008679,-0.020966,-0.011928,-0.019252,-0.011928
3,-0.003937,-0.009584,-0.008679,1.0,-0.005411,-0.02999,-0.007798,-0.006846,-0.005544,-0.007731,...,-0.003937,-0.005579,-0.005522,-0.003937,-0.010845,-0.003937,-0.009511,-0.005411,-0.008733,-0.005411
4,-0.005411,0.205876,-0.011928,-0.005411,1.0,-0.041218,-0.010718,-0.009409,-0.00762,0.328068,...,-0.005411,-0.007667,-0.00759,-0.005411,-0.014904,-0.005411,-0.013072,-0.007437,-0.012003,-0.007437


In [33]:
# Get 5 similar items from the nearest neighbour 
n = 5 
cosine_nn = NearestNeighbors(n_neighbors=n, algorithm='brute',metric='cosine')
item_cosine_nn_fit = cosine_nn.fit(test_M1.T.values)
item_distance, item_indices = item_cosine_nn_fit.kneighbors(test_M1.T.values )

In [45]:
# Cosine distance and Item 
item_distance,item_indices

type(item_distance)
item_distance[1:3],item_indices[1:3]

(array([[0.        , 0.17798027, 0.22217421, 0.2602069 , 0.28073273],
        [0.        , 0.40911613, 0.41654003, 0.41654003, 0.41654003]]),
 array([[   1,  986, 1081, 1496, 1233],
        [   2,  143,  191, 1606, 2002]], dtype=int64))

In [46]:
# Here we are listing all the book and corresponding 5 recommendation 
items_dic = {}

for i in range (len(test_M1.T.index)):
    item_idx = item_indices[i]
    col_names = test_M1.T.index[item_idx].tolist()
    items_dic[test_M1.T.index[i]] = col_names

In [53]:
# List of the Book reccomnded by item title 
items_dic

{' Angels (Walsh Family, #3)': ['The Druid of Shannara (Heritage of Shannara, #2)',
  'The Elfstones of Shannara  (The Original Shannara Trilogy, #2)',
  'Intensity',
  'Rose Madder',
  'The Black Unicorn (Magic Kingdom of Landover, #2)'],
 "'Salem's Lot": ["'Salem's Lot",
  'Night Shift',
  'Pet Sematary',
  'The Dead Zone',
  'Skeleton Crew'],
 "'Tis (Frank McCourt, #2)": ["'Tis (Frank McCourt, #2)",
  "Angela's Ashes (Frank McCourt, #1)",
  'Babyville',
  'The Guy Not Taken: Stories',
  'The Wedding (The Notebook, #2)'],
 '100 Selected Poems': ['A Bargain for Frances',
  '100 Selected Poems',
  'Bread and Jam for Frances',
  'A River Runs Through It and Other Stories',
  'The Neverending Story'],
 '1776': ['1776',
  'Survival in Auschwitz',
  'Mayflower: A Story of Courage, Community, and War',
  'The Rape of Nanking',
  'The Cobra Event'],
 '1984': ['1984',
  'Animal Farm',
  'Brave New World',
  'To Kill a Mockingbird',
  'The Catcher in the Rye'],
 '2001: A Space Odyssey (Space O

In [54]:
# Lets find list of Book read by user

has_read= {}
row_indexes = {}
for i,row in test_M1.iterrows():
    rows = [x for x in range(0,len(test_M1.columns))]
    combine = list(zip(row.index,row.values,rows))
    read = [(x,z) for x,y,z in combine if y!=0]
    index = [i[1] for i in read]
    row_names = [i[0] for i in read]
    row_indexes[i] = index
    has_read[i] = row_names

# https://www.youtube.com/watch?v=dEwl5_QFYNI


In [125]:
has_read.items()

dict_items([(1, ['Balzac and the Little Chinese Seamstress', 'Gilead (Gilead, #1)', 'Housekeeping', 'Never Let Me Go', 'The Book Thief', 'The History of Love', 'The Sea']), (2, ['Harry Potter Collection (Harry Potter, #1-6)', 'Heart of Darkness', 'How to Win Friends and Influence People', 'I am Charlotte Simmons', 'Memoirs of a Geisha', 'The Da Vinci Code (Robert Langdon, #2)', 'The Drama of the Gifted Child: The Search for the True Self', 'The House of God', 'The Millionaire Next Door: The Surprising Secrets of Americas Wealthy', 'Who Moved My Cheese?']), (4, ['1984', 'A Child Called "It" (Dave Pelzer #1)', 'A Confederacy of Dunces', 'A Farewell to Arms', 'A Heartbreaking Work of Staggering Genius', 'A Prayer for Owen Meany', 'A Time to Kill', 'A Wrinkle in Time (A Wrinkle in Time Quintet, #1)', 'Alas, Babylon', 'Alexander and the Terrible, Horrible, No Good, Very Bad Day', 'And Then There Were None', 'Anne of Avonlea (Anne of Green Gables, #2)', 'Atlas Shrugged', 'Black Beauty', 'Ble

In [97]:
type(row_indexes),row_indexes.items()

(dict,
 dict_items([(1, [197, 581, 688, 977, 1405, 1616, 1875]), (2, [633, 647, 692, 705, 899, 1486, 1518, 1629, 1737, 2162]), (4, [5, 21, 25, 36, 44, 68, 84, 93, 107, 108, 142, 154, 185, 224, 233, 255, 265, 294, 297, 306, 337, 355, 389, 393, 468, 492, 507, 510, 635, 637, 638, 639, 640, 641, 642, 647, 756, 761, 834, 849, 888, 895, 899, 984, 1015, 1111, 1236, 1261, 1269, 1271, 1279, 1337, 1342, 1364, 1366, 1412, 1419, 1436, 1445, 1464, 1480, 1482, 1486, 1556, 1563, 1577, 1578, 1580, 1587, 1593, 1596, 1671, 1709, 1730, 1797, 1801, 1810, 1829, 1836, 1891, 1928, 1935, 1936, 1973, 1976, 2025, 2032, 2056, 2063, 2138, 2153, 2154]), (6, [672]), (8, [42, 132, 150, 255, 349, 372, 423, 554, 618, 896, 905, 925, 1003, 1051, 1462, 1472, 1636, 1734, 1820, 1848, 1874, 1966, 2108]), (9, [5, 68, 97, 144, 150, 186, 200, 212, 243, 255, 258, 266, 267, 334, 436, 463, 484, 663, 669, 703, 722, 849, 895, 899, 916, 940, 963, 1075, 1109, 1111, 1173, 1200, 1212, 1218, 1219, 1256, 1381, 1419, 1451, 1478, 1485, 148

In [63]:
item_indices[1:5]

array([[   1,  986, 1081, 1496, 1233],
       [   2,  143,  191, 1606, 2002],
       [  11,    3,  256,   69, 1764],
       [   4, 1303,  894, 1837, 1453]], dtype=int64)

In [102]:
# Sample of data 
# item_indexs = (454, [5, 83, 215, 302, 437, 1031, 1707, 2063, 2098])  i.e. User and Book ID of the read books
# item_distance = (array([[0.        , 0.17798027, 0.22217421, 0.2602069 , 0.28073273],
#         [0.        , 0.40911613, 0.41654003, 0.41654003, 0.41654003]]),
# item_indices =  array([[   1,  986, 1081, 1496, 1233],
#         [   2,  143,  191, 1606, 2002]], dtype=int64))
#-----------------------------------------------------------------------------------------------------------------
# We will read all Book read by User 454 , and find the similar book from item_indices and item_distance
# Then we will remove all the books already read by user 454 and sort the result to store in final recommendation 

top_rec = {}
# Find the Item close to the Item already read 
# Get the read item and find the distance from the Item_disatnce for the book already read 
for k, v in row_indexes.items():
    item_idx = [j for i in item_indices[v] for j in i]
    item_dist = [j for i in item_distance[v] for j in i]
    # Put this info in one list
    combine = list(zip(item_dist,item_idx))
    # Keep out the already read Book
    diction = {i:d for d,i in combine  if i not in v}
    zipped = list(zip(diction.keys(),diction.values()))
    #sorting our result so that we have most similar item on the top 
    sort = sorted(zipped,key=lambda x: x[1])  
    # to get the actaul movie name , pass the actual user-item matrim 
    # test_M.columns[2116] 'Vernon God Little'
    recommendations = [(test_M.columns[i],d) for i,d in sort]
    idp = [(i,d) for i,d in sort ] 
#     print(item_idx)
    top_rec[k]= recommendations
#     print('Combine:',item_idx)
#     print('Only Note Read:',recommendations)
#     print('Item dis:',idp)
          


In [126]:
top_rec.items()
len(test_M.index)

['Harry Potter Collection (Harry Potter, #1-6)',
 'Heart of Darkness',
 'How to Win Friends and Influence People',
 'I am Charlotte Simmons',
 'Memoirs of a Geisha',
 'The Da Vinci Code (Robert Langdon, #2)',
 'The Drama of the Gifted Child: The Search for the True Self',
 'The House of God',
 'The Millionaire Next Door: The Surprising Secrets of Americas Wealthy',
 'Who Moved My Cheese?']

In [127]:
2 in test_M.index # check if User exist in Record

True

In [143]:
has_read[2] # list all the Book Rread
print("Boork Read so far: \n{}".format('\n'.join(has_read[2])))


format('\n'.join(has_read[2]))

Boork Read so far: 
Harry Potter Collection (Harry Potter, #1-6)
Heart of Darkness
How to Win Friends and Influence People
I am Charlotte Simmons
Memoirs of a Geisha
The Da Vinci Code (Robert Langdon, #2)
The Drama of the Gifted Child: The Search for the True Self
The House of God
The Millionaire Next Door: The Surprising Secrets of Americas Wealthy
Who Moved My Cheese?


'Harry Potter Collection (Harry Potter, #1-6)\nHeart of Darkness\nHow to Win Friends and Influence People\nI am Charlotte Simmons\nMemoirs of a Geisha\nThe Da Vinci Code (Robert Langdon, #2)\nThe Drama of the Gifted Child: The Search for the True Self\nThe House of God\nThe Millionaire Next Door: The Surprising Secrets of Americas Wealthy\nWho Moved My Cheese?'

In [164]:
# build Final Recommendation 
def get_book_recommendation(user,number_of_rec=5):
    if user in test_M.index :
        print("Boork Read so far: \n\n{}".format('\n'.join(has_read[user])))
        print()
        print("\n\nTOP 5 RECOMMENDATION:")
        # Get the Move name along with similarity score 
        for k,v in top_rec.items():
            if user == k:
                for i in v[:number_of_rec]:
                    print('{} with similarlity: {:.4f}'.format(i[0],1-i[1]))
    else:
        print(" Sorry user is not found")
    

In [165]:
get_book_recommendation(2)


Boork Read so far: 

Harry Potter Collection (Harry Potter, #1-6)
Heart of Darkness
How to Win Friends and Influence People
I am Charlotte Simmons
Memoirs of a Geisha
The Da Vinci Code (Robert Langdon, #2)
The Drama of the Gifted Child: The Search for the True Self
The House of God
The Millionaire Next Door: The Surprising Secrets of Americas Wealthy
Who Moved My Cheese?



TOP 5 RECOMMENDATION:
Plainsong (Plainsong, #1) with similarlity: 0.6402
Random Family: Love, Drugs, Trouble, and Coming of Age in the Bronx with similarlity: 0.6402
Tender at the Bone: Growing Up at the Table with similarlity: 0.6402
One Good Turn (Jackson Brodie, #2) with similarlity: 0.6402
A Supposedly Fun Thing I'll Never Do Again:  Essays and Arguments with similarlity: 0.6247


In [178]:
# Predict Rating of books
item_distance_p = 1 - item_distance  # actaul similarlity 
test_predction = item_distance_p.T.dot(test_M.T.values)/np.array([np.abs(item_distance_p.T).sum(axis = 1)]).T
real_rating = test_M.T.values[item_distance_p.argsort()[0]]
test_predction
item_distance_p.T.dot(test_M.T.values)


array([[ 23.        ,  43.        , 364.        , ...,  13.        ,
        244.        ,  42.        ],
       [ 12.84371217,  30.60439691, 222.17943689, ...,   7.15978527,
        149.09282941,  22.07803942],
       [ 12.77437879,  28.78289484, 207.62542351, ...,   6.49645683,
        141.98679436,  20.30402605],
       [ 12.74387464,  25.70639651, 195.08457672, ...,   6.43217442,
        135.55939244,  19.68470576],
       [ 12.15220253,  24.04089531, 187.81210418, ...,   6.40233125,
        129.96588445,  18.14283567]])

In [179]:
np.array([np.abs(item_distance_p.T).sum(axis = 1)]).T

array([[2213.        ],
       [1846.79331475],
       [1773.92030933],
       [1711.06318192],
       [1655.84608644]])

### RMSE Calculation

In [193]:
def rmse(prediction,real):
    prediction = prediction[real.nonzero()].flatten()
    real= real[real.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction,real))
    

In [199]:
error_rate = rmse(test_predction,real_rating)
print("Accuracy:{:.3f}".format(100-error_rate))
print("RMSE: {:.3f}".format(error_rate))
error_rate

Accuracy:96.353
RMSE: 3.647


3.6470512670697888

## Summary
Please provide at least one graph, and a textual summary of your findings and recommendations. 

## Sources

**To do: figure out jupyter nbconvert citations**

http://fastml.com/goodbooks-10k-a-new-dataset-for-book-recommendations/

@article{goodbooks2017,
    author = {Zajac, Zygmunt},
    title = {Goodbooks-10k: a new dataset for book recommendations},
    year = {2017},
    publisher = {FastML},
    journal = {FastML},
    howpublished = {\url{http://fastml.com/goodbooks-10k}},
},
