In [None]:
# BOOK RECOMMENDATION SYSTEM
# Collaborative filtering - o filtro basado en popularidad
#1. Preparar datos
#2. Pensar en modelos 

In [28]:
import pandas as pd
import numpy as np

users = pd.read_csv('Users.csv')
books = pd.read_csv('Books.csv', low_memory=False, dtype={
'Image-URL-S': str, 'Image-URL-M':str, 'Image-URL-L':str
})
ratings = pd.read_csv('Ratings.csv')


In [29]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [30]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [31]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [35]:
book_ratings = books.merge(ratings, on = 'ISBN')
user_rating = users.merge(ratings, on = 'User-ID')

In [36]:
user_rating.head()

Unnamed: 0,User-ID,Location,Age,ISBN,Book-Rating
0,2,"stockton, california, usa",18.0,195153448,0
1,7,"washington, dc, usa",,34542252,0
2,8,"timmins, ontario, canada",,2005018,5
3,8,"timmins, ontario, canada",,60973129,0
4,8,"timmins, ontario, canada",,374157065,0


In [37]:
book_rating.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0


In [38]:
# Se realizan 2 datasets nuevos con el número de evaluaciones
# y la media de evaluaciones de cada libro

book_num_ratings = book_ratings.groupby('Book-Title')['Book-Rating'].count().reset_index().rename(columns = {'Book-Rating':'Num-Ratings'})

In [39]:
book_avg_ratings = book_ratings.groupby('Book-Title')['Book-Rating'].mean().reset_index().rename(columns = {'Book-Rating':'Avg-Ratings'})

In [40]:
# Se unen los dataframes anteriores con la media y el número de evaluaciones
final_rating = book_num_ratings.merge(book_avg_ratings, on = 'Book-Title')
final_rating.head()

Unnamed: 0,Book-Title,Num-Ratings,Avg-Ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0


In [42]:
# Se filtra por aquellos libros que tienen más de 250 evaluaciones
# o, por así decirlo, los más populares en orden descendente teniendo 
#en cuenta la media

popular_books = final_rating[final_rating['Num-Ratings'] > 250].sort_values(by = 'Avg-Ratings', ascending=False).reset_index(drop=True).head(50)

# Hemos sacado los 50 mejores libros de nuestro dataframe
popular_books.head()


Unnamed: 0,Book-Title,Num-Ratings,Avg-Ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
1,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
2,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
3,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
4,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453


In [43]:
popular_books.head(15)

Unnamed: 0,Book-Title,Num-Ratings,Avg-Ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
1,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
2,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
3,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
4,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453
5,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117
6,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837
7,Harry Potter and the Sorcerer's Stone (Harry P...,575,4.895652
8,"The Two Towers (The Lord of the Rings, Part 2)",260,4.880769
9,To Kill a Mockingbird,510,4.7


In [45]:
# Para hacer un filtro colaborativo se consideran solo aquellos 
# usuarios que han evaluado mas de 200 libros y aquellos libros con 
# al menos 50 evaluaciones

x = book_ratings.groupby('User-ID').count()['Book-Rating'] > 200
nuestros_usuarios = x[x].index

book_ratings = book_ratings[book_ratings['User-ID'].isin(nuestros_usuarios)]

y = book_ratings.groupby('Book-Title')['Book-Rating'].count() >= 50
nuestros_libros = y[y].index

final = book_ratings[book_ratings['Book-Title'].isin(nuestros_libros)]

# Finalmente tenemos un dataframe con los usuarios que han votado
# a al menos 200 libros y con los libros que tienen al menos 50 votos

final.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
31,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,11676,9
33,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,36836,0
34,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,46398,9
38,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,113270,0
39,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,113519,0


In [46]:
# Se crea Pivot Table: Index = "Book-Title"
# columnas= User-ID, Valores= "Book-Rating" o calificaciones de los libros

pt =  final.pivot_table(index = "Book-Title", columns ="User-ID",
                       values="Book-Rating").fillna(0)

pt.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# Se calcula la similitud del resultado entre cada libro utilizando
# la función "cosine_similarity". Se recomendarán 5 libros con los resultados más similares

from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(pt)

def recommend(book_name):
    index = np.where(pt.index ==book_name)[0][0]
    similar_books = sorted(enumerate(similarity_scores[index]), key=lambda x: x[1], reverse=True)[1:6]
    
    for i in similar_books:
        print(pt.index[i[0]])

In [48]:
recommend("The House of the Spirits")

You Belong To Me
Last Chance Saloon
The Lost World
The Cradle Will Fall
I'll Be Seeing You
