In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [3]:
#import data
books = pd.read_csv('../datasets/books.csv')
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [4]:
books.shape

(10000, 23)

In [5]:
ratings = pd.read_csv('../datasets/ratings.csv')
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [6]:
ratings['rating'].unique()

array([5, 3, 4, 1, 2], dtype=int64)

In [9]:
tags = pd.read_csv('../datasets/book_tags.csv')
tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [11]:
btags = pd.read_csv('../datasets/tags.csv')
btags.tail()

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


In [14]:
# data preprocessing

ratings = ratings.sort_values('user_id')
ratings.shape

(981756, 3)

In [15]:
ratings.drop_duplicates(subset=['book_id' , 'user_id'] , keep=False, inplace=True)
ratings.shape

(977269, 3)

In [16]:
print(books.shape)
books.drop_duplicates(subset=['original_title'] , keep=False, inplace=True)
books.shape

(10000, 23)


(9151, 23)

In [17]:
print(btags.shape)
btags.drop_duplicates(subset=['tag_id'] , keep=False, inplace=True)
btags.shape

(34252, 2)


(34252, 2)

In [19]:
print(tags.shape)
tags.drop_duplicates(subset=['tag_id', 'goodreads_book_id'] , keep=False, inplace=True)
tags.shape

(999912, 3)


(999896, 3)

In [21]:
#Data visualization
jonit_tags = pd.merge(tags, btags , left_on='tag_id' , right_on='tag_id', how='inner')


In [24]:
# Top 10 rated books
top_rated = books.sort_values('average_rating', ascending=False)
top10 = top_rated.head(10)
top10
display = top10[['title', 'small_image_url']]
display.set_index('title', inplace=True)

In [30]:
from IPython.display import Image , HTML

def path_to_image_html(path):
    return f'<img src="{path}" />'
HTML(display.to_html(escape=False , formatters={'small_image_url':path_to_image_html}))

Unnamed: 0_level_0,small_image_url
title,Unnamed: 1_level_1
The Complete Calvin and Hobbes,
"Words of Radiance (The Stormlight Archive, #2)",
Mark of the Lion Trilogy,
It's a Magical World: A Calvin and Hobbes Collection,
There's Treasure Everywhere: A Calvin and Hobbes Collection,
"Harry Potter Boxset (Harry Potter, #1-7)",
"Harry Potter Collection (Harry Potter, #1-6)",
The Indispensable Calvin and Hobbes,
The Authoritative Calvin and Hobbes: A Calvin and Hobbes Treasury,
Attack of the Deranged Mutant Killer Monster Snow Goons,


#### Contents Based Filtering Recommand System

In [32]:
fillnabooks = books.fillna('')

In [34]:
fillednabooks=fillnabooks[['original_title', 'authors', 'average_rating']]
fillednabooks.head()

Unnamed: 0,original_title,authors,average_rating
0,The Hunger Games,Suzanne Collins,4.34
1,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré",4.44
3,To Kill a Mockingbird,Harper Lee,4.25
4,The Great Gatsby,F. Scott Fitzgerald,3.89
5,The Fault in Our Stars,John Green,4.26


In [35]:
def clean_data(x):
    return str.lower(x.replace(" " , ""))

In [36]:
fillednabooks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9151 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   original_title  9151 non-null   object 
 1   authors         9151 non-null   object 
 2   average_rating  9151 non-null   float64
dtypes: float64(1), object(2)
memory usage: 286.0+ KB


In [37]:
fillednabooks = fillednabooks.astype(str)
fillednabooks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9151 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   original_title  9151 non-null   object
 1   authors         9151 non-null   object
 2   average_rating  9151 non-null   object
dtypes: object(3)
memory usage: 286.0+ KB


In [40]:
fillednabooks['original_title'] = fillednabooks['original_title'].apply(clean_data)
fillednabooks['authors'] = fillednabooks['authors'].apply(clean_data)
fillednabooks['average_rating'] = fillednabooks['average_rating'].apply(clean_data)

In [41]:
fillednabooks.head()

Unnamed: 0,original_title,authors,average_rating
0,thehungergames,suzannecollins,4.34
1,harrypotterandthephilosopher'sstone,"j.k.rowling,marygrandpré",4.44
3,tokillamockingbird,harperlee,4.25
4,thegreatgatsby,f.scottfitzgerald,3.89
5,thefaultinourstars,johngreen,4.26


In [42]:
def create_soup(x):
    return x['original_title'] + ' ' +  x['authors'] + ' ' + x['average_rating']

In [44]:
fillednabooks['soup'] = fillednabooks.apply(create_soup , axis=1)

In [45]:
fillednabooks.head()

Unnamed: 0,original_title,authors,average_rating,soup
0,thehungergames,suzannecollins,4.34,thehungergames suzannecollins 4.34
1,harrypotterandthephilosopher'sstone,"j.k.rowling,marygrandpré",4.44,harrypotterandthephilosopher'sstone j.k.rowlin...
3,tokillamockingbird,harperlee,4.25,tokillamockingbird harperlee 4.25
4,thegreatgatsby,f.scottfitzgerald,3.89,thegreatgatsby f.scottfitzgerald 3.89
5,thefaultinourstars,johngreen,4.26,thefaultinourstars johngreen 4.26


In [46]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(fillednabooks['soup'])
count_matrix

<9151x17302 sparse matrix of type '<class 'numpy.int64'>'
	with 33809 stored elements in Compressed Sparse Row format>

In [47]:
cosine_sim  = cosine_similarity(count_matrix)

In [59]:
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [51]:
set(cosine_sim[0])

{0.0,
 0.16012815380508716,
 0.1666666666666667,
 0.2041241452319315,
 0.2357022603955159,
 0.25819888974716115,
 0.2886751345948129,
 0.3333333333333334,
 0.408248290463863,
 1.0000000000000002}

In [56]:
fillnabooks=fillnabooks.reset_index()


In [57]:
indices = pd.Series(fillnabooks.index , index=fillnabooks.original_title)

In [58]:
indices.head()

original_title
The Hunger Games                            0
Harry Potter and the Philosopher's Stone    1
To Kill a Mockingbird                       2
The Great Gatsby                            3
The Fault in Our Stars                      4
dtype: int64

In [72]:
def get_recommandations(title , cosine_sim = cosine_sim):
    # title = str.lower(title.replace(" " , ""))
    idx = indices[title]
    result = list(enumerate(cosine_sim[idx]))
    result = sorted(result, key=lambda x: x[1] , reverse=True)
    # print(result)
    result = result[1:11]
    # print(result)
    book_indicies = [i[0] for i in result]
    # print(book_indicies)
    return books['original_title'].iloc[book_indicies]

In [73]:
result = get_recommandations('The Fault in Our Stars')
print(result)

10                                  The Kite Runner 
73                                Looking for Alaska
87                                       Paper Towns
274                       An Abundance of Katherines
408    Fried Green Tomatoes at the Whistle Stop Cafe
439                                   Fall of Giants
672                                       Americanah
722                                        Shantaram
748                                  The Storyteller
857                                       Red Rising
Name: original_title, dtype: object


In [75]:
books.describe()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn13,original_publication_year,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
count,9151.0,9151.0,9151.0,9151.0,9151.0,8711.0,9145.0,9151.0,9151.0,9151.0,9151.0,9151.0,9151.0,9151.0,9151.0,9151.0
mean,4909.648891,4869336.0,5078243.0,7977119.0,78.362474,9756300000000.0,1981.138218,3.999263,55374.63,61220.16,2986.376024,1337.893017,3168.733909,11784.247077,20559.87,24369.41
std,2884.017088,7339193.0,7607401.0,11282900.0,171.877505,433108600000.0,154.608318,0.252293,157680.0,168612.1,6194.711899,4978.452293,8954.819986,28259.079754,52331.5,81393.72
min,1.0,1.0,1.0,87.0,1.0,195170300.0,-1750.0,2.47,2716.0,5510.0,3.0,11.0,30.0,323.0,750.0,754.0
25%,2401.5,43290.0,43911.5,979527.5,25.0,9780316000000.0,1989.0,3.85,13691.0,15561.0,698.0,197.0,667.0,3165.5,5480.0,5357.5
50%,4864.0,330760.0,349929.0,2516519.0,42.0,9780451000000.0,2004.0,4.02,21662.0,24324.0,1413.0,397.0,1192.0,5043.0,8534.0,8939.0
75%,7396.5,8069682.0,8448476.0,13143620.0,69.0,9780813000000.0,2010.0,4.17,42511.0,47775.0,2852.5,915.5,2454.0,9639.0,16802.5,17783.5
max,10000.0,33288640.0,35534230.0,56399600.0,3455.0,9790008000000.0,2017.0,4.82,4780653.0,4942365.0,155254.0,165455.0,197621.0,606158.0,1481305.0,3011543.0


In [76]:
C = books['average_rating'].mean()

In [78]:
# 투표횟수 중 60%이상의 횟수에 달하는 숫자
# 예를들어 총 투표횟수가 100과 1일때 m값은 매우 달라진다.
m = books['ratings_count'].quantile(0.6)

def weighted_vote_average(record):
    v = record['ratings_count']
    R = record['average_rating']
    
    return ( (v/(v+m)) * R) + ( (m/(m+v)) * C)
    
books['weighted_vote'] = books.apply(weighted_vote_average, axis=1)


In [79]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,weighted_vote
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,4.338089
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,4.437432
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,4.247904
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,3.891087
5,6,11870085,11870085,16827462,226,525478817,9780525000000.0,John Green,2012.0,The Fault in Our Stars,...,2478609,140739,47994,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...,4.257037


In [88]:
def get_recommandations(title , cosine_sim = cosine_sim):
    # title = str.lower(title.replace(" " , ""))
    idx = indices[title]
    result = list(enumerate(cosine_sim[idx]))
    result = sorted(result, key=lambda x: x[1] , reverse=True)
    # print(result)
    result = result[1:21]
    # print(result)
    book_indicies = [i[0] for i in result]
    # print(book_indicies) df.iloc[similar_idx].sort_values(by=['weighted_vote'], ascending=False)[:top_n]
    # print(type(books.iloc[book_indicies]))
    # return books['original_title'].iloc[book_indicies]
    return books.iloc[book_indicies].sort_values(by=['weighted_vote'], ascending=False)[:10]['original_title']

In [89]:
result = get_recommandations('The Fault in Our Stars')
print(result)

10                                   The Kite Runner 
408     Fried Green Tomatoes at the Whistle Stop Cafe
439                                    Fall of Giants
937                                 The Edge of Never
672                                        Americanah
748                                   The Storyteller
857                                        Red Rising
863                                   The Fiery Cross
722                                         Shantaram
1504                                The Darkest Minds
Name: original_title, dtype: object


In [None]:
C = movies_df['vote_average'].mean()

# 투표횟수 중 60%이상의 횟수에 달하는 숫자
# 예를들어 총 투표횟수가 100과 1일때 m값은 매우 달라진다.
m = movies_df['vote_count'].quantile(0.6)

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ( (v/(v+m)) * R) + ( (m/(m+v)) * C)
    
movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)
 
# 가중 평점 변수를 추가해서 유사한 영화 찾아주는 함수 재정의
def find_sim_movie(df, sorted_idx, title_name, top_n=10):
    title_movie = df[df['title'] == title_name]
    title_idx = title_movie.index.values
    
    similar_idx = sorted_idx[title_idx, :(top_n*2)]
    similar_idx = similar_idx.reshape(-1,)
    
    #자기 자신 영화 제외, boolean index기법 사용!
    similar_idx = similar_idx[similar_idx != title_idx]
    return df.iloc[similar_idx].sort_values(by=['weighted_vote'], ascending=False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_idx,
                               'The Avengers')
print(similar_movies[['title','vote_average','weighted_vote']])