The dataset can be Downloaded at - [github](https://github.com/zygmuntz/goodbooks-10k)

Ten thousand books, six million ratings.

# Data ETL

In [1]:
import numpy as np 
import pandas as pd 
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
books = pd.read_csv('data\books.csv', encoding = "ISO-8859-1")
books.head(-10)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9.780440e+12,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9.780316e+12,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9.780061e+12,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9.780743e+12,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,9986,183092,183092,176939,16,310257689,9.780310e+12,Terri Blackstock,2006.0,Night Light: A Restoration Novel,...,8471,8862,218,96,172,1115,2658,4821,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
9986,9987,8087038,8087038,12823536,29,312651198,9.780313e+12,Iris Johansen,2010.0,Chasing The Night,...,10129,10964,411,113,331,2127,3957,4436,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
9987,9988,129237,129237,1383130,44,674017722,9.780674e+12,John Rawls,1971.0,A Theory of Justice,...,8472,9108,168,234,607,2001,3171,3095,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
9988,9989,13489518,13489518,19028216,9,,2.940015e+12,Quinn Loftis,2012.0,Out of the Dark,...,11994,13614,595,72,229,1263,3280,8770,https://images.gr-assets.com/books/1334785691m...,https://images.gr-assets.com/books/1334785691s...


In [3]:
books.columns


Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')

In [5]:
ratings = pd.read_csv('data\ratings.csv', encoding = "ISO-8859-1")
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [6]:
book_tags = pd.read_csv('data\book_tags.csv', encoding = "ISO-8859-1")
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [7]:
tags = pd.read_csv('data\tags.csv')
tags.tail()

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


## Joining Tags and book tags

In [8]:
tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [9]:
to_read = pd.read_csv('data\to_read.csv')
to_read.head()

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380


# Training

TfidfVectorizer transforms text to feature vectors that can be used as input.

Cosine Similarity to calculate a numeric value that denotes the similarity between two books.



In [10]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])
# compute the cosine similarity for the data
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of book authors
def authors_recommendations(title):
    idx = indices[title] # gets the index number for the given title
    sim_scores = list(enumerate(cosine_sim[idx])) # gets the cosine sim values for given book using the index
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # sort in descending order based on similarity
    sim_scores = sim_scores[1:21] # return the top 20 book index's
    book_indices = [i[0] for i in sim_scores]  
    return titles.iloc[book_indices] # get the book titles and return the top 20 books

# Testing

In [12]:
authors_recommendations('Lord of the Flies').head(20)

153                                Macbeth
352                                Othello
713                              King Lear
772                The Taming of the Shrew
803                            The Tempest
854                          Twelfth Night
1884                        As You Like It
2208                    The Complete Works
6416                     The Winter's Tale
6529                      Titus Andronicus
6691                  The Comedy of Errors
399                            Neuromancer
2615    Pattern Recognition (Blue Ant, #1)
3005               Count Zero (Sprawl, #2)
3547                    The Miracle Worker
3883      Mona Lisa Overdrive (Sprawl, #3)
5590            Virtual Light (Bridge, #1)
5629                     Idoru (Bridge #2)
6164          Spook Country (Blue Ant, #2)
7364                        The Peripheral
Name: title, dtype: object

In [13]:
authors_recommendations('Romeo and Juliet').head(20)

838                              The Merchant of Venice
153                                             Macbeth
352                                             Othello
713                                           King Lear
772                             The Taming of the Shrew
803                                         The Tempest
854                                       Twelfth Night
1884                                     As You Like It
2208                                 The Complete Works
6416                                  The Winter's Tale
6529                                   Titus Andronicus
6691                               The Comedy of Errors
8895                                         Richard II
8028             City of Stairs (The Divine Cities, #1)
529                              Much Ado About Nothing
3699                                        Richard III
769                                       Julius Caesar
554     Hamlet: Screenplay, Introduction And Fil

### The above recommendation was done based on Book Authors which is not very Accurate
Let us do it based on the similarity between the book tags.

In [14]:
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id', how='inner')
tf1 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(10000))
cosine_sim_tags = linear_kernel(tfidf_matrix1, tfidf_matrix1)


In [15]:
titles1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of books tags
def tags_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim_tags[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]


In [16]:
tags_recommendations('Romeo and Juliet').head(20)

118                                   The Handmaid's Tale
229                                            Persuasion
328                                      The Last Lecture
423                                  The Name of the Rose
626                   The True Story of the 3 Little Pigs
721                 The Shadow Rising (Wheel of Time, #4)
913                 The Forever War (The Forever War, #1)
1014                                          Steppenwolf
1109                            Maybe Someday (Maybe, #1)
1212              Children of the Mind (Ender's Saga, #4)
1312    Band of Brothers: E Company, 506th Regiment, 1...
1413     True Believer (Jeremy Marsh & Lexie Darnell, #1)
1508                                   The House of Mirth
1621          A Connecticut Yankee in King Arthur's Court
1823                   Flawless (Pretty Little Liars, #2)
2020             The Twilight Collection (Twilight, #1-3)
2112                         Memory Man (Amos Decker, #1)
2221          

In [17]:
tags_recommendations('Lord of the Flies').head(20)

1     Harry Potter and the Sorcerer's Stone (Harry P...
2                               Twilight (Twilight, #1)
3                                 To Kill a Mockingbird
4                                      The Great Gatsby
5                                The Fault in Our Stars
6                                            The Hobbit
7                                The Catcher in the Rye
8                 Angels & Demons  (Robert Langdon, #1)
9                                   Pride and Prejudice
10                                      The Kite Runner
11                            Divergent (Divergent, #1)
12                                                 1984
13                                          Animal Farm
14                            The Diary of a Young Girl
15     The Girl with the Dragon Tattoo (Millennium, #1)
16                 Catching Fire (The Hunger Games, #2)
17    Harry Potter and the Prisoner of Azkaban (Harr...
18    The Fellowship of the Ring (The Lord of th