In [28]:
#Library Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import string
import warnings


#Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


#Tags Clusters
from sklearn.cluster import KMeans

#Stats
from scipy import stats

warnings.filterwarnings('ignore')

In [29]:
#Data Imports

#Book Info
books = pd.read_csv("../data/raw_data/books.csv")
print(books.shape)
print(books.columns)

#User ratings
print()
ratings = pd.read_csv("../data/raw_data/ratings.csv")
print(ratings.shape)
print(ratings.columns)

#User Book viewing history
print()
toRead = pd.read_csv("../data/raw_data/to_read.csv")
print(toRead.shape)
print(toRead.columns)

#Tag ID mapped to the actual tag 
print()
tags = pd.read_csv( '../data/raw_data/tags.csv' )
print(tags.shape)
print(tags.columns)

#Mappings of books and tags that are associated to book

print()
book_tags = pd.read_csv( '../data/raw_data/book_tags.csv')
print(book_tags.shape)
print(book_tags.columns)

(10000, 23)
Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')

(5976479, 3)
Index(['user_id', 'book_id', 'rating'], dtype='object')

(912705, 2)
Index(['user_id', 'book_id'], dtype='object')

(34252, 2)
Index(['tag_id', 'tag_name'], dtype='object')

(999912, 3)
Index(['goodreads_book_id', 'tag_id', 'count'], dtype='object')


In [30]:
tags[tags["tag_id"] == 21303]

Unnamed: 0,tag_id,tag_name
21303,21303,neighbors


In [31]:
book_tags

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716
...,...,...,...
999907,33288638,21303,7
999908,33288638,17271,7
999909,33288638,1126,7
999910,33288638,11478,7


### Books Cleaning

In [32]:
#From Books table we drop some of the columns we don't see direct use for
drop_cols =['best_book_id', 'work_id','isbn', 'isbn13']
books.drop(drop_cols, axis=1, inplace=True)
print(books.columns)

Index(['book_id', 'goodreads_book_id', 'books_count', 'authors',
       'original_publication_year', 'original_title', 'title', 'language_code',
       'average_rating', 'ratings_count', 'work_ratings_count',
       'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3',
       'ratings_4', 'ratings_5', 'image_url', 'small_image_url'],
      dtype='object')


In [33]:
#Drop all NaN rows
books.dropna(subset=['language_code'])

#Check what type of languages are covered in the books
lang_codes = np.array(list(books["language_code"]))
u_langs = np.unique(lang_codes)

print(len(lang_codes))
print(len(u_langs))
u_langs

10000
26


array(['ara', 'dan', 'en', 'en-CA', 'en-GB', 'en-US', 'eng', 'fil', 'fre',
       'ger', 'ind', 'ita', 'jpn', 'mul', 'nan', 'nl', 'nor', 'per',
       'pol', 'por', 'rum', 'rus', 'spa', 'swe', 'tur', 'vie'],
      dtype='<U5')

In [34]:
lang_mask = [False if math.isnan(b) else b for b in books['language_code'].str.contains("en")]    
np.unique(lang_mask)

array([False,  True])

In [35]:
#Apply boolean mask 
books = books[lang_mask]

In [36]:
books.shape

(8730, 19)

In [37]:
# Books data without some columns, and taking out non-english content
# books.to_csv(r'/Users/karthikrameshbabu/midsScratch/MIDS/w207/BooksFinalProject/MIDS207-Final-Project/data/parsed_data/english_books_cf.csv', index = False)


### Tags Cleaning

In [38]:
t_copy = tags.copy()
t_copy['tag_name'] = t_copy['tag_name'].apply(lambda x: x.lower())
t_copy = t_copy[19:]
t_copy = t_copy[(True == t_copy['tag_name'].str.match(r"^[a-zA-Z0-9$@$!%*?&#':;^\-—_,'\\/\". +()]+$"))]
t_copy.head()

Unnamed: 0,tag_id,tag_name
19,19,--available-at-raspberrys--
20,20,-2001--
21,21,-calif--
22,22,-d-c--
23,23,-dean


In [39]:
t_joined = pd.merge(book_tags, t_copy, left_on='tag_id', right_on='tag_id', how='inner')
t_joined.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [40]:
#Get rid of punctuations from the tags
t_joined["tag_name"] = [tag.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for tag in t_joined["tag_name"]]
print(t_joined.shape)
t_joined.head()

(994023, 4)


Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to read
1,2,30574,24549,to read
2,3,30574,496107,to read
3,5,30574,11909,to read
4,6,30574,298,to read


In [41]:
#Somewhat rudementary but this works! To get rid of a solid chunk of custom tags.
user_tags1 = 'to read+|reading+|my books+|wish list+|novel+|series+|^[[:digit:]]*$|i own+|currently+|own+|have+|[^[:alnum:] ]|favorite+|favourite+|'
user_tags2 = 'club+|buy+|library+|read+|borrowed+|abandoned+|audio+|ya|e book+|ebook+|kindle+|default+|finish+|maybe+|gave up+|'
user_tags3 = 'dnf+|stars+|^(15|16|17|18|19|20)\d{2}[a-zA-Z]*|century+|grade+'

user_tags = user_tags1+user_tags2 + user_tags3
del_filter = t_joined["tag_name"].str.contains(user_tags)

print(t_joined[del_filter].shape)
# Iterate by most popular tags going down
# for index, row in t_joined[del_filter].iterrows():
#     print(str(row["tag_id"]) + " : " + str(row["tag_name"]) + " : " + str(row["count"]))

#Update value of our tags
t_joined = t_joined[~del_filter]

print(t_joined.shape)
t_joined.head()

(451161, 4)
(542862, 4)


Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
9983,1,11305,37174,fantasy
9984,2,11305,3441,fantasy
9985,3,11305,47478,fantasy
9986,5,11305,39330,fantasy
9987,6,11305,38378,fantasy


### Similarity By Authors

#### Kernel Method:

The main characteristic of Kernel Methods, however, is their distinct approach to this problem. Kernel methods map the data into higher dimensional spaces in the hope that in this higher-dimensional space the data could become more easily separated or better structured. There are also no constraints on the form of this mapping, which could even lead to infinite-dimensional spaces.

https://www.youtube.com/watch?v=3liCbRZPrZA&feature=youtu.be


#### Dot Products:
Algebraically, the dot product is the sum of the products of the corresponding entries of the two sequences of numbers. Geometrically, it is the product of the Euclidean magnitudes of the two vectors and the cosine of the angle between them.


In this case, the dot product is used for defining lengths (the length of a vector is the square root of the dot product of the vector by itself) and **angles** (the cosine of the angle of two vectors is the quotient of their dot product by the product of their lengths).



#### Kernel Trick:
The Kernel trick is a very interesting and powerful tool. It is powerful because it provides a bridge from linearity to non-linearity to any algorithm that can expressed solely on terms of dot products between two vectors. It comes from the fact that, if we first map our input data into a higher-dimensional space, a linear algorithm operating in this space will behave non-linearly in the original input space.



### Cosine Similarity:
https://www.machinelearningplus.com/nlp/cosine-similarity/

Cosine similarity measures the degree to which two vectors point in the same direction, regardless of magnitude.

When vectors point in the same direction, cosine similarity is 1; when vectors are perpendicular, cosine similarity is 0; and when vectors point in opposite directions, cosine similarity is -1. In positive space, cosine similarity is the complement to cosine distance: cosine_similarity = 1 - cosine_distance.

For example, the cosine similarity between [1, 2, 3] and [3, 2, 1] is 0.7143.


In [42]:
books.shape

(8730, 19)

In [43]:
tf_author = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tf_matrix_authors = tf_author.fit_transform(books['authors'])

print(tf_matrix_authors.shape)
cosine_sim_authors = cosine_similarity(tf_matrix_authors, tf_matrix_authors)
print(cosine_sim_authors.shape)

(8730, 13185)
(8730, 8730)


In [44]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,226,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [45]:
pd.Series(books.index, index=books['title'])

title
The Hunger Games (The Hunger Games, #1)                                                         0
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)                                        1
Twilight (Twilight, #1)                                                                         2
To Kill a Mockingbird                                                                           3
The Great Gatsby                                                                                4
                                                                                             ... 
Billy Budd, Sailor                                                                           9994
Bayou Moon (The Edge, #2)                                                                    9995
Means of Ascent (The Years of Lyndon Johnson, #2)                                            9996
The Mauritius Command                                                                        9997
Cinderella Ate

In [47]:
# All the books 
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Book Recs based on the cosine similarity score of authors
def similarity_by_authors(book, num_recs=10):
    try:
        
        #Find the index of the book
        index = indices[book]
#         print(index)
        print()
        
        #Using that index get the list of cosine similarities (list of 8730)
        # Each entry looks like >> (1135, 1.0)
        #                          (index in dataframe, similarity score)
        scores = list(enumerate(cosine_sim_authors[index]))
 
        #Order that list by cosine similaritie and get the indexes of the rows that are similar
        scores = sorted(scores, key=lambda x: x[1], reverse=True)

        # If you are asking for more recs than we even have books for that author
        # then give you all the books for that author
        if num_recs > len(scores):
            scores = scores[1:len(scores)]  
        else:
            #Give you number of recs you want
            scores = scores[1:num_recs]            

        print(scores)
        print()

        #Iterate and get all the indexes that we want to recommend
        book_indices = [book[0] for book in scores]

        
        #Pull list of books using those indexes!
        return books.iloc[book_indices]
    
    except:
        print("We don't have that book!")

In [48]:
similarity_by_authors('The Great Gatsby')


[(1135, 1.0), (2167, 1.0), (3358, 1.0), (6605, 0.5831356951529844), (7662, 0.43851346650878076), (1225, 0.2605102485970193), (7808, 0.224485040535046), (6135, 0.22089287902236984), (893, 0.22065958215380083)]



Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
1183,1184,46164,406,F. Scott Fitzgerald,1933.0,Tender Is the Night,Tender Is the Night,en-US,3.83,79008,86768,3615,1719,5896,22117,32814,24222,https://images.gr-assets.com/books/1438797669m...,https://images.gr-assets.com/books/1438797669s...
2303,2304,46165,607,F. Scott Fitzgerald,1920.0,This Side of Paradise,This Side of Paradise,eng,3.68,40074,47633,2777,1137,4298,14047,17247,10904,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
3640,3641,747746,31,F. Scott Fitzgerald,1922.0,The Curious Case of Benjamin Button,The Curious Case of Benjamin Button,en-GB,3.53,30313,34715,2887,591,3578,13074,11805,5667,https://images.gr-assets.com/books/1331235364m...,https://images.gr-assets.com/books/1331235364s...
7408,7409,4662,62,"F. Scott Fitzgerald, Matthew J. Bruccoli",1920.0,The Short Stories of F. Scott Fitzgerald,The Short Stories,eng,4.23,11525,12955,282,95,307,1895,4846,5812,https://images.gr-assets.com/books/1400891315m...,https://images.gr-assets.com/books/1400891315s...
8683,8684,17557143,13,J.S. Scott,2013.0,The Billionaire's Obsession ~ Simon,The Billionaire's Obsession ~ Simon (The Billi...,eng,4.0,7492,16989,856,452,935,3307,5796,6499,https://images.gr-assets.com/books/1362454880m...,https://images.gr-assets.com/books/1362454880s...
1279,1280,12914,830,"Virgil, Robert Fitzgerald",-17.0,Æneis,The Aeneid,eng,3.81,75051,85351,1804,1531,6436,23683,28909,24792,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
8869,8870,119247,29,"John D. Fitzgerald, Mercer Mayer",1967.0,The Great Brain,The Great Brain (Great Brain #1),eng,4.16,13422,13870,776,242,389,2297,4876,6066,https://images.gr-assets.com/books/1311781413m...,https://images.gr-assets.com/books/1311781413s...
6862,6863,218659,37,"Flannery O'Connor, Robert Fitzgerald",1965.0,Everything that Rises Must Converge,Everything That Rises Must Converge: Stories,eng,4.27,14534,15025,873,163,433,1876,5312,7241,https://images.gr-assets.com/books/1412859621m...,https://images.gr-assets.com/books/1412859621s...
922,923,17402605,85,Michael Scott,2007.0,The Alchemyst: The Secrets of The Immortal Nic...,The Alchemyst (The Secrets of the Immortal Nic...,eng,3.84,58396,115864,7165,3737,8839,27510,38469,37309,https://images.gr-assets.com/books/1361310229m...,https://images.gr-assets.com/books/1361310229s...


### Similarity By Genre (Tags)

In [49]:
print(t_joined.shape)
t_joined.head()

(542862, 4)


Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
9983,1,11305,37174,fantasy
9984,2,11305,3441,fantasy
9985,3,11305,47478,fantasy
9986,5,11305,39330,fantasy
9987,6,11305,38378,fantasy


In [50]:
print(books.shape)
books.head()

(8730, 19)


Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,226,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [51]:
books_with_tags = pd.merge(books, t_joined, left_on='goodreads_book_id', right_on='goodreads_book_id', how='inner')

# Though this doesnt seem right it brings our matrix space into computational rangeee
# books_with_tags = pd.merge(books, t_joined, left_on='book_id', right_on='goodreads_book_id', how='inner')

print(books_with_tags.shape)
books_with_tags.head()

(471294, 22)


Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,...,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_id,count,tag_name
0,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,11305,10836,fantasy
1,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,33114,25968,young adult
2,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,11743,13819,fiction
3,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,1691,3190,adventure
4,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,26785,1119,sci fi fantasy


## STOP THIS NEXT CELL WILL KILL YOUR KERNEL

In [52]:

# tf_tags = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
# tf_tags_matrix = tf_tags.fit_transform(books_with_tags['tag_name'])

# print(tf_tags_matrix.shape)
# cosine_sim_tags = cosine_similarity(tf_tags_matrix, tf_tags_matrix)
# print(cosine_sim_tags.shape)

In [53]:
# # # Book Recs based on the cosine similarity score of book tags
# def similarity_by_tags(book, num_recs =10):
#     try:
#         index = indices[book]
#         scores = list(enumerate(cosine_sim_tags[index]))
#         scores = sorted(scores, key=lambda x: x[1], reverse=True)
        
#         if num_recs > len(scores):
#             scores = scores[1:len(scores)]  
#         else:
#             #Give you number of recs you want
#             scores = scores[1:num_recs]            

#         print(scores)
#         print()        
        
#         book_indices = [book[0] for book in scores]
#         return titles.iloc[book_indices]
#     except:
#         print("Book doesn't exist")

In [54]:
# similarity_by_tags('The Great Gatsby')

### Overall Similarity

In [55]:
#Combine all the tags into single string by book
b_w_t_collapsed = books_with_tags.groupby('goodreads_book_id')['tag_name'].apply(','.join).reset_index()
print(b_w_t_collapsed.shape)
b_w_t_collapsed.head()

(8730, 2)


Unnamed: 0,goodreads_book_id,tag_name
0,1,"fantasy,young adult,fiction,harry potter,magic..."
1,2,"fantasy,young adult,fiction,harry potter,magic..."
2,3,"fantasy,young adult,fiction,harry potter,magic..."
3,5,"fantasy,young adult,fiction,harry potter,magic..."
4,6,"fantasy,young adult,fiction,harry potter,child..."


In [56]:
# Merge the two tables
books_sq = pd.merge(books, b_w_t_collapsed, left_on='goodreads_book_id', right_on='goodreads_book_id', how='inner')
books_sq.head()

Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name
0,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"fantasy,young adult,fiction,adventure,sci fi f..."
1,2,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,"fantasy,young adult,fiction,harry potter,magic..."
2,3,41865,226,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,"fantasy,young adult,fiction,sci fi fantasy,fan..."
3,4,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,"young adult,fiction,childhood,classics,english..."
4,5,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,"young adult,fiction,classics,english,books,fav..."


In [57]:
print(books.shape)

(8730, 19)


In [58]:
# books['class_features'] = (pd.Series(books[['authors', 'tag_name']]
#                 .fillna('')
#                 .values.tolist()
#                 ).str.join(' '))


books_sq['class_features'] = (pd.Series(books_sq[['authors', 'tag_name']]
                .fillna('')
                .values.tolist()
                ).str.join(','))

In [59]:
print(books_sq.shape)
books.head()

(8730, 21)


Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,226,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [60]:
tf_overall = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tf_matrix_overall = tf_overall.fit_transform(books_sq['class_features'])
# tf_matrix_overall = tf_overall.fit_transform(books['tag_name'])

print(tf_matrix_overall.shape)
cosine_sim_overall = cosine_similarity(tf_matrix_overall, tf_matrix_overall)
print(cosine_sim_overall.shape)


(8730, 113645)
(8730, 8730)


In [61]:
# Function that get book recommendations based on the cosine similarity score of books tags
def overall_recommendations(book, num_recs = 10):
    try:
        index = indices[book]
        scores = list(enumerate(cosine_sim_overall[index]))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)

        if num_recs > len(scores):
            scores = scores[1:len(scores)]  
        else:
            #Give you number of recs you want
            scores = scores[1:num_recs]            
        print(scores)
        print()    
        
        book_indices = [book[0] for book in scores]
        return titles.iloc[book_indices]

    except:
        print("Book doesn't exist")

In [62]:
overall_recommendations("Twilight (Twilight, #1)", 10)

[(49, 0.8494731169084515), (46, 0.8343327380451668), (807, 0.6548859449183038), (714, 0.6365497132803984), (2857, 0.56102561783688), (3755, 0.5496744559558595), (70, 0.49482789519479603), (4760, 0.4837389525701959), (1541, 0.4805597006319514)]



51                                 Eclipse (Twilight, #3)
48                                New Moon (Twilight, #2)
833                         Midnight Sun (Twilight, #1.5)
731     The Short Second Life of Bree Tanner: An Eclip...
3074    Twilight: The Graphic Novel, Vol. 1 (Twilight:...
4087    The Twilight Saga: The Official Illustrated Gu...
72                                The Host (The Host, #1)
5244    Twilight: The Graphic Novel, Vol. 2  (Twilight...
1618    The Twilight Saga Complete Collection  (Twilig...
Name: title, dtype: object

In [63]:
overall_recommendations("Romeo and Juliet", 10)

[(344, 0.5976910890069966), (121, 0.5976643282708686), (749, 0.5800623533942593), (149, 0.5673108823577537), (827, 0.5633087744328512), (812, 0.5461867362854811), (752, 0.530579627962391), (3628, 0.5262192599403432), (241, 0.5245269871454257)]



352                       Othello
124                        Hamlet
769                 Julius Caesar
153                       Macbeth
854                 Twelfth Night
838        The Merchant of Venice
772       The Taming of the Shrew
3947                      Henry V
247     A Midsummer Night's Dream
Name: title, dtype: object

In [36]:
# similarity_by_tags("Romeo and Juliet", 30)

## Clustering By Book Tags

In [71]:
tags_stripped = books_sq.filter(items=['book_id', 'goodreads_book_id', 'title', 'tag_name'])
tags_stripped.head()


Unnamed: 0,book_id,goodreads_book_id,title,tag_name
0,1,2767052,"The Hunger Games (The Hunger Games, #1)","fantasy,young adult,fiction,adventure,sci fi f..."
1,2,3,Harry Potter and the Sorcerer's Stone (Harry P...,"fantasy,young adult,fiction,harry potter,magic..."
2,3,41865,"Twilight (Twilight, #1)","fantasy,young adult,fiction,sci fi fantasy,fan..."
3,4,2657,To Kill a Mockingbird,"young adult,fiction,childhood,classics,english..."
4,5,4671,The Great Gatsby,"young adult,fiction,classics,english,books,fav..."


In [72]:
#Get and fit our vectorizer with thte tag_names
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),min_df=0,stop_words='english')
list_of_tags = tags_stripped["tag_name"]
X = vectorizer.fit_transform(list_of_tags)

### K MEANS CLUSTERING

In [73]:
true_k = 100
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(max_iter=100, n_clusters=100, n_init=1)

In [74]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [75]:
order_centroids

array([[92993, 64571, 24085, ..., 67711, 67712,     0],
       [ 1646, 18857, 20809, ..., 67782, 67783,     0],
       [33757, 98608, 44299, ..., 67819, 67820,     0],
       ...,
       [39501, 64571, 92993, ..., 67783, 67784,     0],
       [47998, 47921,  7465, ..., 67563, 67564,     0],
       [ 6560, 50341,  6561, ..., 67759, 67760,     0]])

In [None]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

In [76]:
print("\n")
print("Prediction")
# X = vectorizer.transform(["Nothing is easy in cricket. Maybe when you watch it on TV, it looks easy. But it is not. You have to use your brain and time the ball."])
predicted = model.predict(X)
print(predicted)



Prediction
[37 20 29 ...  6 78 28]


In [77]:
books_sq["cluster"] = predicted

In [78]:
books_sq[books_sq["cluster"] == 37] 

Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,...,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name,class_features,cluster
0,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"fantasy,young adult,fiction,adventure,sci fi f...","Suzanne Collins,fantasy,young adult,fiction,ad...",37
11,12,13335037,210,Veronica Roth,2011.0,Divergent,"Divergent (Divergent, #1)",eng,4.24,1903563,...,36315,82870,310297,673028,1114304,https://images.gr-assets.com/books/1328559506m...,https://images.gr-assets.com/books/1328559506s...,"fantasy,young adult,fiction,adventure,sci fi f...","Veronica Roth,fantasy,young adult,fiction,adve...",37
16,17,6148028,201,Suzanne Collins,2009.0,Catching Fire,"Catching Fire (The Hunger Games, #2)",eng,4.30,1831039,...,10492,48030,262010,687238,980309,https://images.gr-assets.com/books/1358273780m...,https://images.gr-assets.com/books/1358273780s...,"fantasy,young adult,fiction,adventure,sci fi f...","Suzanne Collins,fantasy,young adult,fiction,ad...",37
19,20,7260188,239,Suzanne Collins,2010.0,Mockingjay,"Mockingjay (The Hunger Games, #3)",eng,4.03,1719760,...,30144,110498,373060,618271,738775,https://images.gr-assets.com/books/1358275419m...,https://images.gr-assets.com/books/1358275419s...,"fantasy,young adult,fiction,adventure,sci fi f...","Suzanne Collins,fantasy,young adult,fiction,ad...",37
66,69,11735983,164,Veronica Roth,2012.0,Insurgent,"Insurgent (Divergent, #2)",eng,4.07,836362,...,10641,44620,182131,339977,369969,https://images.gr-assets.com/books/1325667729m...,https://images.gr-assets.com/books/1325667729s...,"fantasy,young adult,fiction,adventure,sci fi f...","Veronica Roth,fantasy,young adult,fiction,adve...",37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8553,9777,8428984,10,Hugh Howey,2010.0,,Half Way Home,en-GB,3.70,5012,...,161,799,3139,3942,2190,https://images.gr-assets.com/books/1374845830m...,https://images.gr-assets.com/books/1374845830s...,"fantasy,young adult,fiction,adventure,sci fi f...","Hugh Howey,fantasy,young adult,fiction,adventu...",37
8558,9782,13431827,22,D.J. Molles,2012.0,,"The Remaining (The Remaining, #1)",eng,4.08,8306,...,134,423,1655,3426,3654,https://images.gr-assets.com/books/1343344146m...,https://images.gr-assets.com/books/1343344146s...,"fantasy,fiction,adventure,sci fi fantasy,paran...","D.J. Molles,fantasy,fiction,adventure,sci fi f...",37
8607,9844,17235347,13,Marie Lu,2012.0,Life Before Legend,Life Before Legend: Stories of the Criminal an...,eng,4.02,13933,...,234,619,3205,4786,5412,https://images.gr-assets.com/books/1358952957m...,https://images.gr-assets.com/books/1358952957s...,"fantasy,young adult,fiction,adventure,teen,eng...","Marie Lu,fantasy,young adult,fiction,adventure...",37
8680,9937,13010211,22,Caragh M. O'Brien,2012.0,Promised,"Promised (Birthmarked, #3)",en-US,3.77,11766,...,256,1098,3565,4392,3573,https://images.gr-assets.com/books/1330546217m...,https://images.gr-assets.com/books/1330546217s...,"fantasy,young adult,fiction,adventure,sci fi f...","Caragh M. O'Brien,fantasy,young adult,fiction,...",37


In [46]:
# books.to_csv(r'/Users/karthikrameshbabu/midsScratch/MIDS/w207/BooksFinalProject/MIDS207-Final-Project/data/parsed_data/clustered_books_with_tags.csv', index = False)

## DB SCAN CLUSTERING

However this doesn't work very well because our vectors are so sparse

In [47]:
# from sklearn.cluster import OPTICS
# optics_clusters = OPTICS(min_samples=15, max_eps=np.inf).fit(X.toarray())
# optics_clusters.labels_

## OPTICS CLUSTERING

## TSNE CLUSTERING

In [52]:
from sklearn.manifold import TSNE

In [58]:
X.toarray()[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [64]:
tsne = TSNE(n_components=2, init='pca', random_state=0)

baby_x = X.toarray()[:500]
x_tsne = tsne.fit_transform(baby_x)

array([[ 51.200268 ,  49.538544 ],
       [ 22.335249 ,  42.073887 ],
       [-25.912683 ,  59.53444  ],
       [-32.30589  ,  27.964792 ],
       [ 16.933353 ,   9.77677  ],
       [-11.422636 ,   3.445465 ],
       [ 30.313354 ,  74.05439  ],
       [  1.9629364,  67.71346  ],
       [ -3.4364357,  35.418396 ],
       [ 44.808636 ,  17.967396 ]], dtype=float32)

In [65]:
#Basic imports
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [71]:
#Range of PCA components for experimenting
k_vals = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100, 150, 200, 250, 300, 350, 400, 500]
k_vals = np.arange(100, 12000, 500)
var_explained = []

#Try all, to see what gives us best explained variance
for k_val in k_vals:
    pca = PCA(n_components=k_val)

    #Apply the dimensionality reduction
    pca.fit(X.toarray())

    #Get the explained variance ratio
    var_explained.append(sum(pca.explained_variance_ratio_))    

for i in range(len(k_vals)):
    print("K_val: {0}  , Variance Explained: {1}".format(k_vals[i], var_explained[i]))

plt.plot(k_vals,var_explained, 'go--')
plt.title('Percent of Total Variance vs. Number of Principal Components')
plt.xlabel('Number of Principal Componenents')
plt.ylabel('% of Variance Explained')
plt.show()        


KeyboardInterrupt: 

In [None]:
reduced = PCA(n_components=250)


### Parsing Experiments

In [69]:
author_recs = similarity_by_authors('The Great Gatsby')

def process_recs(recs):
    
    result = []
    for row in recs:
        result.append(make_book_object(row))
    return result


def make_book_object(row):
    
    book_link = "https://www.goodreads.com/book/show/"+str(row["goodreads_book_id"])
    
    authors = []
    for author in row["authors"].split(","):
        authors.append({ "text": author, "action": '' })
        
    if row["book_id"] is None:
        image_url = 'https://c.saavncdn.com/873/6-Weeks-Wiggle--English-2018-20181112191820-150x150.jpg'
    else:
        image_url = row["image_url"]
        
    title = row["title"]
    
    return {"title": { "text": title, "action": book_link },
    "author": authors, 
    "book_id": row["book_id"], 
    "type": "book",
    "image": [image_url]
           }



recs_list = author_recs.to_dict(orient='records')
make_book_object(recs_list[0])


[(1135, 1.0), (2167, 1.0), (3358, 1.0), (6605, 0.5831356951529844), (7662, 0.43851346650878076), (1225, 0.2605102485970193), (7808, 0.224485040535046), (6135, 0.22089287902236984), (893, 0.22065958215380083)]



{'title': {'text': 'Tender Is the Night',
  'action': 'https://www.goodreads.com/book/show/46164'},
 'author': [{'text': 'F. Scott Fitzgerald', 'action': ''}],
 'book_id': 1184,
 'type': 'book',
 'image': ['https://images.gr-assets.com/books/1438797669m/46164.jpg']}

In [79]:
books_sq.head()

Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,...,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name,class_features,cluster
0,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"fantasy,young adult,fiction,adventure,sci fi f...","Suzanne Collins,fantasy,young adult,fiction,ad...",37
1,2,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,...,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,"fantasy,young adult,fiction,harry potter,magic...","J.K. Rowling, Mary GrandPré,fantasy,young adul...",20
2,3,41865,226,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,...,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,"fantasy,young adult,fiction,sci fi fantasy,fan...","Stephenie Meyer,fantasy,young adult,fiction,sc...",29
3,4,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,...,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,"young adult,fiction,childhood,classics,english...","Harper Lee,young adult,fiction,childhood,class...",8
4,5,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,...,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,"young adult,fiction,classics,english,books,fav...","F. Scott Fitzgerald,young adult,fiction,classi...",8


In [95]:
books_sq["normalized"] = books_sq["original_title"].str.replace(' ', '_').str.lower()
books_sq



Unnamed: 0,book_id,goodreads_book_id,books_count,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,...,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name,class_features,cluster,normalized
0,1,2767052,272,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"fantasy,young adult,fiction,adventure,sci fi f...","Suzanne Collins,fantasy,young adult,fiction,ad...",37,the_hunger_games
1,2,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,...,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,"fantasy,young adult,fiction,harry potter,magic...","J.K. Rowling, Mary GrandPré,fantasy,young adul...",20,harry_potter_and_the_philosopher's_stone
2,3,41865,226,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,...,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,"fantasy,young adult,fiction,sci fi fantasy,fan...","Stephenie Meyer,fantasy,young adult,fiction,sc...",29,twilight
3,4,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,...,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,"young adult,fiction,childhood,classics,english...","Harper Lee,young adult,fiction,childhood,class...",8,to_kill_a_mockingbird
4,5,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,...,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,"young adult,fiction,classics,english,books,fav...","F. Scott Fitzgerald,young adult,fiction,classi...",8,the_great_gatsby
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8725,9995,15613,199,Herman Melville,1924.0,"Billy Budd, Sailor","Billy Budd, Sailor",eng,3.09,10866,...,2225,3805,2985,1617,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,"fiction,adventure,classics,english,classic,adu...","Herman Melville,fiction,adventure,classics,eng...",8,"billy_budd,_sailor"
8726,9996,7130616,19,Ilona Andrews,2010.0,Bayou Moon,"Bayou Moon (The Edge, #2)",eng,4.09,17204,...,575,3538,7860,6778,https://images.gr-assets.com/books/1307445460m...,https://images.gr-assets.com/books/1307445460s...,"fantasy,fiction,magic,adventure,sci fi fantasy...","Ilona Andrews,fantasy,fiction,magic,adventure,...",41,bayou_moon
8727,9997,208324,19,Robert A. Caro,1990.0,Means of Ascent,"Means of Ascent (The Years of Lyndon Johnson, #2)",eng,4.25,12582,...,551,1737,3389,6972,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,"classics,bookshelf,shelfari wishlist,audible,c...","Robert A. Caro,classics,bookshelf,shelfari wis...",6,means_of_ascent_
8728,9998,77431,60,Patrick O'Brian,1977.0,The Mauritius Command,The Mauritius Command,eng,4.35,9421,...,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,"fiction,adventure,classics,british,england,act...","Patrick O'Brian,fiction,adventure,classics,bri...",78,the_mauritius_command


In [96]:
books_sq.to_csv(r'/Users/karthikrameshbabu/midsScratch/MIDS/w207/BooksFinalProject/MIDS207-Final-Project/data/parsed_data/ultimate_books.csv', index = False)


