# Imports:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Read in the data:

In [4]:
book_tags = pd.read_csv('../ignore/book_tags.csv')
books = pd.read_csv('../ignore/books.csv')
ratings = pd.read_csv('../ignore/ratings.csv')
tags = pd.read_csv('../ignore/tags.csv')
to_read = pd.read_csv('../ignore/to_read.csv')

In [78]:
print(book_tags.shape)
print(books.shape)
print(ratings.shape)
print(tags.shape)
print(to_read.shape)

(999912, 3)
(10000, 23)
(5976479, 3)
(34252, 2)
(912705, 2)


### Cleaning for Recommender System:

While more data is better... for the purpose of memory usage for this project I am going to randomly pick 50,000 data points from the over 5 million data points in the ratings.csv

In [79]:
ratings.isna().sum()

user_id    0
book_id    0
rating     0
dtype: int64

> There are no missing values, so I cannot just delete null items

In [80]:
ratings.user_id.value_counts().count()

53424

> 53,424 unique users

In [81]:
ratings.book_id.value_counts().count()

10000

> 10,000 unique books

In [82]:
ratings_sample = ratings.sample(n = 50000, replace = False)

In [83]:
ratings_sample.head()

Unnamed: 0,user_id,book_id,rating
3607532,42385,337,3
3897650,21470,235,4
1912390,26283,736,5
5926101,5836,6947,4
3952553,21830,601,3


In [84]:
ratings_sample.isna().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [85]:
ratings_sample.user_id.value_counts().count()

32065

32,009 unique users in this sample, with some repeats

In [86]:
ratings_sample.book_id.value_counts().count()

8435

> 8,393 unique books... I want all 10,000 books included so I may re-sample

In [87]:
ratings_sample.rating.value_counts().count()

5

> All 5 different ratings are represented

Let's try a resample with 100,000 instead...

In [88]:
ratings_sample = ratings.sample(n = 100000, replace = False, random_state=1002)

In [89]:
ratings_sample.head()

Unnamed: 0,user_id,book_id,rating
2186601,29403,102,5
2858642,36128,4663,4
4280792,37112,25,5
1026306,16114,693,4
5672837,50684,1731,4


In [90]:
ratings_sample.isna().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [91]:
ratings_sample.user_id.value_counts().count()

44364

> 44,364 unique users

In [92]:
ratings_sample.book_id.value_counts().count()

9523

> 9523 unique books... this is not the full 10,000 but close enough for my liking

In [93]:
# Checking to make sure that some of my favorite books are included in this sample... because what would a good recommender system be if it doesn't include my favorites
# ratings_sample.loc[ratings_sample['book_id'] == 30]

In [94]:
ratings_sample.shape

(100000, 3)

Now, let's merge the book df with it

In [95]:
ratings_and_books = pd.merge(ratings_sample, books, how='left', on='book_id')
ratings_and_books.head()

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,29403,102,5,19543,19543,3020535,110,99408392,9780099000000.0,Maurice Sendak,1963.0,Where the Wild Things Are,Where the Wild Things Are,eng,4.22,620618,636061,9102,15392,27532,93700,167043,332394,https://images.gr-assets.com/books/1384434560m...,https://images.gr-assets.com/books/1384434560s...
1,36128,4663,4,64081,64081,2827103,19,310266300,9780310000000.0,Shane Claiborne,2006.0,The Irresistible Revolution: Living as an Ordi...,The Irresistible Revolution: Living as an Ordi...,,4.07,18566,19186,953,409,977,3435,6486,7879,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
2,37112,25,5,136251,136251,2963218,263,545010225,9780545000000.0,"J.K. Rowling, Mary GrandPré",2007.0,Harry Potter and the Deathly Hallows,Harry Potter and the Deathly Hallows (Harry Po...,eng,4.61,1746574,1847395,51942,9363,22245,113646,383914,1318227,https://images.gr-assets.com/books/1474171184m...,https://images.gr-assets.com/books/1474171184s...
3,16114,693,4,455373,455373,2651694,122,345418972,9780345000000.0,Michael Crichton,1987.0,Sphere,Sphere,eng,3.77,128244,135005,2313,1851,9728,40085,49513,33828,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
4,50684,1731,4,6736971,6736971,6527274,37,345503813,9780346000000.0,Peter V. Brett,2010.0,The Desert Spear,"The Desert Spear (Demon Cycle, #2)",eng,4.24,53143,58608,1927,555,1691,7805,21853,26704,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


Make sure the counts are all the same...

In [96]:
ratings_and_books.shape

(100000, 25)

In [97]:
ratings_and_books.user_id.value_counts().count()

44364

In [98]:
ratings_and_books.book_id.value_counts().count()

9523

I only want books with an english language code

In [99]:
ratings_and_books.language_code.value_counts()

eng      70983
en-US    19678
en-GB     1911
en-CA      771
spa        391
ara        182
fre        178
ger         57
ind         38
en          26
jpn         25
pol         23
nor         18
por         15
nl          14
per         10
ita          8
dan          7
vie          5
mul          3
rum          3
fil          2
swe          1
tur          1
Name: language_code, dtype: int64

In [100]:
en_us = ratings_and_books.loc[(ratings_and_books['language_code'] == 'en-US') | (ratings_and_books['language_code'] == 'eng')]
en_us.head()

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,29403,102,5,19543,19543,3020535,110,99408392,9780099000000.0,Maurice Sendak,1963.0,Where the Wild Things Are,Where the Wild Things Are,eng,4.22,620618,636061,9102,15392,27532,93700,167043,332394,https://images.gr-assets.com/books/1384434560m...,https://images.gr-assets.com/books/1384434560s...
2,37112,25,5,136251,136251,2963218,263,545010225,9780545000000.0,"J.K. Rowling, Mary GrandPré",2007.0,Harry Potter and the Deathly Hallows,Harry Potter and the Deathly Hallows (Harry Po...,eng,4.61,1746574,1847395,51942,9363,22245,113646,383914,1318227,https://images.gr-assets.com/books/1474171184m...,https://images.gr-assets.com/books/1474171184s...
3,16114,693,4,455373,455373,2651694,122,345418972,9780345000000.0,Michael Crichton,1987.0,Sphere,Sphere,eng,3.77,128244,135005,2313,1851,9728,40085,49513,33828,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
4,50684,1731,4,6736971,6736971,6527274,37,345503813,9780346000000.0,Peter V. Brett,2010.0,The Desert Spear,"The Desert Spear (Demon Cycle, #2)",eng,4.24,53143,58608,1927,555,1691,7805,21853,26704,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
6,43944,1199,3,156534,156534,151061,19,1563899809,9781564000000.0,"Brian K. Vaughan, Pia Guerra, José Marzán Jr.",2003.0,"Y: The Last Man, Vol. 1: Unmanned","Y: The Last Man, Vol. 1: Unmanned",eng,4.12,81326,82315,2434,2462,3478,12272,27613,36490,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


In [101]:
en_us.shape

(90661, 25)

Let's drop unwanted columns:
> For recommendation system I only need user_id, rating, and title

In [102]:
recommend = en_us[['user_id', 'rating', 'title']]
recommend.head()

Unnamed: 0,user_id,rating,title
0,29403,5,Where the Wild Things Are
2,37112,5,Harry Potter and the Deathly Hallows (Harry Po...
3,16114,4,Sphere
4,50684,4,"The Desert Spear (Demon Cycle, #2)"
6,43944,3,"Y: The Last Man, Vol. 1: Unmanned"


In [103]:
recommend.shape

(90661, 3)

In [104]:
# recommend.to_csv('recommend.csv', index=False)

Actually... let's try to use all ratings!

In [105]:
ratings_and_books = pd.merge(ratings, books, how='left', on='book_id')
ratings_and_books.head()

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,258,5,1232,1232,3209783,279,143034901.0,9780143000000.0,"Carlos Ruiz Zafón, Lucia Graves",2001.0,La sombra del viento,The Shadow of the Wind (The Cemetery of Forgot...,eng,4.24,263685,317554,24652,4789,11769,42214,101612,157170,https://images.gr-assets.com/books/1344545047m...,https://images.gr-assets.com/books/1344545047s...
1,2,4081,4,231,231,1005335,53,312424442.0,9780312000000.0,Tom Wolfe,2004.0,,I am Charlotte Simmons,en-US,3.4,19293,21580,1891,1481,2886,6557,6890,3766,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
2,2,260,5,4865,4865,2370171,353,,9780672000000.0,Dale Carnegie,1936.0,How to Win Friends and Influence People,How to Win Friends and Influence People,eng,4.13,282623,305437,7909,6734,13880,50529,94743,139551,https://images.gr-assets.com/books/1442726934m...,https://images.gr-assets.com/books/1442726934s...
3,2,9296,5,4887,4887,821815,75,465016901.0,9780465000000.0,"Alice Miller, Ruth Ward",1979.0,Das Drama des begabten Kindes und die Suche na...,The Drama of the Gifted Child: The Search for ...,en-GB,4.09,9563,10830,537,189,490,2025,3548,4578,https://images.gr-assets.com/books/1339395245m...,https://images.gr-assets.com/books/1339395245s...
4,2,2318,3,998,998,5210,43,671015206.0,9780671000000.0,"Thomas J. Stanley, William D. Danko",1995.0,The Millionaire Next Door: The Surprising Secr...,The Millionaire Next Door: The Surprising Secr...,eng,4.0,43937,46748,2049,645,2318,10382,16665,16738,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


Make sure the counts are all the same...

In [106]:
ratings_and_books.shape

(5976479, 25)

In [107]:
ratings_and_books.user_id.value_counts().count()

53424

In [108]:
ratings_and_books.book_id.value_counts().count()

10000

I only want books with an english language code

In [109]:
ratings_and_books.language_code.value_counts()

eng      4239947
en-US    1179179
en-GB     111743
en-CA      47106
spa        25033
fre        10815
ara        10543
ger         3837
ind         2649
jpn         1190
en          1171
por         1043
nl          1017
nor          963
pol          961
per          700
dan          479
ita          307
mul          195
vie          179
swe          129
fil           75
tur           67
rus           67
rum           64
Name: language_code, dtype: int64

In [110]:
en_us = ratings_and_books.loc[(ratings_and_books['language_code'] == 'en-US') | (ratings_and_books['language_code'] == 'eng')]
en_us.head()

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,258,5,1232,1232,3209783,279,143034901.0,9780143000000.0,"Carlos Ruiz Zafón, Lucia Graves",2001.0,La sombra del viento,The Shadow of the Wind (The Cemetery of Forgot...,eng,4.24,263685,317554,24652,4789,11769,42214,101612,157170,https://images.gr-assets.com/books/1344545047m...,https://images.gr-assets.com/books/1344545047s...
1,2,4081,4,231,231,1005335,53,312424442.0,9780312000000.0,Tom Wolfe,2004.0,,I am Charlotte Simmons,en-US,3.4,19293,21580,1891,1481,2886,6557,6890,3766,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
2,2,260,5,4865,4865,2370171,353,,9780672000000.0,Dale Carnegie,1936.0,How to Win Friends and Influence People,How to Win Friends and Influence People,eng,4.13,282623,305437,7909,6734,13880,50529,94743,139551,https://images.gr-assets.com/books/1442726934m...,https://images.gr-assets.com/books/1442726934s...
4,2,2318,3,998,998,5210,43,671015206.0,9780671000000.0,"Thomas J. Stanley, William D. Danko",1995.0,The Millionaire Next Door: The Surprising Secr...,The Millionaire Next Door: The Surprising Secr...,eng,4.0,43937,46748,2049,645,2318,10382,16665,16738,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
5,2,26,4,968,968,2982101,350,307277674.0,9780307000000.0,Dan Brown,2003.0,The Da Vinci Code,"The Da Vinci Code (Robert Langdon, #2)",eng,3.79,1447148,1557292,41560,71345,126493,340790,539277,479387,https://images.gr-assets.com/books/1303252999m...,https://images.gr-assets.com/books/1303252999s...


In [111]:
en_us.shape

(5419126, 25)

Let's drop unwanted columns:
> For recommendation system I only need user_id, rating, and title

In [112]:
recommend = en_us[['user_id', 'rating', 'title']]
recommend.head()

Unnamed: 0,user_id,rating,title
0,1,5,The Shadow of the Wind (The Cemetery of Forgot...
1,2,4,I am Charlotte Simmons
2,2,5,How to Win Friends and Influence People
4,2,3,The Millionaire Next Door: The Surprising Secr...
5,2,4,"The Da Vinci Code (Robert Langdon, #2)"


In [113]:
recommend.shape

(5419126, 3)

In [114]:
# recommend.to_csv('ratings_and_books.csv', index=False)

### Cleaning for Book Rating Predictions:

In [115]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [116]:
books.shape

(10000, 23)

let's clean out all of the books that are not in english:

In [117]:
eng_books = books.loc[(books['language_code'] == 'en-US') | (books['language_code'] == 'eng')]
eng_books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [118]:
eng_books.shape

(8411, 23)

I want to build a model that I can use on my To Be Read list. The only information that I would have is the Author and Title of the book. I will need a df with just these features:

In [119]:
author_and_title = eng_books[['authors', 'title', 'average_rating']]
author_and_title.head()

Unnamed: 0,authors,title,average_rating
0,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.44
2,Stephenie Meyer,"Twilight (Twilight, #1)",3.57
3,Harper Lee,To Kill a Mockingbird,4.25
4,F. Scott Fitzgerald,The Great Gatsby,3.89


In [120]:
author_and_title.authors.value_counts().count()

3976

> 3976 unique authors

In [121]:
author_and_title.title.value_counts().count()

8390

> 8390 unique titles

> When one hot encoded or Count Vectorized, this will equate to MANY features

In [122]:
author_and_title.average_rating = author_and_title.average_rating.round()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  author_and_title.average_rating = author_and_title.average_rating.round()


In [123]:
author_and_title.head()

Unnamed: 0,authors,title,average_rating
0,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.0
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.0
2,Stephenie Meyer,"Twilight (Twilight, #1)",4.0
3,Harper Lee,To Kill a Mockingbird,4.0
4,F. Scott Fitzgerald,The Great Gatsby,4.0


In [124]:
author_and_title.average_rating.value_counts()

4.0    8031
3.0     273
5.0     106
2.0       1
Name: average_rating, dtype: int64

In [126]:
# author_and_title.to_csv('authors_and_titles_all.csv', index=False)

I am first going to make a predictive model off of the top 1,000 most popular books and then increasingly add more data

In [52]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [54]:
top_1000 = books.loc[books['book_id'] <= 1000]
top_1000.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [56]:
top_1000.shape

(1000, 23)

In [57]:
top_1000 = top_1000[(top_1000['language_code'] == 'en-US') | (top_1000['language_code'] == 'eng')]
top_1000.shape

(937, 23)

In [128]:
top_1000 = top_1000[['authors', 'title', 'average_rating']]
top_1000.head()

Unnamed: 0,authors,title,average_rating
0,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.34
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.44
2,Stephenie Meyer,"Twilight (Twilight, #1)",3.57
3,Harper Lee,To Kill a Mockingbird,4.25
4,F. Scott Fitzgerald,The Great Gatsby,3.89


In [129]:
top_1000['average_rating'] = top_1000.average_rating.round()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_1000['average_rating'] = top_1000.average_rating.round()


In [130]:
top_1000.head()

Unnamed: 0,authors,title,average_rating
0,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.0
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.0
2,Stephenie Meyer,"Twilight (Twilight, #1)",4.0
3,Harper Lee,To Kill a Mockingbird,4.0
4,F. Scott Fitzgerald,The Great Gatsby,4.0


In [131]:
top_1000.average_rating.value_counts()

4.0    903
5.0     17
3.0     17
Name: average_rating, dtype: int64

After seeing the distirubtion of ratings, I have realized that including the top 1000 books is not going to equate to a fair model because it is not a random sample of the data

Instead, I will randomly sample 1000 books to start:

In [173]:
books.head(1)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...


In [174]:
book_sample_1000 = books.sample(n = 1195, replace = False, random_state=1002)

In [175]:
book_sample_1000.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
7545,7546,22192,22192,1588093,48,316912123,9780317000000.0,Cecily von Ziegesar,2003.0,All I Want Is Everything,"All I Want is Everything (Gossip Girl, #3)",eng,3.56,15467,16076,422,433,1879,5728,4270,3766,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
8923,8924,833550,833550,819179,29,067088278X,9780671000000.0,"Janet Ahlberg, Allan Ahlberg",1978.0,Each Peach Pear Plum,Each Peach Pear Plum,eng,4.24,12747,13148,431,161,479,2196,3564,6748,https://images.gr-assets.com/books/1367225641m...,https://images.gr-assets.com/books/1367225641s...
3926,3927,18108281,18108281,24749971,39,,,J.R. Ward,2014.0,The King,"The King (Black Dagger Brotherhood, #12)",eng,4.37,17398,38218,4240,265,957,4500,11290,21206,https://images.gr-assets.com/books/1371919011m...,https://images.gr-assets.com/books/1371919011s...
9923,9924,24769,24769,25576,33,60519592,9780061000000.0,Scott Westerfeld,2006.0,"Blue Noon (Midnighters, #3)","Blue Noon (Midnighters, #3)",eng,3.85,14053,15120,830,204,958,4140,5370,4448,https://images.gr-assets.com/books/1315537301m...,https://images.gr-assets.com/books/1315537301s...
4763,4764,17314410,17314410,23981204,10,,,Georgia Cates,2013.0,Beauty from Surrender,"Beauty from Surrender (Beauty, #2)",eng,4.13,31623,34393,2028,401,1331,6108,12239,14314,https://images.gr-assets.com/books/1367294723m...,https://images.gr-assets.com/books/1367294723s...


In [176]:
book_sample_1000.shape

(1195, 23)

In [177]:
book_sample_1000 = book_sample_1000[(book_sample_1000['language_code'] == 'en-US') | (book_sample_1000['language_code'] == 'eng')]
book_sample_1000.shape

(1001, 23)

In [178]:
book_sample_1000 = book_sample_1000[['authors', 'title', 'average_rating']]
book_sample_1000.head()

Unnamed: 0,authors,title,average_rating
7545,Cecily von Ziegesar,"All I Want is Everything (Gossip Girl, #3)",3.56
8923,"Janet Ahlberg, Allan Ahlberg",Each Peach Pear Plum,4.24
3926,J.R. Ward,"The King (Black Dagger Brotherhood, #12)",4.37
9923,Scott Westerfeld,"Blue Noon (Midnighters, #3)",3.85
4763,Georgia Cates,"Beauty from Surrender (Beauty, #2)",4.13


In [181]:
# book_sample_1000.average_rating.value_counts().sort_index()

> No ratings below 3

In [182]:
book_sample_1000['average_rating'] = book_sample_1000['average_rating'].round()

In [183]:
book_sample_1000.average_rating.value_counts()

4.0    972
3.0     20
5.0      9
Name: average_rating, dtype: int64

In [184]:
book_sample_1000.head()

Unnamed: 0,authors,title,average_rating
7545,Cecily von Ziegesar,"All I Want is Everything (Gossip Girl, #3)",4.0
8923,"Janet Ahlberg, Allan Ahlberg",Each Peach Pear Plum,4.0
3926,J.R. Ward,"The King (Black Dagger Brotherhood, #12)",4.0
9923,Scott Westerfeld,"Blue Noon (Midnighters, #3)",4.0
4763,Georgia Cates,"Beauty from Surrender (Beauty, #2)",4.0


In [188]:
book_sample_1000.authors.value_counts().count()

787

In [190]:
book_sample_1000.title.value_counts().count()

1001

In [186]:
# book_sample_1000.to_csv('book_sample_1000.csv', index=False)

#### Creating a df for additional models:

In [195]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [197]:
books_by_isbn = books[['isbn', 'original_publication_year', 'language_code', 'average_rating', 'ratings_count', 'ratings_1' ,'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5']]
books_by_isbn.head()

Unnamed: 0,isbn,original_publication_year,language_code,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,439023483,2008.0,eng,4.34,4780653,66715,127936,560092,1481305,2706317
1,439554934,1997.0,eng,4.44,4602479,75504,101676,455024,1156318,3011543
2,316015849,2005.0,en-US,3.57,3866839,456191,436802,793319,875073,1355439
3,61120081,1960.0,eng,4.25,3198671,60427,117415,446835,1001952,1714267
4,743273567,1925.0,eng,3.89,2683664,86236,197621,606158,936012,947718


In [198]:
books_by_isbn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   isbn                       9300 non-null   object 
 1   original_publication_year  9979 non-null   float64
 2   language_code              8916 non-null   object 
 3   average_rating             10000 non-null  float64
 4   ratings_count              10000 non-null  int64  
 5   ratings_1                  10000 non-null  int64  
 6   ratings_2                  10000 non-null  int64  
 7   ratings_3                  10000 non-null  int64  
 8   ratings_4                  10000 non-null  int64  
 9   ratings_5                  10000 non-null  int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 781.4+ KB


In [199]:
books_by_isbn = books_by_isbn.loc[(books_by_isbn['language_code'] == 'en-US') | (books_by_isbn['language_code'] == 'eng')]
books_by_isbn.head()

Unnamed: 0,isbn,original_publication_year,language_code,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,439023483,2008.0,eng,4.34,4780653,66715,127936,560092,1481305,2706317
1,439554934,1997.0,eng,4.44,4602479,75504,101676,455024,1156318,3011543
2,316015849,2005.0,en-US,3.57,3866839,456191,436802,793319,875073,1355439
3,61120081,1960.0,eng,4.25,3198671,60427,117415,446835,1001952,1714267
4,743273567,1925.0,eng,3.89,2683664,86236,197621,606158,936012,947718


In [200]:
books_by_isbn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8411 entries, 0 to 9998
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   isbn                       7864 non-null   object 
 1   original_publication_year  8396 non-null   float64
 2   language_code              8411 non-null   object 
 3   average_rating             8411 non-null   float64
 4   ratings_count              8411 non-null   int64  
 5   ratings_1                  8411 non-null   int64  
 6   ratings_2                  8411 non-null   int64  
 7   ratings_3                  8411 non-null   int64  
 8   ratings_4                  8411 non-null   int64  
 9   ratings_5                  8411 non-null   int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 722.8+ KB


In [204]:
books_by_isbn = books_by_isbn.dropna()

In [205]:
books_by_isbn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7851 entries, 0 to 9998
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   isbn                       7851 non-null   object 
 1   original_publication_year  7851 non-null   float64
 2   language_code              7851 non-null   object 
 3   average_rating             7851 non-null   float64
 4   ratings_count              7851 non-null   int64  
 5   ratings_1                  7851 non-null   int64  
 6   ratings_2                  7851 non-null   int64  
 7   ratings_3                  7851 non-null   int64  
 8   ratings_4                  7851 non-null   int64  
 9   ratings_5                  7851 non-null   int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 674.7+ KB


In [206]:
books_by_isbn.average_rating = books_by_isbn.average_rating.round()

In [207]:
books_by_isbn.head()

Unnamed: 0,isbn,original_publication_year,language_code,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,439023483,2008.0,eng,4.0,4780653,66715,127936,560092,1481305,2706317
1,439554934,1997.0,eng,4.0,4602479,75504,101676,455024,1156318,3011543
2,316015849,2005.0,en-US,4.0,3866839,456191,436802,793319,875073,1355439
3,61120081,1960.0,eng,4.0,3198671,60427,117415,446835,1001952,1714267
4,743273567,1925.0,eng,4.0,2683664,86236,197621,606158,936012,947718


In [208]:
books_by_isbn.average_rating.value_counts()

4.0    7493
3.0     264
5.0      93
2.0       1
Name: average_rating, dtype: int64

In [209]:
books_by_isbn.shape

(7851, 10)

In [211]:
# books_by_isbn.to_csv('books_by_isbn.csv', index=False)

# Cleaning for Binary Classification:

In [5]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   book_id                    10000 non-null  int64  
 1   goodreads_book_id          10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  int6

In [7]:
books.language_code.fillna('unknown', inplace=True)

In [8]:
books = books.loc[(books['language_code'] == 'en-US') | (books['language_code'] == 'eng')]

In [9]:
books.shape

(8411, 23)

In [11]:
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8411 entries, 0 to 9998
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   book_id                    8411 non-null   int64  
 1   goodreads_book_id          8411 non-null   int64  
 2   best_book_id               8411 non-null   int64  
 3   work_id                    8411 non-null   int64  
 4   books_count                8411 non-null   int64  
 5   isbn                       7864 non-null   object 
 6   isbn13                     7940 non-null   float64
 7   authors                    8411 non-null   object 
 8   original_publication_year  8396 non-null   float64
 9   original_title             7948 non-null   object 
 10  title                      8411 non-null   object 
 11  language_code              8411 non-null   object 
 12  average_rating             8411 non-null   float64
 13  ratings_count              8411 non-null   int64

In [12]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [13]:
books = books[['isbn', 'authors', 'original_publication_year', 'title', 'average_rating', 'ratings_count', 'ratings_1', 'ratings_2' , 'ratings_3', 'ratings_4', 'ratings_5']]

In [14]:
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8411 entries, 0 to 9998
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   isbn                       7864 non-null   object 
 1   authors                    8411 non-null   object 
 2   original_publication_year  8396 non-null   float64
 3   title                      8411 non-null   object 
 4   average_rating             8411 non-null   float64
 5   ratings_count              8411 non-null   int64  
 6   ratings_1                  8411 non-null   int64  
 7   ratings_2                  8411 non-null   int64  
 8   ratings_3                  8411 non-null   int64  
 9   ratings_4                  8411 non-null   int64  
 10  ratings_5                  8411 non-null   int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 788.5+ KB


In [24]:
def map_avg(avg):
    if avg >= 4:
        return 1 #Yes
    else:
        return 0 #No
    
books['should_i_read'] = books['average_rating'].apply(map_avg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books['should_i_read'] = books['average_rating'].apply(map_avg)


In [25]:
books.head()

Unnamed: 0,isbn,authors,original_publication_year,title,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,should_i_read
0,439023483,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,66715,127936,560092,1481305,2706317,1
1,439554934,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,75504,101676,455024,1156318,3011543,1
2,316015849,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,3866839,456191,436802,793319,875073,1355439,0
3,61120081,Harper Lee,1960.0,To Kill a Mockingbird,4.25,3198671,60427,117415,446835,1001952,1714267,1
4,743273567,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,2683664,86236,197621,606158,936012,947718,0


In [26]:
books.should_i_read.value_counts()

1    4497
0    3914
Name: should_i_read, dtype: int64

In [30]:
books.isna().sum()

isbn                         547
authors                        0
original_publication_year     15
title                          0
average_rating                 0
ratings_count                  0
ratings_1                      0
ratings_2                      0
ratings_3                      0
ratings_4                      0
ratings_5                      0
should_i_read                  0
dtype: int64

In [31]:
books.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books.dropna(inplace=True)


In [32]:
books.isna().sum()

isbn                         0
authors                      0
original_publication_year    0
title                        0
average_rating               0
ratings_count                0
ratings_1                    0
ratings_2                    0
ratings_3                    0
ratings_4                    0
ratings_5                    0
should_i_read                0
dtype: int64

In [33]:
books.shape

(7851, 12)

In [35]:
# books.to_csv('books_for_binary_classification.csv', index=False)