# Imports:

In [130]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [131]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Read in the data:

In [132]:
book_tags = pd.read_csv('ignore/book_tags.csv')
books = pd.read_csv('ignore/books.csv')
ratings = pd.read_csv('ignore/ratings.csv')
tags = pd.read_csv('ignore/tags.csv')
to_read = pd.read_csv('ignore/to_read.csv')

In [133]:
print(book_tags.shape)
print(books.shape)
print(ratings.shape)
print(tags.shape)
print(to_read.shape)

(999912, 3)
(10000, 23)
(5976479, 3)
(34252, 2)
(912705, 2)


### Cleaning for Recommender System:

While more data is better... for the purpose of memory usage for this project I am going to randomly pick 50,000 data points from the over 5 million data points in the ratings.csv

In [134]:
ratings.isna().sum()

user_id    0
book_id    0
rating     0
dtype: int64

> There are no missing values, so I cannot just delete null items

In [135]:
ratings.user_id.value_counts().count()

53424

> 53,424 unique users

In [136]:
ratings.book_id.value_counts().count()

10000

> 10,000 unique books

In [137]:
ratings_sample = ratings.sample(n = 50000, replace = False)

In [138]:
ratings_sample.head()

Unnamed: 0,user_id,book_id,rating
4843805,30551,3673,5
5474624,46330,1022,4
5555146,47003,6588,3
74080,2321,1797,4
3670960,42923,99,2


In [139]:
ratings_sample.isna().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [140]:
ratings_sample.user_id.value_counts().count()

31967

32,009 unique users in this sample, with some repeats

In [141]:
ratings_sample.book_id.value_counts().count()

8447

> 8,393 unique books... I want all 10,000 books included so I may re-sample

In [142]:
ratings_sample.rating.value_counts().count()

5

> All 5 different ratings are represented

Let's try a resample with 100,000 instead...

In [143]:
ratings_sample = ratings.sample(n = 100000, replace = False, random_state=1002)

In [144]:
ratings_sample.head()

Unnamed: 0,user_id,book_id,rating
2186601,29403,102,5
2858642,36128,4663,4
4280792,37112,25,5
1026306,16114,693,4
5672837,50684,1731,4


In [145]:
ratings_sample.isna().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [146]:
ratings_sample.user_id.value_counts().count()

44364

> 44,364 unique users

In [147]:
ratings_sample.book_id.value_counts().count()

9523

> 9523 unique books... this is not the full 10,000 but close enough for my liking

In [148]:
# Checking to make sure that some of my favorite books are included in this sample... because what would a good recommender system be if it doesn't include my favorites
# ratings_sample.loc[ratings_sample['book_id'] == 30]

In [179]:
ratings_sample.shape

(100000, 3)

Now, let's merge the book df with it

In [157]:
ratings_and_books = pd.merge(ratings_sample, books, how='left', on='book_id')
ratings_and_books.head()

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,29403,102,5,19543,19543,3020535,110,99408392,9780099000000.0,Maurice Sendak,1963.0,Where the Wild Things Are,Where the Wild Things Are,eng,4.22,620618,636061,9102,15392,27532,93700,167043,332394,https://images.gr-assets.com/books/1384434560m...,https://images.gr-assets.com/books/1384434560s...
1,36128,4663,4,64081,64081,2827103,19,310266300,9780310000000.0,Shane Claiborne,2006.0,The Irresistible Revolution: Living as an Ordi...,The Irresistible Revolution: Living as an Ordi...,,4.07,18566,19186,953,409,977,3435,6486,7879,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
2,37112,25,5,136251,136251,2963218,263,545010225,9780545000000.0,"J.K. Rowling, Mary GrandPré",2007.0,Harry Potter and the Deathly Hallows,Harry Potter and the Deathly Hallows (Harry Po...,eng,4.61,1746574,1847395,51942,9363,22245,113646,383914,1318227,https://images.gr-assets.com/books/1474171184m...,https://images.gr-assets.com/books/1474171184s...
3,16114,693,4,455373,455373,2651694,122,345418972,9780345000000.0,Michael Crichton,1987.0,Sphere,Sphere,eng,3.77,128244,135005,2313,1851,9728,40085,49513,33828,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
4,50684,1731,4,6736971,6736971,6527274,37,345503813,9780346000000.0,Peter V. Brett,2010.0,The Desert Spear,"The Desert Spear (Demon Cycle, #2)",eng,4.24,53143,58608,1927,555,1691,7805,21853,26704,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


Make sure the counts are all the same...

In [158]:
ratings_and_books.shape

(100000, 25)

In [159]:
ratings_and_books.user_id.value_counts().count()

44364

In [160]:
ratings_and_books.book_id.value_counts().count()

9523

I only want books with an english language code

In [161]:
ratings_and_books.language_code.value_counts()

eng      70983
en-US    19678
en-GB     1911
en-CA      771
spa        391
ara        182
fre        178
ger         57
ind         38
en          26
jpn         25
pol         23
nor         18
por         15
nl          14
per         10
ita          8
dan          7
vie          5
mul          3
rum          3
fil          2
swe          1
tur          1
Name: language_code, dtype: int64

In [173]:
en_us = ratings_and_books.loc[(ratings_and_books['language_code'] == 'en-US') | (ratings_and_books['language_code'] == 'eng')]
en_us.head()

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,29403,102,5,19543,19543,3020535,110,99408392,9780099000000.0,Maurice Sendak,1963.0,Where the Wild Things Are,Where the Wild Things Are,eng,4.22,620618,636061,9102,15392,27532,93700,167043,332394,https://images.gr-assets.com/books/1384434560m...,https://images.gr-assets.com/books/1384434560s...
2,37112,25,5,136251,136251,2963218,263,545010225,9780545000000.0,"J.K. Rowling, Mary GrandPré",2007.0,Harry Potter and the Deathly Hallows,Harry Potter and the Deathly Hallows (Harry Po...,eng,4.61,1746574,1847395,51942,9363,22245,113646,383914,1318227,https://images.gr-assets.com/books/1474171184m...,https://images.gr-assets.com/books/1474171184s...
3,16114,693,4,455373,455373,2651694,122,345418972,9780345000000.0,Michael Crichton,1987.0,Sphere,Sphere,eng,3.77,128244,135005,2313,1851,9728,40085,49513,33828,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
4,50684,1731,4,6736971,6736971,6527274,37,345503813,9780346000000.0,Peter V. Brett,2010.0,The Desert Spear,"The Desert Spear (Demon Cycle, #2)",eng,4.24,53143,58608,1927,555,1691,7805,21853,26704,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
6,43944,1199,3,156534,156534,151061,19,1563899809,9781564000000.0,"Brian K. Vaughan, Pia Guerra, José Marzán Jr.",2003.0,"Y: The Last Man, Vol. 1: Unmanned","Y: The Last Man, Vol. 1: Unmanned",eng,4.12,81326,82315,2434,2462,3478,12272,27613,36490,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


In [174]:
en_us.shape

(90661, 25)

Let's drop unwanted columns:
> For recommendation system I only need user_id, rating, and title

In [175]:
recommend = en_us[['user_id', 'rating', 'title']]
recommend.head()

Unnamed: 0,user_id,rating,title
0,29403,5,Where the Wild Things Are
2,37112,5,Harry Potter and the Deathly Hallows (Harry Po...
3,16114,4,Sphere
4,50684,4,"The Desert Spear (Demon Cycle, #2)"
6,43944,3,"Y: The Last Man, Vol. 1: Unmanned"


In [176]:
recommend.shape

(90661, 3)

In [178]:
# recommend.to_csv('recommend.csv', index=False)