# Book Ratings Data Collection & Cleaning

### Import libraries and datasets

In [1]:
import pandas as pd
import numpy as np

Dataset available at: 
    https://www.kaggle.com/bahramjannesarr/goodreads-book-datasets-10m?select=user_rating_0_to_1000.csv

In [2]:
#import raw data
books = pd.read_csv('./data/raw/user_rating_0_to_1000.csv')
books.head(2)

Unnamed: 0,ID,Name,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing


In [3]:
#set 1-5 values for user reviews
books['Rating'].value_counts()

really liked it                      20282
it was amazing                       14108
liked it                             13155
it was ok                             3349
did not like it                        748
This user doesn't have any rating      303
Name: Rating, dtype: int64

In [4]:
books['Rating'].replace('it was amazing', 5, inplace=True)
books['Rating'].replace('really liked it', 4, inplace=True)
books['Rating'].replace('liked it', 3, inplace=True)
books['Rating'].replace('it was ok', 2, inplace=True)
books['Rating'].replace('did not like it', 1, inplace=True)
books['Rating'].replace("This user doesn't have any rating", 0, inplace=True)

In [5]:
books['Rating'].value_counts()

4    20282
5    14108
3    13155
2     3349
1      748
0      303
Name: Rating, dtype: int64

In [6]:
#create book ratings dataframe for app
# https://stackoverflow.com/questions/42350029/assign-a-number-to-each-unique-value-in-a-list
book_names = books['Name']
dictionary = {i: indi for indi, i in enumerate(set(book_names))}
numbers = [dictionary[n] for n in book_names]
books['book_id'] = numbers
books.sort_values(by=['book_id'])

Unnamed: 0,ID,Name,Rating,book_id
11148,227,"Your Native Land, Your Life",5,0
48097,913,Píldoras azules,3,1
1213,5,The Mysterious Stranger and Other Stories,5,2
32388,597,The Real Frank Zappa Book,5,3
34842,666,The Real Frank Zappa Book,3,3
...,...,...,...,...
37784,696,Blankets,3,24092
34776,664,Blankets,5,24092
48271,913,Blankets,4,24092
4963,103,Schiffbruch mit Tiger,5,24093


In [8]:
#create recommender dataframe, reorder for streamlit app
recommender_df = books[['Name', 'book_id', 'ID', 'Rating']]
recommender_df.rename(columns={'ID': 'user_id', 'Name': 'book_name', 'Rating': 'rating'}, inplace=True)

In [9]:
recommender_df.head(2)

Unnamed: 0,book_name,book_id,user_id,rating
0,Agile Web Development with Rails: A Pragmatic ...,13083,1,5
1,The Restaurant at the End of the Universe (Hit...,12462,1,5


In [10]:
#export dataframe
recommender_df.to_csv('./data/clean/books_df.csv')