# Data preparation 
Import the necessary libraries and load the datasets.

In [3]:
# Libraries for data preparation & visualization
import numpy as np
import seaborn as sns
import pandas as pd
import plotly.io as pio
import matplotlib.pyplot as plt
pio.renderers.default = "png"

# Ignore printing warnings for general readability
import warnings 
warnings.filterwarnings("ignore")

In [4]:
# Loading the dataset 
def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=';',error_bad_lines=False,warn_bad_lines=False,encoding='latin-1')
    return df

book   = loaddata("./../BX-Books")
user   = loaddata("../../BX-Users")
rating = loaddata("../../BX-Book-Ratings")

FileNotFoundError: [Errno 2] No such file or directory: './../BX-Books.csv'

# Data Description

In [None]:
print("shape of Users :",user.shape)
print("shape of books :",book.shape)
print("shape of ratings :",rating.shape)

In [None]:
book

In [None]:
user

In [None]:
rating

In [None]:
book.info()

In [None]:
rating.info()

In [None]:
user.info()

In [None]:
# Check for duplicate values
print(f'Duplicate entries book: {book.duplicated().sum()}')
print(f'Duplicate entries rating: {rating.duplicated().sum()}')
print(f'Duplicate entries user: {user.duplicated().sum()}')

In [None]:
# Missing value 
print(book.isnull().sum())
print("="*35)
print(rating.isnull().sum())
print("="*35)
print(user.isnull().sum())

# Data cleaning
Checking and cleaning book data

In [None]:
book['Year-Of-Publication'].unique()

In [None]:
# Extracting and fixing mismatch in feature 'year_of_publication', 'publisher', 'book_author', 'book_title'
book[book['Year-Of-Publication'] == 'DK Publishing Inc'] 

In [None]:
book[book['Year-Of-Publication'] == 'Gallimard']

In [None]:
# function to fix mismatch data in feature 'book_title', 'book_author', ' year_of_publication', 'publisher'
def replace_df_value(df, idx, col_name, val):
    df.loc[idx, col_name] = val
    return df
replace_df_value(book, 209538, 'Book-Title', 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)')
replace_df_value(book, 209538, 'Book-Author', 'Michael Teitelbaum')
replace_df_value(book, 209538, 'Year-Of-Publication', 2000)
replace_df_value(book, 209538, 'Publisher', 'DK Publishing Inc')

replace_df_value(book, 221678, 'Book-Title', 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)')
replace_df_value(book, 221678, 'Book-Author', 'James Buckley')
replace_df_value(book, 221678, 'Year-Of-Publication', 2000)
replace_df_value(book, 221678, 'Publisher', 'DK Publishing Inc')

replace_df_value(book, 220731,'Book-Title', "Peuple du ciel, suivi de 'Les Bergers")
replace_df_value(book, 220731, 'Book-Author', 'Jean-Marie Gustave Le ClÃ?Â©zio')
replace_df_value(book, 220731, 'Year-Of-Publication', 2003)
replace_df_value(book, 220731, 'Publisher', 'Gallimard')

In [None]:
#since year data has some object it it, we shall convert it into null data
book['Year-Of-Publication'] = pd.to_numeric(book['Year-Of-Publication'],errors='coerce')
book['Year-Of-Publication'].isna().sum()

In [None]:
#since year data has the year 0 and 2022 which is invalid, we shall convert it into null data
book.loc[(book['Year-Of-Publication'] > 2022) | (book['Year-Of-Publication'] == 0), 'year'] = np.NAN
#Replacing null data with median 
book['Year-Of-Publication'].fillna(book['year'].median() , inplace = True)
book['Year-Of-Publication'].isna().sum()


In [None]:
book['Year-Of-Publication'].unique()

In [None]:
#Finding and replacing null data from publisher
book.loc[book['Publisher'].isna()]


In [None]:
#Replacing null data from publisher
book['Publisher'].fillna('other' , inplace = True)
book['Publisher'].isna().sum()

In [None]:
#Finding and replacing null data from author
book.loc[book['Book-Author'].isna()]

In [None]:
#replacing null data from publisher
book['Book-Author'].fillna("Unknown" , inplace = True)
book['Book-Author'].isna().sum()

Checking and cleaning user data

In [None]:
print(sorted(user['Age'].unique()))


In [None]:
#removing age above 120 and below 4
user.loc[(user['Age'] > 120) | (user['Age'] < 4) , 'Age' ] = np.NAN
user['Age'].isna().sum()

In [None]:
#Filling the null values with mean
user['Age'].fillna(user['Age'].mean(), inplace = True)
user.duplicated().sum()

In [None]:
print(sorted(user['Age'].unique()))

# Data processing 

In [None]:
#Preprocessing Data
book = book[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
book.rename(columns = {'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)
user.rename(columns = {'User-ID':'user_id', 'Location':'location', 'Age':'age'}, inplace=True)
rating.rename(columns = {'User-ID':'user_id', 'Book-Rating':'rating'}, inplace=True)

# Exploratory data analysist
Analyzing data sets to summarize their main characteristics, often using statistical graphics and other data visualization methods


## Rating distribution

In [None]:
plt.figure(figsize=[8,5])
plt.rc('font', size=12)
plt.title('\nRating counts\n')
sns.countplot(x='rating',data=rating, palette = 'Set2')

<ul>
<li>This countplot shows users have rated 0 the most, which means they haven't rated books at all.
<li>Still we can see pattern to recognize in ratings from 1-10.
<li>
    Mostly the users have rated 8 ratings out of 10 as per books. It might happen that the feedback is positive but not extremely positive as 10 ratings (i.e best books ever).

In [None]:
#ratings should only exist for the books given in the dataset
ratings_new=rating[rating.ISBN.isin(book.ISBN)]
print(rating.shape)
print(ratings_new.shape)

In [None]:
# ratings with 0 rating and without 0 rating ( in rating data includes books given in the dataset )
ratings_explicit=ratings_new[ratings_new['rating']!=0]
ratings_implicit=ratings_new[ratings_new['rating']==0]
print(ratings_new.shape)
print(ratings_explicit.shape)
print(ratings_implicit.shape)

In [None]:
# most popular rating without 0 rating
# Visualising Explicit Rating Counts
plt.figure(figsize=[8,5])
plt.rc('font', size=12)
plt.title('\nMost popular ratings\n')
sns.countplot(data=ratings_explicit, x='rating', palette='Set2')

#### Now this countplot of bookRating indicates that higher ratings are more common amongst users and rating 8 has been rated highest number of times. There can be many assumptions based on ratings of users:
<ul>
<li>Let's take ratings group from 1-4. This can be negative impact for books been published if they have ratings from 1 to 4. It can be issues related to - 1. Language 2. Offend by any chapter's incident/paragraph/Author 3. They've read worst book ever.

<li>If we think analytical about rating 5, it might happen some same reason as above key points mention.

<li>For 5 ratings the users might not sure about book ratings whether it's positive or negative impact.

<li>Let's take ratings group from 6-10. This are positive feedback - 1. It can happen that not every book is perfect in all desire. So, the user's have decided to rate 8.
2. Since 6 ratings is very low among other ratings. 3. As we can aspect 7 and 8 are average and more ratings from users. 4. 9 and 10 ratings are top best ratings based on Author's, Publisher's and Books been published.



In [None]:
plt.figure(figsize=(8,6))
user.age.hist(bins=[10*i for i in range(1, 10)])     
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

  Looking towards the users age between $30-40$ prefer more and somewhat we can also view between $20-30$. Let's make some hypothesis analysis:
<ul>
<li>It is obvious that most of the user books are from Age $30$ to $40$.
<li>It might happen that the users are more interested on that subject what Authors are publishing in the market.
<li>The age group between $20-30$ are immensely attracted to read books published by Author.
<li>We can observe same pitch for Age group between $10-20$ and $50-60$. There are can be lot of different reasons.

In [None]:
#Merge ratings with books
rating_with_books = ratings_new.merge(book, on='ISBN')
rating_with_books.head()

In [None]:
rating_with_books.groupby('title')['rating'].count().reset_index().sort_values(by='rating', ascending=False)[:10]

In [None]:
# top 10 most popular books

popular = rating_with_books.groupby('title')['rating'].count().reset_index().sort_values(by='rating', ascending=False)[:10]
popular.columns = ['title', 'Count']

plt.figure(figsize=[8, 5])
plt.rc('font', size=12)
plt.title('\nMost popular books\n')
sns.barplot(data=popular, y='title', x='Count',palette='Set2')

In [None]:
# most popular book authors
author = rating_with_books.groupby('author')['rating'].count().reset_index().sort_values(by='rating', ascending=False)[:10]
plt.figure(figsize=[8, 5])
plt.rc('font', size=12)
plt.title('\nMost popular Authors\n')
sns.barplot(data=author, y='author', x='rating',palette='Set2')

In [None]:
# Authors with the most books
authors = book['author'].value_counts()
top_authors = authors.head(10).reset_index()
top_authors.columns = ['authors', 'count']
plt.figure(figsize=[8, 5])
plt.rc('font', size=12)
plt.title('\nTop 10 authors with the most books\n')
sns.barplot(x=top_authors['count'], y=top_authors['authors'],
            palette='Set2', )

Agatha Christie is leading at top with more than 600 counts, followed by William Shakespeare. We can plot some hypothesis point<br>
<ul>
<li>It can happen in some possible cases that Agatha Christie is not a best Author, though Agatha Christie has most number of books as compared to others.
<li>William Shakespeare is one of the popular Author in the world. Still he doesn't have highest number of books.
<li>Among all other Authors, it might happen that few of the Author might have some of the best seller books who have millions of copies been sold in world.
</ul>

In [None]:
# Publishers with the most books
publishers = book['publisher'].value_counts()
top_publishers = publishers.head(10).reset_index()
top_publishers.columns = ['publisher', 'count']
plt.figure(figsize=[8, 5])
plt.rc('font', size=12)
plt.title('\nTop 10 publishers with the most books\n')
sns.barplot(x=top_publishers['count'], y=top_publishers['publisher'],
            palette='Set2', )

In [None]:
book()