In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import warnings; 
import seaborn as sns
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='once')

In [None]:
books = pd.read_csv('books.csv',sep=';',error_bad_lines=False,encoding="latin-1")
users = pd.read_csv('users.csv',sep=';',error_bad_lines=False,encoding="latin-1")
rating = pd.read_csv('rating.csv',sep=';',error_bad_lines=False,encoding="latin-1")

In [None]:
print(books.head())
print(rating.head())
print(users.head())

In [None]:
books.columns=['ISBN','bookTitle','bookAuthor','yearOfPublication','publisher','imageUrlS','imageUrlM','imageUrlL']
rating.columns=['userId','ISBN','bookRating']
users.columns=['userId','location','age']

In [None]:
books.head(5)

In [None]:
#Checking dTypes and correcting the forged / discrepant entries 
#I am adjusting the column width to display full text of columns.
print(books.dtypes)
pd.set_option('display.max_colwidth',-1)

In [None]:
books.head(5)
# Now we are getting the whole name in each cells

In [None]:
## Checking the yearOfPublication for the books...
books.yearOfPublication
books.yearOfPublication.unique()
## We see there are two incorrect values in year column that is DKPublishingInc and Gallimard

In [None]:
books.loc[books.yearOfPublication=='DK Publishing Inc']

In [None]:
books.loc[books.yearOfPublication=='Gallimard']

In [None]:
## Now we need to rectify the mistakes in the csv by interchanging the year of publication and bookAuthor
books.loc[books.ISBN=='078946697X','yearOfPublication']=2000
books.loc[books.ISBN=='078946697X','bookAuthor']='Michael Teitelbaum'
books.loc[books.ISBN=='078946697X','publisher']='DK Publishing Inc'
books.loc[books.ISBN=='078946697X','bookTitle']='DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'

books.loc[books.ISBN=='0789466953','yearOfPublication']=2000
books.loc[books.ISBN=='0789466953','bookAuthor']='James Buckley'
books.loc[books.ISBN=='0789466953','publisher']='DK Publishing Inc'
books.loc[books.ISBN=='0789466953','bookTitle']='DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'

books.loc[books.ISBN=='2070426769','yearOfPublication']=2003
books.loc[books.ISBN=='2070426769','bookAuthor']='Jean-Marie Gustave Le ClÃ?Â©zio'
books.loc[books.ISBN=='2070426769','publisher']='Gallimard'
books.loc[books.ISBN=='2070426769','bookTitle']="Peuple du ciel, suivi de 'Les Bergers"

In [None]:
books.yearOfPublication.unique()

In [None]:
books.dtypes

In [None]:
# The datatype of yearOfPublication is object and we need to convert it to INT using the pandas function to_numeric
books.yearOfPublication=pd.to_numeric(books.yearOfPublication)

In [None]:
books.dtypes
# Now we see the dtype of year is integer

In [None]:
users.dtypes

In [None]:
rating.dtypes

In [None]:
# So now we get to see that all the dtypes that is data types are correct , so part of the cleaning process is done.

In [None]:
books.yearOfPublication.unique().max()

In [None]:
# So we get to know that the values of yearOfPublication ranges till 2050 . Since he current year is 2018 , we will be discarding
# the values that are way too ahead o 2018 since they are hardly of any use to our analysis.
books.loc[(books.yearOfPublication>2018) | (books.yearOfPublication==0),'yearOfPublication']=np.NaN
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()),inplace=True)
books.yearOfPublication=books.yearOfPublication.astype(np.int32)
# When inplace=True is passed, the data is renamed in place (it returns nothing)
# When inplace=False is passed (this is the default value, so isn't necessary), performs the operation and returns a copy of the object
books.yearOfPublication.unique()

In [None]:
# Checking for the publisher column if there are any empty values
print(books.publisher.unique())

In [None]:
print(books.loc[books.publisher.isnull()])

In [None]:
books.loc[books.ISBN=='193169656X', 'publisher']='noDetailsFound'
books.loc[books.ISBN=='1931696993', 'publisher']='noDetailsFound'

In [None]:
print(books.loc[books.publisher.isnull()])
## This comes out to be empty dataframe which means NaN entries have been properly deleted nad replace with noDetailsFound

In [None]:
print(users.age.unique())

In [None]:
###### <img src="C:\Users\saurav\Desktop\code_stack_python\BookRecommend\ageError.JPG">
# we can see the ages of the users going beyond 120 which is quite rare and avbsurd if we be practical
# Moreover there are NaN values for some reasons
# Let us consider any age less than 1 and more than 100 as absurd
from IPython.display import Image
Image(filename='ageError.JPG')

In [None]:
users.loc[(users.age>100)|(users.age<1),'age']=np.nan
users.age=users.age.fillna(users.age.mean())


In [None]:
print(sorted(users.age.unique()))

In [None]:
users.age=users.age.fillna(round(users.age.mean()))
print(sorted(users.age.unique()))

In [None]:
rating.head()

In [None]:
print(rating.ISBN.shape)
print(rating.ISBN.isin(books.ISBN))

In [None]:
# The above output shows thhat there are many ISBN in rating that are out of context and are absent in books.csv
# We need to correct this data along with the users.csv

In [None]:
updRating=rating[rating.ISBN.isin(books.ISBN)]
updRating=updRating[updRating.userId.isin(users.userId)]

In [None]:
sns.countplot(data=updRating[updRating.bookRating>=0],x='bookRating')
plt.show()

In [None]:
# We see the recommendation is 0 for over 600000 which is quite abrupt and
# therefore a good judgement cant be given
# So we should separate the ratings <1 and >=1<=10
ratingCorr=updRating[updRating.bookRating>0]
ratingErr=updRating[updRating.bookRating==0]
sns.countplot(data=ratingCorr,x='bookRating')
plt.show()

In [None]:
# Now we will come to the recommendation part where we will be focussing on algorithm to suggets books for the user


In [None]:
# There are many recommendation systems to work out with
# 1.Popularity
# 2.Collaborative Filtering
# 3.User CF
# 4.Item CF
# We will be working with Collaborative Filtering
# Considering recommendations from users who have read over 99 books

In [None]:
countExp=ratingCorr['userId'].value_counts()
ratingCorr=ratingCorr[ratingCorr['userId'].isin(countExp[countExp>=100].index)]
counts=ratingCorr['bookRating'].value_counts()
ratingCorr=ratingCorr[ratingCorr['bookRating'].isin(counts[counts>=100].index)]
rating_matrix=ratingCorr.pivot(index='userId',columns='ISBN',values='bookRating')
userId=rating_matrix.index
ISBN=rating_matrix.columns
print(rating_matrix.shape)
print(rating_matrix)
#for i in rating_matrix:
#    if ISBN.all!='NaN':
#        print(rating_matrix)

In [None]:
# Many entries are showing NaN, but the thing is we chose only those where rating > 0
# But some instances give non zero values such as....
from IPython.display import Image
Image(filename='finalResult.JPG')