In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
ratings = pd.read_csv('ratings.csv')
finalbooks = pd.read_csv('finalbooks.csv')

In [None]:
ratings['book_id'].value_counts().head(100)
out = ratings.groupby(by='book_id', as_index=False).agg({'rating':pd.Series.count})
outid = out.loc[out.rating>=7500]['book_id'].tolist()
testbooks = finalbooks[~finalbooks['book_id'].isin(outid)]
testratings = ratings[~ratings['book_id'].isin(outid)]

In [None]:
finalbooks['genres'] = finalbooks['genres'].fillna('Unknown')
finalbooks['tag_cloud'] = ['-'.join(x.split('|')) for x in finalbooks['genres']]

In [None]:
useronly = ratings.groupby(by='user_id', as_index = False).agg({'rating':pd.Series.count}).sort_values('rating', ascending=False).head(15000)

In [None]:
finalratings = ratings[ratings.user_id.isin(useronly.user_id)]
bookonly = finalratings.groupby(by = 'book_id', as_index = False).agg({'rating' : pd.Series.count}).sort_values('rating', ascending=False).head(8000)
finalratings = finalratings[ratings.book_id.isin(bookonly.book_id)]

In [None]:
finalbooks = finalbooks[finalbooks.book_id.isin(bookonly.book_id)]
finalbooks = finalbooks.reset_index(drop=True)
finalbooks['newbookid'] = finalbooks.index+1
finalbooks

In [None]:
finalratings = finalratings.merge(finalbooks[['book_id', 'newbookid']], how='left', on=['book_id'])
finalratings.dropna(subset = ["newbookid"], inplace=True)
finalratings['newbookid'] = [int(x) for x in finalratings['newbookid']]
finalratings

In [None]:
finalratings['newuser_id'] = finalratings.groupby('user_id').grouper.group_info[0]+1
finalratings

In [None]:
finalratings = finalratings.drop(['user_id', 'book_id'], axis=1)
finalbooks = finalbooks.drop(['book_id', 'goodreads_book_id'], axis=1)

In [None]:
finalratings['good'] = [1 if x>=4 else 0 for x in finalratings['rating']]

In [None]:
finalratings

In [None]:
# Average ratings for the books in the data base
agg=finalratings.groupby(by='newbookid', as_index=False).agg({'rating': pd.Series.mean})
n, bins, patches = plt.hist(agg['rating'], 20, facecolor = 'blue', alpha=0.5)
plt.title('Distribution of Average Ratings per Book in Dataset')
plt.show()

In [None]:
# Distribution of the individual ratings given by our User base
n, bins, patches = plt.hist(finalratings['rating'], 5, facecolor='blue', alpha=0.5)
plt.title('Distribution of Ratings from 1 to 5 by users')
plt.show()

In [None]:
# Distribution of the individual ratings given by our user base
distr = finalratings.groupby(by='rating', as_index=False).agg({'newuser_id':pd.Series.count})
sns.barplot(x='rating', y='newuser_id', data=distr)
plt.title("Distribution of Good ratings from 1 to 5 by users")
plt.show()

In [None]:
# Distribution of the no. of users per rating
finalratings['rating'].value_counts()

In [None]:
# Distribution of the no. of reviews per user
n, bins, patches = plt.hist(finalratings['newuser_id'].value_counts(), 15, facecolor='blue', alpha=0.5)
plt.title('Distribution of no. of reviews per user')
plt.show()

In [None]:
# Distribution of the no.of revviews per book
facet, axes = plt.subplots(1, 1, figsize=(20,10))
n, bins, patches = plt.hist(testratings['book_id'].value_counts(), 1000, facecolor='blue', alpha=0.5)
plt.title('Distribution of no. of reviews per user')
plt.show()

In [None]:
finalbooks.info()

In [None]:
finalbooks['firstgenre'] = [x.split('|')[0] for x in finalbooks['genres']]
finalbooks['mosttagged'] = [x.lower() for x in finalbooks['firstgenre']]
finalbooks['tag_cloud'] = [x.lower() for x in finalbooks['genres']]
finalbooks['tag_cloud'] = ['-'.join(x.split('|')) for x in finalbooks['tag_cloud']]

In [None]:
testbooks['genres'] = testbooks['genres'].fillna('Unknown')
testbooks['firstgenre'] = [x.split('|')[0] for x in testbooks['genres']]
testbooks['mosttagged'] = [x.lower() for x in testbooks['firstgenre']]
testbooks['tag_cloud'] = [x.lower() for x in testbooks['genres']]
testbooks['tag_cloud'] = ['-'.join(x.split('|')) for x in testbooks['tag_cloud']]

In [None]:
# Distribution of the no. of ratings per book
facets, axes = plt.subplots(2,1,figsize=(40,20))
sns.despine(left=True)
sns.barplot(x='firstgenre', y='average_rating', data=testbooks, ax=axes[0])
plt.xticks(rotation=90)
sns.barplot(x='firstgenre', y='ratings_count', data=testbooks, ax=axes[1])
plt.xticks(rotation=90)
plt.show()

In [None]:
finalbooks['title'] = finalbooks['title'].str.replace(r"\(.*\)","")
finalbooks['original_title'] = finalbooks['original_title'].str.replace(r"\(.*\)","")

In [None]:
finalbooks['book_pages'] = finalbooks['book_pages'].fillna('0')


In [None]:
finalbooks.info()

In [None]:
finalbooks.head(20)

In [None]:
finalbooks.to_csv('finalbook.csv', index= False)

In [None]:
finalratings.to_csv('finalratings.csv', index=False)