In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('cleaned_books.csv')
df = df.drop('Unnamed: 0', 1)
df.head()

Unnamed: 0,publisher,author,isbn,date,title,weeks_on_list,no_of_words_title
0,Riverhead,Paula Hawkins,1594634025,2017-02-19,THE GIRL ON THE TRAIN,102,5
1,Scribner,Anthony Doerr,1501173219,2017-05-07,ALL THE LIGHT WE CANNOT SEE,81,6
2,Vintage,E L James,525431888,2017-03-05,FIFTY SHADES DARKER,66,3
3,St. Martin's,Kristin Hannah,1466850604,2017-10-29,THE NIGHTINGALE,63,2
4,Penguin Group,Kathryn Stockett,1440697663,2012-04-08,THE HELP,58,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2248 entries, 0 to 2247
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   publisher          2248 non-null   object
 1   author             2248 non-null   object
 2   isbn               2248 non-null   object
 3   date               2248 non-null   object
 4   title              2248 non-null   object
 5   weeks_on_list      2248 non-null   int64 
 6   no_of_words_title  2248 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 123.1+ KB


In [4]:
review = pd.read_csv('review_stats.csv')
review = review.drop('Unnamed: 0', 1)
review.head()

Unnamed: 0,isbn,ratings_count,reviews_count,text_reviews_count,work_ratings_count,work_reviews_count,work_text_reviews_count,average_rating
0,1594634025,4832,8435,417,2082071,3313269,109917,3.92
1,1501173219,4375,10744,565,1005586,2142471,75056,4.33
2,525431888,71,155,7,737540,1064380,29077,3.84
3,1466850604,631,1480,150,680040,1319418,63407,4.58
4,1440697663,1491,1922,330,2141308,3031266,84485,4.47


In [5]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1281 entries, 0 to 1280
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   isbn                     1281 non-null   object 
 1   ratings_count            1281 non-null   int64  
 2   reviews_count            1281 non-null   int64  
 3   text_reviews_count       1281 non-null   int64  
 4   work_ratings_count       1281 non-null   int64  
 5   work_reviews_count       1281 non-null   int64  
 6   work_text_reviews_count  1281 non-null   int64  
 7   average_rating           1281 non-null   float64
dtypes: float64(1), int64(6), object(1)
memory usage: 80.2+ KB


# Merging two dataframes

Merging two dataframes df and review via RIGHT JOIN

In [6]:
data = df.merge(review, on='isbn', how='inner')
data.head()

Unnamed: 0,publisher,author,isbn,date,title,weeks_on_list,no_of_words_title,ratings_count,reviews_count,text_reviews_count,work_ratings_count,work_reviews_count,work_text_reviews_count,average_rating
0,Riverhead,Paula Hawkins,1594634025,2017-02-19,THE GIRL ON THE TRAIN,102,5,4832,8435,417,2082071,3313269,109917,3.92
1,Scribner,Anthony Doerr,1501173219,2017-05-07,ALL THE LIGHT WE CANNOT SEE,81,6,4375,10744,565,1005586,2142471,75056,4.33
2,Vintage,E L James,525431888,2017-03-05,FIFTY SHADES DARKER,66,3,71,155,7,737540,1064380,29077,3.84
3,St. Martin's,Kristin Hannah,1466850604,2017-10-29,THE NIGHTINGALE,63,2,631,1480,150,680040,1319418,63407,4.58
4,Penguin Group,Kathryn Stockett,1440697663,2012-04-08,THE HELP,58,2,1491,1922,330,2141308,3031266,84485,4.47


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1287 entries, 0 to 1286
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   publisher                1287 non-null   object 
 1   author                   1287 non-null   object 
 2   isbn                     1287 non-null   object 
 3   date                     1287 non-null   object 
 4   title                    1287 non-null   object 
 5   weeks_on_list            1287 non-null   int64  
 6   no_of_words_title        1287 non-null   int64  
 7   ratings_count            1287 non-null   int64  
 8   reviews_count            1287 non-null   int64  
 9   text_reviews_count       1287 non-null   int64  
 10  work_ratings_count       1287 non-null   int64  
 11  work_reviews_count       1287 non-null   int64  
 12  work_text_reviews_count  1287 non-null   int64  
 13  average_rating           1287 non-null   float64
dtypes: float64(1), int64(8),

##### Comment:
The new merged dataframe has more data points than in 'review' dataframe. Therefore, there must be some duplicates and we need to eliminate them

In [8]:
# Check for duplicates based on ISBNs
data['isbn'].duplicated().sum()

6

In [9]:
# Review these duplicates
data[data.duplicated(['isbn'])]

Unnamed: 0,publisher,author,isbn,date,title,weeks_on_list,no_of_words_title,ratings_count,reviews_count,text_reviews_count,work_ratings_count,work_reviews_count,work_text_reviews_count,average_rating
708,Penguin Group,Junot Diaz,1594487367,2012-09-30,THIS IS HOW YOU LOSE HER,1,6,81070,184021,6826,89177,197155,7548,3.75
712,Simon & Schuster,Philippa Gregory,1451626142,2012-09-02,THE KINGMAKER'S DAUGHTER,1,3,36,53,7,38213,86153,2849,3.97
715,Kensington Publishing,Lisa Jackson,758279590,2012-08-26,YOU DON'T WANT TO KNOW,1,5,13,24,5,7699,18878,908,3.79
735,HarperCollins Publishers,Christopher Moore,61779741,2012-04-22,SACRE BLEU,1,2,24173,49976,2949,30011,60835,3599,3.79
738,HarperCollins Publishers,Adriana Trigiani,62098063,2012-04-22,THE SHOEMAKER'S WIFE,1,3,74,131,28,75077,136629,8296,3.99
1176,Knopf Doubleday Publishing,Stieg Larsson,307593673,2012-05-20,THE GIRL WHO KICKED THE HORNET'S NEST,0,7,4544,5893,626,640541,961876,28953,4.22


In [10]:
# Drop these duplicates (with the same ISBNs) from the merged dataframe
data.drop_duplicates(subset ='isbn', inplace = True) 

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1281 entries, 0 to 1286
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   publisher                1281 non-null   object 
 1   author                   1281 non-null   object 
 2   isbn                     1281 non-null   object 
 3   date                     1281 non-null   object 
 4   title                    1281 non-null   object 
 5   weeks_on_list            1281 non-null   int64  
 6   no_of_words_title        1281 non-null   int64  
 7   ratings_count            1281 non-null   int64  
 8   reviews_count            1281 non-null   int64  
 9   text_reviews_count       1281 non-null   int64  
 10  work_ratings_count       1281 non-null   int64  
 11  work_reviews_count       1281 non-null   int64  
 12  work_text_reviews_count  1281 non-null   int64  
 13  average_rating           1281 non-null   float64
dtypes: float64(1), int64(8),

# Save the cleaned merged dataframe into a new CSV file

In [12]:
data.to_csv('merge_data.csv')