In [155]:
import numpy as np 
import pandas as pd

In [156]:
original_goodread_df = pd.read_csv('books.csv', error_bad_lines = False)
original_goodread_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/06,Scholastic Inc.,
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/04,Scholastic Inc.,
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/03,Scholastic,
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/04,Scholastic Inc.,
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,439682584,9780439682589,eng,2690,41428,164,9/13/04,Scholastic,


In [157]:
#make a copy of the df so preserve original data
goodread_df = original_goodread_df.copy()

In [158]:
#creating column name list
col_names = list(goodread_df.columns)
print(col_names)

['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13', 'language_code', '  num_pages', 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher', 'Unnamed: 12']


In [159]:
#updating column name to have no spaces
goodread_df = goodread_df.rename(columns={"  num_pages": "num_pages"})

In [160]:
#updating column name list
col_names = list(goodread_df.columns)
print(col_names)

['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13', 'language_code', 'num_pages', 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher', 'Unnamed: 12']


In [161]:
#check authors
authors_sum = goodread_df.authors.value_counts()
print(authors_sum)

P.G. Wodehouse                                  40
Stephen King                                    40
Rumiko Takahashi                                39
Orson Scott Card                                35
Agatha Christie                                 33
                                                ..
Marcel Proust/Sylvia Townsend Warner             1
Jess Walter                                      1
David  Allen/Tina Blythe/Gene Thompson-Grove     1
Roger Lancelyn Green/Alan Langford               1
Sun Tzu/Mark McNeilly                            1
Name: authors, Length: 6643, dtype: int64


In [162]:
#check publishers
publishers_sum = goodread_df.publisher.value_counts()
print(publishers_sum)

Vintage               318
Penguin Books         261
Penguin Classics      184
Mariner Books         150
Ballantine Books      144
                     ... 
Orion Paperbacks        1
Bloomsbury (NYC)        1
Praeger Publishers      1
Henry Holt and Co.      1
Egmont Books Ltd        1
Name: publisher, Length: 2294, dtype: int64


In [163]:
#check titles
titles_sum = goodread_df.title.value_counts()
print(titles_sum)

The Iliad                                                                      9
The Brothers Karamazov                                                         9
'Salem's Lot                                                                   8
Anna Karenina                                                                  8
Gulliver's Travels                                                             8
                                                                              ..
A Time to Embrace: Same-Gender Relationships in Religion  Law  and Politics    1
A New Hope: The Illustrated Screenplay (Star Wars  Episode IV)                 1
The Ruby Ring                                                                  1
Please Stop Laughing at Me... One Woman's Inspirational Story                  1
The Last Dance: Encountering Death and Dying                                   1
Name: title, Length: 10352, dtype: int64


In [164]:
#check ISBNs
isbn_sum = goodread_df.isbn.value_counts()
print(isbn_sum)

3.58          2
756403413     1
375701893     1
034073356X    1
671020315     1
             ..
425187713     1
042519938X    1
99462109      1
345442121     1
452284449     1
Name: isbn, Length: 11126, dtype: int64


In [165]:
#check unnamed column
extra_col_sum = goodread_df['Unnamed: 12'].value_counts()
print(extra_col_sum)

Brown Son & Ferguson Ltd.      1
Cold Spring Press              1
Harvard University Press       1
Huntington House Publishers    1
Name: Unnamed: 12, dtype: int64


In [166]:
# Update goodread_df rows that are out of alignment

In [167]:
#create slice of DF to check out the alignment of certain rows
outofalignment_df = goodread_df[(goodread_df.num_pages == 'eng') | (goodread_df.num_pages == 'en-US')]

outofalignment_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner,Jr./Sam B. Warner,3.58,674842111,9780670000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net,one of the founding members of this Tolkien w...,3.58,1593600119,9781590000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley,Rawles,3.63,156384155X,9781560000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown,Son & Ferguson,0.0,851742718,9780850000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [168]:
#create a new df with rows out of alignment
df_misshapen = goodread_df.loc[[3348, 4702, 5877, 8979]]
df_misshapen.head()

#add the separated author columns together 
df_misshapen['authors_full'] = df_misshapen['authors'] + df_misshapen['average_rating']

#overwrite poorly formatted author column with correct author column 
df_misshapen['authors'] = df_misshapen['authors_full']

#drop unneeded columns
df_misshapen = df_misshapen.drop(['average_rating','authors_full'], axis=1)

df_misshapen.head()

Unnamed: 0,bookID,title,authors,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner Jr./Sam B. Warner,3.58,674842111,9780670000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net one o...,3.58,1593600119,9781590000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley Rawles,3.63,156384155X,9781560000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown Son & Ferguson,0.0,851742718,9780850000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [169]:
loop_count = 0

def update_col_names(df, col_list, loop_count):
    for col in df.columns:
        df = df.rename(columns={col: col_list[loop_count]})
        loop_count += 1
    return df

df_reshaped = update_col_names(df_misshapen, col_names, loop_count)

df_reshaped.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner Jr./Sam B. Warner,3.58,674842111,9780670000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net one o...,3.58,1593600119,9781590000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley Rawles,3.63,156384155X,9781560000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown Son & Ferguson,0.0,851742718,9780850000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [170]:
#overwrite poorly formatted rows with correct formatted rows 
goodread_df.iloc[3348] = df_reshaped.iloc[0]
goodread_df.iloc[4702] = df_reshaped.iloc[1]
goodread_df.iloc[5877] = df_reshaped.iloc[2]
goodread_df.iloc[8979] = df_reshaped.iloc[3]

In [171]:
#check unnamed column
extra_col_sum = goodread_df["Unnamed: 12"].value_counts()
print(extra_col_sum)

Series([], Name: Unnamed: 12, dtype: int64)


In [172]:
#drop empty column
goodread_df = goodread_df.drop(['Unnamed: 12'], axis=1)
col_names[2] = 'author'
col_names = col_names[:-1]
col_names

['bookID',
 'title',
 'author',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 'num_pages',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher']

In [173]:
#create dataframe with only books in English

In [174]:
#check language codes
language_sum = goodread_df.language_code.value_counts()
print(language_sum)

eng      8911
en-US    1409
spa       218
en-GB     214
fre       144
ger        99
jpn        46
mul        19
zho        14
grc        11
por        10
en-CA       7
ita         5
enm         3
lat         3
swe         2
rus         2
ale         1
glg         1
gla         1
srp         1
nl          1
wel         1
nor         1
tur         1
msa         1
ara         1
Name: language_code, dtype: int64


In [175]:
#change en-US and en-GB to eng
goodread_df = goodread_df.replace(['en-US', 'en-GB', 'en-CA', 'enm'], 'eng')

#check language codes
language_sum = goodread_df.language_code.value_counts()
print(language_sum)

#create english book df slice
non_eng_goodread_slice = goodread_df[(goodread_df.language_code != 'eng')]

#create a list of indexes for english books
non_eng_book_index = list(non_eng_goodread_slice.index.values)

#drop all non-eng rows
eng_goodread_df = goodread_df.drop(non_eng_book_index, axis=0)

eng    10544
spa      218
fre      144
ger       99
jpn       46
mul       19
zho       14
grc       11
por       10
ita        5
lat        3
swe        2
rus        2
nor        1
ara        1
glg        1
ale        1
gla        1
srp        1
nl         1
wel        1
tur        1
msa        1
Name: language_code, dtype: int64


In [176]:
eng_goodread_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/06,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/04,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/03,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/04,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,439682584,9780439682589,eng,2690,41428,164,9/13/04,Scholastic


In [177]:
#split out authors into primary author column and other authors column
author_split = eng_goodread_df.authors.str.split("/", n=1, expand=True) 

eng_goodread_df['author'] = author_split[0]

eng_goodread_df = eng_goodread_df.drop('authors', axis=1)
eng_goodread_df.head()

Unnamed: 0,bookID,title,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,author
0,1,Harry Potter and the Half-Blood Prince (Harry ...,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/06,Scholastic Inc.,J.K. Rowling
1,2,Harry Potter and the Order of the Phoenix (Har...,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/04,Scholastic Inc.,J.K. Rowling
2,4,Harry Potter and the Chamber of Secrets (Harry...,4.42,439554896,9780439554893,eng,352,6333,244,11/1/03,Scholastic,J.K. Rowling
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/04,Scholastic Inc.,J.K. Rowling
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,4.78,439682584,9780439682589,eng,2690,41428,164,9/13/04,Scholastic,J.K. Rowling


In [178]:
#reorder goodread_df columns
eng_goodread_df = eng_goodread_df[col_names]
eng_goodread_df.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/06,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/04,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/03,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/04,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling,4.78,439682584,9780439682589,eng,2690,41428,164,9/13/04,Scholastic


In [179]:
#check authors
authors_sum = eng_goodread_df.author.value_counts()
print(authors_sum)

William Shakespeare    88
Stephen King           64
J.R.R. Tolkien         47
P.G. Wodehouse         46
Agatha Christie        45
                       ..
Robert Brent Toplin     1
Ian W. Toll             1
Luke Welling            1
Elizabeth Moon          1
Judith A. Lansdowne     1
Name: author, Length: 4169, dtype: int64


In [180]:
#check titles
titles_sum = eng_goodread_df.title.value_counts()
print(titles_sum)

The Iliad                                                                                                                             9
The Brothers Karamazov                                                                                                                9
'Salem's Lot                                                                                                                          8
Anna Karenina                                                                                                                         8
The Odyssey                                                                                                                           8
                                                                                                                                     ..
The Last Lion: Winston Spencer Churchill: Visions of Glory 1874-1932                                                                  1
Deep River                                      

In [181]:
karamazov_df = eng_goodread_df[(eng_goodread_df.title == 'The Brothers Karamazov')]

In [182]:
karamazov_df.head(9)

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
1406,4933,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,451527348,9780451527349,eng,736,983,91,6/1/99,Signet Classics
1407,4934,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,374528373,9780374528379,eng,796,191531,6795,6/14/02,Farrar Straus and Giroux
1408,4935,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,140449248,9780140449242,eng,1013,1673,184,2/27/03,Penguin Books Ltd
1409,4936,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,679729259,9780679729259,eng,796,617,80,9/3/91,Vintage Books USA
1410,4938,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,1596440791,9781596440791,eng,16,20,2,5/1/05,Hovel Audio
1411,4940,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,159308045X,9781593080457,eng,720,1089,202,7/25/04,Barnes Noble Classics
1634,5691,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,99922800,9780099922803,eng,796,443,55,1/16/92,Vintage Classics
1990,7135,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,553212168,9780553212167,eng,1072,1022,154,4/1/84,Bantam Classics
9356,37058,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,1596440783,9781596440784,eng,16,3,1,5/1/05,Hovel Audio


In [183]:
#deal with num_pages containing audio hours

In [184]:
#create Boolean series for publishers that contain the word Audio
audiobook_bool_series = eng_goodread_df['publisher'].str.contains('Audio')

#create audiobook df slice
audiobook_bool_slice = audiobook_bool_series[(audiobook_bool_series == True)]

#create a list of indexes for audiobooks
audiobook_bool_index = list(audiobook_bool_slice.index.values)

#create a new df with audiobook rows
df_audiobook = eng_goodread_df.loc[audiobook_bool_index]
df_audiobook.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
10,16,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,4.22,739322206,9780739322208,eng,6,1266,253,3/23/05,Random House Audio
40,68,The Known World,Edward P. Jones,3.83,006076273X,9780060762735,eng,14,55,12,6/15/04,HarperAudio
215,524,Lord of the Flies,William Golding,3.68,307281701,9780307281708,eng,6,408,96,10/11/05,Listening Library (Audio)
370,1281,Men Are from Mars Women Are from Venus,John Gray,3.55,006123205X,9780061232053,eng,2,43,3,4/3/07,HarperAudio
673,2166,The Old Man and the Sea,Ernest Hemingway,3.77,743564367,9780743564366,eng,3,393,77,5/1/06,Simon Schuster Audio


In [185]:
audiobook_bool_series.head()

0    False
1    False
2    False
3    False
4    False
Name: publisher, dtype: bool

In [186]:
#reorder df_audiobooks columns
col_names.append('audio_hrs')

col_names = [col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5], col_names[6], col_names[7], col_names[12], col_names[8], col_names[9], col_names[10], col_names[11]]
col_names

['bookID',
 'title',
 'author',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 'num_pages',
 'audio_hrs',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher']

In [187]:
#rename df_audiobook columns and add in a new num_pages column
df_audiobook = df_audiobook.rename(columns={'num_pages':'audio_hrs'})
df_audiobook['num_pages'] = 0

df_audiobook = df_audiobook[col_names]
df_audiobook.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
10,16,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,4.22,739322206,9780739322208,eng,0,6,1266,253,3/23/05,Random House Audio
40,68,The Known World,Edward P. Jones,3.83,006076273X,9780060762735,eng,0,14,55,12,6/15/04,HarperAudio
215,524,Lord of the Flies,William Golding,3.68,307281701,9780307281708,eng,0,6,408,96,10/11/05,Listening Library (Audio)
370,1281,Men Are from Mars Women Are from Venus,John Gray,3.55,006123205X,9780061232053,eng,0,2,43,3,4/3/07,HarperAudio
673,2166,The Old Man and the Sea,Ernest Hemingway,3.77,743564367,9780743564366,eng,0,3,393,77,5/1/06,Simon Schuster Audio


In [188]:
#drop rows for data in df_audiobook
eng_goodread_text_audio_df = eng_goodread_df.drop(audiobook_bool_index, axis=0)

In [189]:
#append updated df_audiobook to goodread_audiobook_df and reorganize columns
eng_goodread_text_audio_df = eng_goodread_text_audio_df.append(df_audiobook)
eng_goodread_text_audio_df = eng_goodread_text_audio_df[col_names]
eng_goodread_text_audio_df['audio_hrs'] = eng_goodread_text_audio_df['audio_hrs'].fillna(0)
eng_goodread_text_audio_df.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.57,439785960,9780439785969,eng,652,0,2095690,27591,9/16/06,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.49,439358078,9780439358071,eng,870,0,2153167,29221,9/1/04,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,0,6333,244,11/1/03,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,4.56,043965548X,9780439655484,eng,435,0,2339585,36325,5/1/04,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling,4.78,439682584,9780439682589,eng,2690,0,41428,164,9/13/04,Scholastic


In [190]:
#create copy of df to fix datatypes
corrected_goodread_df = eng_goodread_text_audio_df.copy()
corrected_goodread_df.head()

corrected_goodread_df.dtypes

#update average_rating, num_pages, text_reviews_count to int64
corrected_goodread_df['average_rating'] = corrected_goodread_df['average_rating'].astype(float)
corrected_goodread_df['num_pages'] = corrected_goodread_df['num_pages'].astype(int)
corrected_goodread_df['text_reviews_count'] = corrected_goodread_df['text_reviews_count'].astype(int)

corrected_goodread_df['publication_date'].sort_values()

#update publication_date to datetime
corrected_goodread_df['publication_date'] = pd.to_datetime(corrected_goodread_df['publication_date'], errors='coerce')
  
corrected_goodread_df.dtypes

bookID                         int64
title                         object
author                        object
average_rating               float64
isbn                          object
isbn13                        object
language_code                 object
num_pages                      int64
audio_hrs                     object
ratings_count                  int64
text_reviews_count             int64
publication_date      datetime64[ns]
publisher                     object
dtype: object

In [191]:
#reset index
corrected_goodread_df = corrected_goodread_df.reset_index(drop=True)

In [192]:
#check publication column
pub_date_slice = corrected_goodread_df[corrected_goodread_df.publication_date.isnull()]
pub_date_slice

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
7600,31373,In Pursuit of the Proper Sinner (Inspector Lyn...,Elizabeth George,4.1,553575104,9780553575101,eng,718,0,10608,295,NaT,Bantam Books


In [193]:
corrected_goodread_df.iloc[7600]

bookID                                                            31373
title                 In Pursuit of the Proper Sinner (Inspector Lyn...
author                                                Elizabeth  George
average_rating                                                      4.1
isbn                                                          553575104
isbn13                                                    9780553575101
language_code                                                       eng
num_pages                                                           718
audio_hrs                                                             0
ratings_count                                                     10608
text_reviews_count                                                  295
publication_date                                                    NaT
publisher                                                  Bantam Books
Name: 7600, dtype: object

In [194]:
corrected_goodread_df['publication_date'][7600]

NaT

In [195]:
#replace incorrect date
date_correction = pd.to_datetime('2000-10-31')

corrected_goodread_df['publication_date'] = corrected_goodread_df['publication_date'].fillna(date_correction)

In [196]:
corrected_goodread_df['publication_date'][7600]

Timestamp('2000-10-31 00:00:00')

In [197]:
#check for missing values in the num_pages and audio_hrs

In [198]:
#create slice of DF to check out books with 0 num_pages and 0 audio_hrs
zero_bookpages_df = corrected_goodread_df[(corrected_goodread_df.num_pages == 0) & (corrected_goodread_df.audio_hrs == 0)]
zero_bookpages_df

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
291,955,The 5 Love Languages / The 5 Love Languages Jo...,Gary Chapman,4.7,802415318,9780802415318,eng,0,0,22,4,2005-01-01,Moody Publishers
801,2835,The Tragedy of Pudd'nhead Wilson,Mark Twain,3.79,140015068X,9781400150687,eng,0,0,3,0,2003-01-01,Tantor Media
1984,7598,A Study Guide to Gabriel Garcia Marquez' One H...,Gabriel García Márquez,4.11,1570421129,9781570421129,eng,0,0,75,2,2006-04-01,Warner Adult
2027,7737,Gulliver's Travels,Jonathan Swift,3.57,1400102723,9781400102723,eng,0,0,4,1,2006-10-01,Tantor Media
2300,8889,Return to the Planet of the Apes #2: Escape fr...,William Arrow,3.0,345251679,9780345251671,eng,0,0,10,2,1976-04-12,Ballantine Books
2308,8916,The Complete Science Fiction Treasury of H.G. ...,H.G. Wells,4.14,517052253,9780517052259,eng,0,0,45,1,1987-06-24,Random House Value Publishing
2527,9832,Blind Willow Sleeping Woman: 24 Stories,Haruki Murakami,3.84,1400102952,9781400102952,eng,0,0,28,3,2006-10-15,Tantor Media
2605,10215,The Far Pavilions,M.M. Kaye,4.21,517333414,9780517333419,eng,0,0,19,4,1988-12-12,Random House Value Publishing
3396,13106,The Celestine Prophecy,James Redfield,3.64,1594831955,9781594831959,eng,0,0,64,10,2006-02-16,Grand Central Publishing
3611,13842,The Botany of Desire: A Plant's-Eye View of th...,Michael Pollan,4.06,1596590939,9781596590939,eng,0,0,117,36,2007-05-21,Your Coach Digital


In [199]:
tantor_media_df = corrected_goodread_df[(corrected_goodread_df.publisher == 'Tantor Media')]
tantor_media_df

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
801,2835,The Tragedy of Pudd'nhead Wilson,Mark Twain,3.79,140015068X,9781400150687,eng,0,0,3,0,2003-01-01,Tantor Media
2027,7737,Gulliver's Travels,Jonathan Swift,3.57,1400102723,9781400102723,eng,0,0,4,1,2006-10-01,Tantor Media
2527,9832,Blind Willow Sleeping Woman: 24 Stories,Haruki Murakami,3.84,1400102952,9781400102952,eng,0,0,28,3,2006-10-15,Tantor Media
4926,19081,Madame Bovary,Gustave Flaubert,3.67,1400132746,9781400132744,eng,0,0,3,0,2006-10-01,Tantor Media
9353,40378,The Chessmen of Mars (Barsoom #5),Edgar Rice Burroughs,3.83,1400130212,9781400130214,eng,0,0,5147,157,2005-01-01,Tantor Media
9354,40379,The Warlord of Mars (Barsoom #3),Edgar Rice Burroughs,3.86,1400130220,9781400130221,eng,6,0,9350,345,2001-02-01,Tantor Media


"""#data collection to update the df
index | ISBN | num_pages | audio_hrs
10194 | 9780449210840 | 318 | 0
9513 | 9780517164358 | 241 | 0
9353 | 9781400130214 | 0 | 9.25
8892 | 9780449239698 | 221 | 0
8491 | 9780373612116 | 192 | 0
8366 | 9780517199961 | 398 | 0
8330 | 9780804104555 | 341 | 0
7055 | 9781401910440 |  0 | 3
6609 | 9780449231418 | 288 | 0
5601 | 9780613997638 | 241 | 0
5419 | 9781594839412 | 0 | 13
5418 | 9781594832284 | 0 | 15.25
5373 | 9780618497423 | 0 | 432
4926 | 9781400132744 | 0 | 11.5
4486 | 9780738209272 | 0 | 12
4485 | 9780738209258 | 0 | 12
4214 | 9781572703858 | 0 | 5.25
3611 | 9781596590939 | 271 | 0
3396 | 9781594831959 | 0 | 8.5
2605 | 9780517333419 | 1000 | 0
2527 | 9781400102952 | 0 | 12.75
2308 | 9780517052259 | 1015 | 0
2300 | 9780345251671 | 192 | 0
1984 | 9781400102723 | 0 | 11
drop --> 9781570421129, it's an audiobook but I'm unable to find run time AND I'm more interested in stories not study guides.
801 | 9781400150687 | 0 | 6.5
drop --> 9780802415318, I'm unable to find page number for this edition AND I'm more interested in stories not study guides."""

#create a list of isbn13s above and a list values to replace function where if i in isbn13, replace 0 with number


In [200]:
corrected_goodread_df.audio_hrs.iloc[801] = 6.5
corrected_goodread_df.audio_hrs.iloc[2027] = 11
corrected_goodread_df.num_pages.iloc[2300] = 192
corrected_goodread_df.num_pages.iloc[2308] = 1015
corrected_goodread_df.num_pages.iloc[10194] = 318
corrected_goodread_df.num_pages.iloc[9513] = 241
corrected_goodread_df.audio_hrs.iloc[9353] = 9.25
corrected_goodread_df.num_pages.iloc[8892] = 221
corrected_goodread_df.num_pages.iloc[8491] = 92
corrected_goodread_df.num_pages.iloc[8366] = 398
corrected_goodread_df.num_pages.iloc[8330] = 341
corrected_goodread_df.audio_hrs.iloc[7055] = 3
corrected_goodread_df.num_pages.iloc[6609] = 288
corrected_goodread_df.num_pages.iloc[5601] = 241
corrected_goodread_df.audio_hrs.iloc[5419] = 13
corrected_goodread_df.audio_hrs.iloc[5418] = 15.25
corrected_goodread_df.audio_hrs.iloc[5373] = 432
corrected_goodread_df.audio_hrs.iloc[4926] = 11.5
corrected_goodread_df.audio_hrs.iloc[4486] = 12
corrected_goodread_df.audio_hrs.iloc[4485] = 12
corrected_goodread_df.audio_hrs.iloc[4214] = 5.25
corrected_goodread_df.num_pages.iloc[3611] = 271
corrected_goodread_df.audio_hrs.iloc[3396] = 8.5
corrected_goodread_df.num_pages.iloc[2605] = 1000
corrected_goodread_df.audio_hrs.iloc[2527] = 12.75
corrected_goodread_df.audio_hrs.iloc[9354] = 6
corrected_goodread_df.num_pages.iloc[9354] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [201]:
corrected_goodread_df.iloc[9354]

bookID                                            40379
title                 The Warlord of Mars (Barsoom  #3)
author                             Edgar Rice Burroughs
average_rating                                     3.86
isbn                                         1400130220
isbn13                                    9781400130221
language_code                                       eng
num_pages                                             0
audio_hrs                                             6
ratings_count                                      9350
text_reviews_count                                  345
publication_date                    2001-02-01 00:00:00
publisher                                  Tantor Media
Name: 9354, dtype: object

In [202]:
print(len(corrected_goodread_df))

10544


In [203]:
#drop zero book length rows
corrected_goodread_df = corrected_goodread_df.drop([1984, 291], axis=0)

#reset index
corrected_goodread_df = corrected_goodread_df.reset_index(drop=True)

In [204]:
print(len(corrected_goodread_df))

10542


In [151]:
#add columns for
#series
#genre category (fiction/non-fiction)
#genre (memoir, play, science fiction, etc.)
#?

In [152]:
#re-sort by number of ratings, check head and tail
#re-sort by number of reviews, check head and tail
#re-sort by ratings, check head and tail

#number counts for:
#authors
#average_ratings
#publisher
#text_reviews_counts
#ratings_count