In [1]:
import numpy as np 
import pandas as pd

In [2]:
#read in data
original_goodread_df = pd.read_csv('books.csv', error_bad_lines = False)
original_goodread_df.head()

#make a copy of the df so preserve original data
goodread_df = original_goodread_df.copy()

### Set Up Data To Be Easily Accessed Late 

In [3]:
#creating column name list
col_names = list(goodread_df.columns)
print(col_names)

['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13', 'language_code', '  num_pages', 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher', 'Unnamed: 12']


In [4]:
#updating column name to have no spaces
goodread_df = goodread_df.rename(columns={"  num_pages": "num_pages"})

In [5]:
#updating column name list
col_names = list(goodread_df.columns)
print(col_names)

['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13', 'language_code', 'num_pages', 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher', 'Unnamed: 12']


In [6]:
#check authors
authors_sum = goodread_df.authors.value_counts()
print(authors_sum)

P.G. Wodehouse                               40
Stephen King                                 40
Rumiko Takahashi                             39
Orson Scott Card                             35
Agatha Christie                              33
                                             ..
Robert Goddard                                1
Yasunari Kawabata/Edward G. Seidensticker     1
Jerome Kagan                                  1
Norman Sherry                                 1
D.H. Lawrence/Christa Jansohn                 1
Name: authors, Length: 6643, dtype: int64


In [7]:
#check publishers
publishers_sum = goodread_df.publisher.value_counts()
print(publishers_sum)

Vintage                              318
Penguin Books                        261
Penguin Classics                     184
Mariner Books                        150
Ballantine Books                     144
                                    ... 
Peter Smith Publisher                  1
Doubleday & Company  Inc.              1
Titan Books Ltd                        1
W.W. Norton & Company (NY/London)      1
One Hour Entertainment                 1
Name: publisher, Length: 2294, dtype: int64


In [8]:
#check titles
titles_sum = goodread_df.title.value_counts()
print(titles_sum)

The Brothers Karamazov                                                                9
The Iliad                                                                             9
Gulliver's Travels                                                                    8
The Odyssey                                                                           8
Anna Karenina                                                                         8
                                                                                     ..
Beatrix Potter's Journal                                                              1
E=mc²: A Biography of the World's Most Famous Equation                                1
Shampoo Planet                                                                        1
Fermat's Enigma: The Epic Quest to Solve the World's Greatest Mathematical Problem    1
The Hannibal Lecter Trilogy                                                           1
Name: title, Length: 10352, dtyp

In [9]:
#check ISBNs
isbn_sum = goodread_df.isbn.value_counts()
print(isbn_sum)

3.58          2
440169127     1
312865767     1
374455031     1
671024205     1
             ..
23513209      1
7179812       1
039700804X    1
8838461139    1
1572705485    1
Name: isbn, Length: 11126, dtype: int64


In [10]:
#check unnamed column
extra_col_sum = goodread_df['Unnamed: 12'].value_counts()
print(extra_col_sum)

Brown Son & Ferguson Ltd.      1
Cold Spring Press              1
Harvard University Press       1
Huntington House Publishers    1
Name: Unnamed: 12, dtype: int64


### Update goodread_df rows that are out of alignment

In [12]:
#create slice of DF to check out the alignment of certain rows
outofalignment_df = goodread_df[(goodread_df.num_pages == 'eng') | (goodread_df.num_pages == 'en-US')]

outofalignment_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner,Jr./Sam B. Warner,3.58,674842111,9780670000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net,one of the founding members of this Tolkien w...,3.58,1593600119,9781590000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley,Rawles,3.63,156384155X,9781560000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown,Son & Ferguson,0.0,851742718,9780850000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [13]:
#create a new df with rows out of alignment
df_misshapen = goodread_df.loc[[3348, 4702, 5877, 8979]]
df_misshapen.head()

#add the separated author columns together 
df_misshapen['authors_full'] = df_misshapen['authors'] + df_misshapen['average_rating']

#overwrite poorly formatted author column with correct author column 
df_misshapen['authors'] = df_misshapen['authors_full']

#drop unneeded columns
df_misshapen = df_misshapen.drop(['average_rating','authors_full'], axis=1)

df_misshapen.head()

Unnamed: 0,bookID,title,authors,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner Jr./Sam B. Warner,3.58,674842111,9780670000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net one o...,3.58,1593600119,9781590000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley Rawles,3.63,156384155X,9781560000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown Son & Ferguson,0.0,851742718,9780850000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [14]:
loop_count = 0

def update_col_names(df, col_list, loop_count):
    for col in df.columns:
        df = df.rename(columns={col: col_list[loop_count]})
        loop_count += 1
    return df

df_reshaped = update_col_names(df_misshapen, col_names, loop_count)

df_reshaped.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner Jr./Sam B. Warner,3.58,674842111,9780670000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net one o...,3.58,1593600119,9781590000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley Rawles,3.63,156384155X,9781560000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown Son & Ferguson,0.0,851742718,9780850000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [15]:
#overwrite poorly formatted rows with correct formatted rows 
goodread_df.iloc[3348] = df_reshaped.iloc[0]
goodread_df.iloc[4702] = df_reshaped.iloc[1]
goodread_df.iloc[5877] = df_reshaped.iloc[2]
goodread_df.iloc[8979] = df_reshaped.iloc[3]

In [16]:
#check unnamed column
extra_col_sum = goodread_df["Unnamed: 12"].value_counts()
print(extra_col_sum)

Series([], Name: Unnamed: 12, dtype: int64)


In [17]:
#drop empty column
goodread_df = goodread_df.drop(['Unnamed: 12'], axis=1)
col_names[2] = 'author'
col_names = col_names[:-1]
col_names

['bookID',
 'title',
 'author',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 'num_pages',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher']

### Create dataframe with only books in English

In [19]:
#check language codes
language_sum = goodread_df.language_code.value_counts()
print(language_sum)

eng      8911
en-US    1409
spa       218
en-GB     214
fre       144
ger        99
jpn        46
mul        19
zho        14
grc        11
por        10
en-CA       7
ita         5
enm         3
lat         3
rus         2
swe         2
srp         1
wel         1
ara         1
ale         1
gla         1
nl          1
tur         1
nor         1
glg         1
msa         1
Name: language_code, dtype: int64


In [20]:
#change en-US and en-GB to eng
goodread_df = goodread_df.replace(['en-US', 'en-GB', 'en-CA', 'enm'], 'eng')

#check language codes
language_sum = goodread_df.language_code.value_counts()
print(language_sum)

#create english book df slice
non_eng_goodread_slice = goodread_df[(goodread_df.language_code != 'eng')]

#create a list of indexes for english books
non_eng_book_index = list(non_eng_goodread_slice.index.values)

#drop all non-eng rows
eng_goodread_df = goodread_df.drop(non_eng_book_index, axis=0)

#reset index
eng_goodread_df = eng_goodread_df.reset_index(drop=True)

print(len(eng_goodread_df))

eng    10544
spa      218
fre      144
ger       99
jpn       46
mul       19
zho       14
grc       11
por       10
ita        5
lat        3
swe        2
rus        2
wel        1
srp        1
nor        1
ara        1
ale        1
gla        1
nl         1
tur        1
glg        1
msa        1
Name: language_code, dtype: int64


NameError: name 'corrected_goodread_df' is not defined

In [21]:
print(len(eng_goodread_df))

10544


In [22]:
#split out authors into primary author column and other authors column
author_split = eng_goodread_df.authors.str.split("/", n=1, expand=True) 

eng_goodread_df['author'] = author_split[0]

eng_goodread_df = eng_goodread_df.drop('authors', axis=1)
eng_goodread_df.head()

Unnamed: 0,bookID,title,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,author
0,1,Harry Potter and the Half-Blood Prince (Harry ...,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/06,Scholastic Inc.,J.K. Rowling
1,2,Harry Potter and the Order of the Phoenix (Har...,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/04,Scholastic Inc.,J.K. Rowling
2,4,Harry Potter and the Chamber of Secrets (Harry...,4.42,439554896,9780439554893,eng,352,6333,244,11/1/03,Scholastic,J.K. Rowling
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/04,Scholastic Inc.,J.K. Rowling
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,4.78,439682584,9780439682589,eng,2690,41428,164,9/13/04,Scholastic,J.K. Rowling


In [23]:
#reorder goodread_df columns
eng_goodread_df = eng_goodread_df[col_names]
eng_goodread_df.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/06,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/04,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/03,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/04,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling,4.78,439682584,9780439682589,eng,2690,41428,164,9/13/04,Scholastic


In [24]:
#check authors
authors_sum = eng_goodread_df.author.value_counts()
print(authors_sum)

William Shakespeare    88
Stephen King           64
J.R.R. Tolkien         47
P.G. Wodehouse         46
Agatha Christie        45
                       ..
Rosetta James           1
Mark Kermode            1
Julie Garwood           1
Peter Hessler           1
James Pinocchio         1
Name: author, Length: 4169, dtype: int64


In [25]:
#check titles
titles_sum = eng_goodread_df.title.value_counts()
print(titles_sum)

The Brothers Karamazov             9
The Iliad                          9
'Salem's Lot                       8
The Odyssey                        8
Gulliver's Travels                 8
                                  ..
Floor Games (Sandplay Classics)    1
Blow Fly (Kay Scarpetta  #12)      1
The Shape of Things to Come        1
Beatrix Potter's Journal           1
The Hannibal Lecter Trilogy        1
Name: title, Length: 9808, dtype: int64


In [26]:
#check a specific title
karamazov_df = eng_goodread_df[(eng_goodread_df.title == 'The Brothers Karamazov')]
karamazov_df.head(9)

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
1330,4933,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,451527348,9780451527349,eng,736,983,91,6/1/99,Signet Classics
1331,4934,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,374528373,9780374528379,eng,796,191531,6795,6/14/02,Farrar Straus and Giroux
1332,4935,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,140449248,9780140449242,eng,1013,1673,184,2/27/03,Penguin Books Ltd
1333,4936,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,679729259,9780679729259,eng,796,617,80,9/3/91,Vintage Books USA
1334,4938,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,1596440791,9781596440791,eng,16,20,2,5/1/05,Hovel Audio
1335,4940,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,159308045X,9781593080457,eng,720,1089,202,7/25/04,Barnes Noble Classics
1551,5691,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,99922800,9780099922803,eng,796,443,55,1/16/92,Vintage Classics
1897,7135,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,553212168,9780553212167,eng,1072,1022,154,4/1/84,Bantam Classics
8872,37058,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,1596440783,9781596440784,eng,16,3,1,5/1/05,Hovel Audio


### Deal with num_pages containing audio hours

In [27]:
#create Boolean series for publishers that contain the word Audio
audiobook_bool_series = eng_goodread_df['publisher'].str.contains('Audio')

#create audiobook df slice
audiobook_bool_slice = audiobook_bool_series[(audiobook_bool_series == True)]

#create a list of indexes for audiobooks
audiobook_bool_index = list(audiobook_bool_slice.index.values)

#create a new df with audiobook rows
df_audiobook = eng_goodread_df.loc[audiobook_bool_index]
print(len(df_audiobook))

print(set(list(df_audiobook['publisher'])))

181
{'Nova Audio Books', 'Hovel Audio', 'RH Audio', 'BBC Audiobooks America', 'The Audio Partners', 'Scholastic Audio Books', 'Harper Audio', 'Hodder Audio', 'Macmillan Audio', 'Blackstone Audiobooks', 'Brilliance Audio', 'Canadian Broadcasting Corporation (CBC Audio)', 'BBC Audiobooks Ltd', 'Simon  Schuster Audio', 'MacMillan Audio', 'RH Audio Price-less', 'Phoenix Audio', 'Audio Partners', 'Random House Audio', 'AudioGO', 'Audio Renaissance', 'Audiogo', 'BBC Audiobooks', 'Penguin Audio UK', 'Hachette Audio', 'Naxos Audiobooks', 'Audio Literature', 'BBC Physical Audio', 'Highbridge Audio', 'HarperAudio', 'Listening Library (Audio)', 'Simon & Schuster Audio', 'AudioText', 'Random House Audio Publishing Group', 'Puffin Audiobooks', 'Penguin Audio'}


In [28]:
#reorder df_audiobooks columns
col_names.append('audio_hrs')

col_names = [col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5], col_names[6], col_names[7], col_names[12], col_names[8], col_names[9], col_names[10], col_names[11]]
col_names

['bookID',
 'title',
 'author',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 'num_pages',
 'audio_hrs',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher']

In [29]:
#rename df_audiobook columns and add in a new num_pages column
df_audiobook = df_audiobook.rename(columns={'num_pages':'audio_hrs'})
df_audiobook['num_pages'] = 0

df_audiobook = df_audiobook[col_names]
df_audiobook.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
10,16,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,4.22,739322206,9780739322208,eng,0,6,1266,253,3/23/05,Random House Audio
40,68,The Known World,Edward P. Jones,3.83,006076273X,9780060762735,eng,0,14,55,12,6/15/04,HarperAudio
210,524,Lord of the Flies,William Golding,3.68,307281701,9780307281708,eng,0,6,408,96,10/11/05,Listening Library (Audio)
353,1281,Men Are from Mars Women Are from Venus,John Gray,3.55,006123205X,9780061232053,eng,0,2,43,3,4/3/07,HarperAudio
636,2166,The Old Man and the Sea,Ernest Hemingway,3.77,743564367,9780743564366,eng,0,3,393,77,5/1/06,Simon Schuster Audio


In [30]:
#drop rows for data in df_audiobook
eng_goodread_text_audio_df = eng_goodread_df.drop(audiobook_bool_index, axis=0)

In [32]:
#append updated df_audiobook to goodread_audiobook_df and reorganize columns
eng_goodread_text_audio_df = eng_goodread_text_audio_df.append(df_audiobook)
eng_goodread_text_audio_df = eng_goodread_text_audio_df[col_names]
eng_goodread_text_audio_df['audio_hrs'] = eng_goodread_text_audio_df['audio_hrs'].fillna(0)
audio_hrs_slice = eng_goodread_text_audio_df[(eng_goodread_text_audio_df['audio_hrs'] != 0)]
audio_hrs_slice.head()

#reset index
eng_goodread_text_audio_df = eng_goodread_text_audio_df.reset_index(drop=True)

print(len(eng_goodread_text_audio_df))

10725


### Check for missing values in the num_pages and audio_hrs

In [33]:
#correct the author == not a book rows
notabook_slice = eng_goodread_text_audio_df[(eng_goodread_text_audio_df['author'] == 'NOT A BOOK')]
notabook_slice.head()

eng_goodread_text_audio_df.iloc[10373] = eng_goodread_text_audio_df.iloc[10373].replace({'audio_hrs': '0'}, 6.5)
eng_goodread_text_audio_df.iloc[10373] = eng_goodread_text_audio_df.iloc[10373].replace({'author': 'NOT A BOOK'}, 'Anthony Boucher')
eng_goodread_text_audio_df.iloc[10374] = eng_goodread_text_audio_df.iloc[10374].replace({'audio_hrs': '0'}, 1)
eng_goodread_text_audio_df.iloc[10374] = eng_goodread_text_audio_df.iloc[10374].replace({'author': 'NOT A BOOK'}, 'Anthony Boucher')

eng_goodread_text_audio_df.iloc[10373]

bookID                                                             3593
title                 Murder by Moonlight & Other Mysteries (New Adv...
author                                                  Anthony Boucher
average_rating                                                        4
isbn                                                          743564677
isbn13                                                    9780743564670
language_code                                                       eng
num_pages                                                             0
audio_hrs                                                           6.5
ratings_count                                                         7
text_reviews_count                                                    2
publication_date                                                10/3/06
publisher                                         Simon  Schuster Audio
Name: 10373, dtype: object

In [34]:
#drop Not A Book rows
eng_goodread_text_audio_df = eng_goodread_text_audio_df.drop([10438, 10439, 10440], axis=0)

#reset index
eng_goodread_text_audio_df = eng_goodread_text_audio_df.reset_index(drop=True)

### Fix datatypes

In [35]:
#create copy of df to fix datatypes
corrected_goodread_df = eng_goodread_text_audio_df.copy()
corrected_goodread_df.head()

corrected_goodread_df.dtypes

#update average_rating, num_pages, text_reviews_count to int64
corrected_goodread_df['bookID'] = corrected_goodread_df['bookID'].astype(str)
corrected_goodread_df['audio_hrs'] = corrected_goodread_df['audio_hrs'].astype(float)
corrected_goodread_df['average_rating'] = corrected_goodread_df['average_rating'].astype(float)
corrected_goodread_df['num_pages'] = corrected_goodread_df['num_pages'].astype(int)
corrected_goodread_df['text_reviews_count'] = corrected_goodread_df['text_reviews_count'].astype(int)

corrected_goodread_df['publication_date'].sort_values()

#update publication_date to datetime
corrected_goodread_df['publication_date'] = pd.to_datetime(corrected_goodread_df['publication_date'], errors='coerce')
  
corrected_goodread_df.dtypes

bookID                        object
title                         object
author                        object
average_rating               float64
isbn                          object
isbn13                        object
language_code                 object
num_pages                      int64
audio_hrs                    float64
ratings_count                  int64
text_reviews_count             int64
publication_date      datetime64[ns]
publisher                     object
dtype: object

### Update Tantor Media audiobooks

In [36]:
#create dataframe for Tantor Media audiobooks
tantor_media_df = corrected_goodread_df[(corrected_goodread_df.publisher == 'Tantor Media')]
tantor_media_df

#create a list of indexes for Tantor Media audiobooks
tantormedia_index_list = list(tantor_media_df.index.values)

tantormedia_audiobooklength = {801: 6.5, 2027: 11, 2527:12.75, 4926:11.5, 9353:9.25, 9354:5.75}

In [37]:
def update_booklength(df, booklength_list, booklength_dict):
    for key, value in booklength_dict.items():
        if key in booklength_list:
            df.iloc[key] = df.iloc[key].replace({'audio_hrs': 0}, value)
        else:
            df.iloc[key] = df.iloc[key].replace({'num_pgs': 0}, value)
    return df

In [38]:
corrected_goodread_df = update_booklength(corrected_goodread_df, tantormedia_index_list, tantormedia_audiobooklength)

In [39]:
corrected_goodread_df[(corrected_goodread_df.publisher == 'Tantor Media')]

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
801,2835,The Tragedy of Pudd'nhead Wilson,Mark Twain,3.79,140015068X,9781400150687,eng,0,6.5,3,0,2003-01-01,Tantor Media
2027,7737,Gulliver's Travels,Jonathan Swift,3.57,1400102723,9781400102723,eng,0,11.0,4,1,2006-10-01,Tantor Media
2527,9832,Blind Willow Sleeping Woman: 24 Stories,Haruki Murakami,3.84,1400102952,9781400102952,eng,0,12.75,28,3,2006-10-15,Tantor Media
4926,19081,Madame Bovary,Gustave Flaubert,3.67,1400132746,9781400132744,eng,0,11.5,3,0,2006-10-01,Tantor Media
9353,40378,The Chessmen of Mars (Barsoom #5),Edgar Rice Burroughs,3.83,1400130212,9781400130214,eng,0,9.25,5147,157,2005-01-01,Tantor Media
9354,40379,The Warlord of Mars (Barsoom #3),Edgar Rice Burroughs,3.86,1400130220,9781400130221,eng,6,5.75,9350,345,2001-02-01,Tantor Media


In [40]:
corrected_goodread_df.iloc[801]

bookID                                            2835
title                 The Tragedy of Pudd'nhead Wilson
author                                      Mark Twain
average_rating                                    3.79
isbn                                        140015068X
isbn13                                   9781400150687
language_code                                      eng
num_pages                                            0
audio_hrs                                          6.5
ratings_count                                        3
text_reviews_count                                   0
publication_date                   2003-01-01 00:00:00
publisher                                 Tantor Media
Name: 801, dtype: object

In [41]:
corrected_goodread_df.dtypes

bookID                        object
title                         object
author                        object
average_rating               float64
isbn                          object
isbn13                        object
language_code                 object
num_pages                      int64
audio_hrs                    float64
ratings_count                  int64
text_reviews_count             int64
publication_date      datetime64[ns]
publisher                     object
dtype: object

### Update rows with the zero book length

In [42]:
#create slice of DF to check out books with 0 num_pages and 0 audio_hrs
zero_booklength_df = corrected_goodread_df[(corrected_goodread_df.num_pages == 0) & (corrected_goodread_df.audio_hrs == 0)]
zero_booklength_df

#create a list of indexes for zero_booklength_df
zero_booklength_index = list(zero_booklength_df.index.values)

In [43]:
zero_booklength_df[(zero_booklength_df.publisher == 'Tantor Media')]

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher


In [45]:
print(len(zero_booklength_index))

111


In [46]:
#create a new df with zero booklength books
df_zero_booklength = corrected_goodread_df.loc[zero_booklength_index]

df_zero_booklength.to_csv('zero_booklength_01.csv')

In [None]:
zero_booklength_dict = {2300:192, 2308:1015, 2605:1000, 3396:8.5, 3611:271, 4214:5.25, 4485:12, 4486:12, 5373:432, 5418:15.25,
                        5419:13, 5601:241, 6609:288, 7055:3, 8330:341, 8366:398, 8491:192, 8892:221, 9513:241, 10194:318,
                        10376:17, 10381:20.5, 10384:6, 10385:4.5, 10389:15.25, 10396:1.5, 10404:3.5, 10407:16, 10414:37.25,
                        10420:6.5, 10423:7, 10431:16.75, 10434:5.25, 10440:6.75, 10442:9, 10445:22, 10447:5.75, 10457:5.25,
                        10458:6.5, 10459:5.75, 10460:6, 10479:21.25, 10481:3, 10482:16.25, 10483:3, 10484:15.5, 10487:8.75,
                        10489:10, 10490:5.5, 10492:0.75, 10497:1, 10498:1.25, 10499:3.25, 10501:3, 10502:1.5, 10506:6.25,
                        10510:11.5, 10517:14.75, 10522:2.5, 10524:8.5, 10527:34.5, 10531:1.25, 10539:2}
audiobook_index = [3396, 4214, 4485, 4486, 5418, 5419, 7055, 10376, 10381, 10384,10385, 10389, 10396, 10404, 10407, 
                   10414, 10420, 10423, 10431, 10434, 10440, 10442, 10445, 10447, 10457, 10458, 10459, 10460, 10479, 
                   10481, 10482, 10483, 10484, 10487, 10489, 10490, 10492, 10497, 10498, 10499, 10501, 10502, 10506, 
                   10510, 10517, 10522, 10524, 10527, 10531, 10539]

In [None]:
corrected_goodread_df = update_booklength(corrected_goodread_df, audiobook_index, zero_booklength_dict)

In [None]:
#create a new df with zero booklength books
booklength_df = corrected_goodread_df.loc[zero_booklength_index]
booklength_df

In [None]:
booklength_df = corrected_goodread_df[(corrected_goodread_df.num_pages == 0) & (corrected_goodread_df.audio_hrs == 0)]
booklength_df

print(len(corrected_goodread_df))

In [None]:
#drop zero book length rows
corrected_goodread_df = corrected_goodread_df.drop([1984, 291], axis=0)

#reset index
corrected_goodread_df = corrected_goodread_df.reset_index(drop=True)

print(len(corrected_goodread_df))

In [None]:
Update null publication dates

In [None]:
#check publication date column
pub_date_slice = corrected_goodread_df[corrected_goodread_df.publication_date.isnull()]
pub_date_slice

In [None]:
corrected_goodread_df.iloc[7600]

In [None]:
#replace incorrect date
date_correction = pd.to_datetime('2000-10-31')

corrected_goodread_df['publication_date'] = corrected_goodread_df['publication_date'].fillna(date_correction)

In [None]:
corrected_goodread_df['publication_date'][7600]

In [None]:
Create a series column

In [None]:
#create Boolean series for publishers that contain the character #
bookseries_series = corrected_goodread_df['title'].str.contains('#')

#create audiobook df slice
bookseries_bool_slice = bookseries_series[(bookseries_series == True)]

#create a list of indexes for audiobooks
bookseries_bool_index = list(bookseries_bool_slice.index.values)

#create a new df with audiobook rows
df_bookseries = corrected_goodread_df.loc[bookseries_bool_index]
df_bookseries.title.head()

In [None]:
list_bookseries = df_bookseries['title'].to_list()
list_bookseries

In [None]:
df_bookseries['series'] = True

corrected_goodread_df = pd.merge(corrected_goodread_df, df_bookseries, on=col_names, how='outer')
corrected_goodread_df['series'] = corrected_goodread_df['series'].fillna(False)

In [None]:
corrected_goodread_df.head()

In [None]:
corrected_goodread_df.dtypes

In [None]:
print(len(corrected_goodread_df))

In [None]:
Send information to CSV to help with adding more information to this dataset later

In [None]:
corrected_goodread_df['isbn'].to_csv('isbn.csv', index=False)

In [None]:
corrected_goodread_df['isbn13'].to_csv('isbn13.csv', index=False)

In [None]:
corrected_goodread_df.to_csv('corrected_goodread_df.csv', index=False)

In [None]:
#add columns for
#genre category (fiction/non-fiction)
#genre (memoir, play, science fiction, etc.)
#?

In [None]:
#re-sort by number of ratings, check head and tail
#re-sort by number of reviews, check head and tail
#re-sort by ratings, check head and tail

#number counts for:
#authors
#average_ratings
#publisher
#text_reviews_counts
#ratings_count