In [217]:
import numpy as np 
import pandas as pd

In [218]:
original_goodread_df = pd.read_csv('books.csv', error_bad_lines = False)
original_goodread_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/06,Scholastic Inc.,
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/04,Scholastic Inc.,
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780000000000.0,eng,352,6333,244,11/1/03,Scholastic,
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780000000000.0,eng,435,2339585,36325,5/1/04,Scholastic Inc.,
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,439682584,9780000000000.0,eng,2690,41428,164,9/13/04,Scholastic,


In [219]:
#make a copy of the df so preserve original data
goodread_df = original_goodread_df.copy()

In [220]:
#creating column name list
col_names = list(goodread_df.columns)
print(col_names)

['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13', 'language_code', '  num_pages', 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher', 'Unnamed: 12']


In [221]:
#updating column name to have no spaces
goodread_df = goodread_df.rename(columns={"  num_pages": "num_pages"})

In [222]:
#updating column name list
col_names = list(goodread_df.columns)
print(col_names)

['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13', 'language_code', 'num_pages', 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher', 'Unnamed: 12']


In [223]:
#check authors
authors_sum = goodread_df.authors.value_counts()
print(authors_sum)

Stephen King                         40
P.G. Wodehouse                       40
Rumiko Takahashi                     39
Orson Scott Card                     35
Agatha Christie                      33
                                     ..
Temple Grandin/Catherine  Johnson     1
W. Chan Kim/Renée Mauborgne           1
Arthur Bennett                        1
Alice Munro/Susanna Basso             1
Sherman Alexie                        1
Name: authors, Length: 6643, dtype: int64


In [224]:
#check publishers
publishers_sum = goodread_df.publisher.value_counts()
print(publishers_sum)

Vintage                                    318
Penguin Books                              261
Penguin Classics                           184
Mariner Books                              150
Ballantine Books                           144
                                          ... 
Simon & Schuster Adult Publishing Group      1
Bantam Books  Inc.                           1
Thomas Nelson Publishers                     1
Octopus Books                                1
Manesse Verlag                               1
Name: publisher, Length: 2294, dtype: int64


In [225]:
#check titles
titles_sum = goodread_df.title.value_counts()
print(titles_sum)

The Iliad                                  9
The Brothers Karamazov                     9
Anna Karenina                              8
Gulliver's Travels                         8
The Odyssey                                8
                                          ..
Influence: The Psychology of Persuasion    1
Lucy Sullivan Is Getting Married           1
Novels and Stories                         1
Ensayo sobre la lucidez                    1
The Satanic Verses                         1
Name: title, Length: 10352, dtype: int64


In [226]:
#check ISBNs
isbn_sum = goodread_df.isbn.value_counts()
print(isbn_sum)

3.58          2
068483183X    1
393002039     1
312426119     1
684854678     1
             ..
043930931X    1
393321630     1
1406503436    1
086516584X    1
898709199     1
Name: isbn, Length: 11126, dtype: int64


In [227]:
#check unnamed column
extra_col_sum = goodread_df['Unnamed: 12'].value_counts()
print(extra_col_sum)

Harvard University Press       1
Brown Son & Ferguson Ltd.      1
Cold Spring Press              1
Huntington House Publishers    1
Name: Unnamed: 12, dtype: int64


In [228]:
# Update goodread_df rows that are out of alignment

In [229]:
#create slice of DF to check out the alignment of certain rows
outofalignment_df = goodread_df[(goodread_df.num_pages == 'eng') | (goodread_df.num_pages == 'en-US')]

outofalignment_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner,Jr./Sam B. Warner,3.58,674842111,9780000000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net,one of the founding members of this Tolkien w...,3.58,1593600119,9780000000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley,Rawles,3.63,156384155X,9780000000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown,Son & Ferguson,0.0,851742718,9780000000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [230]:
#create a new df with rows out of alignment
df_misshapen = goodread_df.loc[[3348, 4702, 5877, 8979]]
df_misshapen.head()

#add the separated author columns together 
df_misshapen['authors_full'] = df_misshapen['authors'] + df_misshapen['average_rating']

#overwrite poorly formatted author column with correct author column 
df_misshapen['authors'] = df_misshapen['authors_full']

#drop unneeded columns
df_misshapen = df_misshapen.drop(['average_rating','authors_full'], axis=1)

df_misshapen.head()

Unnamed: 0,bookID,title,authors,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner Jr./Sam B. Warner,3.58,674842111,9780000000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net one o...,3.58,1593600119,9780000000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley Rawles,3.63,156384155X,9780000000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown Son & Ferguson,0.0,851742718,9780000000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [231]:
loop_count = 0

def update_col_names(df, col_list, loop_count):
    for col in df.columns:
        df = df.rename(columns={col: col_list[loop_count]})
        loop_count += 1
    return df

df_reshaped = update_col_names(df_misshapen, col_names, loop_count)

df_reshaped.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
3348,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner Jr./Sam B. Warner,3.58,674842111,9780000000000.0,en-US,236,61,6,4/20/04,Harvard University Press
4702,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net one o...,3.58,1593600119,9780000000000.0,eng,400,26,4,4/6/04,Cold Spring Press
5877,22128,Patriots (The Coming Collapse),James Wesley Rawles,3.63,156384155X,9780000000000.0,eng,342,38,4,1/15/99,Huntington House Publishers
8979,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown Son & Ferguson,0.0,851742718,9780000000000.0,eng,49,0,0,5/1/77,Brown Son & Ferguson Ltd.


In [232]:
#overwrite poorly formatted rows with correct formatted rows 
goodread_df.iloc[3348] = df_reshaped.iloc[0]
goodread_df.iloc[4702] = df_reshaped.iloc[1]
goodread_df.iloc[5877] = df_reshaped.iloc[2]
goodread_df.iloc[8979] = df_reshaped.iloc[3]

In [233]:
#check unnamed column
extra_col_sum = goodread_df["Unnamed: 12"].value_counts()
print(extra_col_sum)

Series([], Name: Unnamed: 12, dtype: int64)


In [234]:
#drop empty column
goodread_df = goodread_df.drop(['Unnamed: 12'], axis=1)
col_names[2] = 'author'
col_names = col_names[:-1]
col_names

['bookID',
 'title',
 'author',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 'num_pages',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher']

In [235]:
#create dataframe with only books in English

In [236]:
#check language codes
language_sum = goodread_df.language_code.value_counts()
print(language_sum)

eng      8911
en-US    1409
spa       218
en-GB     214
fre       144
ger        99
jpn        46
mul        19
zho        14
grc        11
por        10
en-CA       7
ita         5
lat         3
enm         3
swe         2
rus         2
nor         1
glg         1
nl          1
ara         1
wel         1
tur         1
gla         1
ale         1
srp         1
msa         1
Name: language_code, dtype: int64


In [237]:
#change en-US and en-GB to eng
goodread_df = goodread_df.replace(['en-US', 'en-GB', 'en-CA', 'enm'], 'eng')

#check language codes
language_sum = goodread_df.language_code.value_counts()
print(language_sum)

#create english book df slice
non_eng_goodread_slice = goodread_df[(goodread_df.language_code != 'eng')]

#create a list of indexes for english books
non_eng_book_index = list(non_eng_goodread_slice.index.values)

#drop all non-eng rows
eng_goodread_df = goodread_df.drop(non_eng_book_index, axis=0)

eng    10544
spa      218
fre      144
ger       99
jpn       46
mul       19
zho       14
grc       11
por       10
ita        5
lat        3
swe        2
rus        2
nor        1
nl         1
glg        1
ara        1
wel        1
tur        1
gla        1
ale        1
srp        1
msa        1
Name: language_code, dtype: int64


In [238]:
eng_goodread_df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/06,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/04,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780000000000.0,eng,352,6333,244,11/1/03,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780000000000.0,eng,435,2339585,36325,5/1/04,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,439682584,9780000000000.0,eng,2690,41428,164,9/13/04,Scholastic


In [239]:
#fix datatypes

In [240]:
#split out authors into primary author column and other authors column
author_split = eng_goodread_df.authors.str.split("/", n=1, expand=True) 

eng_goodread_df['author'] = author_split[0]

eng_goodread_df = eng_goodread_df.drop('authors', axis=1)
eng_goodread_df.head()

Unnamed: 0,bookID,title,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,author
0,1,Harry Potter and the Half-Blood Prince (Harry ...,4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/06,Scholastic Inc.,J.K. Rowling
1,2,Harry Potter and the Order of the Phoenix (Har...,4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/04,Scholastic Inc.,J.K. Rowling
2,4,Harry Potter and the Chamber of Secrets (Harry...,4.42,439554896,9780000000000.0,eng,352,6333,244,11/1/03,Scholastic,J.K. Rowling
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,4.56,043965548X,9780000000000.0,eng,435,2339585,36325,5/1/04,Scholastic Inc.,J.K. Rowling
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,4.78,439682584,9780000000000.0,eng,2690,41428,164,9/13/04,Scholastic,J.K. Rowling


In [241]:
#reorder goodread_df columns
eng_goodread_df = eng_goodread_df[col_names]
eng_goodread_df.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/06,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/04,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780000000000.0,eng,352,6333,244,11/1/03,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,4.56,043965548X,9780000000000.0,eng,435,2339585,36325,5/1/04,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling,4.78,439682584,9780000000000.0,eng,2690,41428,164,9/13/04,Scholastic


In [242]:
#check authors
authors_sum = eng_goodread_df.author.value_counts()
print(authors_sum)

William Shakespeare    88
Stephen King           64
J.R.R. Tolkien         47
P.G. Wodehouse         46
Agatha Christie        45
                       ..
Eireann Corrigan        1
Dave Luckett            1
Clement Greenberg       1
Geoff Emerick           1
Sherman Alexie          1
Name: author, Length: 4169, dtype: int64


In [243]:
#check titles
titles_sum = eng_goodread_df.title.value_counts()
print(titles_sum)

The Brothers Karamazov                                                                                                                             9
The Iliad                                                                                                                                          9
The Odyssey                                                                                                                                        8
'Salem's Lot                                                                                                                                       8
Gulliver's Travels                                                                                                                                 8
                                                                                                                                                  ..
The Hidden Smile of God: The Fruit of Affliction in the Lives of John Bunyan  William Cowper  and David Br

In [244]:
karamazov_df = eng_goodread_df[(eng_goodread_df.title == 'The Brothers Karamazov')]

In [245]:
karamazov_df.head(9)

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
1406,4933,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,451527348,9780000000000.0,eng,736,983,91,6/1/99,Signet Classics
1407,4934,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,374528373,9780000000000.0,eng,796,191531,6795,6/14/02,Farrar Straus and Giroux
1408,4935,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,140449248,9780000000000.0,eng,1013,1673,184,2/27/03,Penguin Books Ltd
1409,4936,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,679729259,9780000000000.0,eng,796,617,80,9/3/91,Vintage Books USA
1410,4938,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,1596440791,9780000000000.0,eng,16,20,2,5/1/05,Hovel Audio
1411,4940,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,159308045X,9780000000000.0,eng,720,1089,202,7/25/04,Barnes Noble Classics
1634,5691,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,99922800,9780000000000.0,eng,796,443,55,1/16/92,Vintage Classics
1990,7135,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,553212168,9780000000000.0,eng,1072,1022,154,4/1/84,Bantam Classics
9356,37058,The Brothers Karamazov,Fyodor Dostoyevsky,4.32,1596440783,9780000000000.0,eng,16,3,1,5/1/05,Hovel Audio


In [246]:
#deal with num_pages containing audio hours

In [247]:
audiobook_bool_series.head()

0    False
1    False
2    False
3    False
4    False
Name: publisher, dtype: bool

In [248]:
#create Boolean series for publishers that contain the word Audio
audiobook_bool_series = eng_goodread_df['publisher'].str.contains('Audio')

#create audiobook df slice
audiobook_bool_slice = audiobook_bool_series[(audiobook_bool_series == True)]

#create a list of indexes for audiobooks
audiobook_bool_index = list(audiobook_bool_slice.index.values)

#create a new df with audiobook rows
df_audiobook = eng_goodread_df.loc[audiobook_index_list]
df_audiobook.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
10,16,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,4.22,739322206,9780000000000.0,eng,6,1266,253,3/23/05,Random House Audio
40,68,The Known World,Edward P. Jones,3.83,006076273X,9780000000000.0,eng,14,55,12,6/15/04,HarperAudio
215,524,Lord of the Flies,William Golding,3.68,307281701,9780000000000.0,eng,6,408,96,10/11/05,Listening Library (Audio)
370,1281,Men Are from Mars Women Are from Venus,John Gray,3.55,006123205X,9780000000000.0,eng,2,43,3,4/3/07,HarperAudio
673,2166,The Old Man and the Sea,Ernest Hemingway,3.77,743564367,9780000000000.0,eng,3,393,77,5/1/06,Simon Schuster Audio


In [250]:
#reorder df_audiobooks columns
col_names.append('audio_hrs')

col_names = [col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5], col_names[6], col_names[7], col_names[12], col_names[8], col_names[9], col_names[10], col_names[11]]
col_names

['bookID',
 'title',
 'author',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 'num_pages',
 'audio_hrs',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher']

In [251]:
#rename df_audiobook columns and add in a new num_pages column
df_audiobook = df_audiobook.rename(columns={'num_pages':'audio_hrs'})
df_audiobook['num_pages'] = 'NA'

df_audiobook = df_audiobook[col_names]
df_audiobook.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
10,16,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,4.22,739322206,9780000000000.0,eng,,6,1266,253,3/23/05,Random House Audio
40,68,The Known World,Edward P. Jones,3.83,006076273X,9780000000000.0,eng,,14,55,12,6/15/04,HarperAudio
215,524,Lord of the Flies,William Golding,3.68,307281701,9780000000000.0,eng,,6,408,96,10/11/05,Listening Library (Audio)
370,1281,Men Are from Mars Women Are from Venus,John Gray,3.55,006123205X,9780000000000.0,eng,,2,43,3,4/3/07,HarperAudio
673,2166,The Old Man and the Sea,Ernest Hemingway,3.77,743564367,9780000000000.0,eng,,3,393,77,5/1/06,Simon Schuster Audio


In [252]:
#drop rows for data in df_audiobook
goodread_audiobook_df = goodread_df.drop(audiobook_index_list, axis=0)

In [253]:
#append updated df_audiobook to goodread_audiobook_df and reorganize columns
goodread_audiobook_df = goodread_audiobook_df.append(df_audiobook)
goodread_audiobook_df = goodread_audiobook_df[col_names]
goodread_audiobook_df.head()

Unnamed: 0,bookID,title,author,average_rating,isbn,isbn13,language_code,num_pages,audio_hrs,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,,4.57,439785960,9780000000000.0,eng,652,,2095690,27591,9/16/06,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,,4.49,439358078,9780000000000.0,eng,870,,2153167,29221,9/1/04,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,,4.42,439554896,9780000000000.0,eng,352,,6333,244,11/1/03,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,,4.56,043965548X,9780000000000.0,eng,435,,2339585,36325,5/1/04,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,,4.78,439682584,9780000000000.0,eng,2690,,41428,164,9/13/04,Scholastic


In [254]:
#add columns for
#audiobooks
#series
#genre category (fiction/non-fiction)
#genre (memoir, play, science fiction, etc.)
#?

In [255]:
#re-sort by number of ratings, check head and tail
#re-sort by number of reviews, check head and tail
#re-sort by ratings, check head and tail

#number counts for:
#authors
#average_ratings
#publisher
#text_reviews_counts
#ratings_count