# Goodbooks Exercise
### Load the Data

In [1]:
import pandas as pd

path = 'https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv'
book_df = pd.read_csv(path)


### Check the Data for import problems

In [2]:
import string

# Here I am making sure that there are no null values in the column I will be checking
book_df['original_title'] = book_df.original_title.fillna('No Title')

# Here I am checking to see if a title has characters that are not in the english alphabet
# I am checking the string with a regex expression for the check
book_df[(~book_df.original_title.str.contains("|".join(string.ascii_letters)))].head()

# Based on the result of this check, I don't need to do anything special on data import

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
78,79,1381,1381,3356006,1703,143039954,9780143000000.0,"Homer, Robert Fagles, E.V. Rieu, Frédéric Mugl...",-720.0,Ὀδύσσεια,...,670326,710757,8101,29703,65629,183082,224120,208223,https://images.gr-assets.com/books/1390173285m...,https://images.gr-assets.com/books/1390173285s...
171,172,15823480,15823480,2507928,1492,345803922,9780346000000.0,"Leo Tolstoy, Louise Maude, Leo Tolstoj, Aylmer...",1877.0,Анна Каренина,...,297472,472796,18064,11738,26945,88365,158179,187569,https://images.gr-assets.com/books/1352422904m...,https://images.gr-assets.com/books/1352422904s...
176,177,7144,7144,3393917,1714,143058142,9780143000000.0,"Fyodor Dostoyevsky, David McDuff",1866.0,Преступление и наказание,...,380903,444675,12605,9477,20078,64050,137104,213966,https://images.gr-assets.com/books/1382846449m...,https://images.gr-assets.com/books/1382846449s...
294,295,10644930,10644930,15553789,145,1451627289,9781452000000.0,Stephen King,2011.0,11/22/63,...,258464,303057,30656,2927,7649,35500,109392,147589,https://images.gr-assets.com/books/1327876792m...,https://images.gr-assets.com/books/1327876792s...
340,341,1371,1371,3293141,1726,140275363,9780140000000.0,"Homer, Robert Fagles, Frédéric Mugler, Bernard...",-750.0,Ἰλιάς,...,241088,273565,4763,7701,20845,68844,89384,86791,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


### Decide how to handle the empty values in columns
 - I will be dropping the missing isbn numbers and replacing language code

In [3]:
book_df = book_df[(~book_df.isbn13.isnull())]
book_df.isbn.str.len().value_counts()

9.0     5570
10.0    2683
8.0      915
7.0      112
Name: isbn, dtype: int64

In [4]:
book_df['isbn13'] = book_df.isbn13.astype('str')

In [5]:
book_df.isbn13.str.len().value_counts()

15    9407
13       4
14       3
11       1
Name: isbn13, dtype: int64

In [6]:
book_df.isbn13.str.len().value_counts()

15    9407
13       4
14       3
11       1
Name: isbn13, dtype: int64

In [7]:
book_df = book_df[(book_df.isbn13.str.len() == 15)]

In [8]:
book_df.isbn13.str.len().value_counts()

15    9407
Name: isbn13, dtype: int64

In [9]:
book_df['language_code'] = book_df.language_code.fillna('Not Provided')

### Format ISBN 13 to match specifications

In [10]:
def format_isbn(old_isbn):
    old_isbn = old_isbn[:-3]
    sum_check = 0
    for i, char in enumerate(old_isbn):
        if i % 2 == 0:
            sum_check += int(char) * 1
        else:
            sum_check += int(char) * 3
    check_digit = str(int(round(sum_check + 5.1, -1) - sum_check))
    if check_digit == '10':
        check_digit = '0'
    s = old_isbn + check_digit
    new_num = f'{s[:3]}-{s[3:4]}-{s[4:9]}-{s[9:12]}-{s[12:]}'
    return new_num

book_df['new_isbn13'] = book_df.isbn13.apply(lambda x: format_isbn(x))

### Add a column to show where original title and title differ.

In [11]:
book_df['title_mismatch'] = book_df.apply(lambda row: 'match' if row.title == row.original_title else 'difference', axis = 1)
book_df.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,new_isbn13,title_mismatch
0,1,2767052,2767052,2792775,272,439023483,9780439023480.0,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,978-0-43902-348-1,difference
1,2,3,3,4640799,491,439554934,9780439554930.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,978-0-43955-493-0,difference
2,3,41865,41865,3212258,226,316015849,9780316015840.0,Stephenie Meyer,2005.0,Twilight,...,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,978-0-31601-584-4,difference
3,4,2657,2657,3275794,487,61120081,9780061120080.0,Harper Lee,1960.0,To Kill a Mockingbird,...,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,978-0-06112-008-4,match
4,5,4671,4671,245494,1356,743273567,9780743273560.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,978-0-74327-356-5,match


### Break up authors into separate columns

In [12]:
# I am creating a new df with an index of the unique book_id
author_df = book_df.set_index('book_id').authors.str.split(",", expand = True).fillna("-")

# I am renaming the columns
author_df.columns = ['Author ' + str(i) for i in range(1,len(author_df.columns) + 1)]

# I am moving the index back to a column
author_df.reset_index(inplace = True)

author_separated_df = pd.merge(author_df, book_df, left_on = 'book_id', right_on = 'book_id')
author_separated_df.head()

Unnamed: 0,book_id,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,Author 7,Author 8,Author 9,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,new_isbn13,title_mismatch
0,1,Suzanne Collins,-,-,-,-,-,-,-,-,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,978-0-43902-348-1,difference
1,2,J.K. Rowling,Mary GrandPré,-,-,-,-,-,-,-,...,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,978-0-43955-493-0,difference
2,3,Stephenie Meyer,-,-,-,-,-,-,-,-,...,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,978-0-31601-584-4,difference
3,4,Harper Lee,-,-,-,-,-,-,-,-,...,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,978-0-06112-008-4,match
4,5,F. Scott Fitzgerald,-,-,-,-,-,-,-,-,...,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,978-0-74327-356-5,match


### Store each publication by its publication year by century in its own worksheet.

In [13]:
# Not yet checked for accuracy, but I think it is fine.

year_groups = [(1700,1800), (1800,1900), (1900,2000), (2000,2100)]

with pd.ExcelWriter('book_data_by_year.xlsx') as new_file:
    books_written = 0
    for year_group in year_groups:
        start_year = year_group[0]
        end_year = year_group[1]
        new_sheet_name = f'{start_year}-{end_year-1}'
        
        subset = author_separated_df[
            (author_separated_df.original_publication_year.isin(range(start_year, end_year, 1)))
        ]
        subset.to_excel(new_file, sheet_name = new_sheet_name)
        print(len(subset))
        books_written += subset.shape[0]
        print(books_written)
        
        author_separated_df = author_separated_df[
            (~author_separated_df.original_publication_year.isin(range(start_year, end_year, 1)))
        ]
    
    author_separated_df.to_excel(new_file, sheet_name = 'Other Years')
    books_written += author_separated_df.shape[0]

print(f'Books Written to file: {books_written}')

26
26
249
275
3365
3640
5654
9294
Books Written to file: 9407


In [14]:
book_df.shape[0]

9407