# Goodbooks Exercise
### Load the Data

In [1]:
import pandas as pd

path = 'https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv'
book_df = pd.read_csv(path)


### Check the Data for import problems

In [2]:
import string

# Here I am making sure that there are no null values in the column I will be checking
book_df['original_title'] = book_df.original_title.fillna('No Title')

# Here I am checking to see if a title has characters that are not in the english alphabet
# I am checking the string with a regex expression for the check
book_df[(~book_df.original_title.str.contains("|".join(string.ascii_letters)))]

# Based on the result of this check, I don't need to do anything special on data import

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
78,79,1381,1381,3356006,1703,143039954,9.780143e+12,"Homer, Robert Fagles, E.V. Rieu, Frédéric Mugl...",-720.0,Ὀδύσσεια,...,670326,710757,8101,29703,65629,183082,224120,208223,https://images.gr-assets.com/books/1390173285m...,https://images.gr-assets.com/books/1390173285s...
171,172,15823480,15823480,2507928,1492,345803922,9.780346e+12,"Leo Tolstoy, Louise Maude, Leo Tolstoj, Aylmer...",1877.0,Анна Каренина,...,297472,472796,18064,11738,26945,88365,158179,187569,https://images.gr-assets.com/books/1352422904m...,https://images.gr-assets.com/books/1352422904s...
176,177,7144,7144,3393917,1714,143058142,9.780143e+12,"Fyodor Dostoyevsky, David McDuff",1866.0,Преступление и наказание,...,380903,444675,12605,9477,20078,64050,137104,213966,https://images.gr-assets.com/books/1382846449m...,https://images.gr-assets.com/books/1382846449s...
294,295,10644930,10644930,15553789,145,1451627289,9.781452e+12,Stephen King,2011.0,11/22/63,...,258464,303057,30656,2927,7649,35500,109392,147589,https://images.gr-assets.com/books/1327876792m...,https://images.gr-assets.com/books/1327876792s...
340,341,1371,1371,3293141,1726,140275363,9.780140e+12,"Homer, Robert Fagles, Frédéric Mugler, Bernard...",-750.0,Ἰλιάς,...,241088,273565,4763,7701,20845,68844,89384,86791,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9883,9884,97390,97390,93879,32,5080039159,9.785080e+12,"Aleksandr Griboyedov, Александр Сергеевич Гриб...",1825.0,Горе от ума,...,9287,9690,36,159,553,1674,2791,4513,https://images.gr-assets.com/books/1342993233m...,https://images.gr-assets.com/books/1342993233s...
9905,9906,1087207,1087207,2902197,16,1421501929,9.781422e+12,Bisco Hatori,1999.0,桜蘭高校ホスト部 4,...,11280,11392,137,53,191,1264,2942,6942,https://images.gr-assets.com/books/1416398307m...,https://images.gr-assets.com/books/1416398307s...
9947,9948,263173,263173,255111,19,1421511304,9.781422e+12,"Matsuri Hino, Tomo Kimura",2005.0,ヴァンパイア騎士 2,...,13455,13704,300,178,489,2244,3606,7187,https://images.gr-assets.com/books/1329259095m...,https://images.gr-assets.com/books/1329259095s...
9966,9967,5295735,5295735,7087982,23,,,"Naguib Mahfouz, نجيب محفوظ",1977.0,ملحمة الحرافيش,...,8572,10079,1216,139,297,1174,2859,5610,https://images.gr-assets.com/books/1347915841m...,https://images.gr-assets.com/books/1347915841s...


### Decide how to handle the empty values in columns
 - I will be replacing these nulls with a "Not Provided" label because the books could still be useful without that data.

In [3]:
book_df['isbn'] = book_df.isbn.fillna('Not Provided')
book_df['isbn13'] = book_df.isbn13.fillna('Not Provided')
book_df['language_code'] = book_df.language_code.fillna('Not Provided')

### Format ISBN 13 to match specifications

In [4]:
def check_sum(num, end):
    sum_check = 0
    for i, char in enumerate(num[:-end]):
        if i % 2 == 0:
            sum_check += int(char) * 1
        else:
            sum_check += int(char) * 3
    check_digit = str(int(round(sum_check + 5.1, -1) - sum_check))
    if end == 2:
        new_isbn = num[:-2] + check_digit + '0'
    else:
        new_isbn = num[:-1] + check_digit
    return new_isbn

def format_isbn(isbn, isbn13):
    if isbn13 == 'Not Provided':
        return isbn13
    
    isbn = str(isbn)
    isbn13 = str(int(isbn13))
    
    new_isbn = ""
    if isbn[:-1] in isbn13:      
        new_isbn = check_sum(isbn13, 1)
    else:
        if isbn[:-2] in isbn13:
            new_isbn = check_sum(isbn13, 2)
            new_isbn = check_sum(new_isbn, 1)
            
    s = new_isbn
    new_num = f'{s[:3]}-{s[3:4]}-{s[4:9]}-{s[9:12]}-{s[12:]}'
    return new_num
    
book_df['new_isbn13'] = book_df.apply(lambda row: format_isbn(row.isbn, row.isbn13), axis = 1)
book_df[['isbn', 'isbn13', 'new_isbn13']]

Unnamed: 0,isbn,isbn13,new_isbn13
0,439023483,9780439023480.0,978-0-43902-348-1
1,439554934,9780439554930.0,978-0-43955-493-10
2,316015849,9780316015840.0,978-0-31601-584-4
3,61120081,9780061120080.0,978-0-06112-008-4
4,743273567,9780743273560.0,978-0-74327-356-5
...,...,...,...
9995,441019455,9780441019460.0,978-0-44101-943-4
9996,067973371X,9780679733710.0,978-0-67973-371-3
9997,039330762X,9780393307630.0,978-0-39330-763-4
9998,61711527,9780061711530.0,978-0-06171-153-4
