In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('GoodReads_100k_books.csv')

#print shape
print(df.shape)

# Display the head of the dataframe
display(df.head())

(100000, 13)


Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [None]:
import requests

# on this dataset, we have the following columns
# auther, bookformat, desc, genre, img, isbn, isbn13, link, pages, rating, reviews, title, total_ratings

# now we will use the isbn13 to get the book details from the google books api
# we will use the google books api to get the following details
# price, publishedDate
api_key = 'AIzaSyDZffRwzFSo0HPtdu2BMbICboKH4Y6yY84'
# delcare a variable to count the number of missed requests, make public so that we can access it from the functions
missed_count = 0
google_missed_count = 0
open_library_missed_count = 0

def get_book_details_from_google(isbn13):
    global google_missed_count
    
    url = f'https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn13}&key={api_key}'
    
    response = requests.get(url)
    data = response.json()
    
    if 'items' in data and len(data['items']) > 0:
        book_info = data['items'][0]['volumeInfo']
        
        title = book_info.get('title', 'N/A')
        published_date = book_info.get('publishedDate', 'N/A')
        price = 'N/A'  # Price information is not directly available from Google Books API
        google_books_link = book_info.get('selfLink', 'N/A')

        print(title)
        return price, published_date, google_books_link
    else:
        print('No google data found for ISBN:' + str(isbn13) + " ; Trying Open Library API")
        google_missed_count += 1
        if (missed_count % 10) == 0 and missed_count > 0:
            print('Google Missed Count Now: ' + str(missed_count))
        return get_book_details(isbn13)

def get_book_details(isbn13):
    global missed_count
    global open_library_missed_count
    
    url = f'https://openlibrary.org/api/books?bibkeys=ISBN:{isbn13}&jscmd=data&format=json'
    response = requests.get(url)
    # Check if the response is successful, by checking if valid data is returned    
    data = response.json()
    if f'ISBN:{isbn13}' in data:
        book = data[f'ISBN:{isbn13}']
        if 'title' in book:
            print(book['title'])
        if 'publish_date' in book:
            publishedDate = book['publish_date']
        else:
            publishedDate = 'N/A'
        if 'price' in book:
            price = book['price']
        else:
            price = 'N/A'
        if 'url' in book:
            openBooksLink = book['url']
        else:
            openBooksLink = 'N/A'
        return price, publishedDate, openBooksLink
    else:
        print('No data found for ISBN:', isbn13)
        missed_count += 1
        open_library_missed_count += 1
        if (missed_count % 10) == 0:
            print('Missed Count Now: ' + str(missed_count))

    return 'N/A', 'N/A', 'N/A'

# Apply the function to the dataframe
df[['price', 'publishedDate', 'marketLink']] = df['isbn'].apply(lambda x: pd.Series(get_book_details_from_google(x)))

print('Google Missed Count: ' + str(google_missed_count))
print('Open Library Missed Count: ' + str(open_library_missed_count))
print('Total Missed Count: ' + str(missed_count))

display(df.head())

No google data found for ISBN:002914180X ; Trying Open Library API
Between two fires
Fashion Sourcebook 1920s
No google data found for ISBN:948984147 ; Trying Open Library API
No data found for ISBN: 948984147
No google data found for ISBN:814327079 ; Trying Open Library API
No data found for ISBN: 814327079
Les oiseaux gourmands
No google data found for ISBN:875848419 ; Trying Open Library API
No data found for ISBN: 875848419
Competitive Advantage Through People
No google data found for ISBN:393009076 ; Trying Open Library API
No data found for ISBN: 393009076
No google data found for ISBN:752456830 ; Trying Open Library API
No data found for ISBN: 752456830
Genuine Happiness
No google data found for ISBN:791407209 ; Trying Open Library API
No data found for ISBN: 791407209
Anthropological Studies of Religion
Anarchism and Ecology
No google data found for ISBN:143037013 ; Trying Open Library API
No data found for ISBN: 143037013
No google data found for ISBN:961328983 ; Trying Open L

In [None]:
# save the dataframe 'df' to a new csv file
# the new csv file should be at 'data/expanded_books.csv'
# the 'data' folder is one directory above the current directory
df.to_csv('../data/expanded_books.csv', index=False)