In [1]:
import requests
import openpyxl
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
import pandas as pd
import numpy as np
from pymarc import MARCReader

In [2]:
def modify_isbn(isbn):
    isbn = str(isbn)
    # Check if the first 3 characters are neither '978' nor '979', and the length is 10
    if not (isbn[:3] == '978' or isbn[:3] == '979'):
        if len(isbn) == 10:
            return '978' + isbn  # Prepend '978' if conditions are met
    return isbn[:13]  # Return the first 13 characters of ISBN as-is if conditions are not met

In [5]:
# /Users/rwebb/Desktop/ReMo/CRWReportJob148737.xlsx

In [2]:
# Handle excel (.xlsx) files, scraping ISBNs --> Learn to convert other files
# Use API to fill data points
# Fill gaps with excel

In [3]:
filename = input('Enter File Name: ')
print('Selected:', filename)

if filename.endswith('.mrc'):
    file = open(filename, 'r')
elif filename.endswith('.xml'):
    file = open(filename, 'r')
elif filename.endswith('.xlsx'):
    file = pd.read_excel(filename)
    # print(file.head())  # Uncomment if you want to inspect the first few rows
    isbn_values = []
    
    # Ensure 'ISBN' column exists
    if 'ISBN' in file.columns:
        for isbn in file['ISBN']:
            if pd.isna(isbn):
                continue  # Skip if ISBN is NaN
            isbn = str(isbn)  # Ensure it's a string
            isbn = isbn.replace('-', '')  # Remove dashes
            # Check if the ISBN starts with '978' or '979'
            if not (isbn[:3] == '978' or isbn[:3] == '979'):
                if len(isbn) == 10:
                    isbn = '978' + isbn
                else:    
                    isbn_values.append('BAD DATA')
                continue  # Skip the current invalid ISBN
            isbn = isbn[:13]  # Ensure it's the correct length (13 digits)

            isbn_values.append(isbn)
        print(isbn_values)
    else:
        print("No 'ISBN' column found in the Excel file.")
else:
    print("Unsupported file format.")
df = file

Selected: CRWReportJob148737.xlsx
['9780007439126', '9780007439126', '9780007439126', '9780007439126', '9780007439126', '9780007439126', '9780007439126', '9780007439126', '9780007439126', '9780007439126', '9780007511242', '9780007511242', '9780007511242', '9780007511242', '9780007511242', '9780007511242', '9780007511242', '9780007511242', '9780008183684', '9780008183684', '9780008183684', '9780008183684', '9780008183684', '9780008183684', '9780008256944', '9780008256944', 'BAD DATA', 'BAD DATA', 'BAD DATA', '9780060012786', '9780060012786', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', 'BAD DATA', '97800605

In [4]:
nan_df = df[df['ISBN'].isna()]
isbn_df = df[~df['ISBN'].isna()]

In [5]:
isbn_df.loc[:, 'ISBN'] = isbn_df['ISBN'].str.replace('-', '', regex=False)
isbn_df.loc[:, 'ISBN'] = isbn_df['ISBN'].apply(modify_isbn)

In [6]:
broken_isbn_df = isbn_df[~isbn_df['ISBN'].str[:3].isin(['978', '979'])]
whole_isbn_df = isbn_df[isbn_df['ISBN'].str[:3].isin(['978', '979'])]

broken_isbn_df

Unnamed: 0,ISBN,Series Title,Title/Subtitle,ISSN,Author,Publication Year,Publisher,Material Type,Lexile,Subject
947,0020345542 (p,,Great battles of the Civil War,,"Macdonald, John, 1945-",1992.0,"Collier/Macmillan,",Book,,United States -- History -- 1861-1865.
952,0020518609 (p,Scribner classics,The short stories of Ernest Hemingway,,"Hemingway, Ernest, 1899-1961.",1986.0,"Collier Books,",Book,,Short stories.
980,0030705061 (p,,Paper flight : 48 models ready for take-off,,"Botermans, Jack.",1984.0,"Holt,",Book,,Paper airplanes.
1218,0060248742 (l,,Leaving home : stories,,selected by Hazel Rochman and Darlene Z. McCam...,1997.0,"HarperCollins Publishers,",Book,,Literature -- Collections.
1219,0060248742 (l,,Leaving home : stories,,selected by Hazel Rochman and Darlene Z. McCam...,1997.0,"HarperCollins Publishers,",Book,,Young adult literature.
...,...,...,...,...,...,...,...,...,...,...
76180,1887178228 (a,,A world lost,,"Berry, Wendell, 1934-",1996.0,Counterpoint ;,Book,,Kentucky -- Fiction.
76181,1887178228 (a,,A world lost,,"Berry, Wendell, 1934-",1996.0,Counterpoint ;,Book,,Uncles -- Fiction.
79642,3822829668 (p,Basic art series.,Futurism,,"Martin, Sylvia, 1964-",2005.0,"Taschen,",Book,,Futurism (Art)
79646,3822852961 (p,,Renaissance,,"Wundram, Manfred.",2006.0,"Taschen,",Book,,"Painting, Renaissance."


In [7]:
broken_isbn_df.loc[:,'ISBN'] = broken_isbn_df['ISBN'].apply(lambda x: '978' + str(x)[:10]) # Is 978 the right choice here??
broken_isbn_df

Unnamed: 0,ISBN,Series Title,Title/Subtitle,ISSN,Author,Publication Year,Publisher,Material Type,Lexile,Subject
947,9780020345542,,Great battles of the Civil War,,"Macdonald, John, 1945-",1992.0,"Collier/Macmillan,",Book,,United States -- History -- 1861-1865.
952,9780020518609,Scribner classics,The short stories of Ernest Hemingway,,"Hemingway, Ernest, 1899-1961.",1986.0,"Collier Books,",Book,,Short stories.
980,9780030705061,,Paper flight : 48 models ready for take-off,,"Botermans, Jack.",1984.0,"Holt,",Book,,Paper airplanes.
1218,9780060248742,,Leaving home : stories,,selected by Hazel Rochman and Darlene Z. McCam...,1997.0,"HarperCollins Publishers,",Book,,Literature -- Collections.
1219,9780060248742,,Leaving home : stories,,selected by Hazel Rochman and Darlene Z. McCam...,1997.0,"HarperCollins Publishers,",Book,,Young adult literature.
...,...,...,...,...,...,...,...,...,...,...
76180,9781887178228,,A world lost,,"Berry, Wendell, 1934-",1996.0,Counterpoint ;,Book,,Kentucky -- Fiction.
76181,9781887178228,,A world lost,,"Berry, Wendell, 1934-",1996.0,Counterpoint ;,Book,,Uncles -- Fiction.
79642,9783822829668,Basic art series.,Futurism,,"Martin, Sylvia, 1964-",2005.0,"Taschen,",Book,,Futurism (Art)
79646,9783822852961,,Renaissance,,"Wundram, Manfred.",2006.0,"Taschen,",Book,,"Painting, Renaissance."


In [152]:
isbn_df = pd.concat([broken_isbn_df, whole_isbn_df])
isbn_df

Unnamed: 0,ISBN,Series Title,Title/Subtitle,ISSN,Author,Publication Year,Publisher,Material Type,Lexile,Subject
947,9780020345542,,Great battles of the Civil War,,"Macdonald, John, 1945-",1992.0,"Collier/Macmillan,",Book,,United States -- History -- 1861-1865.
952,9780020518609,Scribner classics,The short stories of Ernest Hemingway,,"Hemingway, Ernest, 1899-1961.",1986.0,"Collier Books,",Book,,Short stories.
980,9780030705061,,Paper flight : 48 models ready for take-off,,"Botermans, Jack.",1984.0,"Holt,",Book,,Paper airplanes.
1218,9780060248742,,Leaving home : stories,,selected by Hazel Rochman and Darlene Z. McCam...,1997.0,"HarperCollins Publishers,",Book,,Literature -- Collections.
1219,9780060248742,,Leaving home : stories,,selected by Hazel Rochman and Darlene Z. McCam...,1997.0,"HarperCollins Publishers,",Book,,Young adult literature.
...,...,...,...,...,...,...,...,...,...,...
79802,9798985282801,,Women are the fiercest creatures,,"Dunlop, Andrea.",2023.0,"Zibby Books,",Book,,Families -- Fiction.
79803,9798985282801,,Women are the fiercest creatures,,"Dunlop, Andrea.",2023.0,"Zibby Books,",Book,,\tFeminists -- Fiction.
79804,9798985282801,,Women are the fiercest creatures,,"Dunlop, Andrea.",2023.0,"Zibby Books,",Book,,Friendship -- Women -- Fiction.
79805,9798985282801,,Women are the fiercest creatures,,"Dunlop, Andrea.",2023.0,"Zibby Books,",Book,,Thriller fiction.


In [None]:
# df = df.astype(str)
# df

Unnamed: 0,ISBN,Series Title,Title/Subtitle,ISSN,Author,Publication Year,Publisher,Material Type,Lexile,Subject
0,,,The 60s reader,,"Haskins, James.",1987.0,"Viking Kestrel,",Book,,United States -- History.
1,,,200 years in Union : a pictorial history of Un...,,"Nash, Chester",1974.0,"Union Historical Society,",Book,,Union (Me.) -- History -- Pictoral works.
2,,,"1929, the year of the great crash",,"Klingaman, William K.",1989.0,"Harper & Row,",Book,,Depressions -- 1929 -- United States.
3,,,"1929, the year of the great crash",,"Klingaman, William K.",1989.0,"Harper & Row,",Book,,Economic history -- 1918-1945.
4,,,"1929, the year of the great crash",,"Klingaman, William K.",1989.0,"Harper & Row,",Book,,United States -- Economic conditions -- 1918-1...
...,...,...,...,...,...,...,...,...,...,...
79802,9798985282801,,Women are the fiercest creatures,,"Dunlop, Andrea.",2023.0,"Zibby Books,",Book,,Families -- Fiction.
79803,9798985282801,,Women are the fiercest creatures,,"Dunlop, Andrea.",2023.0,"Zibby Books,",Book,,\tFeminists -- Fiction.
79804,9798985282801,,Women are the fiercest creatures,,"Dunlop, Andrea.",2023.0,"Zibby Books,",Book,,Friendship -- Women -- Fiction.
79805,9798985282801,,Women are the fiercest creatures,,"Dunlop, Andrea.",2023.0,"Zibby Books,",Book,,Thriller fiction.


In [154]:
isbn_df = isbn_df.astype(str)

In [155]:
# Drop duplicates based on relevant columns while keeping the first occurrence
isbn_df_unique = isbn_df.drop_duplicates(subset=['ISBN', 'Series Title', 'Title/Subtitle', 'ISSN', 
                                                  'Author', 'Publication Year', 'Publisher', 
                                                  'Material Type', 'Lexile'], keep='first')

# Group by 'ISBN' and combine the 'Subject' values
isbn_df_unique = isbn_df_unique.groupby('ISBN').agg({
    'Series Title': 'first',  # Keep the first value (they are the same for each ISBN)
    'Title/Subtitle': 'first',  # Same as above
    'ISSN': 'first',  # Same as above
    'Author': 'first',  # Same as above
    'Publication Year': 'first',  # Same as above
    'Publisher': 'first',  # Same as above
    'Material Type': 'first',  # Same as above
    'Lexile': 'first',  # Same as above
    'Subject': ', '.join  # Combine all unique 'Subject' values
}).reset_index()

isbn_df_unique

Unnamed: 0,ISBN,Series Title,Title/Subtitle,ISSN,Author,Publication Year,Publisher,Material Type,Lexile,Subject
0,9780007439126,Martin Beck series,The Man Who Went Up in Smoke,,"Sjöwall, Maj.",2016.0,"Fourth Estate,",Book,,"Beck, Martin (Fictitious character)"
1,9780007511242,,It's about love,,"Camden, Steven.",2015.0,"HarperCollins Children's Books,",Book,,Families
2,9780008183684,,Scotland the best 100 places,,"Irvine, Peter, 1948-",2017.0,"Collins,",Book,,Nature photography.
3,9780008256944,,Lateral thinking puzzlers,,Paul Sloane.,2016.0,"Puzzlewright Press, an imprint of Sterling Pub...",Book,,Lateral thinking.
4,9780020198817,A Scribner classic,The great Gatsby,,"Fitzgerald, F. Scott (Francis Scott), 1896-1940.",1992.0,"Collier Books,",Book,1070L,
...,...,...,...,...,...,...,...,...,...,...
11286,9789380741222,Campfire heroes,Gandhi : my life is my message,,"Quinn, Jason.",2013.0,"Campifre/Kalyani Navyug Media PVT Ltd.,",Book,,"Cartoons and comics., Cartoons and comics."
11287,9789629549992,,The great Gatsby,,"Fitzgerald, F. Scott (Francis Scott), 1896-1940.",2012.0,"Findaway World,",Sound,,Audiobooks.
11288,9798639873775,,The fifth vital,,"Majlak, Mike, 1985-",2020.0,"Mike Majlak,",Book,,Autobiographies.
11289,9798663066952,,Schitt's Creek Fun Facts : 80 fun facts and ev...,,"Ortiz, Celestina.",2020.0,"Independently Published,",Book,,American culture.


In [157]:
def add_attributes(df):
    category_list = ['Title/Subtitle', 'Author', 'Copyright Date', 'Summary/Synopsis', 
                    'Series Name/Position', 'Genre', 'Form', 'Format', 'ISBN', 
                    'Page Count', 'Type']
    for category in category_list:
        if category not in df.columns:
            df.loc[category] = np.nan

add_attributes(isbn_df_unique)
add_attributes(nan_df)
isbn_df_unique

Unnamed: 0,ISBN,Series Title,Title/Subtitle,ISSN,Author,Publication Year,Publisher,Material Type,Lexile,Subject,Copyright Date,Summary/Synopsis,Series Name/Position,Genre,Form,Format,Page Count,Type
0,9780007439126,Martin Beck series,The Man Who Went Up in Smoke,,"Sjöwall, Maj.",2016.0,"Fourth Estate,",Book,,"Beck, Martin (Fictitious character)",,,,,,,,
1,9780007511242,,It's about love,,"Camden, Steven.",2015.0,"HarperCollins Children's Books,",Book,,Families,,,,,,,,
2,9780008183684,,Scotland the best 100 places,,"Irvine, Peter, 1948-",2017.0,"Collins,",Book,,Nature photography.,,,,,,,,
3,9780008256944,,Lateral thinking puzzlers,,Paul Sloane.,2016.0,"Puzzlewright Press, an imprint of Sterling Pub...",Book,,Lateral thinking.,,,,,,,,
4,9780020198817,A Scribner classic,The great Gatsby,,"Fitzgerald, F. Scott (Francis Scott), 1896-1940.",1992.0,"Collier Books,",Book,1070L,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11286,9789380741222,Campfire heroes,Gandhi : my life is my message,,"Quinn, Jason.",2013.0,"Campifre/Kalyani Navyug Media PVT Ltd.,",Book,,"Cartoons and comics., Cartoons and comics.",,,,,,,,
11287,9789629549992,,The great Gatsby,,"Fitzgerald, F. Scott (Francis Scott), 1896-1940.",2012.0,"Findaway World,",Sound,,Audiobooks.,,,,,,,,
11288,9798639873775,,The fifth vital,,"Majlak, Mike, 1985-",2020.0,"Mike Majlak,",Book,,Autobiographies.,,,,,,,,
11289,9798663066952,,Schitt's Creek Fun Facts : 80 fun facts and ev...,,"Ortiz, Celestina.",2020.0,"Independently Published,",Book,,American culture.,,,,,,,,


In [158]:
short_df = isbn_df_unique.head(10)
short_df

Unnamed: 0,ISBN,Series Title,Title/Subtitle,ISSN,Author,Publication Year,Publisher,Material Type,Lexile,Subject,Copyright Date,Summary/Synopsis,Series Name/Position,Genre,Form,Format,Page Count,Type
0,9780007439126,Martin Beck series,The Man Who Went Up in Smoke,,"Sjöwall, Maj.",2016.0,"Fourth Estate,",Book,,"Beck, Martin (Fictitious character)",,,,,,,,
1,9780007511242,,It's about love,,"Camden, Steven.",2015.0,"HarperCollins Children's Books,",Book,,Families,,,,,,,,
2,9780008183684,,Scotland the best 100 places,,"Irvine, Peter, 1948-",2017.0,"Collins,",Book,,Nature photography.,,,,,,,,
3,9780008256944,,Lateral thinking puzzlers,,Paul Sloane.,2016.0,"Puzzlewright Press, an imprint of Sterling Pub...",Book,,Lateral thinking.,,,,,,,,
4,9780020198817,A Scribner classic,The great Gatsby,,"Fitzgerald, F. Scott (Francis Scott), 1896-1940.",1992.0,"Collier Books,",Book,1070L,,,,,,,,,
5,9780020199201,Collier books,This side of paradise,,"Fitzgerald, F. Scott (Francis Scott), 1896-1940.",1986.0,"Macmillan,",Book,1070L,,,,,,,,,
6,9780020264801,,Ethan Frome,,"Wharton, Edith, 1862-1937.",1987.0,"Collier Books,",Book,1160L,New England -- Fiction,,,,,,,,
7,978002034550X,,Great battles of the Civil War,,"Macdonald, John, 1945-",1992.0,"Collier/Macmillan,",Book,,United States -- History -- 1861-1865.,,,,,,,,
8,9780020345542,,Great battles of the Civil War,,"Macdonald, John, 1945-",1992.0,"Collier/Macmillan,",Book,,United States -- History -- 1861-1865.,,,,,,,,
9,9780020449313,,The yearling,,"Rawlings, Marjorie Kinnan, 1896-1953.",1988.0,"Aladdin Paperbacks,",Book,750L,Deer -- Fiction.,,,,,,,,


In [160]:
for isbn in short_df['ISBN']:
    if pd.isna(isbn):
        continue  # Skip if ISBN is NaN
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    response = requests.get(url)
    if response.status_code == 200: # Confirm request worked/is available
        book_data = response.json()
        if 'items' in book_data:
        # Extract book details from the API response
            book_info = book_data['items'][0]['volumeInfo']
            # Missing:
            # Series name and position
            # Form
            # Format
            book_details = {
                'ISBN#': isbn,
                'Title': book_info.get('title', np.nan),
                'Subtitle': np.nan, # Not available
                'Authors': ', '.join(book_info.get('authors', ['N/A'])),
                'Publisher': book_info.get('publisher', np.nan),
                'PublishedDate': book_info.get('publishedDate', np.nan),
                'CopyrightDate': np.nan, # Not available
                'Summary': book_info.get('description', 'No description available'), # NEED
                'Genre': ', '.join(book_info.get('categories', ['N/A'])), # NEED
                'PageCount': book_info.get('pageCount', np.nan), # NEED
                'Type': book_info.get('printType', np.nan), # Need
                'Categories': book_info.get('Categories', np.nan)
                # Type of book - ficiton, nonfiction, blended
            }
            # Accessing the correct row using ISBN in the DataFrame
            row_index = short_df[short_df['ISBN'] == isbn].index
            
            if not row_index.empty:  # Check if a matching ISBN was found
                row_index = row_index[0]  # Get the first (and expected only) match

                # Only update if the current value is NaN
                if pd.isna(short_df.loc[row_index, 'Title/Subtitle']):
                    short_df.loc[row_index, 'Title/Subtitle'] = book_details['Title']
                if pd.isna(short_df.loc[row_index, 'Author']):
                    short_df.loc[row_index, 'Author'] = book_details['Authors']
                if pd.isna(short_df.loc[row_index, 'Publication Year']):
                    short_df.loc[row_index, 'Publication Year'] = book_details['PublishedDate']
                if pd.isna(short_df.loc[row_index, 'Publisher']):
                    short_df.loc[row_index, 'Publisher'] = book_details['Publisher']
                if pd.isna(short_df.loc[row_index, 'Material Type']):
                    short_df.loc[row_index, 'Material Type'] = book_details['Type']
                if pd.isna(short_df.loc[row_index, 'Subject']):
                    short_df.loc[row_index, 'Subject'] = book_details['Genre']
                if pd.isna(short_df.loc[row_index, 'Summary/Synopsis']):
                    short_df.loc[row_index, 'Summary/Synopsis'] = book_details['Summary']
                if pd.isna(short_df.loc[row_index, 'Page Count']):
                    short_df.loc[row_index, 'Page Count'] = book_details['PageCount']
                # # ISBN Can't be NaN for this data scraping
                # if short_df['Title/Subtitle'][isbn] == np.nan:
                #     short_df['Title/Subtitle'][isbn] = book_details['Title']
                # if short_df['Author'][isbn] == np.nan:
                #     short_df['Author'][isbn] = book_details['Authors']
                # if short_df['Publication Year'][isbn] == np.nan:
                #     short_df['Publication Year'][isbn] = book_details['PublishedDate']
                # if short_df['Publisher'][isbn] == np.nan:
                #     short_df['Publisher'][isbn] = book_details['Publisher']
                # if short_df['Material Type'][isbn] == np.nan:
                #     short_df['Material Type'][isbn] = book_details['Type']
                # if short_df['Subject'][isbn] == np.nan:
                #     short_df['Subject'][isbn] = book_details['Genre']
                # if short_df['Summary/Synopsis'][isbn] == np.nan:
                #     short_df['Summary/Synopsis'][isbn] = book_details['Summary']
                # if short_df['Page Count'][isbn] == np.nan:
                #     short_df['Page Count'][isbn] = book_details['PageCount']

  short_df.loc[row_index, 'Summary/Synopsis'] = book_details['Summary']


In [161]:
short_df

Unnamed: 0,ISBN,Series Title,Title/Subtitle,ISSN,Author,Publication Year,Publisher,Material Type,Lexile,Subject,Copyright Date,Summary/Synopsis,Series Name/Position,Genre,Form,Format,Page Count,Type
0,9780007439126,Martin Beck series,The Man Who Went Up in Smoke,,"Sjöwall, Maj.",2016.0,"Fourth Estate,",Book,,"Beck, Martin (Fictitious character)",,Detective Martin Beck has just begun his holid...,,,,,0.0,
1,9780007511242,,It's about love,,"Camden, Steven.",2015.0,"HarperCollins Children's Books,",Book,,Families,,"Despite coming from different backgrounds, asp...",,,,,0.0,
2,9780008183684,,Scotland the best 100 places,,"Irvine, Peter, 1948-",2017.0,"Collins,",Book,,Nature photography.,,"Peter Irvine, bestselling author of Scotland t...",,,,,0.0,
3,9780008256944,,Lateral thinking puzzlers,,Paul Sloane.,2016.0,"Puzzlewright Press, an imprint of Sterling Pub...",Book,,Lateral thinking.,,"The astonishing, beautiful new story for all r...",,,,,400.0,
4,9780020198817,A Scribner classic,The great Gatsby,,"Fitzgerald, F. Scott (Francis Scott), 1896-1940.",1992.0,"Collier Books,",Book,1070L,,,"Jay Gatsby had once loved beautiful, spoiled D...",,,,,248.0,
5,9780020199201,Collier books,This side of paradise,,"Fitzgerald, F. Scott (Francis Scott), 1896-1940.",1986.0,"Macmillan,",Book,1070L,,,Here is the accomplished first novel that cata...,,,,,294.0,
6,9780020264801,,Ethan Frome,,"Wharton, Edith, 1862-1937.",1987.0,"Collier Books,",Book,1160L,New England -- Fiction,,"Ethan Frome, a poor, downtrodden New England f...",,,,,164.0,
7,978002034550X,,Great battles of the Civil War,,"Macdonald, John, 1945-",1992.0,"Collier/Macmillan,",Book,,United States -- History -- 1861-1865.,,,,,,,,
8,9780020345542,,Great battles of the Civil War,,"Macdonald, John, 1945-",1992.0,"Collier/Macmillan,",Book,,United States -- History -- 1861-1865.,,17 of the Greatest Battles of the Civil War Co...,,,,,0.0,
9,9780020449313,,The yearling,,"Rawlings, Marjorie Kinnan, 1896-1953.",1988.0,"Aladdin Paperbacks,",Book,750L,Deer -- Fiction.,,,,,,,,


In [None]:
isbn = isbn_values[0]
url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"

response = requests.get(url)
if response.status_code == 200: # Confirm request worked/is available
    book_data = response.json()
    if 'items' in book_data:
        # Extract book details from the API response
        book_info = book_data['items'][0]['volumeInfo']
        # Missing:
        # Series name and position
        # Form
        # Format
        book_details = {
            'ISBN#': isbn,
            'Title': book_info.get('title', np.nan),
            'Subtitle': np.nan,
            'Authors': ', '.join(book_info.get('authors', ['N/A'])),
            'Publisher': book_info.get('publisher', np.nan),
            'PublishedDate': book_info.get('publishedDate', np.nan),
            'CopyrightDate': np.nan,
            'Summary': book_info.get('description', 'No description available'), # NEED
            'Genre': ', '.join(book_info.get('categories', ['N/A'])), # NEED
            'PageCount': book_info.get('pageCount', np.nan) # NEED
            # Type of book - ficiton, nonfiction, blended
        }

# Book title
# title = book_data['items'][0]['volumeInfo']['title']
# print(f"Title: {title}")

In [6]:
book_data

{'kind': 'books#volumes',
 'totalItems': 1,
 'items': [{'kind': 'books#volume',
   'id': 'eZrztgAACAAJ',
   'etag': '6a8Dfo+UKmk',
   'selfLink': 'https://www.googleapis.com/books/v1/volumes/eZrztgAACAAJ',
   'volumeInfo': {'title': 'The Man who Went Up in Smoke',
    'authors': ['Maj Sjöwall', 'Per Wahlöö'],
    'publisher': 'A Martin Beck Novel',
    'publishedDate': '2011',
    'description': 'Detective Martin Beck has just begun his holiday on a small island off the coast of Sweden. But when a neighbour gets a phone call, Beck finds himself packed off to Budapest, where a boorish journalist has vanished without a trace.',
    'industryIdentifiers': [{'type': 'ISBN_10', 'identifier': '0007439121'},
     {'type': 'ISBN_13', 'identifier': '9780007439126'}],
    'readingModes': {'text': False, 'image': False},
    'pageCount': 0,
    'printType': 'BOOK',
    'categories': ['Beck, Martin (Fictitious character)'],
    'maturityRating': 'NOT_MATURE',
    'allowAnonLogging': False,
    'co

In [7]:
import requests
import pandas as pd
import time

unique_isbn_list = list(unique_isbn_values)

# List to store book data
book_data_list = []

# Loop over each ISBN and fetch data from Google Books API
for isbn in unique_isbn_list[:10]:
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    try:
        # Make the GET request
        response = requests.get(url)
        
        # Check if the response is successful (status code 200)
        if response.status_code == 200:
            book_data = response.json()

            # Check if 'items' is in the response (this indicates that data is available)
            if 'items' in book_data:
                # Extract book details from the API response
                book_info = book_data['items'][0]['volumeInfo']
                # Missing:
                    # Series name and position
                    # Form
                    # Format
                book_details = {
                    'ISBN#': isbn,
                    'Title': book_info.get('title', np.nan),
                    'Subtitle': np.nan,
                    'Authors': ', '.join(book_info.get('authors', ['N/A'])),
                    'Publisher': book_info.get('publisher', np.nan),
                    'PublishedDate': book_info.get('publishedDate', np.nan),
                    'CopyrightDate': np.nan,
                    'Summary': book_info.get('description', 'No description available'), # NEED
                    'Genre': ', '.join(book_info.get('categories', ['N/A'])), # NEED
                    'PageCount': book_info.get('pageCount', np.nan) # NEED
                    # Type of book - ficiton, nonfiction, blended
                }
                book_data_list.append(book_details)
            else:
                # Handle the case where no data is found for the ISBN
                print(f"No data found for ISBN {isbn}")
                book_details = {
                    'ISBN#': isbn,
                    'Title': np.nan,
                    'Subtitle': np.nan,
                    'Subtitle': np.nan,
                    'Authors': np.nan,
                    'Publisher': np.nan,
                    'PublishedDate': np.nan,
                    'CopyrightDate': np.nan,
                    'Summary': np.nan, # NEED
                    'Genre': np.nan, # NEED
                    'PageCount': np.nan # NEED
                    # Type of book - ficiton, nonfiction, blended
                }
                book_data_list.append(book_details)
        else:
            # If the response status code isn't 200, print an error message
            print(f"Error fetching data for ISBN {isbn}, Status Code: {response.status_code}")
    
    except requests.exceptions.RequestException as e:
        # Handle other exceptions such as network issues or timeout
        print(f"Error with request for ISBN {isbn}: {e}")
    
    # Sleep to prevent hitting API rate limits too quickly (optional but recommended for large datasets)
    time.sleep(1)

# Convert the list of book data to a DataFrame for easier manipulation/inspection
book_data_df = pd.DataFrame(book_data_list)

# Optionally, save the data to an Excel file
book_data_df.to_excel('book_data.xlsx', index=False)

# Print the DataFrame or inspect the first few rows
book_data_df

No data found for ISBN 9781421540764


Unnamed: 0,ISBN#,Title,Subtitle,Authors,Publisher,PublishedDate,CopyrightDate,Summary,Genre,PageCount
0,9780525564201,"My Sister, the Serial Killer",,Oyinkan Braithwaite,Anchor,2019-07-30,,ONE OF TIME MAGAZINE'S 100 BEST MYSTERY AND TH...,Fiction,241.0
1,9781534474529,How to Change Everything,,Naomi Klein,Atheneum Books for Young Readers,2021-02-23,,“[A] uniquely inclusive perspective that will ...,Juvenile Nonfiction,336.0
2,9781338670905,Forged by Reading,,"Kylene Beers, Robert Probst",Scholastic Professional,2020-12,,Bestselling authors Beers and Probst explore w...,,208.0
3,9781662515125,The Spy Coast,,Tess Gerritsen,Thomas & Mercer,2023-10-17,,A retired CIA operative in small-town Maine ta...,Fiction,0.0
4,9781421527734,Brave Story,,Miyuki Miyabe,Haikasoru,2009-11-17,,"Wataru Mitani has to grow up fast, and not jus...",Juvenile Fiction,0.0
5,9780062394415,She Rides Shotgun,,Jordan Harper,Ecco,2018-03-13,,*WINNER OF A 2018 EDGAR AWARD FOR BEST DEBUT N...,Fiction,272.0
6,9780593466322,Shrines of Gaiety,,Kate Atkinson,Anchor,2023-05-02,,NEW YORK TIMES BESTSELLER • The award-winning ...,Fiction,449.0
7,9781421540764,,,,,,,,,
8,9780399171611,The Wrath & the Dawn,,Renée Ahdieh,Penguin,2015-05-12,,#1 New York Times Bestseller A sumptuous and e...,Young Adult Fiction,418.0
9,9781250153296,Untitled Flatiron,,Untitled Flatiron,Flatiron Books,2019-11-05,,No description available,Biography & Autobiography,320.0


In [4]:
file

NameError: name 'file' is not defined

In [12]:
url = f"http://openlibrary.org/api/volumes/brief/isbn/0596156715.json"
# Make the GET request
response = requests.get(url)
response

#         # Check if 'items' is in the response (this indicates that data is available)
#         if 'items' in book_data:
#             # Extract book details from the API response
#             book_info = book_data['items'][0]['volumeInfo']
#             # Missing:
#                 # Series name and position
#                 # Form
#                 # Format
#             book_details = {
#                 'ISBN': isbn,
#                 'title': book_info.get('title', 'N/A'),
#                 'authors': ', '.join(book_info.get('authors', ['N/A'])),
#                 'publisher': book_info.get('publisher', 'N/A'),
#                 'publishedDate': book_info.get('publishedDate', 'N/A'),
#                 'Summary': book_info.get('description', 'No description available'), # NEED
#                 'Genre': ', '.join(book_info.get('categories', ['N/A'])), # NEED
#                 'pageCount': book_info.get('pageCount', 'N/A'),
#             }
#             book_data_list.append(book_details)
#         else:
#             # Handle the case where no data is found for the ISBN
#             print(f"No data found for ISBN {isbn}")
#     else:
#         # If the response status code isn't 200, print an error message
#     print(f"Error fetching data for ISBN {isbn}, Status Code: {response.status_code}")
    
# except requests.exceptions.RequestException as e:
#     # Handle other exceptions such as network issues or timeout
#     print(f"Error with request for ISBN {isbn}: {e}")
    
#     # Sleep to prevent hitting API rate limits too quickly (optional but recommended for large datasets)
#     time.sleep(1)

# # Convert the list of book data to a DataFrame for easier manipulation/inspection
# book_data_df = pd.DataFrame(book_data_list)

# # Optionally, save the data to an Excel file
# book_data_df.to_excel('book_data.xlsx', index=False)

# # Print the DataFrame or inspect the first few rows
# book_data_df.head()

<Response [200]>