In [4]:
import pandas as pd
import requests
import time
import json
import gzip
import csv
#from concurrent.futures import ThreadPoolExecutor

## Analizing the books at the LFPL libraries

In [15]:
# Read the csv file
# LFPL's collection inventory. Updated on a monthly basis.
lfpl_books = pd.read_csv('https://www.arcgis.com/sharing/rest/content/items/372216992aea4b2cb5b02837d7a48eaf/data')
lfpl_books

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9.780072e+12,2009,Book,Adult Non-Fiction,Main,25.95,06/01/2023 00:00:00
1,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9.780307e+12,2009,Book,Adult Non-Fiction,Southwest,19.99,06/01/2023 00:00:00
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9.780307e+12,2009,Book,Adult Non-Fiction,Southwest,19.99,06/01/2023 00:00:00
3,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9.780307e+12,2009,Book,Adult Non-Fiction,Remote Shelving - Main,19.99,06/01/2023 00:00:00
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9.780307e+12,2009,Book,Adult Non-Fiction,Remote Shelving - Main,19.99,06/01/2023 00:00:00
...,...,...,...,...,...,...,...,...,...,...
1674146,2654219,Emma of 83rd Street,"Harding, Emily",9.781797e+12,2023,Eaudiobook,Electronic,Main,0.00,06/01/2023 00:00:00
1674147,2654219,Emma of 83rd Street,"Harding, Emily",9.781797e+12,2023,Eaudiobook,Electronic,Main,0.00,06/01/2023 00:00:00
1674148,2654219,Emma of 83rd Street,"Harding, Emily",9.781797e+12,2023,Eaudiobook,Electronic,Main,0.00,06/01/2023 00:00:00
1674149,2654220,The Sea Witch,"Robert, Katee",9.781951e+12,2020,Eaudiobook,Electronic,Main,0.00,06/01/2023 00:00:00


In [16]:
# Leaving only Books in the dataframe
lfpl_books = lfpl_books[lfpl_books['ItemType'] == 'Book']
lfpl_books

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9.780072e+12,2009,Book,Adult Non-Fiction,Main,25.95,06/01/2023 00:00:00
1,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9.780307e+12,2009,Book,Adult Non-Fiction,Southwest,19.99,06/01/2023 00:00:00
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9.780307e+12,2009,Book,Adult Non-Fiction,Southwest,19.99,06/01/2023 00:00:00
3,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9.780307e+12,2009,Book,Adult Non-Fiction,Remote Shelving - Main,19.99,06/01/2023 00:00:00
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9.780307e+12,2009,Book,Adult Non-Fiction,Remote Shelving - Main,19.99,06/01/2023 00:00:00
...,...,...,...,...,...,...,...,...,...,...
1674140,2654213,The Saracen's mark,"Perry, SW",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674141,2654214,The hedge fund edge: maximum profit/minimum ri...,"Boucher, Mark",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674142,2654215,"The maid, the man, and the fans: Elivis is the...","Rooks, Nancy; Gutter, Mae",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674143,2654216,"The monstrous-feminine: film, feminism, psycho...","Creed, Barbara",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00


## The plan is to enrich this dataset using the Google API. The ISBN will be used to pull the required information from the API. In order to achieve that, we need to check the non valid values in the ISBN column. let's do some cleaning first

In [17]:
# Check the types
lfpl_books.dtypes

BibNum               int64
Title               object
Author              object
ISBN               float64
PublicationYear      int64
ItemType            object
ItemCollection      object
ItemLocation        object
ItemPrice          float64
ReportDate          object
dtype: object

In [18]:
shape = lfpl_books.shape
print(f"Rows: {shape[0]:,}, Columns: {shape[1]}")

Rows: 1,184,052, Columns: 10


In [5]:
# Finding how many NaN values per column
lfpl_books.isna().sum()

BibNum                 0
Title                  1
Author             65644
ISBN               36527
PublicationYear        0
ItemType               0
ItemCollection       202
ItemLocation           0
ItemPrice              0
ReportDate             0
ObjectId               0
dtype: int64

In [19]:

# Drop rows with missing values in the 'Title' column
lfpl_books = lfpl_books.dropna(subset=['Title'])
lfpl_books.loc[lfpl_books['Title'].isna() ]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate


In [20]:
# Looking for missing author and ISBN
lfpl_books.loc[(lfpl_books['Author'].isna()) & (lfpl_books['ISBN'].isna())]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate
17100,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,06/01/2023 00:00:00
17101,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,06/01/2023 00:00:00
17102,1375916,Laptop,,,0,Book,Laptop,Main,900.31,06/01/2023 00:00:00
17103,1375916,Laptop,,,0,Book,Laptop,Main,900.31,06/01/2023 00:00:00
17104,1375916,Laptop,,,0,Book,Laptop,Main,900.31,06/01/2023 00:00:00
...,...,...,...,...,...,...,...,...,...,...
1669175,2647142,Little elephant's big heart.,,,2017,Book,Children's Board Book,Jeffersontown,5.99,06/01/2023 00:00:00
1671387,2647603,Why the Dutch are Different,,,0,Book,,Southwest,0.00,06/01/2023 00:00:00
1672187,2646832,The Saga of Gosta Berling,,,2006,Book,Adult DVD,Main,74.19,06/01/2023 00:00:00
1673882,2650013,highlights into the future,,,0,Book,,Crescent Hill,0.00,06/01/2023 00:00:00


Apparently the Laptops are included as books in this dataset.

In [21]:
lfpl_books[lfpl_books['Title'] == 'Laptop']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate
17100,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,06/01/2023 00:00:00
17101,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,06/01/2023 00:00:00
17102,1375916,Laptop,,,0,Book,Laptop,Main,900.31,06/01/2023 00:00:00
17103,1375916,Laptop,,,0,Book,Laptop,Main,900.31,06/01/2023 00:00:00
17104,1375916,Laptop,,,0,Book,Laptop,Main,900.31,06/01/2023 00:00:00
...,...,...,...,...,...,...,...,...,...,...
17197,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,06/01/2023 00:00:00
17198,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,06/01/2023 00:00:00
17199,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,06/01/2023 00:00:00
17200,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,06/01/2023 00:00:00


In [22]:
# Dropping all the laptops from the dataset
lfpl_books = lfpl_books.loc[lfpl_books['Title'] != 'Laptop']
lfpl_books[lfpl_books['Title'] == 'Laptop']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate


The ISBN column has several values NaN. We need this value populated to use the Google API in order to obtain more information about the book.
The easiest solution is to drop all these rows. Before doing that, we will attempt to obtain the ISBN number using the book's title.
The main challenge is that API calls are pretty slow. 
For the records: I tried several methods to speed up the process using multithreading, but that made things worse: the API were returning errors.

## First we will change the data type from float to string on the ISBN

In [23]:
lfpl_books['ISBN'] = lfpl_books['ISBN'].astype(str)
lfpl_books

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9780071605878.0,2009,Book,Adult Non-Fiction,Main,25.95,06/01/2023 00:00:00
1,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408.0,2009,Book,Adult Non-Fiction,Southwest,19.99,06/01/2023 00:00:00
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408.0,2009,Book,Adult Non-Fiction,Southwest,19.99,06/01/2023 00:00:00
3,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408.0,2009,Book,Adult Non-Fiction,Remote Shelving - Main,19.99,06/01/2023 00:00:00
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408.0,2009,Book,Adult Non-Fiction,Remote Shelving - Main,19.99,06/01/2023 00:00:00
...,...,...,...,...,...,...,...,...,...,...
1674140,2654213,The Saracen's mark,"Perry, SW",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674141,2654214,The hedge fund edge: maximum profit/minimum ri...,"Boucher, Mark",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674142,2654215,"The maid, the man, and the fans: Elivis is the...","Rooks, Nancy; Gutter, Mae",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674143,2654216,"The monstrous-feminine: film, feminism, psycho...","Creed, Barbara",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00


## There is an extra '.0' on the string. Let's remove it

In [24]:
lfpl_books['ISBN'] = lfpl_books['ISBN'].astype(str).str.rstrip('.0')
lfpl_books

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9780071605878,2009,Book,Adult Non-Fiction,Main,25.95,06/01/2023 00:00:00
1,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,06/01/2023 00:00:00
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,06/01/2023 00:00:00
3,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Remote Shelving - Main,19.99,06/01/2023 00:00:00
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Remote Shelving - Main,19.99,06/01/2023 00:00:00
...,...,...,...,...,...,...,...,...,...,...
1674140,2654213,The Saracen's mark,"Perry, SW",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674141,2654214,The hedge fund edge: maximum profit/minimum ri...,"Boucher, Mark",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674142,2654215,"The maid, the man, and the fans: Elivis is the...","Rooks, Nancy; Gutter, Mae",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00
1674143,2654216,"The monstrous-feminine: film, feminism, psycho...","Creed, Barbara",,0,Book,Interlibrary Loan,Main,0.00,06/01/2023 00:00:00


In [25]:
# ISBN is an string now
lfpl_books.dtypes

BibNum               int64
Title               object
Author              object
ISBN                object
PublicationYear      int64
ItemType            object
ItemCollection      object
ItemLocation        object
ItemPrice          float64
ReportDate          object
dtype: object

In [26]:
# Nothing is NaN after the conversion, it's the string 'nan' instead
lfpl_books[lfpl_books['ISBN'] == 'nan']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate
4968,1348353,Stolen legacy: the Egyptian origins of western...,"James, George G. M",,0,Book,Interlibrary Loan,Main,0.0,06/01/2023 00:00:00
7938,1353427,A symposium in public librarianship; three add...,"University of California, Berkeley. School of ...",,1952,Book,Kentucky History,Main,25.0,06/01/2023 00:00:00
8150,1353740,"WWI : the Great War, 1914-1918","Brown, Hilary (Author of Two-liners for kids)",,2014,Book,Adult Non-Fiction,Main,15.0,06/01/2023 00:00:00
8165,1353780,LANKIE MANTITA,LESLIE PATRICELLI,,0,Book,,Shively,0.0,06/01/2023 00:00:00
10449,1362079,Act of War,"Brown, Dale",,0,Book,,Bon Air,0.0,06/01/2023 00:00:00
...,...,...,...,...,...,...,...,...,...,...
1674140,2654213,The Saracen's mark,"Perry, SW",,0,Book,Interlibrary Loan,Main,0.0,06/01/2023 00:00:00
1674141,2654214,The hedge fund edge: maximum profit/minimum ri...,"Boucher, Mark",,0,Book,Interlibrary Loan,Main,0.0,06/01/2023 00:00:00
1674142,2654215,"The maid, the man, and the fans: Elivis is the...","Rooks, Nancy; Gutter, Mae",,0,Book,Interlibrary Loan,Main,0.0,06/01/2023 00:00:00
1674143,2654216,"The monstrous-feminine: film, feminism, psycho...","Creed, Barbara",,0,Book,Interlibrary Loan,Main,0.0,06/01/2023 00:00:00


In [39]:
# Changing 'nan' to an empty string
lfpl_books['ISBN'] = lfpl_books['ISBN'].replace('nan', '')

In [40]:
isbn_length = lfpl_books['ISBN'].str.len().value_counts()
isbn_length

ISBN
13    1034288
12     102386
0       35899
11      10083
10       1146
9         142
8           4
7           1
Name: count, dtype: int64

In [77]:
# Open Books API test. Trying to use compression, if possible, to reduce the size of the data transfered over the network
# Exploring all the information available on the API
import requests
import gzip
from io import BytesIO

def get_book_info(book_name):
    base_url = "https://openlibrary.org/search.json"

    response = requests.get(base_url, params={'q': book_name})

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Check if the response is gzip compressed
        if 'Content-Encoding' in response.headers and response.headers['Content-Encoding'] == 'gzip':
            # Parse the JSON response from a gzip-compressed content
            with gzip.GzipFile(fileobj=BytesIO(response.content), mode='rb') as gz:
                data = gz.read().decode('iso-8859-1')
        else:
            # If not compressed, decode as utf-8
            data = response.content.decode('utf-8')

        # Display information from the JSON response
        data = response.json()

        if 'docs' in data and data['docs']:
            first_book = data['docs'][0]

            # Traverse and display information
            for key, value in first_book.items():
                print(f"{key}: {value}")
        else:
            print(f"No information found for the book: {book_name}")
    else:
        print(f"Error: Unable to fetch information for the book {book_name}")

# Example: Get information for the book "The Great Gatsby"
#get_book_info("The Great Gatsby")
get_book_info("Old Creole days")



key: /works/OL2178936W
type: work
seed: ['/books/OL13555446M', '/books/OL18604806M', '/books/OL6971233M', '/books/OL7091166M', '/books/OL6742373M', '/books/OL6979450M', '/books/OL7056325M', '/books/OL6941435M', '/books/OL22333180M', '/books/OL472097M', '/books/OL6971091M', '/books/OL24178541M', '/books/OL13537061M', '/books/OL23747649M', '/books/OL7113470M', '/books/OL7125200M', '/books/OL7036468M', '/books/OL24178424M', '/books/OL22347396M', '/books/OL15038444M', '/books/OL23765223M', '/books/OL6456144M', '/books/OL16249535M', '/books/OL16207376M', '/books/OL4433761M', '/books/OL5444026M', '/books/OL5302549M', '/books/OL3666101M', '/books/OL2204587M', '/books/OL9769056M', '/books/OL9818132M', '/books/OL9817453M', '/books/OL11976116M', '/works/OL2178936W', '/subjects/fiction', '/subjects/creoles_in_fiction', '/subjects/creoles', '/subjects/social_life_and_customs', '/subjects/louisiana_in_fiction', '/subjects/city_and_town_life', '/subjects/history', '/subjects/fiction_general', '/subj

In [48]:
# Modifying the previous script to only return the ISBN number. Handling some exceptions.
import requests
import gzip
from io import BytesIO
import time

def get_book_isbn(book_title, max_retries=3):
    base_url = "https://openlibrary.org/search.json"
    
    for attempt in range(max_retries + 1):
        try:
            # Sending a request to the Open Books API with the book title
            response = requests.get(base_url, params={'q': book_title})

            # Check if the request was successful (status code 200)
            response.raise_for_status()

            # Check if the response is gzip compressed
            if 'Content-Encoding' in response.headers and response.headers['Content-Encoding'] == 'gzip':
                # Parse the JSON response from a gzip-compressed content
                with gzip.GzipFile(fileobj=BytesIO(response.content), mode='rb') as gz:
                    data = gz.read().decode('iso-8859-1')
            else:
                # If not compressed, decode as utf-8
                data = response.content.decode('utf-8')

            # Parse the JSON response
            data = response.json()

            if 'docs' in data and data['docs']:
                first_book = data['docs'][0]

                # Check if ISBN information is available
                if 'isbn' in first_book:
                    # Return the ISBN
                    return first_book['isbn'][0] if first_book['isbn'] else ""
                else:
                    print(f"ISBN information not found for the book: {book_title}")
                    return ""
            else:
                print(f"No information found for the book: {book_title}")
                return ""

        except requests.exceptions.HTTPError as e:
            if attempt < max_retries and e.response.status_code == 429:
                # Retry if the status code is 429 (Too Many Requests)
                retry_after = int(e.response.headers.get('Retry-After', 5))  # Default to 5 seconds if Retry-After is not present
                print(f"Retrying after {retry_after} seconds due to status code 429 (Too Many Requests)...")
                time.sleep(retry_after)
            else:
                print(f"Error: {e}")
                return ""

isbn = get_book_isbn("The Great Gatsby")
print(f"ISBN: {isbn}")



ISBN: 9788484504382


In [49]:
# Function to apply to the DataFrame
def fill_missing_isbn(row):
    if row["ISBN"] == '':
        return get_book_isbn(row["Title"])
    else:
        return row["ISBN"]

In [50]:
# Example DataFrame
data = {
    "Title": ["Cognitive Mapping : Past, Present, and Future","LANKIE MANTITA", "WWI : the Great War, 1914-1918", "the girl in the road", "Sharp Edges", "My First Pet"],
    "ISBN": ["", "", "", "", "",""]
}

df = pd.DataFrame(data)

# Apply the function using lambda only to rows with missing ISBN
df["ISBN"] = df.apply(lambda row: fill_missing_isbn(row), axis=1)

print(df)

No information found for the book: LANKIE MANTITA
                                           Title           ISBN
0  Cognitive Mapping : Past, Present, and Future     1315812282
1                                 LANKIE MANTITA               
2                 WWI : the Great War, 1914-1918     1869792637
3                           the girl in the road  9798791410146
4                                    Sharp Edges  9780671524098
5                                   My First Pet  9781405309035


In [51]:
# Testing with the Open Library Books API - v2

def get_isbn_from_openlibrary(title):
    base_url = "http://openlibrary.org/search.json"
    params = {"title": title}
    response = requests.get(base_url, params, headers={"Accept-Encoding": "gzip"})
    if response.status_code == 429:
        retry_after = int(response.headers.get("Retry-After"))
        print(f"API rate limit exceeded. Retrying after {retry_after} seconds.")
        time.sleep(retry_after)
        return get_isbn_from_openlibrary(title)
    if response.status_code != 200:
        print(f"Error retrieving ISBN for title '{title}': {response.status_code}")
        return None
    content_type = response.headers.get("Content-Type")
    if content_type == "application/gzip":
        with gzip.decompress(response.content) as decompressed_data:
            data = json.loads(decompressed_data.decode("utf-8"))
    else:
        data = json.loads(response.content.decode("utf-8"))
    isbn = data.get("docs")[0].get("isbn")[0] if data.get("docs") and data.get("docs")[0].get("isbn") else None
    return isbn

isbn = get_isbn_from_openlibrary("The Great Gatsby")
print(f"ISBN: {isbn}")

ISBN: 9788484504382


In [70]:
import pandas as pd

# read the original CSV file
df = pd.read_csv('data\openlib_isbn.csv')

# drop rows with missing ISBN values
df = df.dropna(subset=['ISBN'])
df = df.drop_duplicates()

# write the updated DataFrame to a new CSV file
df.to_csv('data\openlib_isbn_1.csv', index=False)


In [79]:
# Using the Open Library API. Saving to a csv because this will require multiple runs: API will block too many calls.
# Check if the CSV file exists
import os

# Check if the CSV file exists
csv_file = 'data\openlib_isbn.csv'
csv_exists = os.path.exists(csv_file)

# If the CSV file doesn't exist, create an empty DataFrame with headers
if not csv_exists:
    isbn_openlib = pd.DataFrame(columns=['Title', 'ISBN'], dtype='str')
    isbn_openlib.to_csv(csv_file, index=False)

# Identify the remaining rows that need processing
remaining_rows = lfpl_books.loc[lfpl_books['ISBN'] == '', 'Title'].drop_duplicates()

# Iterate over remaining rows
for title in remaining_rows:
    # Check if the title is already in isbn_openlib DataFrame
    if title not in isbn_openlib['Title'].values:
        try:
            # Obtain ISBN using the get_book_isbn function
            isbn = get_book_isbn(title)

            # Concatenate the result to the DataFrame
            #if isbn:
                # Concatenate the result to the DataFrame
            new_row = pd.DataFrame({'Title': [title], 'ISBN': [isbn]})
            isbn_openlib = pd.concat([isbn_openlib, new_row], ignore_index=True)
                
            # Append the new row to the CSV file
            new_row.to_csv(csv_file, index=False, header=not csv_exists, mode='a')

            # Update csv_exists flag
            csv_exists = True

        except Exception as e:
            print(f"Error processing title '{title}': {e}")
    # Remove rows with blank ISBNs from the CSV file
#isbn_openlib = pd.read_csv(csv_file)
#isbn_openlib = isbn_openlib[isbn_openlib['ISBN'] != '']
#isbn_openlib.to_csv(csv_file, index=False)        
      #isbn_openlib.drop_duplicates(subset='Title', inplace=True) # Just in case there are some duplicates(it shouldn't)

No information found for the book: 1850 U. S. census, Amelia County, Virginia
No information found for the book: 1850 U. S. census, Oldham County, Kentucky
No information found for the book: 1850 U. S. census, Owen County, Kentucky
No information found for the book: 1850 U. S. census, Owsley County, Kentucky
No information found for the book: 1850 U. S. census, Pendleton County, Kentucky
No information found for the book: 1850 U. S. census, Scott County, Kentucky
No information found for the book: 1850 U. S. census, Simpson County, Kentucky
No information found for the book: 1850 U. S. census, Spencer County, Kentucky
No information found for the book: 1850 U.S. census, Trigg County, Kentucky
ISBN information not found for the book: Three Gloucester plays
No information found for the book: 1850 U. S. census, Boyle County, Kentucky
ISBN information not found for the book: Nelson County Kentucky records
ISBN information not found for the book: The floods of 1913 in the rivers of the Ohio

In [63]:
isbn_openlib

Unnamed: 0,Title,ISBN
0,Stolen legacy: the Egyptian origins of western...,1494861992
1,A symposium in public librarianship; three add...,
2,"WWI : the Great War, 1914-1918",1869792637
3,LANKIE MANTITA,
4,Act of War,0659021676
...,...,...
3205,First season.,9780333118702
3206,A mirror for England; British movies from aust...,0571095038
3207,"The annals and scandals of Henderson County, K...",
3208,Jerusalem,1379313074


In [57]:
# Option 1
# Step 1: Create a new DataFrame with unique book titles and empty ISBNs
unique_titles = lfpl_books.loc[lfpl_books['ISBN'] == '', 'Title'].drop_duplicates().reset_index(drop=True)
isbn_openlib = pd.DataFrame({'Title': unique_titles})
isbn_openlib

Unnamed: 0,Title
0,Stolen legacy: the Egyptian origins of western...
1,A symposium in public librarianship; three add...
2,"WWI : the Great War, 1914-1918"
3,LANKIE MANTITA
4,Act of War
...,...
27980,KOREA the land the people the culture
27981,The Saracen's mark
27982,The hedge fund edge: maximum profit/minimum ri...
27983,"The monstrous-feminine: film, feminism, psycho..."


In [58]:
# Step 2 and 3: Use the get_book_isbn function to obtain ISBNs
isbn_openlib['ISBN'] = isbn_openlib['Title'].apply(get_book_isbn)
isbn_openlib

ISBN information not found for the book: A symposium in public librarianship; three addresses
No information found for the book: LANKIE MANTITA
ISBN information not found for the book: Pieces from the past
ISBN information not found for the book: Double strand deception
ISBN information not found for the book: Murder simply played
ISBN information not found for the book: Secrets of the Amish diary
ISBN information not found for the book: Proper tension
ISBN information not found for the book: A dangerous tide
ISBN information not found for the book: Plain deception
ISBN information not found for the book: Celtic chains
ISBN information not found for the book: Simple lies
ISBN information not found for the book: Double cross crochet
ISBN information not found for the book: Where hope dwells
ISBN information not found for the book: A ring of deception
ISBN information not found for the book: Humble pies and white lies
ISBN information not found for the book: Law & old order
ISBN informat

ConnectTimeout: HTTPSConnectionPool(host='openlibrary.org', port=443): Max retries exceeded with url: /search.json?q=Parish+registers+of+Horton+Kirbie%2C+Co.+Kent (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001AAF520A750>, 'Connection to openlibrary.org timed out. (connect timeout=None)'))

In [59]:
isbn_openlib

Unnamed: 0,Title
0,Stolen legacy: the Egyptian origins of western...
1,A symposium in public librarianship; three add...
2,"WWI : the Great War, 1914-1918"
3,LANKIE MANTITA
4,Act of War
...,...
27980,KOREA the land the people the culture
27981,The Saracen's mark
27982,The hedge fund edge: maximum profit/minimum ri...
27983,"The monstrous-feminine: film, feminism, psycho..."


In [None]:



# Step 4: Remove duplicated book titles
#isbn_openlib = isbn_openlib.drop_duplicates(subset='Title').reset_index(drop=True)

# Step 5: Save the resulting DataFrame to a CSV file
isbn_openlib.to_csv('data\openlib_isbn.csv', index=False)

# Display the resulting DataFrame
isbn_openlib

In [52]:
# Creating a copy of the dataframe with the title columns
# Removing duplicates titles to reduce the number of API calls
isbn_openlib = lfpl_books[lfpl_books["ISBN"]==""][["Title"]].drop_duplicates()
#isbn_openlib = lfpl_books.loc[lfpl_books['ISBN'] == '', 'Title'].drop_duplicates().reset_index(drop=True)

shape_c = isbn_openlib.shape
print(f"Rows: {shape_c[0]:,}, Columns: {shape_c[1]}")
isbn_openlib["ISBN"] = isbn_openlib.apply(lambda row: fill_missing_isbn(row), axis=1)

Rows: 27,985, Columns: 1


In [71]:
temp_df['ISBN'].isna().value_counts()

ISBN
True     21245
False     7082
Name: count, dtype: int64

In [64]:
temp_df[temp_df['ISBN'].notna()]

Unnamed: 0,Title,ISBN
5931,My First Pet,9354401066
6383,O little town of Sugarcreek,1961125498
6405,Act of War,9780451466198
6417,The Christmas riddle,9798586294586
6434,Castle of whispers,9781609451929
...,...,...
1181089,The New Mencken Letters,9780803713796
1181093,Eye of the Beholder,0671523066
1181101,Sharp Edges,9780671524098
1181291,the girl in the road,0804138842


In [81]:
# We found 7082 ISBN's using this API
# Merge the result with the original dataframe using the title column
# Merge the two dataframes using the title column
merged_df = lfpl_books.merge(temp_df, on='Title', how='left')
merged_df

Unnamed: 0,BibNum,Title,Author,ISBN_x,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId,ISBN_y
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9780071605878,2009,Book,Adult Non-Fiction,Main,25.95,2022/07/01 04:00:00+00,1,
1,1341361,McGraw-Hill's American idioms dictionary,"Spears, Richard A.",9780071478939,2007,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,2,
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,3,
3,1341362,McGraw-Hill's essential ESL grammar : a handbo...,"Lester, Mark.",9780071496421,2008,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,4,
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,5,
...,...,...,...,...,...,...,...,...,...,...,...,...
1181239,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Main,10.02,2022/07/01 04:00:00+00,1657551,
1181240,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Newburg,10.02,2022/07/01 04:00:00+00,1657552,
1181241,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,South Central,10.02,2022/07/01 04:00:00+00,1657553,
1181242,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Western,10.02,2022/07/01 04:00:00+00,1657554,


In [84]:
lfpl_books[lfpl_books['ISBN']=='nan']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
4429,1348353,Stolen legacy: the Egyptian origins of western...,"James, George G. M",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,5055
5336,1353427,A symposium in public librarianship; three add...,"University of California, Berkeley. School of ...",,1952,Book,Kentucky History,Main,25.0,2022/07/01 04:00:00+00,7761
5510,1353740,"WWI : the Great War, 1914-1918","Brown, Hilary (Author of Two-liners for kids)",,2014,Book,Adult Non-Fiction,Main,15.0,2022/07/01 04:00:00+00,8010
5511,1353780,LANKIE MANTITA,LESLIE PATRICELLI,,0,Book,,Shively,0.0,2022/07/01 04:00:00+00,8027
5931,2511351,My First Pet,,,0,Book,,Newburg,0.0,2022/07/01 04:00:00+00,8588
...,...,...,...,...,...,...,...,...,...,...,...
1181091,2528085,"Cognitive Mapping : Past, Present, and Future","Kitchin, Rob",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657144
1181093,2528087,Eye of the Beholder,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657147
1181101,2528088,Sharp Edges,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657176
1181291,2527399,the girl in the road,MONICA BYRNE,,0,Book,,Main,0.0,2022/07/01 04:00:00+00,1657506


In [96]:
lfpl_books_1 = lfpl_books.merge(temp_df, on='Title', how='left', suffixes=('', '_temp'))


In [97]:
lfpl_books_1[lfpl_books_1['ISBN']=='nan'].value_counts()

BibNum   Title                                             Author                                         ISBN  PublicationYear  ItemType  ItemCollection     ItemLocation            ItemPrice  ReportDate              ObjectId  ISBN_temp    
2313     Roughing it                                       Twain, Mark, 1835-1910.                        nan   1992             Book      Adult Non-Fiction  Jeffersontown           22.95      2022/07/01 04:00:00+00  82652     9798465049788    1
476060   The Orangeburg massacre                           Nelson, Jack, 1929 Oct. 11-                    nan   1970             Book      Adult Reference    Western                 55.00      2022/07/01 04:00:00+00  344282    9780865541207    1
476294   The Negro in the American Revolution              Aptheker, Herbert, 1915-                       nan   1940             Book      Adult Non-Fiction  Remote Shelving - Main  25.00      2022/07/01 04:00:00+00  344549    9780807838334    1
                     

In [98]:

# Update the missing values in the isbn column of lfpl_books using the values from temp_df
lfpl_books_1['ISBN'] = lfpl_books_1.apply(lambda row: row['ISBN_temp'] if row['ISBN']=='nan' else row['ISBN'], axis=1)
lfpl_books_1.drop('ISBN_temp', axis=1, inplace=True)


In [104]:
# 'nan' are now None
lfpl_books_1['ISBN'].isna().value_counts()


ISBN
False    1154047
True       27197
Name: count, dtype: int64

In [106]:
# Still 27197 'nan' values
lfpl_books_1.to_csv('data\lfpl_books_1.csv',index=False)

In [74]:
lfpl_books_1

Unnamed: 0,BibNum,Title,Author,ISBN_x,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId,ISBN
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9780071605878,2009,Book,Adult Non-Fiction,Main,25.95,2022/07/01 04:00:00+00,1,9780071605878
1,1341361,McGraw-Hill's American idioms dictionary,"Spears, Richard A.",9780071478939,2007,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,2,9780071478939
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,3,9780307451408
3,1341362,McGraw-Hill's essential ESL grammar : a handbo...,"Lester, Mark.",9780071496421,2008,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,4,9780071496421
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,5,9780307451408
...,...,...,...,...,...,...,...,...,...,...,...,...
1181239,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Main,10.02,2022/07/01 04:00:00+00,1657551,9780399544354
1181240,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Newburg,10.02,2022/07/01 04:00:00+00,1657552,9781787737372
1181241,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,South Central,10.02,2022/07/01 04:00:00+00,1657553,9780063055254
1181242,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Western,10.02,2022/07/01 04:00:00+00,1657554,9781974712533


In [107]:
lfpl_books_1['ISBN'].isna().value_counts()

ISBN
False    1154047
True       27197
Name: count, dtype: int64

## Google API. It should be noted that the API respond with errors in too many calls are generated from the same IP address

In [29]:
# I used this method on the first phase but it was returning the response 429(too many requests). It had to be modified.
import requests

def find_isbn_by_book_name(book_name):
    # Define the API endpoint to search for ISBN using book names
    api_url = f"https://www.googleapis.com/books/v1/volumes?q={book_name}"
    # Using "intitle" find less ISBN numbers on the test
    #api_url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{book_name}"
    try:
        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            book_response = data.get('totalItems', '')
            if book_response in (0, ""):
                return ""
            else:
                isbn = data.get('items','')[0].get('volumeInfo','').get('industryIdentifiers','')[0].get('identifier','')
                #return isbn if len(isbn) in {10, 13} else ""
                if len(isbn) in (10, 13): # Only acceptable ISBN length
                    return isbn
                else:
                    return ""
        
    except Exception as e:
        return ""
        #print(f"API request error: {str(e)}")

    return ''



In [21]:
# Function to apply to the DataFrame
def fill_missing_isbn(row):
    if row["ISBN"] == "nan":
        return find_isbn_by_book_name(row["Title"])
    else:
        return row["ISBN"]

In [28]:
# Example DataFrame
data = {
    "Title": ["Cognitive Mapping : Past, Present, and Future","LANKIE MANTITA", "WWI : the Great War, 1914-1918", "the girl in the road", "Sharp Edges", "My First Pet"],
    "ISBN": ["nan", "nan", "nan", "nan", "nan","nan"]
}

df = pd.DataFrame(data)



# Apply the function using lambda only to rows with missing ISBN
df["ISBN"] = df.apply(lambda row: fill_missing_isbn(row), axis=1)

print(df)

                                           Title           ISBN
0  Cognitive Mapping : Past, Present, and Future  9781317798071
1                                 LANKIE MANTITA               
2                 WWI : the Great War, 1914-1918               
3                           the girl in the road  9780804138864
4                                    Sharp Edges  9781439120118
5                                   My First Pet  9781465452955


The Google API appears to have more information than the previous one. 
The returned will be accepted if it has 10(ISBN-10) or 13(ISBN-13) characters.


In [23]:
# Experimenting with timing

print(f"My First Pet - ISBN: {find_isbn_by_book_name('My First Pet')}")

My First Pet - ISBN: 1620315491


In [None]:
# The following line of code might take a lot to process

#lfpl_books["ISBN"] = lfpl_books.apply(lambda row: fill_missing_isbn(row), axis=1)

## Trying to use concurrent features

In [207]:
isbn_length

ISBN
13    1031252
12     102249
3       36430
11      10031
10       1137
9         140
8           4
7           1
Name: count, dtype: int64

In [120]:
# Creating a copy of the dataframe with only the isbn and title columns
missing_isbn_df = lfpl_books_1[lfpl_books_1["ISBN"].isna()][["Title"]].drop_duplicates()
shape_c = missing_isbn_df.shape
print(f"Rows: {shape_c[0]:,}, Columns: {shape_c[1]}")

Rows: 21,245, Columns: 1


In [119]:
missing_isbn_df

Unnamed: 0,Title
4429,Stolen legacy: the Egyptian origins of western...
5336,A symposium in public librarianship; three add...
5510,"WWI : the Great War, 1914-1918"
5511,LANKIE MANTITA
6414,Pieces from the past
...,...
1180968,Never Surrender : A Soldier's Journey to the C...
1180971,The Secret of the Universe = Mysterium Cosmogr...
1180976,Mathematics : a Very Short Introduction
1180988,ILL KDLA BOOK KIT - Hidden Places


We saved a few thousands API calls removing the duplicates

In [110]:
# Trying to select the appropiate number of cores, depending on the CPU
import multiprocessing

def get_cpu_count():
  return multiprocessing.cpu_count()

def select_max_workers(cpu_count):
  if cpu_count <= 4:
    return cpu_count
  else:
    return int(cpu_count * 0.75)

In [111]:
print(select_max_workers(get_cpu_count()))

18


In [131]:
import time

def get_isbn_from_api_with_retry(book_title, max_retries=3):
    
    #base_url = "https://www.googleapis.com/books/v1/volumes"
    #params = {"q": f"intitle:{book_title}", "maxResults": 1}

    api_url = f"https://www.googleapis.com/books/v1/volumes?q={book_title}"

    for retry in range(max_retries):
        try:
            #response = requests.get(api_url, params=params)
            response = requests.get(api_url)
            response.raise_for_status()  # Raise an HTTPError for bad responses
            data = response.json()

            # Check if any items were found
            if "items" in data and data["items"]:
                volume_info = data["items"][0]["volumeInfo"]
                isbn = volume_info.get("industryIdentifiers", [])

                # Validate ISBN length
                for identifier in isbn:
                    if identifier["type"] == "ISBN_10" and len(identifier["identifier"]) == 10:
                        return identifier["identifier"]
                    elif identifier["type"] == "ISBN_13" and len(identifier["identifier"]) == 13:
                        return identifier["identifier"]

            return ""
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                # Extract the 'Retry-After' header value
                retry_after = int(e.response.headers.get("Retry-After", 1))
                print(f"Rate limit exceeded. Retrying in {retry_after} seconds...")
                time.sleep(retry_after)
                return ""
            else:
                print(f"Error in API request: {e}")
                return ""
        except requests.exceptions.RequestException as e:
            print(f"Error in API request: {e}")
            return ""

    print(f"Exceeded maximum retries for {book_title}. Skipping...")
    return ""


In [121]:
def get_isbn(book_title, csvfile):
    isbn = get_isbn_from_api_with_retry(book_title)

    if isbn:
        with open(csvfile, "a", encoding="utf-8", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([isbn, book_title])
            print(f'Saving: {book_title}')
    return isbn

In [132]:
from concurrent.futures import ThreadPoolExecutor

cpu_count = get_cpu_count()
max_workers = select_max_workers(cpu_count)

with open('data/lfpl_books_2', "a", encoding="utf-8", newline="") as csvfile:
            
    with ThreadPoolExecutor( max_workers ) as executor:
        #missing_isbn_df["ISBN"] = list(executor.map(fill_missing_isbn, missing_isbn_df.itertuples(index=False)))
        missing_isbn_df["ISBN"] = list(executor.map(get_isbn_from_api_with_retry, missing_isbn_df['Title']))
        writer = csv.writer(csvfile)
        writer.writerow(missing_isbn_df['ISBN'], missing_isbn_df['Title'])
        print(f"Saving: {missing_isbn_df['Title']}")


Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1

KeyboardInterrupt: 

In [128]:
missing_isbn_df

Unnamed: 0,Title
4429,Stolen legacy: the Egyptian origins of western...
5336,A symposium in public librarianship; three add...
5510,"WWI : the Great War, 1914-1918"
5511,LANKIE MANTITA
6414,Pieces from the past
...,...
1180968,Never Surrender : A Soldier's Journey to the C...
1180971,The Secret of the Universe = Mysterium Cosmogr...
1180976,Mathematics : a Very Short Introduction
1180988,ILL KDLA BOOK KIT - Hidden Places


In [171]:
# Merge the result with the original dataframe using the title column
lfpl_books = pd.merge(lfpl_books, missing_isbn_df, on="Title", how="left", suffixes=("_old", ""))

#lfpl_books["ISBN"] = lfpl_books["ISBN_new"].combine_first(lfpl_books["ISBN"]).astype(str)  # Fill missing ISBN in the original dataframe

# Drop temporary columns
#lfpl_books = lfpl_books.drop(columns=["ISBN_new"])
lfpl_books = lfpl_books.drop(columns=["ISBN_old"])

In [1]:

lfpl_books['ISBN'].str.len().value_counts()

NameError: name 'lfpl_books' is not defined

47 minutes and 55 seconds in a cpu with 16 cores, 24 logical procesors

In [50]:
lfpl_books['ISBN'].str.len().value_counts()

ISBN
13    1031252
12     102249
3       36527
11      10031
10       1137
9         140
8           4
7           1
Name: count, dtype: int64

In [51]:
def summ_length(strings):
  """
  Summarizes a list of strings by length.

  Args:
    strings: A list of strings.

  Returns:
    A dictionary that maps string lengths to the number of strings of that length.
  """

  summary = {}

  for string in strings:
    length = len(string)

    if length not in summary:
      summary[length] = 0

    summary[length] += 1

  return summary

strings = ['hello', 'world', 'this', 'is', 'a', 'test']
summary = summ_length(results)

for length, count in summary.items():
  print(f"There are {count} strings with length {length}")

There are 1032881 strings with length 13
There are 102384 strings with length 12
There are 10036 strings with length 11
There are 1864 strings with length 10
There are 228 strings with length 15
There are 31565 strings with length 0
There are 89 strings with length 22
There are 863 strings with length 14
There are 152 strings with length 16
There are 633 strings with length 18
There are 172 strings with length 19
There are 266 strings with length 23
There are 46 strings with length 20
There are 17 strings with length 17
There are 140 strings with length 9
There are 4 strings with length 8
There are 1 strings with length 7


In [54]:
new_isbn = pd.DataFrame(results, columns=['ISBN'])
new_isbn['ISBN'].str.len().value_counts()

ISBN
13    1032881
12     102384
0       31565
11      10036
10       1864
14        863
18        633
23        266
15        228
19        172
16        152
9         140
22         89
20         46
17         17
8           4
7           1
Name: count, dtype: int64

In [55]:
lfpl_books['ISBN'].str.len().value_counts()

ISBN
13    1031252
12     102249
3       36527
11      10031
10       1137
9         140
8           4
7           1
Name: count, dtype: int64

In [58]:
new_isbn[new_isbn['ISBN'].str.len() == 17]

Unnamed: 0,ISBN
60695,EAN:8596547062158
96189,EAN:4064066431020
102906,EAN:4064066397913
245348,SRLF:AX0000165175
252219,EAN:8596547026808
254818,EAN:8596547408710
259217,EAN:8596547378266
261460,EAN:8596547192480
262910,EAN:8596547403005
264185,SRLF:AA0002591691


In [56]:
new_isbn.shape

(1181341, 1)

In [57]:
lfpl_books.shape

(1181341, 12)

In [49]:
type(results)

list

## Let's check the results

In [62]:
lfpl_books['ISBN'].isna().value_counts()

ISBN
False    1181341
Name: count, dtype: int64

In [72]:
lfpl_books[lfpl_books['ISBN'] == 'nan']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
4429,1348353,Stolen legacy: the Egyptian origins of western...,"James, George G. M",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,5055
5336,1353427,A symposium in public librarianship; three add...,"University of California, Berkeley. School of ...",,1952,Book,Kentucky History,Main,25.0,2022/07/01 04:00:00+00,7761
5510,1353740,"WWI : the Great War, 1914-1918","Brown, Hilary (Author of Two-liners for kids)",,2014,Book,Adult Non-Fiction,Main,15.0,2022/07/01 04:00:00+00,8010
5511,1353780,LANKIE MANTITA,LESLIE PATRICELLI,,0,Book,,Shively,0.0,2022/07/01 04:00:00+00,8027
5931,2511351,My First Pet,,,0,Book,,Newburg,0.0,2022/07/01 04:00:00+00,8588
...,...,...,...,...,...,...,...,...,...,...,...
1181091,2528085,"Cognitive Mapping : Past, Present, and Future","Kitchin, Rob",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657144
1181093,2528087,Eye of the Beholder,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657147
1181101,2528088,Sharp Edges,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657176
1181291,2527399,the girl in the road,MONICA BYRNE,,0,Book,,Main,0.0,2022/07/01 04:00:00+00,1657506


In [64]:
# Finding books with no title
lfpl_books[lfpl_books['Title'].isna()]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
198351,2239375,,,,0,Book,,Crescent Hill,0.0,2022/07/01 04:00:00+00,243966


In [67]:
lfpl_books[lfpl_books['ItemCollection'].isna()]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
5511,1353780,LANKIE MANTITA,LESLIE PATRICELLI,,0,Book,,Shively,0.0,2022/07/01 04:00:00+00,8027
5931,2511351,My First Pet,,,0,Book,,Newburg,0.0,2022/07/01 04:00:00+00,8588
6405,1362079,Act of War,"Brown, Dale",,0,Book,,Bon Air,0.0,2022/07/01 04:00:00+00,10502
9061,2410700,FOUR,VERONICA ROTH,,0,Book,,Crescent Hill,0.0,2022/07/01 04:00:00+00,16849
12678,2481658,designing your work life,bill burnett,,0,Book,,St Matthews,0.0,2022/07/01 04:00:00+00,22166
...,...,...,...,...,...,...,...,...,...,...,...
1180968,2528221,The Khipu and the Final Key,Dee Garretson,,0,Book,,Northeast,0.0,2022/07/01 04:00:00+00,1656928
1180972,2528222,MOMMY'S DISEASE,CAROLYN HANNAN,,0,Book,,Southwest,0.0,2022/07/01 04:00:00+00,1656934
1181047,2527377,use,,,0,Book,,Northeast,0.0,2022/07/01 04:00:00+00,1657039
1181291,2527399,the girl in the road,MONICA BYRNE,,0,Book,,Main,0.0,2022/07/01 04:00:00+00,1657506


In [6]:
lfpl_books['ItemCollection'].unique()

array(['Adult Non-Fiction', 'ELL Collection', 'Adult Fiction', 'Mystery',
       "Children's Picture Book", 'Science Fiction', 'Older Teen Fiction',
       'Younger Teen  Fiction', 'Adult Paperback', "Children's Fiction",
       'Western', "Children's Picture Paperback", "Children's Paperback",
       'International Collection', 'Teen Non-Fiction',
       "Children's Non-Fiction", 'Kentucky History', 'Natural Resources',
       'Oversize', 'Holiday', 'Urban Fiction', 'Bestsellers',
       "Children's Board Book", 'Storytime Collection',
       'Preschool  Picture Book', "Children's Easy Reader",
       'Adult Reference', 'Interlibrary Loan', nan,
       'Adult Paperbacks Tall', "Children's Easy Reader Paperback",
       'Caldecott/Newbery', 'Laptop', 'Younger Teen  Paperback',
       'Government Documents', 'Large Print', 'Telereference',
       "Children's Non-Fiction Paperback", 'Big Book',
       "Children's Reference", 'Older Teen Paperback', 'Teen Reference',
       'College Shop'

In [7]:
lfpl_books['ItemCollection'].value_counts()

ItemCollection
Adult Non-Fiction                   370372
Adult Fiction                       173472
Children's Non-Fiction               86723
Mystery                              60152
Children's Picture Book              58609
Preschool  Picture Book              50564
Children's Fiction                   47829
Adult Paperback                      45733
Children's Paperback                 44649
Teen Non-Fiction                     24045
Children's Easy Reader               24037
Older Teen Fiction                   23476
Children's Board Book                20180
Younger Teen  Fiction                17227
Kentucky History                     16766
Science Fiction                      15714
Children's Easy Reader Paperback     15566
Holiday                              15482
International Collection             15439
Adult Reference                      11233
Children's Picture Paperback          9638
Urban Fiction                         7556
Caldecott/Newbery                     6