In [13]:
import pandas as pd
import requests
import time
import json
import gzip
import csv
from concurrent.futures import ThreadPoolExecutor

## Analizing the books at the LFPL libraries

In [15]:
# Read the csv file
lfpl_books = pd.read_csv('data\LFPL_Books.csv')

## The plan is to enrich this dataset using the Google API. The ISBN will be used to pull the required information from the API. In order to achieve that, we need to check the non valid values in the ISBN column. let's do some cleaning first

In [3]:
# Check the types
lfpl_books.dtypes

BibNum               int64
Title               object
Author              object
ISBN               float64
PublicationYear      int64
ItemType            object
ItemCollection      object
ItemLocation        object
ItemPrice          float64
ReportDate          object
ObjectId             int64
dtype: object

In [4]:
shape = lfpl_books.shape
print(f"Rows: {shape[0]:,}, Columns: {shape[1]}")

Rows: 1,181,341, Columns: 11


In [5]:
# Finding how many NaN values per column
lfpl_books.isna().sum()

BibNum                 0
Title                  1
Author             65644
ISBN               36527
PublicationYear        0
ItemType               0
ItemCollection       202
ItemLocation           0
ItemPrice              0
ReportDate             0
ObjectId               0
dtype: int64

In [6]:

lfpl_books.loc[lfpl_books['Title'].isna() ]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
198351,2239375,,,,0,Book,,Crescent Hill,0.0,2022/07/01 04:00:00+00,243966


There is no Title, or ISBN in this row. Dropping it.

In [16]:
lfpl_books.drop(198351, inplace=True)
lfpl_books.loc[lfpl_books['Title'].isna() ]


Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId


The row with the empty title is gone now. Looking at the ones with missing author and ISBN

In [17]:
lfpl_books.loc[(lfpl_books['Author'].isna()) & (lfpl_books['ISBN'].isna())]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
5931,2511351,My First Pet,,,0,Book,,Newburg,0.00,2022/07/01 04:00:00+00,8588
9402,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17216
9403,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,2022/07/01 04:00:00+00,17217
9446,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17260
9451,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17265
...,...,...,...,...,...,...,...,...,...,...,...
1180912,2514077,Hidden history of the Kovno Ghetto : teacher g...,,,1998,Book,Government Documents,Main,0.00,2022/07/01 04:00:00+00,1656785
1180913,2514078,A Necessary fence-- : the Senate's first century.,,,1989,Book,Government Documents,Main,0.00,2022/07/01 04:00:00+00,1656786
1181047,2527377,use,,,0,Book,,Northeast,0.00,2022/07/01 04:00:00+00,1657039
1181089,2528084,The New Mencken Letters,,,0,Book,Interlibrary Loan,Main,0.00,2022/07/01 04:00:00+00,1657141


Apparently the Laptops are included as books in this dataset.

In [18]:
lfpl_books[lfpl_books['Title'] == 'Laptop']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
9402,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17216
9403,1375916,Laptop,,,0,Book,Laptop,South Central,1077.00,2022/07/01 04:00:00+00,17217
9446,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17260
9451,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17265
9457,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17271
...,...,...,...,...,...,...,...,...,...,...,...
9684,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17501
9685,1375916,Laptop,,,0,Book,Laptop,St Matthews,900.31,2022/07/01 04:00:00+00,17502
9686,1375916,Laptop,,,0,Book,Laptop,Northeast,900.31,2022/07/01 04:00:00+00,17503
9687,1375916,Laptop,,,0,Book,Laptop,Northeast,900.31,2022/07/01 04:00:00+00,17504


In [19]:
lfpl_books.drop(lfpl_books[lfpl_books['Title'] == 'Laptop'].index, inplace=True)
lfpl_books[lfpl_books['Title'] == 'Laptop']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId


Laptops were dropped.

The ISBN column has 36527 values NaN. We need this value populated to use the Google API in order to obtain more information about the book.
The easiest solution is to drop all these rows. Before doing that, we will attempt to obtain the ISBN number using the book's title.
The main challenge is that API calls are pretty slow.

## First we will change the data type from float to string on the ISBN

In [20]:
lfpl_books['ISBN'] = lfpl_books['ISBN'].astype(str)
lfpl_books

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9780071605878.0,2009,Book,Adult Non-Fiction,Main,25.95,2022/07/01 04:00:00+00,1
1,1341361,McGraw-Hill's American idioms dictionary,"Spears, Richard A.",9780071478939.0,2007,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,2
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408.0,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,3
3,1341362,McGraw-Hill's essential ESL grammar : a handbo...,"Lester, Mark.",9780071496421.0,2008,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,4
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408.0,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,5
...,...,...,...,...,...,...,...,...,...,...,...
1181336,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056.0,2022,Book,Adult Non-Fiction,Main,10.02,2022/07/01 04:00:00+00,1657551
1181337,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056.0,2022,Book,Adult Non-Fiction,Newburg,10.02,2022/07/01 04:00:00+00,1657552
1181338,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056.0,2022,Book,Adult Non-Fiction,South Central,10.02,2022/07/01 04:00:00+00,1657553
1181339,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056.0,2022,Book,Adult Non-Fiction,Western,10.02,2022/07/01 04:00:00+00,1657554


## There is an extra '.0' on the string. Let's remove it

In [21]:
lfpl_books['ISBN'] = lfpl_books['ISBN'].astype(str).str.rstrip('.0')
lfpl_books

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9780071605878,2009,Book,Adult Non-Fiction,Main,25.95,2022/07/01 04:00:00+00,1
1,1341361,McGraw-Hill's American idioms dictionary,"Spears, Richard A.",9780071478939,2007,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,2
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,3
3,1341362,McGraw-Hill's essential ESL grammar : a handbo...,"Lester, Mark.",9780071496421,2008,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,4
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,5
...,...,...,...,...,...,...,...,...,...,...,...
1181336,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Main,10.02,2022/07/01 04:00:00+00,1657551
1181337,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Newburg,10.02,2022/07/01 04:00:00+00,1657552
1181338,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,South Central,10.02,2022/07/01 04:00:00+00,1657553
1181339,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Western,10.02,2022/07/01 04:00:00+00,1657554


In [13]:
# ISBN is an string now
lfpl_books.dtypes

BibNum               int64
Title               object
Author              object
ISBN                object
PublicationYear      int64
ItemType            object
ItemCollection      object
ItemLocation        object
ItemPrice          float64
ReportDate          object
ObjectId             int64
dtype: object

In [22]:
# Nothing is NaN after the conversion, it's the string 'nan' instead
lfpl_books[lfpl_books['ISBN'] == 'nan']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
4429,1348353,Stolen legacy: the Egyptian origins of western...,"James, George G. M",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,5055
5336,1353427,A symposium in public librarianship; three add...,"University of California, Berkeley. School of ...",,1952,Book,Kentucky History,Main,25.0,2022/07/01 04:00:00+00,7761
5510,1353740,"WWI : the Great War, 1914-1918","Brown, Hilary (Author of Two-liners for kids)",,2014,Book,Adult Non-Fiction,Main,15.0,2022/07/01 04:00:00+00,8010
5511,1353780,LANKIE MANTITA,LESLIE PATRICELLI,,0,Book,,Shively,0.0,2022/07/01 04:00:00+00,8027
5931,2511351,My First Pet,,,0,Book,,Newburg,0.0,2022/07/01 04:00:00+00,8588
...,...,...,...,...,...,...,...,...,...,...,...
1181091,2528085,"Cognitive Mapping : Past, Present, and Future","Kitchin, Rob",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657144
1181093,2528087,Eye of the Beholder,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657147
1181101,2528088,Sharp Edges,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657176
1181291,2527399,the girl in the road,MONICA BYRNE,,0,Book,,Main,0.0,2022/07/01 04:00:00+00,1657506


In [23]:
isbn_length = lfpl_books['ISBN'].str.len().value_counts()
isbn_length

ISBN
13    1031252
12     102249
3       36430
11      10031
10       1137
9         140
8           4
7           1
Name: count, dtype: int64

## Starting with len == 3

## We'll try to fill the missing ISBN information using the open library API

In [7]:
# Testing with the Open Library Books API - v1
import pandas as pd
import requests

def get_isbn_from_api(book_title):
    base_url = "http://openlibrary.org/search.json"
    params = {"title": book_title}
    
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        data = response.json()
        # Check if any documents were found
        if data.get("docs"):
            # Assuming the first result is the most relevant
            isbn = data["docs"][0].get("isbn")[3]            
            return str(isbn)
        else:
            return ""
    except requests.exceptions.RequestException as e:
        #print(f"Error in API request: {e}")
        return ""


In [9]:
# Function to apply to the DataFrame
def fill_missing_isbn(row):
    if row["ISBN"] == "nan":
        return get_isbn_from_api(row["Title"])
    else:
        return row["ISBN"]

In [10]:
# Example DataFrame
data = {
    "Title": ["Cognitive Mapping : Past, Present, and Future","LANKIE MANTITA", "WWI : the Great War, 1914-1918", "the girl in the road", "Sharp Edges", "My First Pet"],
    "ISBN": ["nan", "nan", "nan", "nan", "nan","nan"]
}

df = pd.DataFrame(data)

# Apply the function using lambda only to rows with missing ISBN
df["ISBN"] = df.apply(lambda row: fill_missing_isbn(row), axis=1)

print(df)

                                           Title           ISBN
0  Cognitive Mapping : Past, Present, and Future               
1                                 LANKIE MANTITA               
2                 WWI : the Great War, 1914-1918               
3                           the girl in the road  9780349004372
4                                    Sharp Edges     1568955499
5                                   My First Pet  9789354400391


In [24]:
# Testing with the Open Library Books API - v2

def get_isbn_from_openlibrary(title):
    base_url = "http://openlibrary.org/search.json"
    params = {"title": title}
    response = requests.get(base_url, params, headers={"Accept-Encoding": "gzip"})
    if response.status_code == 429:
        retry_after = int(response.headers.get("Retry-After"))
        print(f"API rate limit exceeded. Retrying after {retry_after} seconds.")
        time.sleep(retry_after)
        return get_isbn_from_openlibrary(title)
    if response.status_code != 200:
        print(f"Error retrieving ISBN for title '{title}': {response.status_code}")
        return None
    content_type = response.headers.get("Content-Type")
    if content_type == "application/gzip":
        with gzip.decompress(response.content) as decompressed_data:
            data = json.loads(decompressed_data.decode("utf-8"))
    else:
        data = json.loads(response.content.decode("utf-8"))
    isbn = data.get("docs")[0].get("isbn")[0] if data.get("docs") and data.get("docs")[0].get("isbn") else None
    return isbn


In [114]:
def populate_isbn(book_title, csvfile):
    isbn = get_isbn_from_openlibrary(book_title)

    if isbn:
        with open(csvfile, "a", encoding="utf-8", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([isbn, book_title])
            print(f'Saving: {book_title}')
    return isbn

In [57]:
# Creating a copy of the dataframe with the title columns
temp_df = lfpl_books[lfpl_books["ISBN"]=="nan"][["Title"]].drop_duplicates()
shape_c = temp_df.shape
print(f"Rows: {shape_c[0]:,}, Columns: {shape_c[1]}")

Rows: 28,327, Columns: 1


In [58]:

# Optimize the code for speed by using ThreadPoolExecutor to increase the overall speed of the process
with ThreadPoolExecutor(max_workers=10) as executor:
  temp_df["ISBN"] = list(executor.map(populate_isbn, [temp_df['Title'], "data\lfpl_books_isbn.csv"]))

# Update the lfpl_books dataframe with the populated ISBNs
#lfpl_books.update(pd.DataFrame(results, columns=temp_df.columns))
temp_df

Saving: O little town of Sugarcreek
Saving: The Christmas riddle
Saving: Castle of whispers
Saving: My First Pet
Saving: Centaur aisle
Saving: Simple lies
Saving: A season of secrets
Saving: The tenant of Wildfell Hall
Saving: Deceptive hearts
Saving: Stocks on the Move
Saving: Act of War
Saving: The Book of Liz
Saving: designing your work life
Saving: The secret letter
Saving: FUNNY FARM
Saving: Bing ya
Saving: Wagon Wheels
Saving: Baby Mickey's nap : a book about touch.
Saving: Farm friends : a book about animal sounds.
Saving: Watch and learn : a book about shapes.
Saving: Confessions of a gym-class dropout
Saving: McSweeney's no. 40
Saving: Military Government in the Ryukyu Islands, 1945-1950
Saving: Olivia and the Billionaire cattle king
Saving: Migrating from the Kentucky Holy Lands to Owensboro, Kentucky in 2017
Saving: The Yakima`
Saving: Hunger
Saving: The Powhatan Indians
Saving: Hideaway
Saving: ain't no valley
Saving: The Corps of Engineers : the war against Japan
Saving: H

Unnamed: 0,Title,ISBN
4429,Stolen legacy: the Egyptian origins of western...,
5336,A symposium in public librarianship; three add...,
5510,"WWI : the Great War, 1914-1918",
5511,LANKIE MANTITA,
5931,My First Pet,9354401066
...,...,...
1181091,"Cognitive Mapping : Past, Present, and Future",
1181093,Eye of the Beholder,0671523066
1181101,Sharp Edges,9780671524098
1181291,the girl in the road,0804138842


In [71]:
temp_df['ISBN'].isna().value_counts()

ISBN
True     21245
False     7082
Name: count, dtype: int64

In [64]:
temp_df[temp_df['ISBN'].notna()]

Unnamed: 0,Title,ISBN
5931,My First Pet,9354401066
6383,O little town of Sugarcreek,1961125498
6405,Act of War,9780451466198
6417,The Christmas riddle,9798586294586
6434,Castle of whispers,9781609451929
...,...,...
1181089,The New Mencken Letters,9780803713796
1181093,Eye of the Beholder,0671523066
1181101,Sharp Edges,9780671524098
1181291,the girl in the road,0804138842


In [81]:
# We found 7082 ISBN's using this API
# Merge the result with the original dataframe using the title column
# Merge the two dataframes using the title column
merged_df = lfpl_books.merge(temp_df, on='Title', how='left')
merged_df

Unnamed: 0,BibNum,Title,Author,ISBN_x,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId,ISBN_y
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9780071605878,2009,Book,Adult Non-Fiction,Main,25.95,2022/07/01 04:00:00+00,1,
1,1341361,McGraw-Hill's American idioms dictionary,"Spears, Richard A.",9780071478939,2007,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,2,
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,3,
3,1341362,McGraw-Hill's essential ESL grammar : a handbo...,"Lester, Mark.",9780071496421,2008,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,4,
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,5,
...,...,...,...,...,...,...,...,...,...,...,...,...
1181239,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Main,10.02,2022/07/01 04:00:00+00,1657551,
1181240,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Newburg,10.02,2022/07/01 04:00:00+00,1657552,
1181241,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,South Central,10.02,2022/07/01 04:00:00+00,1657553,
1181242,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Western,10.02,2022/07/01 04:00:00+00,1657554,


In [84]:
lfpl_books[lfpl_books['ISBN']=='nan']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
4429,1348353,Stolen legacy: the Egyptian origins of western...,"James, George G. M",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,5055
5336,1353427,A symposium in public librarianship; three add...,"University of California, Berkeley. School of ...",,1952,Book,Kentucky History,Main,25.0,2022/07/01 04:00:00+00,7761
5510,1353740,"WWI : the Great War, 1914-1918","Brown, Hilary (Author of Two-liners for kids)",,2014,Book,Adult Non-Fiction,Main,15.0,2022/07/01 04:00:00+00,8010
5511,1353780,LANKIE MANTITA,LESLIE PATRICELLI,,0,Book,,Shively,0.0,2022/07/01 04:00:00+00,8027
5931,2511351,My First Pet,,,0,Book,,Newburg,0.0,2022/07/01 04:00:00+00,8588
...,...,...,...,...,...,...,...,...,...,...,...
1181091,2528085,"Cognitive Mapping : Past, Present, and Future","Kitchin, Rob",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657144
1181093,2528087,Eye of the Beholder,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657147
1181101,2528088,Sharp Edges,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657176
1181291,2527399,the girl in the road,MONICA BYRNE,,0,Book,,Main,0.0,2022/07/01 04:00:00+00,1657506


In [96]:
lfpl_books_1 = lfpl_books.merge(temp_df, on='Title', how='left', suffixes=('', '_temp'))


In [97]:
lfpl_books_1[lfpl_books_1['ISBN']=='nan'].value_counts()

BibNum   Title                                             Author                                         ISBN  PublicationYear  ItemType  ItemCollection     ItemLocation            ItemPrice  ReportDate              ObjectId  ISBN_temp    
2313     Roughing it                                       Twain, Mark, 1835-1910.                        nan   1992             Book      Adult Non-Fiction  Jeffersontown           22.95      2022/07/01 04:00:00+00  82652     9798465049788    1
476060   The Orangeburg massacre                           Nelson, Jack, 1929 Oct. 11-                    nan   1970             Book      Adult Reference    Western                 55.00      2022/07/01 04:00:00+00  344282    9780865541207    1
476294   The Negro in the American Revolution              Aptheker, Herbert, 1915-                       nan   1940             Book      Adult Non-Fiction  Remote Shelving - Main  25.00      2022/07/01 04:00:00+00  344549    9780807838334    1
                     

In [98]:

# Update the missing values in the isbn column of lfpl_books using the values from temp_df
lfpl_books_1['ISBN'] = lfpl_books_1.apply(lambda row: row['ISBN_temp'] if row['ISBN']=='nan' else row['ISBN'], axis=1)
lfpl_books_1.drop('ISBN_temp', axis=1, inplace=True)


In [104]:
# 'nan' are now None
lfpl_books_1['ISBN'].isna().value_counts()


ISBN
False    1154047
True       27197
Name: count, dtype: int64

In [106]:
# Still 27197 'nan' values
lfpl_books_1.to_csv('data\lfpl_books_1.csv',index=False)

In [74]:
lfpl_books_1

Unnamed: 0,BibNum,Title,Author,ISBN_x,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId,ISBN
0,707409,"Jeff Immelt and the new GE way : innovation, t...","Magee, David, 1965-",9780071605878,2009,Book,Adult Non-Fiction,Main,25.95,2022/07/01 04:00:00+00,1,9780071605878
1,1341361,McGraw-Hill's American idioms dictionary,"Spears, Richard A.",9780071478939,2007,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,2,9780071478939
2,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,3,9780307451408
3,1341362,McGraw-Hill's essential ESL grammar : a handbo...,"Lester, Mark.",9780071496421,2008,Book,ELL Collection,South Central,20.00,2022/07/01 04:00:00+00,4,9780071496421
4,707411,Robin rescues dinner : 52 weeks of quick-fix m...,"Miller, Robin, 1964-",9780307451408,2009,Book,Adult Non-Fiction,Southwest,19.99,2022/07/01 04:00:00+00,5,9780307451408
...,...,...,...,...,...,...,...,...,...,...,...,...
1181239,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Main,10.02,2022/07/01 04:00:00+00,1657551,9780399544354
1181240,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Newburg,10.02,2022/07/01 04:00:00+00,1657552,9781787737372
1181241,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,South Central,10.02,2022/07/01 04:00:00+00,1657553,9780063055254
1181242,2527332,The afrominimalist's guide to living with less,"Platt, Christine",9781982168056,2022,Book,Adult Non-Fiction,Western,10.02,2022/07/01 04:00:00+00,1657554,9781974712533


In [107]:
lfpl_books_1['ISBN'].isna().value_counts()

ISBN
False    1154047
True       27197
Name: count, dtype: int64

## Google API. It should be noted that the API respond with errors in too many calls are generated from the same IP address

In [29]:
# I used this method on the first phase but it was returning the response 429(too many requests). It had to be modified.
import requests

def find_isbn_by_book_name(book_name):
    # Define the API endpoint to search for ISBN using book names
    api_url = f"https://www.googleapis.com/books/v1/volumes?q={book_name}"
    # Using "intitle" find less ISBN numbers on the test
    #api_url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{book_name}"
    try:
        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            book_response = data.get('totalItems', '')
            if book_response in (0, ""):
                return ""
            else:
                isbn = data.get('items','')[0].get('volumeInfo','').get('industryIdentifiers','')[0].get('identifier','')
                #return isbn if len(isbn) in {10, 13} else ""
                if len(isbn) in (10, 13): # Only acceptable ISBN length
                    return isbn
                else:
                    return ""
        
    except Exception as e:
        return ""
        #print(f"API request error: {str(e)}")

    return ''



In [21]:
# Function to apply to the DataFrame
def fill_missing_isbn(row):
    if row["ISBN"] == "nan":
        return find_isbn_by_book_name(row["Title"])
    else:
        return row["ISBN"]

In [28]:
# Example DataFrame
data = {
    "Title": ["Cognitive Mapping : Past, Present, and Future","LANKIE MANTITA", "WWI : the Great War, 1914-1918", "the girl in the road", "Sharp Edges", "My First Pet"],
    "ISBN": ["nan", "nan", "nan", "nan", "nan","nan"]
}

df = pd.DataFrame(data)



# Apply the function using lambda only to rows with missing ISBN
df["ISBN"] = df.apply(lambda row: fill_missing_isbn(row), axis=1)

print(df)

                                           Title           ISBN
0  Cognitive Mapping : Past, Present, and Future  9781317798071
1                                 LANKIE MANTITA               
2                 WWI : the Great War, 1914-1918               
3                           the girl in the road  9780804138864
4                                    Sharp Edges  9781439120118
5                                   My First Pet  9781465452955


The Google API appears to have more information than the previous one. 
The returned will be accepted if it has 10(ISBN-10) or 13(ISBN-13) characters.


In [23]:
# Experimenting with timing

print(f"My First Pet - ISBN: {find_isbn_by_book_name('My First Pet')}")

My First Pet - ISBN: 1620315491


In [None]:
# The following line of code might take a lot to process

#lfpl_books["ISBN"] = lfpl_books.apply(lambda row: fill_missing_isbn(row), axis=1)

## Trying to use concurrent features

In [207]:
isbn_length

ISBN
13    1031252
12     102249
3       36430
11      10031
10       1137
9         140
8           4
7           1
Name: count, dtype: int64

In [120]:
# Creating a copy of the dataframe with only the isbn and title columns
missing_isbn_df = lfpl_books_1[lfpl_books_1["ISBN"].isna()][["Title"]].drop_duplicates()
shape_c = missing_isbn_df.shape
print(f"Rows: {shape_c[0]:,}, Columns: {shape_c[1]}")

Rows: 21,245, Columns: 1


In [119]:
missing_isbn_df

Unnamed: 0,Title
4429,Stolen legacy: the Egyptian origins of western...
5336,A symposium in public librarianship; three add...
5510,"WWI : the Great War, 1914-1918"
5511,LANKIE MANTITA
6414,Pieces from the past
...,...
1180968,Never Surrender : A Soldier's Journey to the C...
1180971,The Secret of the Universe = Mysterium Cosmogr...
1180976,Mathematics : a Very Short Introduction
1180988,ILL KDLA BOOK KIT - Hidden Places


We saved a few thousands API calls removing the duplicates

In [110]:
# Trying to select the appropiate number of cores, depending on the CPU
import multiprocessing

def get_cpu_count():
  return multiprocessing.cpu_count()

def select_max_workers(cpu_count):
  if cpu_count <= 4:
    return cpu_count
  else:
    return int(cpu_count * 0.75)

In [111]:
print(select_max_workers(get_cpu_count()))

18


In [131]:
import time

def get_isbn_from_api_with_retry(book_title, max_retries=3):
    
    #base_url = "https://www.googleapis.com/books/v1/volumes"
    #params = {"q": f"intitle:{book_title}", "maxResults": 1}

    api_url = f"https://www.googleapis.com/books/v1/volumes?q={book_title}"

    for retry in range(max_retries):
        try:
            #response = requests.get(api_url, params=params)
            response = requests.get(api_url)
            response.raise_for_status()  # Raise an HTTPError for bad responses
            data = response.json()

            # Check if any items were found
            if "items" in data and data["items"]:
                volume_info = data["items"][0]["volumeInfo"]
                isbn = volume_info.get("industryIdentifiers", [])

                # Validate ISBN length
                for identifier in isbn:
                    if identifier["type"] == "ISBN_10" and len(identifier["identifier"]) == 10:
                        return identifier["identifier"]
                    elif identifier["type"] == "ISBN_13" and len(identifier["identifier"]) == 13:
                        return identifier["identifier"]

            return ""
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                # Extract the 'Retry-After' header value
                retry_after = int(e.response.headers.get("Retry-After", 1))
                print(f"Rate limit exceeded. Retrying in {retry_after} seconds...")
                time.sleep(retry_after)
                return ""
            else:
                print(f"Error in API request: {e}")
                return ""
        except requests.exceptions.RequestException as e:
            print(f"Error in API request: {e}")
            return ""

    print(f"Exceeded maximum retries for {book_title}. Skipping...")
    return ""


In [121]:
def get_isbn(book_title, csvfile):
    isbn = get_isbn_from_api_with_retry(book_title)

    if isbn:
        with open(csvfile, "a", encoding="utf-8", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([isbn, book_title])
            print(f'Saving: {book_title}')
    return isbn

In [132]:
from concurrent.futures import ThreadPoolExecutor

cpu_count = get_cpu_count()
max_workers = select_max_workers(cpu_count)

with open('data/lfpl_books_2', "a", encoding="utf-8", newline="") as csvfile:
            
    with ThreadPoolExecutor( max_workers ) as executor:
        #missing_isbn_df["ISBN"] = list(executor.map(fill_missing_isbn, missing_isbn_df.itertuples(index=False)))
        missing_isbn_df["ISBN"] = list(executor.map(get_isbn_from_api_with_retry, missing_isbn_df['Title']))
        writer = csv.writer(csvfile)
        writer.writerow(missing_isbn_df['ISBN'], missing_isbn_df['Title'])
        print(f"Saving: {missing_isbn_df['Title']}")


Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1 seconds...
Rate limit exceeded. Retrying in 1

KeyboardInterrupt: 

In [128]:
missing_isbn_df

Unnamed: 0,Title
4429,Stolen legacy: the Egyptian origins of western...
5336,A symposium in public librarianship; three add...
5510,"WWI : the Great War, 1914-1918"
5511,LANKIE MANTITA
6414,Pieces from the past
...,...
1180968,Never Surrender : A Soldier's Journey to the C...
1180971,The Secret of the Universe = Mysterium Cosmogr...
1180976,Mathematics : a Very Short Introduction
1180988,ILL KDLA BOOK KIT - Hidden Places


In [171]:
# Merge the result with the original dataframe using the title column
lfpl_books = pd.merge(lfpl_books, missing_isbn_df, on="Title", how="left", suffixes=("_old", ""))

#lfpl_books["ISBN"] = lfpl_books["ISBN_new"].combine_first(lfpl_books["ISBN"]).astype(str)  # Fill missing ISBN in the original dataframe

# Drop temporary columns
#lfpl_books = lfpl_books.drop(columns=["ISBN_new"])
lfpl_books = lfpl_books.drop(columns=["ISBN_old"])

In [1]:

lfpl_books['ISBN'].str.len().value_counts()

NameError: name 'lfpl_books' is not defined

47 minutes and 55 seconds in a cpu with 16 cores, 24 logical procesors

In [50]:
lfpl_books['ISBN'].str.len().value_counts()

ISBN
13    1031252
12     102249
3       36527
11      10031
10       1137
9         140
8           4
7           1
Name: count, dtype: int64

In [51]:
def summ_length(strings):
  """
  Summarizes a list of strings by length.

  Args:
    strings: A list of strings.

  Returns:
    A dictionary that maps string lengths to the number of strings of that length.
  """

  summary = {}

  for string in strings:
    length = len(string)

    if length not in summary:
      summary[length] = 0

    summary[length] += 1

  return summary

strings = ['hello', 'world', 'this', 'is', 'a', 'test']
summary = summ_length(results)

for length, count in summary.items():
  print(f"There are {count} strings with length {length}")

There are 1032881 strings with length 13
There are 102384 strings with length 12
There are 10036 strings with length 11
There are 1864 strings with length 10
There are 228 strings with length 15
There are 31565 strings with length 0
There are 89 strings with length 22
There are 863 strings with length 14
There are 152 strings with length 16
There are 633 strings with length 18
There are 172 strings with length 19
There are 266 strings with length 23
There are 46 strings with length 20
There are 17 strings with length 17
There are 140 strings with length 9
There are 4 strings with length 8
There are 1 strings with length 7


In [54]:
new_isbn = pd.DataFrame(results, columns=['ISBN'])
new_isbn['ISBN'].str.len().value_counts()

ISBN
13    1032881
12     102384
0       31565
11      10036
10       1864
14        863
18        633
23        266
15        228
19        172
16        152
9         140
22         89
20         46
17         17
8           4
7           1
Name: count, dtype: int64

In [55]:
lfpl_books['ISBN'].str.len().value_counts()

ISBN
13    1031252
12     102249
3       36527
11      10031
10       1137
9         140
8           4
7           1
Name: count, dtype: int64

In [58]:
new_isbn[new_isbn['ISBN'].str.len() == 17]

Unnamed: 0,ISBN
60695,EAN:8596547062158
96189,EAN:4064066431020
102906,EAN:4064066397913
245348,SRLF:AX0000165175
252219,EAN:8596547026808
254818,EAN:8596547408710
259217,EAN:8596547378266
261460,EAN:8596547192480
262910,EAN:8596547403005
264185,SRLF:AA0002591691


In [56]:
new_isbn.shape

(1181341, 1)

In [57]:
lfpl_books.shape

(1181341, 12)

In [49]:
type(results)

list

## Let's check the results

In [62]:
lfpl_books['ISBN'].isna().value_counts()

ISBN
False    1181341
Name: count, dtype: int64

In [72]:
lfpl_books[lfpl_books['ISBN'] == 'nan']

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
4429,1348353,Stolen legacy: the Egyptian origins of western...,"James, George G. M",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,5055
5336,1353427,A symposium in public librarianship; three add...,"University of California, Berkeley. School of ...",,1952,Book,Kentucky History,Main,25.0,2022/07/01 04:00:00+00,7761
5510,1353740,"WWI : the Great War, 1914-1918","Brown, Hilary (Author of Two-liners for kids)",,2014,Book,Adult Non-Fiction,Main,15.0,2022/07/01 04:00:00+00,8010
5511,1353780,LANKIE MANTITA,LESLIE PATRICELLI,,0,Book,,Shively,0.0,2022/07/01 04:00:00+00,8027
5931,2511351,My First Pet,,,0,Book,,Newburg,0.0,2022/07/01 04:00:00+00,8588
...,...,...,...,...,...,...,...,...,...,...,...
1181091,2528085,"Cognitive Mapping : Past, Present, and Future","Kitchin, Rob",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657144
1181093,2528087,Eye of the Beholder,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657147
1181101,2528088,Sharp Edges,"Krentz, Jayne",,0,Book,Interlibrary Loan,Main,0.0,2022/07/01 04:00:00+00,1657176
1181291,2527399,the girl in the road,MONICA BYRNE,,0,Book,,Main,0.0,2022/07/01 04:00:00+00,1657506


In [64]:
# Finding books with no title
lfpl_books[lfpl_books['Title'].isna()]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
198351,2239375,,,,0,Book,,Crescent Hill,0.0,2022/07/01 04:00:00+00,243966


In [67]:
lfpl_books[lfpl_books['ItemCollection'].isna()]

Unnamed: 0,BibNum,Title,Author,ISBN,PublicationYear,ItemType,ItemCollection,ItemLocation,ItemPrice,ReportDate,ObjectId
5511,1353780,LANKIE MANTITA,LESLIE PATRICELLI,,0,Book,,Shively,0.0,2022/07/01 04:00:00+00,8027
5931,2511351,My First Pet,,,0,Book,,Newburg,0.0,2022/07/01 04:00:00+00,8588
6405,1362079,Act of War,"Brown, Dale",,0,Book,,Bon Air,0.0,2022/07/01 04:00:00+00,10502
9061,2410700,FOUR,VERONICA ROTH,,0,Book,,Crescent Hill,0.0,2022/07/01 04:00:00+00,16849
12678,2481658,designing your work life,bill burnett,,0,Book,,St Matthews,0.0,2022/07/01 04:00:00+00,22166
...,...,...,...,...,...,...,...,...,...,...,...
1180968,2528221,The Khipu and the Final Key,Dee Garretson,,0,Book,,Northeast,0.0,2022/07/01 04:00:00+00,1656928
1180972,2528222,MOMMY'S DISEASE,CAROLYN HANNAN,,0,Book,,Southwest,0.0,2022/07/01 04:00:00+00,1656934
1181047,2527377,use,,,0,Book,,Northeast,0.0,2022/07/01 04:00:00+00,1657039
1181291,2527399,the girl in the road,MONICA BYRNE,,0,Book,,Main,0.0,2022/07/01 04:00:00+00,1657506


In [6]:
lfpl_books['ItemCollection'].unique()

array(['Adult Non-Fiction', 'ELL Collection', 'Adult Fiction', 'Mystery',
       "Children's Picture Book", 'Science Fiction', 'Older Teen Fiction',
       'Younger Teen  Fiction', 'Adult Paperback', "Children's Fiction",
       'Western', "Children's Picture Paperback", "Children's Paperback",
       'International Collection', 'Teen Non-Fiction',
       "Children's Non-Fiction", 'Kentucky History', 'Natural Resources',
       'Oversize', 'Holiday', 'Urban Fiction', 'Bestsellers',
       "Children's Board Book", 'Storytime Collection',
       'Preschool  Picture Book', "Children's Easy Reader",
       'Adult Reference', 'Interlibrary Loan', nan,
       'Adult Paperbacks Tall', "Children's Easy Reader Paperback",
       'Caldecott/Newbery', 'Laptop', 'Younger Teen  Paperback',
       'Government Documents', 'Large Print', 'Telereference',
       "Children's Non-Fiction Paperback", 'Big Book',
       "Children's Reference", 'Older Teen Paperback', 'Teen Reference',
       'College Shop'

In [7]:
lfpl_books['ItemCollection'].value_counts()

ItemCollection
Adult Non-Fiction                   370372
Adult Fiction                       173472
Children's Non-Fiction               86723
Mystery                              60152
Children's Picture Book              58609
Preschool  Picture Book              50564
Children's Fiction                   47829
Adult Paperback                      45733
Children's Paperback                 44649
Teen Non-Fiction                     24045
Children's Easy Reader               24037
Older Teen Fiction                   23476
Children's Board Book                20180
Younger Teen  Fiction                17227
Kentucky History                     16766
Science Fiction                      15714
Children's Easy Reader Paperback     15566
Holiday                              15482
International Collection             15439
Adult Reference                      11233
Children's Picture Paperback          9638
Urban Fiction                         7556
Caldecott/Newbery                     6