# Book info supplementation with API call
#### Book info does not contain any data about book description or book categories (genre). We used openli

In [169]:
import pandas as pd
import requests

In [171]:
book_rating = pd.read_csv("book_ratings.dat", sep = "\t", header = 0, index_col=None)

In [173]:
book_rating.head()

Unnamed: 0,user,item,rating
0,1,6264,7.0
1,1,4350,7.0
2,1,6252,5.0
3,1,202,9.0
4,1,6266,6.0


In [175]:
# Open the file and remove trailing tabs
with open("items_info.dat", "r") as file:
    lines = file.readlines()

# Strip trailing tabs and save to a new file
cleaned_lines = [line.rstrip('\t').strip() for line in lines]

# Write the cleaned lines to a new file
with open("cleaned_items_info.txt", "w") as new_file:
    new_file.writelines([line + '\n' for line in cleaned_lines])

In [179]:
book_info = pd.read_csv("cleaned_items_info.txt", sep = "\t", header = 0, index_col = None, usecols = range(5))

In [181]:
book_info.head()

Unnamed: 0,Book_ID,ISBN,Book-Title,Book-Author,Year-Of-Publication
0,1,60973129,Decision in Normandy,Carlo D'Este,1991
1,2,393045218,The Mummies of Urumchi,E. J. W. Barber,1999
2,3,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000
3,4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994
4,5,609804618,Our Dumb Century: The Onion Presents 100 Years...,The Onion,1999


In [183]:
isbn_list = book_info['ISBN'].tolist()

In [185]:
len(isbn_list)

17384

In [195]:
# there are 17384 books, we can split the list into 3 parts of about 6000 books each
isbn_list1 = isbn_list[:6000]
isbn_list2 = isbn_list[6000:12000]
isbn_list3 = isbn_list[12000:]

In [197]:
print(len(isbn_list1))
print(len(isbn_list2))
print(len(isbn_list3))

6000
6000
5384


In [187]:
import time

In [287]:
# Batch the ISBNs into groups of 100
batch_size = 100
batches1 = [isbn_list1[i:i + batch_size] for i in range(0, len(isbn_list1), batch_size)]
batches2 = [isbn_list2[i:i + batch_size] for i in range(0, len(isbn_list2), batch_size)]
batches3 = [isbn_list3[i:i + batch_size] for i in range(0, len(isbn_list3), batch_size)]

# Initialize a list to store the results
book_data1 = []

# Loop through each batch
for batch in batches1:
    # Prepare the list of ISBNs for the batch request
    isbn_batch = ",".join([f"ISBN:{isbn}" for isbn in batch])
    
    url = f"https://openlibrary.org/api/books?bibkeys={isbn_batch}&format=json&jscmd=details"
    
    # Make the batch request
    response = requests.get(url)
    
    # Check if the response is valid
    if response.status_code == 200:
        book_info = response.json()
        
        # Extract data for each ISBN in the batch
        for isbn in batch:
            book = book_info.get(f"ISBN:{isbn}", {}).get("details")
            if book is None:
                continue
            title = book.get("title", "N/A")
            desc = book.get("description", "No description available")
            if isinstance(desc, dict):
                description = desc.get("value", "No description available")
            else:
                description = desc
            category = list(set([item.strip() for subject in book.get("subjects", []) for item in subject.split("--")])) if "subjects" in book else "N/A"
            
            book_data1.append({
                "ISBN": isbn,
                "Title": title,
                "Description": description,
                "Category": category
            })
    
    # Sleep to avoid hitting rate limits
    time.sleep(1)  # 1 second delay between batches

# Convert the results into a pandas DataFrame
result_df1 = pd.DataFrame(book_data1)

# Save the results to a CSV file
result_df1.to_csv("books_info1.csv", index=False)

print("Batch API call complete and saved to CSV.")

Batch API call complete and saved to CSV.


In [295]:
(result_df1["Description"] == "No description available").sum()
len(batches3)

54

In [299]:
# Initialize a list to store the results
book_data2 = []

# Loop through each batch
for batch in batches2:
    # Prepare the list of ISBNs for the batch request
    isbn_batch = ",".join([f"ISBN:{isbn}" for isbn in batch])
    
    url = f"https://openlibrary.org/api/books?bibkeys={isbn_batch}&format=json&jscmd=details"
    
    # Make the batch request
    response = requests.get(url)
    
    # Check if the response is valid
    if response.status_code == 200:
        book_info = response.json()
        
        # Extract data for each ISBN in the batch
        for isbn in batch:
            book = book_info.get(f"ISBN:{isbn}", {}).get("details")
            if book is None:
                continue
            title = book.get("title", "N/A")
            desc = book.get("description", "No description available")
            if isinstance(desc, dict):
                description = desc.get("value", "No description available")
            else:
                description = desc
            category = list(set([item.strip() for subject in book.get("subjects", []) for item in subject.split("--")])) if "subjects" in book else "N/A"
            
            book_data2.append({
                "ISBN": isbn,
                "Title": title,
                "Description": description,
                "Category": category
            })
    
    # Sleep to avoid hitting rate limits
    time.sleep(1)  # 1 second delay between batches

# Convert the results into a pandas DataFrame
result_df2 = pd.DataFrame(book_data2)

# Save the results to a CSV file
result_df2.to_csv("books_info2.csv", index=False)

print("Batch API call complete and saved to CSV.")

Batch API call complete and saved to CSV.


In [314]:
result_df2.head()

Unnamed: 0,ISBN,Title,Description,Category
0,553280643,The gate to women's country,No description available,
1,385480210,The Pocket Mirror of Heroes,The Art of Worldly Wisdom by Baltasar Gracian ...,
2,1570612676,Portland,No description available,"[Guidebooks., Portland (Or.)]"
3,3293202047,Nana Plaza.,No description available,
4,805065415,Blue Latitudes,"""James Cook's three epic journeys in the eight...","[Voyages around the world, Cook, James,, 1958-..."


In [318]:
# Initialize a list to store the results
book_data3 = []

# Loop through each batch
for batch in batches3:
    # Prepare the list of ISBNs for the batch request
    isbn_batch = ",".join([f"ISBN:{isbn}" for isbn in batch])
    
    url = f"https://openlibrary.org/api/books?bibkeys={isbn_batch}&format=json&jscmd=details"
    
    # Make the batch request
    response = requests.get(url)
    
    # Check if the response is valid
    if response.status_code == 200:
        book_info = response.json()
        # Extract data for each ISBN in the batch
        for isbn in batch:
            book = book_info.get(f"ISBN:{isbn}", {}).get("details")
            if book is None:
                continue
            title = book.get("title", "N/A")
            desc = book.get("description", "No description available")
            if isinstance(desc, dict):
                description = desc.get("value", "No description available")
            else:
                description = desc
            category = list(set([item.strip() for subject in book.get("subjects", []) for item in subject.split("--")])) if "subjects" in book else "N/A"
            
            book_data3.append({
                "ISBN": isbn,
                "Title": title,
                "Description": description,
                "Category": category
            })
    
    # Sleep to avoid hitting rate limits
    time.sleep(1)  # 1 second delay between batches

# Convert the results into a pandas DataFrame
result_df3 = pd.DataFrame(book_data3)

# Save the results to a CSV file
result_df3.to_csv("books_info3.csv", index=False)

print("Batch API call complete and saved to CSV.")

Batch API call complete and saved to CSV.


In [326]:
(result_df3["Category"] == "N/A").sum()

1828

In [342]:
books_info = pd.concat([result_df1, result_df2, result_df3], ignore_index = True)
print("Missing Description: ", (books_info["Description"] == "No description available").sum())
print("Missing Category: ", (books_info["Category"] == "N/A").sum())
print("Total books: ", books_info["ISBN"].count())

Missing Description:  14374
Missing Category:  6199
Total books:  17347


In [346]:
missing_category = books_info[books_info["Category"] == "N/A"]
missing_category.head()

Unnamed: 0,ISBN,Title,Description,Category
3,452264464,Beloved,No description available,
7,345402871,Airframe,Three passengers are dead. Fifty-six are injur...,
8,345417623,Timeline,"In an Arizona desert, a man wanders in a daze,...",
12,446310786,To Kill a Mockingbird,No description available,
14,60168013,Pigs in heaven,No description available,


In [392]:
# check if calling the api differently will give another format with categories
missing_cat_isbn = missing_category["ISBN"].tolist()

batch_size = 100
batch_missing_cat = [missing_cat_isbn[i:i + batch_size] for i in range(0, len(missing_cat_isbn), batch_size)]
len(batch_missing_cat)

62

In [394]:
book_data_missing_cat = []

# Loop through each batch
for batch in batch_missing_cat:
    # Prepare the list of ISBNs for the batch request
    isbn_batch = ",".join([f"ISBN:{isbn}" for isbn in batch])
    
    url = f"https://openlibrary.org/api/books?bibkeys={isbn_batch}&format=json&jscmd=data"
    
    # Make the batch request
    response = requests.get(url)
    
    # Check if the response is valid
    if response.status_code == 200:
        book_info = response.json()
        # Extract data for each ISBN in the batch
        for isbn in batch:
            book = book_info.get(f"ISBN:{isbn}", {})
            if book is None:
                continue
            category = [subject["name"] for subject in book.get("subjects", [])] if "subjects" in book else "N/A"
            
            book_data_missing_cat.append({
                "ISBN": isbn,
                "Category": category
            })
    
# Sleep to avoid hitting rate limits (Optional, adjust time accordingly)
time.sleep(1)  # 1 second delay between requests

# Convert the results into a pandas DataFrame
result_df4 = pd.DataFrame(book_data_missing_cat)

# Save the results to a CSV file
result_df4.to_csv("books_info_missing_cat.csv", index=False)

print("Batch API call complete and saved to CSV.")

Batch API call complete and saved to CSV.


In [388]:
result_df4 = pd.DataFrame(book_data_missing_cat)

In [396]:
result_df4

Unnamed: 0,ISBN,Category
0,0452264464,"[African American History, Ohio, History, 19th..."
1,0345402871,"[air safety, media relations, investigative jo..."
2,0345417623,"[Fiction, Hundertja hriger Krieg, Wissenschaft..."
3,0446310786,"[fiction, fiction classics, contemporary ficti..."
4,0060168013,"[Literature, Cherokee Indians, Orphans, Humoro..."
...,...,...
6194,082173900X,"[Fiction, romance, historical, general]"
6195,3404143035,
6196,0449209202,"[Pastoral psychology, Psychology, applied, App..."
6197,2209014026,


In [398]:
isbn_cat = dict(zip(result_df4["ISBN"], result_df4["Category"]))

books_info["Category"] = books_info["ISBN"].map(isbn_cat).fillna(books_info["Category"])

In [400]:
print("Missing Description: ", (books_info["Description"] == "No description available").sum())
print("Missing Category: ", (books_info["Category"] == "N/A").sum())
print("Total books: ", books_info["ISBN"].count())

Missing Description:  14374
Missing Category:  730
Total books:  17347


In [402]:
books_info.to_csv("books_info_updated.csv", index = False)

In [404]:
item_info = pd.read_csv("books_info_updated.csv")
item_info.head()

Unnamed: 0,ISBN,Title,Description,Category
0,60973129,Decision in Normandy,No description available,"['Normandy (France)', 'France', 'Campaigns', '..."
1,393045218,The mummies of Ürümchi,"In the museums of Urumchi, the wind-swept regi...","['China', 'Antiquities', 'Mummies', 'Bronze ag..."
2,425176428,What if?,No description available,"['Imaginary wars and battles.', 'Imaginary his..."
3,452264464,Beloved,No description available,"['African American History', 'Ohio', 'History'..."
4,609804618,Our dumb century,No description available,"['American wit and humor', 'Headlines', 'Humor..."
