In [45]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm  # For progress bar
import numpy as np

In [46]:
# Load ISBNs from a CSV file or list
isbn_list = ["9780143126560", "9780062316097", "9781451648539"] 

### Import list of ISBN

In [47]:
df_ratings = pd.read_csv('../data/ratings.csv')
isbn_list = df_ratings['ISBN'].to_list()
isbn_list = list(set(isbn_list)) ## Keep only unique ISBNs

print(f"Number of ISBNs: {len(isbn_list)}")

Number of ISBNs: 340556


In [48]:
isbn_list = isbn_list[:50] ## For testing

#### Function to fetch book genre

In [49]:
def fetch_genre(isbn):
    url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json&jscmd=data"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
        book_key = f"ISBN:{isbn}"
        
        if book_key in data:
            book_info = data[book_key]
            title = book_info.get("title", "Unknown Title")
            genres = [sub["name"] for sub in book_info.get("subjects", [])] or ["Unknown Genre"]
            return {"ISBN": isbn, "Title": title, "Genres": ", ".join(genres)}
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching ISBN {isbn}: {e}")
        return {"ISBN": isbn, "Title": "Error", "Genres": "Error"}
    
    return {"ISBN": isbn, "Title": "Not Found", "Genres": "Not Found"}

In [50]:
fetch_genre("9780143126560")

{'ISBN': '9780143126560',
 'Title': 'Getting Things Done',
 'Genres': "Time management, Self-management (Psychology), Gestion de soi, Budgets temps, Zelfpsychologie, Zeiteinteilung, Tijdmanagement, Selbstmanagement, nyt:paperback-advice=2008-06-08, New York Times bestseller, Administracio n del tiempo, Administración del tiempo, Efficiency, Efficacité, Organisation du travail, Formation, Développement personnel, Développement d'aptitudes, Shi jian, Guan li, Zi wo(xin li xue), Business, handbooks, manuals, etc., Industrial management, Creative ability in business, Psychological Stress, Prevention & control"}

In [51]:
# Process ISBNs in parallel
def fetch_all_genres(isbn_list, max_workers=10):
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_genre, isbn): isbn for isbn in isbn_list}
        for future in tqdm(as_completed(futures), total=len(isbn_list), desc="Fetching Genres"):
            results.append(future.result())
    return results

In [52]:
# Run the function
results = fetch_all_genres(isbn_list, max_workers=10)

Fetching Genres:   0%|          | 0/50 [00:00<?, ?it/s]

Fetching Genres: 100%|██████████| 50/50 [00:05<00:00,  8.83it/s]


#### Save results to CSV

In [53]:
df = pd.DataFrame(results)
df.to_csv("book_genres.csv", index=False)

print("✅ Data saved to book_genres.csv")

✅ Data saved to book_genres.csv
