In [7]:
import json
import pandas as pd
import re
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer


nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
import json

with open("fantasy_books.json", "r", encoding="utf-8") as f:
    books = json.load(f)

print(len(books))

4000


In [9]:
pd.DataFrame(books[:5])

Unnamed: 0,title,author,description,rating,image_url,publish_date,publisher,language,pages,subjects
0,Harry Potter and the Sorcerer's Stone,J. K. Rowling,Harry Potter #1,4.2 (863 ratings),https://covers.openlibrary.org/b/id/15093275-M...,2016,Arthur A. Levine Books,English,246.0,"[Ghosts, Monsters, Vampires, Witches, Challeng..."
1,A Game of Thrones: Book One of A Song of Ice a...,George R. R. Martin,A Game of Thrones is the inaugural novel in A ...,4.2 (686 ratings),https://covers.openlibrary.org/b/id/15093534-M...,2012,Harper Voyager,English,801.0,"[Adult, Action & Adventure, Fantasy, High Fant..."
2,Haunting Adeline,H. D. Carlton,The Manipulator,3.6 (241 ratings),https://covers.openlibrary.org/b/id/13846269-M...,2021,H. D. Carlton,English,,"[collectionID:YDarkromance, Fiction, romance, ..."
3,A Court of Mist and Fury,Sarah J. Maas,Feyre has undergone more trials than one human...,4.0 (410 ratings),https://covers.openlibrary.org/b/id/14315089-M...,2016,Bloomsbury,English,624.0,"[Fantasy, Fiction, Fairies, Blessing and cursi..."
4,The Alchemist,Paulo Coelho,"Combining magic, mysticism, wisdom and wonder ...",3.9 (326 ratings),https://covers.openlibrary.org/b/id/15095844-M...,2006,HarperSanFrancisco,English,208.0,"[Translations into Indonesian, Voyages and tra..."


In [10]:
# Step 3: Clean each book dictionary

def clean_book(book):
    # --- 1. Clean Rating ---
    raw_rating = book.get("rating", "N/A")
    rating_score = None
    rating_count = None

    if isinstance(raw_rating, str):
        raw_rating = raw_rating.replace('\xa0', ' ').strip()
        match = re.match(r"([\d.]+)\s*\((\d+)\s+ratings?\)", raw_rating)
        if match:
            try:
                rating_score = float(match.group(1))
                rating_count = int(match.group(2))
            except ValueError:
                pass  # fallback to None if parsing fails

    book["rating_score"] = rating_score
    book["rating_count"] = rating_count
    book.pop("rating", None)

    # --- 2. Convert pages to int ---
    try:
        pages_str = re.sub(r"[^\d]", "", str(book.get("pages", "")))
        book["pages"] = int(pages_str) if pages_str.isdigit() else None
    except:
        book["pages"] = None

    # --- 3. Clean publish date ---
    try:
        year = re.search(r"\d{4}", str(book.get("publish_date", "")))
        book["publish_date"] = int(year.group()) if year else None
    except:
        book["publish_date"] = None

    # --- 4. Replace "N/A" with None ---
    for key, value in book.items():
        if isinstance(value, str) and value.strip().upper() == "N/A":
            book[key] = None

    # --- 5. Deduplicate & normalize subjects ---
    raw_subjects = book.get("subjects", [])
    clean_subjects = set()
    for subj in raw_subjects:
        if isinstance(subj, str):
            clean = subj.lower().replace("–", "-")  
            clean = re.sub(r"[^\w\s-]", "", clean)  
            clean = " ".join([lemmatizer.lemmatize(w) for w in clean.split()])
            clean_subjects.add(clean.strip())

    book["subjects"] = list(clean_subjects)

    return book


In [11]:
# Step 5: Apply cleaning to all books
cleaned_data = [clean_book(book) for book in tqdm(books, desc="Cleaning books")]

# Show cleaned sample
pd.DataFrame(cleaned_data[:5])

Cleaning books: 100%|██████████| 4000/4000 [00:00<00:00, 7056.64it/s]


Unnamed: 0,title,author,description,image_url,publish_date,publisher,language,pages,subjects,rating_score,rating_count
0,Harry Potter and the Sorcerer's Stone,J. K. Rowling,Harry Potter #1,https://covers.openlibrary.org/b/id/15093275-M...,2016,Arthur A. Levine Books,English,246.0,"[roman, 4 privet drive, england fiction, magia...",4.2,863
1,A Game of Thrones: Book One of A Song of Ice a...,George R. R. Martin,A Game of Thrones is the inaugural novel in A ...,https://covers.openlibrary.org/b/id/15093534-M...,2012,Harper Voyager,English,801.0,"[war and conflict, knight, trial by combat, im...",4.2,686
2,Haunting Adeline,H. D. Carlton,The Manipulator,https://covers.openlibrary.org/b/id/13846269-M...,2021,H. D. Carlton,English,,"[collectionidydarkromance, young adult fiction...",3.6,241
3,A Court of Mist and Fury,Sarah J. Maas,Feyre has undergone more trials than one human...,https://covers.openlibrary.org/b/id/14315089-M...,2016,Bloomsbury,English,624.0,"[love romance, seriesa_court_of_thorns_and_ros...",4.0,410
4,The Alchemist,Paulo Coelho,"Combining magic, mysticism, wisdom and wonder ...",https://covers.openlibrary.org/b/id/15095844-M...,2006,HarperSanFrancisco,English,208.0,"[nyttrade_fiction_paperback2007-12-02, nyttrad...",3.9,326


In [None]:
# Step 6: Save cleaned data

with open("cleaned_fantasy_books.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)