# Feature Exploration and Building

This notebook offiline explores the books from the aplication Data Base and creates the RL embeddings file with:
- Top Categories
- Top Authors
- Top publishers

## 1. Setup and Imports

In [1]:
import os
import sys
from collections import Counter
from pathlib import Path


import json
import math

import numpy as np
import pandas as pd


# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from sqlalchemy.orm import Session
from app.db.database import SessionLocal
from app.db import models, crud
from app.utils.config import RECOMMENDER_CONFIG


print(f"Project root: {project_root}")


Project root: c:\Users\pedra\OneDrive\Documentos\pfp\recommender_mvp


## 2. Load and Inpect Data

In [2]:
db: Session = SessionLocal()

books = crud.get_all_books(db)

len(books)


100

## Inspect rating and rating_count

In [3]:

data = [
    {
        "id": b.id,
        "title": b.title,
        "avg_rating": b.avg_rating,
        "ratings_count": b.ratings_count,
    }
    for b in books
]

df_books = pd.DataFrame(data)
df_books.describe()


Unnamed: 0,id,avg_rating,ratings_count
count,100.0,100.0,100.0
mean,50.5,4.1979,4.86
std,29.011492,0.864267,8.349705
min,1.0,1.0,1.0
25%,25.75,3.95,1.0
50%,50.5,4.5,2.0
75%,75.25,5.0,4.25
max,100.0,5.0,55.0


## Explore Categories

### Top Categories

In [4]:
# Célula 4: contar frequência global de categorias usando a relação many-to-many

def print_tuple(categories):
    names = []
    ids = []
    for cat in categories:
        print(f"{cat[0].name}: {cat[1]}")
        if cat[0] != None:
            names.append(cat[0].name)
            ids.append(cat[0].id)
    return names, ids

category_frequency = crud.get_categories_frequency(db) 
print(len(category_frequency))
print_tuple(category_frequency[:10])


46
Religion: 9
Fiction: 9
History: 6
Biography & Autobiography: 5
Sports & Recreation: 4
Juvenile Nonfiction: 3
Juvenile Fiction: 3
Social Science: 2
Reference: 2
Law: 2


(['Religion',
  'Fiction',
  'History',
  'Biography & Autobiography',
  'Sports & Recreation',
  'Juvenile Nonfiction',
  'Juvenile Fiction',
  'Social Science',
  'Reference',
  'Law'],
 [3, 4, 9, 2, 17, 7, 24, 5, 6, 11])

In [5]:
# Célula 5: escolher TOP_K categorias mais frequentes

TOP_K = 6  # ajuste se quiser mais ou menos categorias

top_categories = category_frequency[:TOP_K]
top_cat_names, top_cat_ids = print_tuple(top_categories)


Religion: 9
Fiction: 9
History: 6
Biography & Autobiography: 5
Sports & Recreation: 4
Juvenile Nonfiction: 3


In [6]:
# Check multi-hot in some books

cat_index = {name: i for i, name in enumerate(top_cat_names)}

def book_cat_vector(book):
    vec = np.zeros(len(top_categories), dtype=int)
    names = book.get_categories_list
    for n in names:
        print(n)
        if n in cat_index:
            vec[cat_index[n]] = 1
    return vec, names

for b in books[:10]:
    vec, names = book_cat_vector(b)
    print(f"ID={b.id} | title={b.title[:50]!r}")
    print("Categorias do livro:", names)
    print("multi-hot:", dict(zip(top_cat_names, vec)))
    print("-" * 60)


Comics & Graphic Novels
ID=1 | title='Its Only Art If Its Well Hung!'
Categorias do livro: ['Comics & Graphic Novels']
multi-hot: {'Religion': np.int64(0), 'Fiction': np.int64(0), 'History': np.int64(0), 'Biography & Autobiography': np.int64(0), 'Sports & Recreation': np.int64(0), 'Juvenile Nonfiction': np.int64(0)}
------------------------------------------------------------
Biography & Autobiography
ID=2 | title='Dr. Seuss: American Icon'
Categorias do livro: ['Biography & Autobiography']
multi-hot: {'Religion': np.int64(0), 'Fiction': np.int64(0), 'History': np.int64(0), 'Biography & Autobiography': np.int64(1), 'Sports & Recreation': np.int64(0), 'Juvenile Nonfiction': np.int64(0)}
------------------------------------------------------------
Religion
ID=3 | title='Wonderful Worship in Smaller Churches'
Categorias do livro: ['Religion']
multi-hot: {'Religion': np.int64(1), 'Fiction': np.int64(0), 'History': np.int64(0), 'Biography & Autobiography': np.int64(0), 'Sports & Recreation'

In [7]:
author_frequency = crud.get_authors_frequency(db)
print(len(author_frequency))
print_tuple(author_frequency[:10])


top_authors = author_frequency[:TOP_K]
top_aut_names, top_aut_ids = print_tuple(top_authors)

# Check multi-hot in some books

aut_index = {name: i for i, name in enumerate(top_aut_names)}

def book_aut_vector(book):
    vec = np.zeros(len(top_authors), dtype=int)
    names = book.get_authors_list
    for n in names:
        print(n)
        if n in aut_index:
            vec[aut_index[n]] = 1
    return vec, names

for b in books[:10]:
    vec, names = book_aut_vector(b)
    print(f"ID={b.id} | title={b.title[:50]!r}")
    print("Categorias do livro:", names)
    print("multi-hot:", dict(zip(top_aut_names, vec)))
    print("-" * 60)



100
Julie Strain: 1
Philip Nel: 1
David R. Ray: 1
Veronica Haddon: 1
Edward Long: 1
Everett Ferguson: 1
Miriam Allen De Ford: 1
Lee Blessing: 1
Mary Fabyan Windeatt: 1
Steven Wardell: 1
Julie Strain: 1
Philip Nel: 1
David R. Ray: 1
Veronica Haddon: 1
Edward Long: 1
Everett Ferguson: 1
Julie Strain
ID=1 | title='Its Only Art If Its Well Hung!'
Categorias do livro: ['Julie Strain']
multi-hot: {'Julie Strain': np.int64(1), 'Philip Nel': np.int64(0), 'David R. Ray': np.int64(0), 'Veronica Haddon': np.int64(0), 'Edward Long': np.int64(0), 'Everett Ferguson': np.int64(0)}
------------------------------------------------------------
Philip Nel
ID=2 | title='Dr. Seuss: American Icon'
Categorias do livro: ['Philip Nel']
multi-hot: {'Julie Strain': np.int64(0), 'Philip Nel': np.int64(1), 'David R. Ray': np.int64(0), 'Veronica Haddon': np.int64(0), 'Edward Long': np.int64(0), 'Everett Ferguson': np.int64(0)}
------------------------------------------------------------
David R. Ray
ID=3 | title='W

In [8]:
publisher_frequency = crud.get_publisher_frequency(db)
print(len(publisher_frequency))
print(publisher_frequency[:10])

for pub in publisher_frequency:
    if pub[0] == 'None':
        publisher_frequency.remove(pub)
top_publishers = publisher_frequency[:TOP_K]

# Check multi-hot in some books
top_pub_names = [pub[0] for pub in top_publishers]
top_pub_ids = []
pub_index = {name[0]: i for i, name in enumerate(top_pub_names)}
def book_pub_vector(book):
    vec = np.zeros(len(top_publishers), dtype=int)
    
    name = book.publisher
    for pub in top_publishers:
        print(pub)
        if pub in pub_index:
            vec[pub_index[pub]] = 1
    return vec, name

for b in books[:10]:
    vec, names = book_pub_vector(b)
    print(f"ID={b.id} | title={b.title[:50]!r}")
    print("Categorias do livro:", names)
    print("multi-hot:", dict(zip(top_pub_names, vec)))
    print("-" * 60)



63
[('None', 33), ('Simon and Schuster', 3), ('Bloomsbury Publishing', 3), ('John Wiley & Sons', 2), ('iUniverse', 1), ('Yale University Press', 1), ('Wm. B. Eerdmans Publishing', 1), ('Wiley', 1), ('Vintage', 1), ('Veloce Publishing Ltd', 1)]
('Simon and Schuster', 3)
('Bloomsbury Publishing', 3)
('John Wiley & Sons', 2)
('iUniverse', 1)
('Yale University Press', 1)
('Wm. B. Eerdmans Publishing', 1)
ID=1 | title='Its Only Art If Its Well Hung!'
Categorias do livro: None
multi-hot: {'Simon and Schuster': np.int64(0), 'Bloomsbury Publishing': np.int64(0), 'John Wiley & Sons': np.int64(0), 'iUniverse': np.int64(0), 'Yale University Press': np.int64(0), 'Wm. B. Eerdmans Publishing': np.int64(0)}
------------------------------------------------------------
('Simon and Schuster', 3)
('Bloomsbury Publishing', 3)
('John Wiley & Sons', 2)
('iUniverse', 1)
('Yale University Press', 1)
('Wm. B. Eerdmans Publishing', 1)
ID=2 | title='Dr. Seuss: American Icon'
Categorias do livro: A&C Black
multi-

#### Save top categories

In [11]:
# Célula 7: salvar top_categories em JSON para ser usado pelo ContextFeatures

output_dir = os.path.join(project_root, "data", "embeddings")
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "item_config.json")

payload = {
    "top_categories": top_cat_ids,
    "top_authors": top_aut_ids,
    "top_publishers": top_pub_names,
}

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

output_path


'c:\\Users\\pedra\\OneDrive\\Documentos\\pfp\\recommender_mvp\\data\\embeddings\\item_config.json'

## Explore 

## Check Output

In [10]:
# Célula 8: testar leitura do JSON como o ContextFeatures fará

with open(output_path, "r", encoding="utf-8") as f:
    data_json = json.load(f)

loaded_cats = data_json.get("top_categories", [])
loaded_cats


[3, 4, 9, 2, 17, 7]