# Feature Exploration and Building

This notebook offiline explores the books from the aplication Data Base and creates the RL embeddings file with:
- Top Categories
- Top Authors
- Top publishers

## 1. Setup and Imports

In [None]:
import os
import sys
from pathlib import Path


import json

import numpy as np
import pandas as pd


# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from sqlalchemy.orm import Session
from app.db.database import SessionLocal
from app.db import crud
from app.utils.config import RECOMMENDER_CONFIG


print(f"Project root: {project_root}")

Project root: c:\Users\pedra\OneDrive\Documentos\pfp\recommender_mvp


## 2. Load and Inpect Data

In [None]:
db: Session = SessionLocal()

books = crud.get_all_books(db, 0)

len(books)

10000

## Inspect rating and rating_count

In [None]:
data = [
    {
        "id": b.id,
        "title": b.title,
        "avg_rating": b.avg_rating,
        "ratings_count": b.ratings_count,
    }
    for b in books
]

df_books = pd.DataFrame(data)
df_books.describe()

Unnamed: 0,id,avg_rating,ratings_count
count,10000.0,10000.0,10000.0
mean,5000.5,4.262464,13.8723
std,2886.89568,0.824711,137.640452
min,1.0,0.0,1.0
25%,2500.75,4.0,1.0
50%,5000.5,4.5,2.0
75%,7500.25,5.0,5.0
max,10000.0,5.0,4610.0


## Explore Categories

In [None]:

def print_tuple(categories):
    names = []
    ids = []
    for cat in categories:
        print(f"{cat[0].name}: {cat[1]}")
        if cat[0] != None:
            names.append(cat[0].name)
            ids.append(cat[0].id)
    return names, ids


category_frequency = crud.get_categories_frequency(db)
print(len(category_frequency))
print_tuple(category_frequency[:10])

1438
Fiction: 1212
Religion: 505
History: 491
Business & Economics: 342
Biography & Autobiography: 329
Computers: 315
Juvenile Fiction: 306
Social Science: 196
Juvenile Nonfiction: 166
Education: 138


(['Fiction',
  'Religion',
  'History',
  'Business & Economics',
  'Biography & Autobiography',
  'Computers',
  'Juvenile Fiction',
  'Social Science',
  'Juvenile Nonfiction',
  'Education'],
 [4, 3, 9, 40, 2, 41, 24, 5, 7, 67])

### Escolher TOP_K categorias mais frequentes

In [None]:
TOP_K = RECOMMENDER_CONFIG["feature_dim"]
top_categories = category_frequency[:TOP_K]
top_cat_names, top_cat_ids = print_tuple(top_categories)

Fiction: 1212
Religion: 505
History: 491
Business & Economics: 342
Biography & Autobiography: 329
Computers: 315
Juvenile Fiction: 306
Social Science: 196
Juvenile Nonfiction: 166
Education: 138


### Check multi-hot Categories in some books

In [None]:
cat_index = {name: i for i, name in enumerate(top_cat_names)}


def book_cat_vector(book):
    vec = np.zeros(len(top_categories), dtype=int)
    names = book.get_categories_list
    for n in names:
        print(n)
        if n in cat_index:
            vec[cat_index[n]] = 1
    return vec, names


for b in books[:10]:
    vec, names = book_cat_vector(b)
    print(f"ID={b.id} | title={b.title[:50]!r}")
    print("Categorias do livro:", names)
    print("multi-hot:", dict(zip(top_cat_names, vec)))
    print("-" * 60)

Comics & Graphic Novels
ID=1 | title='Its Only Art If Its Well Hung!'
Categorias do livro: ['Comics & Graphic Novels']
multi-hot: {'Fiction': np.int64(0), 'Religion': np.int64(0), 'History': np.int64(0), 'Business & Economics': np.int64(0), 'Biography & Autobiography': np.int64(0), 'Computers': np.int64(0), 'Juvenile Fiction': np.int64(0), 'Social Science': np.int64(0), 'Juvenile Nonfiction': np.int64(0), 'Education': np.int64(0)}
------------------------------------------------------------
Biography & Autobiography
ID=2 | title='Dr. Seuss: American Icon'
Categorias do livro: ['Biography & Autobiography']
multi-hot: {'Fiction': np.int64(0), 'Religion': np.int64(0), 'History': np.int64(0), 'Business & Economics': np.int64(0), 'Biography & Autobiography': np.int64(1), 'Computers': np.int64(0), 'Juvenile Fiction': np.int64(0), 'Social Science': np.int64(0), 'Juvenile Nonfiction': np.int64(0), 'Education': np.int64(0)}
------------------------------------------------------------
Religion
I

### Check multi-hot Authors in some books

In [None]:
author_frequency = crud.get_authors_frequency(db)
print(len(author_frequency))
print_tuple(author_frequency[:10])


top_authors = author_frequency[:TOP_K]
top_aut_names, top_aut_ids = print_tuple(top_authors)

aut_index = {name: i for i, name in enumerate(top_aut_names)}


def book_aut_vector(book):
    vec = np.zeros(len(top_authors), dtype=int)
    names = book.get_authors_list
    for n in names:
        print(n)
        if n in aut_index:
            vec[aut_index[n]] = 1
    return vec, names


for b in books[:10]:
    vec, names = book_aut_vector(b)
    print(f"ID={b.id} | title={b.title[:50]!r}")
    print("Autores do livro:", names)
    print("multi-hot:", dict(zip(top_aut_names, vec)))
    print("-" * 60)

10880
Rose Arny: 15
Thomas Hardy: 10
William Shakespeare: 9
Microsoft Corporation: 9
Library of Congress. Copyright Office: 8
Agatha Christie: 8
Isaac Asimov: 8
DK: 7
Erle Stanley Gardner: 7
Georges Simenon: 7
Rose Arny: 15
Thomas Hardy: 10
William Shakespeare: 9
Microsoft Corporation: 9
Library of Congress. Copyright Office: 8
Agatha Christie: 8
Isaac Asimov: 8
DK: 7
Erle Stanley Gardner: 7
Georges Simenon: 7
Julie Strain
ID=1 | title='Its Only Art If Its Well Hung!'
Categorias do livro: ['Julie Strain']
multi-hot: {'Rose Arny': np.int64(0), 'Thomas Hardy': np.int64(0), 'William Shakespeare': np.int64(0), 'Microsoft Corporation': np.int64(0), 'Library of Congress. Copyright Office': np.int64(0), 'Agatha Christie': np.int64(0), 'Isaac Asimov': np.int64(0), 'DK': np.int64(0), 'Erle Stanley Gardner': np.int64(0), 'Georges Simenon': np.int64(0)}
------------------------------------------------------------
Philip Nel
ID=2 | title='Dr. Seuss: American Icon'
Categorias do livro: ['Philip Nel

### Check multi-hot publishers in some books

In [None]:
publisher_frequency = crud.get_publisher_frequency(db)
print(len(publisher_frequency))
print(publisher_frequency[:10])

for pub in publisher_frequency:
    if pub[0] == "None":
        publisher_frequency.remove(pub)
top_publishers = publisher_frequency[:TOP_K]

top_pub_names = [pub[0] for pub in top_publishers]
top_pub_ids = []
pub_index = {name[0]: i for i, name in enumerate(top_pub_names)}


def book_pub_vector(book):
    vec = np.zeros(len(top_publishers), dtype=int)

    name = book.publisher
    for pub in top_publishers:
        print(pub)
        if pub in pub_index:
            vec[pub_index[pub]] = 1
    return vec, name


for b in books[:10]:
    vec, names = book_pub_vector(b)
    print(f"ID={b.id} | title={b.title[:50]!r}")
    print("Categorias do livro:", names)
    print("multi-hot:", dict(zip(top_pub_names, vec)))
    print("-" * 60)

2444
[('None', 2983), ('Simon and Schuster', 152), ('Routledge', 137), ('John Wiley & Sons', 126), ('Penguin', 102), ('Hachette UK', 88), ('Cambridge University Press', 88), ('Courier Corporation', 80), ('Harper Collins', 76), ('Random House', 67)]
('Simon and Schuster', 152)
('Routledge', 137)
('John Wiley & Sons', 126)
('Penguin', 102)
('Hachette UK', 88)
('Cambridge University Press', 88)
('Courier Corporation', 80)
('Harper Collins', 76)
('Random House', 67)
('iUniverse', 57)
ID=1 | title='Its Only Art If Its Well Hung!'
Categorias do livro: None
multi-hot: {'Simon and Schuster': np.int64(0), 'Routledge': np.int64(0), 'John Wiley & Sons': np.int64(0), 'Penguin': np.int64(0), 'Hachette UK': np.int64(0), 'Cambridge University Press': np.int64(0), 'Courier Corporation': np.int64(0), 'Harper Collins': np.int64(0), 'Random House': np.int64(0), 'iUniverse': np.int64(0)}
------------------------------------------------------------
('Simon and Schuster', 152)
('Routledge', 137)
('John Wile

## Save top categories

In [None]:
output_dir = os.path.join(project_root, "data", "embeddings")
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "item_config.json")

payload = {
    "top_categories_ids": top_cat_ids,
    "top_authors_ids": top_aut_ids,
    "top_publishers": top_pub_names,
}

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

output_path

'c:\\Users\\pedra\\OneDrive\\Documentos\\pfp\\recommender_mvp\\data\\embeddings\\item_config.json'