## Books database generator.

##### Packages needed:

In [1]:
import requests
import json
import pandas as pd
from itables import show
import numpy as np

##### Funció per obtenir els llibres:

In [135]:
# Función para obtener libros de ficción
def obtener_libros(num_paginas=150, genero = 'social'):
    libros = []
    for i in range(1, num_paginas + 1):  # Paginando a través de los resultados
        url = f'https://openlibrary.org/subjects/{genero}.json?page={i}'
        respuesta = requests.get(url)
        if respuesta.status_code == 200:
            datos = respuesta.json()
            libros.extend(datos['works'])  # Añadiendo libros a la lista
        else:
            print(f'Error: {respuesta.status_code}')
    return libros

##### Subgèneres i subsubgèneres que es busca:

In [136]:
# Create a dictionary of subgenres and subsubgenres
literary_genres = {
    "Realistic": ["social", "psychological", "manners"],
    "Historical": ["medieval", "ancient civilization", "world war"],
    "Romantic": ["contemporary romance", "historical romance", "paranormal romance"],
    "Science Fiction": ["space opera", "dystopian fiction", "cyberpunk"],
    "Fantasy": ["high fantasy", "low fantasy", "urban fantasy"],
    "Adventure": ["pirate", "travel", "exploration"]
}

# Print the dictionary
for subgenre, subsubgenres in literary_genres.items():
    print(f"{subgenre}: {', '.join(subsubgenres)}")


Realistic: social, psychological, manners
Historical: medieval, ancient civilization, world war
Romantic: contemporary romance, historical romance, paranormal romance
Science Fiction: space opera, dystopian fiction, cyberpunk
Fantasy: high fantasy, low fantasy, urban fantasy
Adventure: pirate, travel, exploration


In [137]:
libros = {}
for key in literary_genres.keys():
    for value in literary_genres[key]:
        print(key,value)
        libros[(key,value)] = obtener_libros(20,genero = value)
       


Realistic social
Realistic psychological
Realistic manners
Historical medieval
Historical ancient civilization
Historical world war
Romantic contemporary romance
Romantic historical romance
Romantic paranormal romance
Science Fiction space opera
Science Fiction dystopian fiction
Science Fiction cyberpunk
Fantasy high fantasy
Fantasy low fantasy
Fantasy urban fantasy
Adventure pirate
Adventure travel
Adventure exploration


##### Construcció del dataframe:

In [139]:
# Assuming books_dict is your dictionary
rows = []  # This list will hold all the rows

# Iterate through the dictionary
for genre_subgenre, books in libros.items():
    genre, subgenre = genre_subgenre  # Unpack the tuple into genre and subgenre

    # Iterate through each book in the list
    for book in books:
        # Create a row for each book
        row = {
            'genre': genre,
            'subgenre': subgenre,
            'key': book['key'],
            'title': book['title'],
            'edition_count': book['edition_count'],
            'cover_id': book['cover_id'],
            'cover_edition_key': book['cover_edition_key'],
            'subject': book['subject'],
            'ia_collection': book['ia_collection'],
            'lendinglibrary': book['lendinglibrary'],
            'printdisabled': book['printdisabled'],
            'lending_edition': book['lending_edition'],
            'lending_identifier': book['lending_identifier'],
            'authors': book['authors'],
            'first_publish_year': book['first_publish_year'],
            'ia': book['ia'],
            'public_scan': book['public_scan'],
            'has_fulltext': book['has_fulltext'],
        }
        rows.append(row)  # Append the row to the rows list


In [156]:
# Assuming rows is the list of dictionaries we created earlier
df = pd.DataFrame(rows)

##### Preprocessament del dataframe:

##### eliminar columnes que no aporten informació rellevant:

In [157]:
columns_to_drop = ['key','edition_count', 'cover_id', 'cover_edition_key', 'ia_collection','lendinglibrary',
                   'printdisabled','lending_edition', 'lending_identifier', 'ia', 'public_scan', 'subject']
df.drop(columns =columns_to_drop, inplace = True)

##### canviar format de la columna author:

In [158]:
# Assuming df is your DataFrame
def extract_author_name(authors_list):
    # Check if the list is not empty
    if authors_list:
        return authors_list[0].get('name', None)
    return None

# Apply the function to the 'authors' column
df['authors'] = df['authors'].apply(extract_author_name)

##### comprovar si hi ha llibres repetits:

In [159]:
# Checking and printing repeated titles
title_counts = df['title'].value_counts()
repeated_titles = title_counts[title_counts > 1]
print("Repeated Titles:")
print(repeated_titles)

# Retaining only one instance of each repeated title
df = df.drop_duplicates(subset='title', keep='first')

# Optionally, if you want to reset the index after dropping duplicates
df.reset_index(drop=True, inplace=True)

Repeated Titles:
The Magic School Bus                  40
Pride and Prejudice                   40
Zur Genealogie der Moral              20
The Lost Thorn                        20
Among the Barons (Shadow Children)    20
                                      ..
Sea Swept                             20
Chesapeake Blue                       20
Four Blondes                          20
Inner Harbor                          20
Universe                              20
Name: title, Length: 203, dtype: int64


##### Eliminar llibres amb títols que no segeueixen el format.

In [160]:
rows_to_drop = df[
    (df['title'] == 'Преступление и наказание') |
    (df['title'] == "Works (Awakening / Beyond the Bayou / Desiree's Baby / Kiss / Locket / Ma'ame Pelagie / Pair of Silk Stockings / Reflection / Respectable Woman)") |
    (df['title'] == "Novels (Emma / Mansfield Park / Northanger Abbey / Persuasion / Pride and Prejudice / Sense and Sensibility)") |
    (df['title'] == "Novels (Emma / Lady Susan / Mansfield Park / Northanger Abbey / Persuasion / Pride and Prejudice / Sense and Sensibility)") |
    (df['title'] == "Novels (Emma / Pride and Prejudice / Sense and Sensibility)") |
    (df['title'] == "Novels (Mansfield Park / Persuasion / Pride and Prejudice)")
].index
df = df.drop(rows_to_drop)

# Optionally, reset the index after dropping rows if needed
df.reset_index(drop=True, inplace=True)


##### comprovar si hi ha NA:

In [164]:
# Revisando si hay valores NA y eliminando las filas que los contengan
if df.isna().any().any():
    print("Hay valores NA en el DataFrame. Se eliminarán las filas correspondientes.")
    df = df.dropna()
else:
    print("No hay valores NA en el DataFrame.")

# Restablecer el índice después de eliminar las filas, si es necesario
df.reset_index(drop=True, inplace=True)



No hay valores NA en el DataFrame.


##### llibre que té un ; que no va bé per clips:

In [None]:
# Define a custom function to remove semicolons
def remove_semicolons(title):
    if title.startswith('Frankenstein'):
        return title.replace(';', '')
    else:
        return title

# Apply the custom function to the 'title' column
df['title'] = df['title'].apply(remove_semicolons)


##### es geneneren noves variables que segueixen una distribució normal per cada subgènere:

In [180]:
# Definir semillas para reproducibilidad
np.random.seed(23122003)

def generate_values(sub_df):
    # Aumenta la escala para obtener más variabilidad en 'Pages'. Ajusta loc y scale según tus necesidades.
    sub_df['Pages'] = np.random.normal(loc=350, scale=150, size=len(sub_df)).astype(int)
    
    # Asegurarte de que 'Pages' no tenga valores negativos
    sub_df['Pages'] = sub_df['Pages'].apply(lambda x: max(1, x)) # Asegurando un mínimo de 1 página
    
    sub_df['Rating (0-5)'] = np.random.normal(loc=3.5, scale=0.5, size=len(sub_df)).round(1)
    sub_df['Rating (0-5)'] = sub_df['Rating (0-5)'].clip(lower=0, upper=5)
    
    sub_df['Number of Characters'] = np.random.normal(loc=57.5, scale=20, size=len(sub_df)).astype(int)
    sub_df['Number of Characters'] = sub_df['Number of Characters'].clip(lower=15, upper=100)
    
    # Suponiendo un rango de 1 a 52 semanas.
    sub_df['Weeks on Best-Seller List'] = np.random.normal(loc=26, scale=15, size=len(sub_df)).astype(int)
    sub_df['Weeks on Best-Seller List'] = sub_df['Weeks on Best-Seller List'].clip(lower=1, upper=52) # Asegurando un rango de 1 a 52 semanas
    
    return sub_df

# Agrupar por subgénero y aplicar la función de generación de valores
df = df.groupby('subgenre').apply(generate_values).reset_index(drop=True)

##### S'afageix una columna del públic al qual està destinat el llibre.

In [5]:
publicos = ['Infantes', 'Adolescentes', 'Adulto-Joven', 'Adulto', 'Ancianos']

# Añadir una columna de estilo al DataFrame
df['Publico'] = np.random.choice(publicos, size=len(df))


##### Dataframe de autors:

In [3]:
df = pd.read_csv('llibres.csv')

In [6]:
autores_dict = df['authors'].value_counts().to_dict()

In [7]:
# Calculando métricas promedio por autor
author_group = df.groupby('authors')

# Calcula el promedio de las columnas numéricas
average_metrics = author_group[['Pages', 'Rating (0-5)', 'Number of Characters', 'minutes_to_read', 'Weeks on Best-Seller List', 'first_publish_year']].mean().round(2)

# Encuentra el género, subgénero y estilo de escritura más común por autor
common_genre = author_group['genre'].agg(lambda x:x.value_counts().index[0])
common_subgenre = author_group['subgenre'].agg(lambda x:x.value_counts().index[0])
common_writting_style = author_group['Estilo de Escritura'].agg(lambda x:x.value_counts().index[0])

# Crear un DataFrame de autores con la información calculada
authors_df = pd.concat([average_metrics, common_genre, common_subgenre, common_writting_style], axis=1)
authors_df.columns = ['Average Pages', 'Average Rating (0-5)', 'Average Number of Characters', 'Average Minutes to Read', 'Average Weeks on Best-Seller List','Publishing_year_average', 'Most Common Genre', 'Most Common Subgenre', 'Most Common Writting Style']

# Resetear el índice para tener 'authors' como una columna
authors_df.reset_index(inplace=True)


In [8]:
authors_df.to_csv('autors.csv')
df.to_csv('llibres.csv')