## Individual Challenge - Rich Data Visualizations & Insights of Books Sales and Reviews
#### By Mariana Tomás, nr 60421

These are the packages that were needed and the cleaned datasets importation: 

In [24]:
import pandas as pd
import numpy as np
import dash
from dash import html, dcc, Input, Output
import plotly.express as px

In [25]:
# Clean datasets
df1 = pd.read_csv("Datasets/1_amazon/amazon_meta_clean.csv")
df2 = pd.read_csv("Datasets/2_3_goodreads/goodreads_meta_books_clean.csv", low_memory=False)
df3 = pd.read_csv("Datasets/2_3_goodreads/goodReads_2019_2020_fixed_clean.csv")
df4 = pd.read_csv("Datasets/4_bookcrossing/book_crossing_Books_clean.csv", low_memory=False)
df5 = pd.read_csv("Datasets/5_sales_N_ratings/Books_Data_Clean_Clean.csv")
df6 = pd.read_csv("Datasets/6_kindle/kindle_data_clean.csv")
df7 = pd.read_csv("Datasets/7_wonderbk/wonderbooks_clean.csv")

I needed to use low_memory in datasets 2 and 4 because two of the columns in these sdatasets have data type issues, i.e. these columns contain multiple data types instead of just 1. This way, I ensure that there are no future errors. The data types used are always corrected before using any dataset.

### Dataset 1 - Amazon Reviews 2023 


In [26]:
df1.columns = df1.columns.str.strip()

# Converter tipos de dados
df1['avg_rating'] = pd.to_numeric(df1['avg_rating'], errors='coerce')
df1['num_ratings'] = pd.to_numeric(df1['num_ratings'], errors='coerce')
df1['price'] = pd.to_numeric(df1['price'], errors='coerce')
df1['publication_date'] = pd.to_datetime(df1['publication_date'], errors='coerce')
df1['year'] = df1['publication_date'].dt.year

df1.columns

Index(['main_category', 'title', 'subtitle', 'avg_rating', 'num_ratings',
       'features', 'description', 'price', 'images', 'videos', 'store',
       'categories', 'parent_asin', 'author_name', 'author_bio',
       'author_avatar', 'product_details', 'language', 'dimensions_raw',
       'publisher_raw', 'item_weight_raw', 'isbn_13', 'isbn_10',
       'publication_date', 'length', 'width', 'height', 'weight_oz',
       'lang_iso', 'publisher', 'edition', 'year'],
      dtype='object')

#### Interactive Dashboard

In [27]:
# Inicializar o app Dash com external_stylesheets e suppress_callback_exceptions
app1 = dash.Dash(__name__, external_stylesheets=['data:text/css;charset=utf-8,.sidebar:hover { opacity: 1; } .button:hover { opacity: 0.8; }'], suppress_callback_exceptions=True)

# Estilo simples
style = {
    'fontFamily': 'Arial',
    'padding': '10px',
    'backgroundColor': '#f8f9fa',
    'color': '#333'
}

# Estilo da barra lateral
sidebar_style = {
    'width': '20%',
    'backgroundColor': '#e9ecef',
    'padding': '10px',
    'borderRadius': '5px',
    'opacity': '0.7',
    'transition': 'opacity 0.3s',
    'position': 'fixed',
    'left': '10px',
    'top': '150px'
}

# Estilo dos botões
button_style = {
    'backgroundColor': 'darkgreen',
    'color': 'white',
    'border': 'none',
    'padding': '10px 20px',
    'margin': '5px',
    'borderRadius': '5px',
    'cursor': 'pointer',
    'fontSize': '14px'
}

# Layout do app
app1.layout = html.Div([
    html.H1("Analysis of Amazon Reviews’ 2023 Dataset", style={'textAlign': 'center', 'fontSize': '24px', 'margin': '10px'}),
    
    # Botões no topo
    html.Div([
        html.Button("Books Distribution", id='button-1', style=button_style, className='button'),
        html.Button("Reviews Comparations", id='button-2', style=button_style, className='button'),
        html.Button("Rating Correlation Heatmap", id='button-3', style=button_style, className='button'),
        html.Button("Books per Year", id='button-4', style=button_style, className='button'),
        html.Button("Format per Year", id='button-5', style=button_style, className='button')
    ], style={'display': 'flex', 'justifyContent': 'center', 'gap': '10px', 'marginBottom': '20px'}),
    
    # Conteúdo dinâmico
    html.Div(id='content')
], style=style)

# Callback para atualizar o conteúdo
@app1.callback(
    Output('content', 'children'),
    [Input('button-1', 'n_clicks'), Input('button-2', 'n_clicks'), Input('button-3', 'n_clicks'), 
     Input('button-4', 'n_clicks'), Input('button-5', 'n_clicks')]
)
def update_content(btn1, btn2, btn3, btn4, btn5):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'button-1'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
    if button_id == 'button-1':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-1',
                    options=[
                        {'label': 'Author', 'value': 'author_name'},
                        {'label': 'Publisher', 'value': 'publisher'},
                        {'label': 'Format', 'value': 'main_category'},
                        {'label': 'Language', 'value': 'lang_iso'}
                    ],
                    value='author_name',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-1',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (ISBN-13)', 'value': 'isbn_13'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-1', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-2':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-2',
                    options=[
                        {'label': 'Number of Books', 'value': 'count_books'},
                        {'label': 'Number of Ratings', 'value': 'num_ratings'},
                        {'label': 'Price', 'value': 'price'},
                        {'label': 'Year', 'value': 'year'}
                    ],
                    value='count_books',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-2',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (ISBN-13)', 'value': 'isbn_13'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-2', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-3':  # Rating Correlation Heatmap
        return [
            html.Div([], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-3', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-4':
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-4',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (ISBN-13)', 'value': 'isbn_13'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-4', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    else:  # button-5 (Format per Year)
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-5',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (ISBN-13)', 'value': 'isbn_13'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-5', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]

# Callbacks para atualizar os gráficos
# Gráfico 1
@app1.callback(
    Output('chart-1', 'figure'),
    [Input('y-axis-filter-1', 'value'), Input('duplicate-filter-1', 'value')]
)
def update_chart_1(y_axis, duplicate_option):
    filtered_df = df1.copy()
    
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    elif duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'isbn_13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_13'])
    
    data = filtered_df[y_axis].value_counts().reset_index()
    data.columns = [y_axis, 'count']
    
    if y_axis in ['main_category', 'author_name', 'lang_iso', 'publisher']:
        data = data.head(10)
    
    y_axis_labels = {
        'author_name': 'Author',
        'publisher': 'Publisher',
        'main_category': 'Format',
        'lang_iso': 'Language'
    }
    
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    fig = px.bar(
        data,
        x='count',
        y=y_axis,
        title=f'{y_label} by Number of Books',
        color_discrete_sequence=['darkgreen'],
        labels={'count': 'Number of Books', y_axis: y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 2
@app1.callback(
    Output('chart-2', 'figure'),
    [Input('y-axis-filter-2', 'value'), Input('duplicate-filter-2', 'value')]
)
def update_chart_2(y_axis, duplicate_option):
    filtered_df = df1.copy()
    
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    elif duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'isbn_13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_13'])
    
    filtered_df = filtered_df[filtered_df['avg_rating'].notna()]
    filtered_df = filtered_df[filtered_df['year'] <= 2024]
    
    if filtered_df.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    y_axis_labels = {
        'count_books': 'Number of Books',
        'num_ratings': 'Number of Ratings',
        'price': 'Price',
        'year': 'Year'
    }
    
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    if y_axis == 'count_books':
        data = filtered_df.groupby('avg_rating').size().reset_index(name='count')
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='avg_rating',
            y='count',
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['darkgreen'],
            labels={'avg_rating': 'Average Rating', 'count': y_label}
        )
    elif y_axis == 'num_ratings':
        data = filtered_df.groupby('avg_rating')[y_axis].mean().reset_index()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='avg_rating',
            y=y_axis,
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['darkgreen'],
            labels={'avg_rating': 'Average Rating', y_axis: y_label}
        )
    elif y_axis == 'price':
        if 'price' not in filtered_df.columns:
            fig = px.scatter(title='Error: Price column not found in the dataset')
            fig.update_layout(
                annotations=[
                    dict(text="Price data not available. Please check the dataset columns.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        
        data = filtered_df[['avg_rating', 'price']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid prices available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='avg_rating',
            y='price',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['darkgreen'],
            labels={'avg_rating': 'Average Rating', 'price': y_label},
            opacity=0.6
        )
    else:
        data = filtered_df[['avg_rating', 'year']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid years available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='avg_rating',
            y='year',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['darkgreen'],
            labels={'avg_rating': 'Average Rating', 'year': y_label},
            opacity=0.6
        )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 3
@app1.callback(
    Output('chart-3', 'figure'),
    [Input('button-3', 'n_clicks')]
)
def update_chart_3(n_clicks):
    available_cols = df1.columns
    numeric_cols_mapping = {
        'avg_rating': [col for col in ['avg_rating', 'average_rating'] if col in available_cols],
        'num_ratings': [col for col in ['num_ratings', 'ratings_count'] if col in available_cols],
        'year': [col for col in ['year', 'publication_year'] if col in available_cols],
        'price': ['price'] if 'price' in available_cols else []
    }

    numeric_cols = [col[0] for col in numeric_cols_mapping.values() if col]
    if not numeric_cols:
        fig = px.scatter(title='Error: No numeric columns available for correlation.')
        fig.update_layout(
            annotations=[
                dict(text="Please check the dataset columns (e.g., avg_rating, num_ratings, year, price).", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    df1_numeric = df1[numeric_cols].dropna()
    
    if df1_numeric.empty:
        fig = px.scatter(title='Error: No valid data after removing NaN values.')
        fig.update_layout(
            annotations=[
                dict(text="All rows contain NaN values for the selected columns.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    correlation_matrix = df1_numeric.corr()
    
    fig = px.imshow(
        correlation_matrix,
        text_auto=True,
        aspect="auto",
        color_continuous_scale='RdBu_r',
        title='Correlation of Average Rating with Other Variables'
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 60, 'b': 40},
        font={'size': 12},
        coloraxis_colorbar_title="Correlation"
    )
    
    return fig

# Gráfico 4
@app1.callback(
    Output('chart-4', 'figure'),
    [Input('duplicate-filter-4', 'value')]
)
def update_chart_4(duplicate_option):
    filtered_df = df1.copy()
    
    filtered_df = filtered_df[filtered_df['year'].notna()]
    filtered_df = filtered_df[(filtered_df['year'] >= 2000) & (filtered_df['year'] <= 2024)]
    
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    elif duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'isbn_13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_13'])
    
    data = filtered_df.groupby('year').size().reset_index(name='count')
    if data.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    x_label = 'Year'
    y_label = 'Number of Books'
    
    fig = px.line(
        data,
        x='year',
        y='count',
        title=f'{y_label} by {x_label}',
        markers=True,
        color_discrete_sequence=['darkgreen'],
        labels={'year': x_label, 'count': y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 5
@app1.callback(
    Output('chart-5', 'figure'),
    [Input('duplicate-filter-5', 'value')]
)
def update_chart_5(duplicate_option):
    filtered_df = df1.copy()
    
    # Excluir linhas com year ou main_category nulo
    filtered_df = filtered_df[filtered_df['year'].notna() & filtered_df['main_category'].notna()]
    
    # Restringir anos a partir de 2000 até 2024
    filtered_df = filtered_df[(filtered_df['year'] >= 2000) & (filtered_df['year'] <= 2024)]
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    elif duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'isbn_13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_13'])
    
    # Padronizar os valores de main_category para lowercase e remover espaços
    filtered_df['main_category'] = filtered_df['main_category'].str.lower().str.strip()
    
    # Preparar dados: contar formatos por ano
    data = filtered_df.groupby(['year', 'main_category']).size().reset_index(name='count')
    
    if data.empty:
        fig = px.scatter(title='No data available for Format per Year')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Verificar se 'audiobooks' está presente nos dados
    if 'audiobooks' not in data['main_category'].values:
        print("Aviso: 'audiobooks' não encontrado nos dados após filtragem. Verifique o dataset.")
    
    # Criar gráfico de barras
    fig = px.bar(
        data,
        x='year',
        y='count',
        color='main_category',
        title='Type of Format (counts) by Year',
        labels={'year': 'Year', 'count': 'Number of Books', 'main_category': 'Format'},
        color_discrete_sequence=px.colors.qualitative.Set1
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=True,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    
    return fig

# Rodar o app
if __name__ == '__main__':
    app1.run(debug=True, port=8050)

### Dataset 2 - Goodreads 2017

In [28]:
df2.columns = df2.columns.str.strip()

# Converter tipos
df2['avg_rating'] = pd.to_numeric(df2['avg_rating'], errors='coerce')
df2['num_pages'] = pd.to_numeric(df2['num_pages'], errors='coerce')
df2['ratings_count'] = pd.to_numeric(df2['ratings_count'], errors='coerce')
df2['text_reviews_count'] = pd.to_numeric(df2['text_reviews_count'], errors='coerce')
df2['publication_date'] = pd.to_datetime(df2['publication_date'], errors='coerce')
df2['year'] = df2['publication_date'].dt.year
df2['is_ebook'] = df2['is_ebook'].astype(bool)

df2.columns


Index(['title_without_series', 'title', 'work_id', 'book_id',
       'publication_year', 'num_pages', 'ratings_count', 'kindle_asin',
       'publisher_raw', 'authors', 'format_raw', 'country_code_raw', 'series',
       'avg_rating', 'similar_books', 'image_url', 'isbn_13', 'is_ebook',
       'text_reviews_count', 'language_raw', 'description', 'link', 'url',
       'child_asin', 'popular_shelves', 'edition', 'isbn_10',
       'publication_day', 'publication_month', 'publication_date',
       'publication_estimate', 'publisher', 'lang_iso', 'standard_format',
       'year'],
      dtype='object')

#### Interactive Dashboard

In [29]:
# Inicializar o app Dash com external_stylesheets
app2 = dash.Dash(__name__, external_stylesheets=['data:text/css;charset=utf-8,.sidebar:hover { opacity: 1; } .button:hover { opacity: 0.8; }'], suppress_callback_exceptions=True)

# Estilo simples
style = {
    'fontFamily': 'Arial',
    'padding': '10px',
    'backgroundColor': '#f8f9fa',
    'color': '#333'
}

# Estilo da barra lateral
sidebar_style = {
    'width': '20%',
    'backgroundColor': '#e9ecef',
    'padding': '10px',
    'borderRadius': '5px',
    'opacity': '0.7',
    'transition': 'opacity 0.3s',
    'position': 'fixed',
    'left': '10px',
    'top': '150px'  # Ajustado de 100px para 150px
}

# Estilo dos botões
button_style = {
    'backgroundColor': 'teal',
    'color': 'white',
    'border': 'none',
    'padding': '10px 20px',
    'margin': '5px',
    'borderRadius': '5px',
    'cursor': 'pointer',
    'fontSize': '14px'
}

# Layout do app
app2.layout = html.Div([
    html.H1("Analysis of Goodreads - 2017 Dataset", style={'textAlign': 'center', 'fontSize': '24px', 'margin': '10px'}),
    
    # Botões no topo
    html.Div([
        html.Button("Books Distribution", id='button-1', style=button_style, className='button'),
        html.Button("Reviews Comparations", id='button-2', style=button_style, className='button'),
        html.Button("Rating Correlation Heatmap", id='button-3', style=button_style, className='button'),  # Trocado com o botão 4
        html.Button("Books per Year", id='button-4', style=button_style, className='button'),  # Trocado com o botão 3
        html.Button("Format per Year", id='button-5', style=button_style, className='button')
    ], style={'display': 'flex', 'justifyContent': 'center', 'gap': '10px', 'marginBottom': '20px'}),
    
    # Conteúdo dinâmico
    html.Div(id='content')
], style=style)

# Callback para atualizar o conteúdo
@app2.callback(
    Output('content', 'children'),
    [Input('button-1', 'n_clicks'), Input('button-2', 'n_clicks'), Input('button-3', 'n_clicks'), 
     Input('button-4', 'n_clicks'), Input('button-5', 'n_clicks')]
)
def update_content(btn1, btn2, btn3, btn4, btn5):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'button-1'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
    if button_id == 'button-1':
        y_axis_options = [
            {'label': 'Publisher', 'value': 'publisher'},
            {'label': 'Format', 'value': 'standard_format'},
            {'label': 'Language', 'value': 'lang_iso'},
            {'label': 'Is eBook', 'value': 'is_ebook'}
        ]
        
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-1',
                    options=y_axis_options,
                    value='publisher',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-1',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (ISBN-13)', 'value': 'isbn_13'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-1', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-2':
        y_axis_options = [
            {'label': 'Number of Books', 'value': 'count_books'},
            {'label': 'Number of Ratings', 'value': 'ratings_count'},
            {'label': 'Number of Pages', 'value': 'num_pages'},
            {'label': 'Year', 'value': 'year'}
        ]
        
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-2',
                    options=y_axis_options,
                    value='count_books',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-2',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (ISBN-13)', 'value': 'isbn_13'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-2', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-3':  # Rating Correlation Heatmap
        return [
            html.Div([], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-3', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-4':  # Books per Year
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-4',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (ISBN-13)', 'value': 'isbn_13'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-4', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    else:  # button-5 (Format per Year)
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-5',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (ISBN-13)', 'value': 'isbn_13'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-5', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]

# Callbacks para atualizar os gráficos
# Gráfico 1
@app2.callback(
    Output('chart-1', 'figure'),
    [Input('y-axis-filter-1', 'value'), Input('duplicate-filter-1', 'value')]
)
def update_chart_1(y_axis, duplicate_option):
    filtered_df = df2.copy()
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    elif duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'isbn_13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_13'])
    
    # Preparar dados
    data = filtered_df[y_axis].value_counts().reset_index()
    data.columns = [y_axis, 'count']
    
    # Limitar a 10 categorias para publisher, lang_iso, standard_format
    if y_axis in ['publisher', 'lang_iso', 'standard_format']:
        data = data.head(10)
    
    # Converter True/False para True/False com apenas a primeira letra em maiúscula se for is_ebook
    if y_axis == 'is_ebook':
        data[y_axis] = data[y_axis].map({True: 'True', False: 'False'})
    
    # Mapear valores de y_axis para nomes legíveis (mesmos do dropdown)
    y_axis_labels = {
        'publisher': 'Publisher',
        'standard_format': 'Format',
        'lang_iso': 'Language',
        'is_ebook': 'Is eBook'
    }
    
    # Obter o rótulo legível para o título e eixo Y
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    # Criar gráfico com título ajustado
    fig = px.bar(
        data,
        x='count',
        y=y_axis,
        title=f'{y_label} by Number of Books',
        color_discrete_sequence=['teal'],
        labels={'count': 'Number of Books', y_axis: y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 2
@app2.callback(
    Output('chart-2', 'figure'),
    [Input('y-axis-filter-2', 'value'), Input('duplicate-filter-2', 'value')]
)
def update_chart_2(y_axis, duplicate_option):
    filtered_df = df2.copy()
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    elif duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'isbn_13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_13'])
    
    # Verificar nulos em avg_rating
    filtered_df = filtered_df[filtered_df['avg_rating'].notna()]
    
    # Restringir anos até 2017
    filtered_df = filtered_df[filtered_df['year'] <= 2017]
    
    # Verificar se filtered_df está vazio
    if filtered_df.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Mapear valores de y_axis para nomes legíveis (mesmos do dropdown)
    y_axis_labels = {
        'count_books': 'Number of Books',
        'ratings_count': 'Number of Ratings',
        'num_pages': 'Number of Pages',
        'year': 'Year'
    }
    
    # Obter o rótulo legível para o título e eixo Y
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    # Preparar dados
    if y_axis == 'count_books':
        data = filtered_df.groupby('avg_rating').size().reset_index(name='count')
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='avg_rating',
            y='count',
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['teal'],
            labels={'avg_rating': 'Average Rating', 'count': y_label}
        )
    elif y_axis == 'ratings_count':
        data = filtered_df.groupby('avg_rating')[y_axis].mean().reset_index()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='avg_rating',
            y=y_axis,
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['teal'],
            labels={'avg_rating': 'Average Rating', y_axis: y_label}
        )
    elif 'num_pages' in df2.columns and y_axis == 'num_pages':
        data = filtered_df[['avg_rating', 'num_pages']].dropna()
        if data.empty or data['num_pages'].isna().all():
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid pages available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='avg_rating',
            y='num_pages',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['teal'],
            labels={'avg_rating': 'Average Rating', 'num_pages': y_label},
            opacity=0.6
        )
    else:  # y_axis == 'year'
        data = filtered_df[['avg_rating', 'year']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid years available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='avg_rating',
            y='year',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['teal'],
            labels={'avg_rating': 'Average Rating', 'year': y_label},
            opacity=0.6
        )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 3 (Rating Correlation Heatmap)
@app2.callback(
    Output('chart-3', 'figure'),
    [Input('button-3', 'n_clicks')]
)
def update_chart_3(n_clicks):
    available_cols = df2.columns
    numeric_cols_mapping = {
        'avg_rating': [col for col in ['avg_rating', 'average_rating'] if col in available_cols],
        'ratings_count': [col for col in ['ratings_count', 'num_ratings'] if col in available_cols],
        'year': [col for col in ['year', 'publication_year'] if col in available_cols],
        'num_pages': ['num_pages'] if 'num_pages' in available_cols else []
    }

    numeric_cols = [col[0] for col in numeric_cols_mapping.values() if col]
    if not numeric_cols:
        fig = px.scatter(title='Error: No numeric columns available for correlation.')
        fig.update_layout(
            annotations=[
                dict(text="Please check the dataset columns (e.g., avg_rating, ratings_count, year, num_pages).", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    df2_numeric = df2[numeric_cols].dropna()
    
    if df2_numeric.empty:
        fig = px.scatter(title='Error: No valid data after removing NaN values.')
        fig.update_layout(
            annotations=[
                dict(text="All rows contain NaN values for the selected columns.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    correlation_matrix = df2_numeric.corr()
    
    fig = px.imshow(
        correlation_matrix,
        text_auto=True,
        aspect="auto",
        color_continuous_scale='RdBu_r',
        title='Correlation of Average Rating with Other Variables'
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 60, 'b': 40},
        font={'size': 12},
        coloraxis_colorbar_title="Correlation"
    )
    
    return fig

# Gráfico 4 (Books per Year)
@app2.callback(
    Output('chart-4', 'figure'),
    [Input('duplicate-filter-4', 'value')]
)
def update_chart_4(duplicate_option):
    filtered_df = df2.copy()
    
    # Excluir linhas com year nulo
    filtered_df = filtered_df[filtered_df['year'].notna()]
    
    # Restringir anos até 2017
    filtered_df = filtered_df[filtered_df['year'] <= 2017]
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    elif duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'isbn_13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_13'])
    
    # Preparar dados
    data = filtered_df.groupby('year').size().reset_index(name='count')
    if data.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    x_label = 'Year'
    y_label = 'Number of Books'
    
    fig = px.line(
        data,
        x='year',
        y='count',
        title=f'{y_label} by {x_label}',
        markers=True,
        color_discrete_sequence=['teal'],
        labels={'year': x_label, 'count': y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 5 (Format per Year)
@app2.callback(
    Output('chart-5', 'figure'),
    [Input('duplicate-filter-5', 'value')]
)
def update_chart_5(duplicate_option):
    filtered_df = df2.copy()
    
    # Excluir linhas com year ou standard_format nulo
    filtered_df = filtered_df[filtered_df['year'].notna() & filtered_df['standard_format'].notna()]
    
    # Restringir anos até 2017
    filtered_df = filtered_df[filtered_df['year'] <= 2017]
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    elif duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'isbn_13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_13'])
    
    # Preparar dados: contar formatos por ano
    data = filtered_df.groupby(['year', 'standard_format']).size().reset_index(name='count')
    
    if data.empty:
        fig = px.scatter(title='No data available for Format per Year')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Criar gráfico de barras
    fig = px.bar(
        data,
        x='year',
        y='count',
        color='standard_format',
        title='Type of Format(counts) by Year',
        labels={'year': 'Year', 'count': 'Number of Books', 'standard_format': 'Format'},
        color_discrete_sequence=px.colors.qualitative.Set1
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=True,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    
    return fig

if __name__ == '__main__':
    app2.run(debug=True, port=8051)

### Dataset 3 - Goodreads 2019-2020

In [30]:
df3.columns = df3.columns.str.strip()

# Converter tipos
df3['average_rating'] = pd.to_numeric(df3['average_rating'], errors='coerce')
df3['num_pages'] = pd.to_numeric(df3['num_pages'], errors='coerce')
df3['ratings_count'] = pd.to_numeric(df3['ratings_count'], errors='coerce')
df3['text_reviews_count'] = pd.to_numeric(df3['text_reviews_count'], errors='coerce')
df3['publication_date'] = pd.to_datetime(df3['publication_date'], errors='coerce')
df3['year'] = df3['publication_date'].dt.year

df3.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_raw', 'num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher', 'date_parsed_iso', 'author_standard',
       'lang_iso', 'publisher_standard', 'year'],
      dtype='object')

#### Interactive dashboasrd 

In [31]:

# Inicializar o app Dash com external_stylesheets
app3 = dash.Dash(__name__, external_stylesheets=['data:text/css;charset=utf-8,.sidebar:hover { opacity: 1; } .button:hover { opacity: 0.8; }'], suppress_callback_exceptions=True)

# Estilo simples
style = {
    'fontFamily': 'Arial',
    'padding': '10px',
    'backgroundColor': '#f8f9fa',
    'color': '#333'
}

# Estilo da barra lateral
sidebar_style = {
    'width': '20%',
    'backgroundColor': '#e9ecef',
    'padding': '10px',
    'borderRadius': '5px',
    'opacity': '0.7',
    'transition': 'opacity 0.3s',
    'position': 'fixed',
    'left': '10px',
    'top': '200px'  # Ajustado de 150px para 200px
}

# Estilo dos botões
button_style = {
    'backgroundColor': 'orangered',
    'color': 'white',
    'border': 'none',
    'padding': '10px 20px',
    'margin': '5px',
    'borderRadius': '5px',
    'cursor': 'pointer',
    'fontSize': '14px'
}

# Layout do app
app3.layout = html.Div([
    html.H1("Analysis of Goodreads - 2019 & 2020 Dataset", style={'textAlign': 'center', 'fontSize': '24px', 'margin': '10px'}),
    
    # Botões no topo
    html.Div([
        html.Button("Books Distribution", id='button-1', style=button_style, className='button'),
        html.Button("Reviews Comparations", id='button-2', style=button_style, className='button'),
        html.Button("Rating Correlation Heatmap", id='button-3', style=button_style, className='button'),
        html.Button("Books per Year", id='button-4', style=button_style, className='button')
    ], style={'display': 'flex', 'justifyContent': 'center', 'gap': '10px', 'marginBottom': '20px'}),
    
    # Conteúdo dinâmico
    html.Div(id='content')
], style=style)

# Callback para atualizar o conteúdo
@app3.callback(
    Output('content', 'children'),
    [Input('button-1', 'n_clicks'), Input('button-2', 'n_clicks'), Input('button-3', 'n_clicks'), Input('button-4', 'n_clicks')]
)
def update_content(btn1, btn2, btn3, btn4):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'button-1'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
    if button_id == 'button-1':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-1',
                    options=[
                        {'label': 'Author', 'value': 'author_standard'},
                        {'label': 'Publisher', 'value': 'publisher_standard'},
                        {'label': 'Language', 'value': 'lang_iso'}
                    ],
                    value='author_standard',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-1',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (ISBN)', 'value': 'isbn'},
                        {'label': 'Without Duplicates (ISBN13)', 'value': 'isbn13'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-1', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-2':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-2',
                    options=[
                        {'label': 'Number of Books', 'value': 'count_books'},
                        {'label': 'Number of Ratings', 'value': 'ratings_count'},
                        {'label': 'Number of Pages', 'value': 'num_pages'},
                        {'label': 'Year', 'value': 'year'} 
                    ],
                    value='count_books',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-2',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (ISBN)', 'value': 'isbn'},
                        {'label': 'Without Duplicates (ISBN13)', 'value': 'isbn13'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-2', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-3':  # Rating Correlation Heatmap
        return [
            html.Div([], style=sidebar_style, className='sidebar'),  # Sidebar vazio para o heatmap
            dcc.Graph(id='chart-3', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    else:  # button-4 (Books per Year)
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-4',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (ISBN)', 'value': 'isbn'},
                        {'label': 'Without Duplicates (ISBN13)', 'value': 'isbn13'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-4', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]

# Callbacks para atualizar os gráficos
# Gráfico 1
@app3.callback(
    Output('chart-1', 'figure'),
    [Input('y-axis-filter-1', 'value'), Input('duplicate-filter-1', 'value')]
)
def update_chart_1(y_axis, duplicate_option):
    filtered_df = df3.copy()
    
    # Tratar duplicatas
    if duplicate_option == 'isbn':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn'])
    elif duplicate_option == 'isbn13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn13'])
    elif duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    
    # Preparar dados
    data = filtered_df[y_axis].value_counts().reset_index()
    data.columns = [y_axis, 'count']
    
    # Limitar a 10 categorias para author_standard, publisher_standard, lang_iso
    if y_axis in ['author_standard', 'publisher_standard', 'lang_iso']:
        data = data.head(10)
    
    # Mapear valores de y_axis para nomes legíveis (mesmos do dropdown)
    y_axis_labels = {
        'author_standard': 'Author',
        'publisher_standard': 'Publisher',
        'lang_iso': 'Language'
    }
    
    # Obter o rótulo legível para o título e eixo Y
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    # Criar gráfico com título ajustado
    fig = px.bar(
        data,
        x='count',
        y=y_axis,
        title=f'{y_label} by Number of Books',  # Título ajustado
        color_discrete_sequence=['orangered'],
        labels={'count': 'Number of Books', y_axis: y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 2
@app3.callback(
    Output('chart-2', 'figure'),
    [Input('y-axis-filter-2', 'value'), Input('duplicate-filter-2', 'value')]
)
def update_chart_2(y_axis, duplicate_option):
    filtered_df = df3.copy()
    
    # Tratar duplicatas
    if duplicate_option == 'isbn':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn'])
    elif duplicate_option == 'isbn13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn13'])
    elif duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    
    # Verificar nulos em average_rating
    filtered_df = filtered_df[filtered_df['average_rating'].notna()]
    
    # Mapear valores de y_axis para nomes legíveis (mesmos do dropdown)
    y_axis_labels = {
        'count_books': 'Number of Books',
        'ratings_count': 'Number of Ratings',
        'num_pages': 'Number of Pages',
        'year': 'Year'
    }
    
    # Obter o rótulo legível para o título e eixo Y
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    # Preparar dados
    if y_axis == 'count_books':
        data = filtered_df.groupby('average_rating').size().reset_index(name='count')
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='average_rating',
            y='count',
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['orangered'],
            labels={'average_rating': 'Average Rating', 'count': y_label}
        )
    elif y_axis == 'num_pages':
        data = filtered_df[['average_rating', 'num_pages']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid pages available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='average_rating',
            y='num_pages',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['orangered'],
            labels={'average_rating': 'Average Rating', 'num_pages': y_label},
            opacity=0.6
        )
    elif y_axis == 'ratings_count':
        data = filtered_df.groupby('average_rating')[y_axis].mean().reset_index()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='average_rating',
            y=y_axis,
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['orangered'],
            labels={'average_rating': 'Average Rating', y_axis: y_label}
        )
    else:  # y_axis == 'year'
        data = filtered_df[['average_rating', 'year']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid years available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='average_rating',
            y='year',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['orangered'],
            labels={'average_rating': 'Average Rating', 'year': y_label},
            opacity=0.6
        )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 3 (Rating Correlation Heatmap)
@app3.callback(
    Output('chart-3', 'figure'),
    [Input('button-3', 'n_clicks')]
)
def update_chart_3(n_clicks):
    # Verificar colunas disponíveis para o heatmap (ordem conforme filtros do Gráfico 2)
    available_cols = df3.columns
    numeric_cols_mapping = {
        'average_rating': ['average_rating'] if 'average_rating' in available_cols else [],
        'count_books': ['count_books'] if 'count_books' in available_cols else [],
        'ratings_count': ['ratings_count'] if 'ratings_count' in available_cols else [],
        'num_pages': ['num_pages'] if 'num_pages' in available_cols else [],
        'year': ['year'] if 'year' in available_cols else []
    }

    # Filtrar colunas disponíveis na ordem especificada
    numeric_cols = []
    for key in ['average_rating', 'count_books', 'ratings_count', 'num_pages', 'year']:
        if numeric_cols_mapping[key]:
            numeric_cols.append(numeric_cols_mapping[key][0])
    if not numeric_cols:
        fig = px.scatter(title='Error: No numeric columns available for correlation.')
        fig.update_layout(
            annotations=[
                dict(text="Please check the dataset columns (e.g., average_rating, count_books, ratings_count, num_pages, year).", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    df3_numeric = df3[numeric_cols].dropna()
    
    # Verificar se há dados após dropna
    if df3_numeric.empty:
        fig = px.scatter(title='Error: No valid data after removing NaN values.')
        fig.update_layout(
            annotations=[
                dict(text="All rows contain NaN values for the selected columns.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Calcular a matriz de correlação
    correlation_matrix = df3_numeric.corr()
    
    # Criar heatmap
    fig = px.imshow(
        correlation_matrix,
        text_auto=True,
        aspect="auto",
        color_continuous_scale='RdBu_r',
        title='Correlation of Average Rating with Other Variables'  # Título ajustado
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 60, 'b': 40},
        font={'size': 12},
        coloraxis_colorbar_title="Correlation"
    )
    
    return fig

# Gráfico 4 (Books per Year)
@app3.callback(
    Output('chart-4', 'figure'),
    [Input('duplicate-filter-4', 'value')]
)
def update_chart_4(duplicate_option):
    filtered_df = df3.copy()
    
    # Excluir linhas com year nulo
    filtered_df = filtered_df[filtered_df['year'].notna()]
    
    # Tratar duplicatas
    if duplicate_option == 'isbn':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn'])
    elif duplicate_option == 'isbn13':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn13'])
    elif duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    
    # Preparar dados
    data = filtered_df.groupby('year').size().reset_index(name='count')
    if data.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Definir rótulos legíveis
    x_label = 'Year'
    y_label = 'Number of Books'
    
    fig = px.line(
        data,
        x='year',
        y='count',
        title=f'{y_label} by {x_label}',
        markers=True,
        color_discrete_sequence=['orangered'],
        labels={'year': x_label, 'count': y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Rodar o app
if __name__ == '__main__':
    app3.run(debug=True, port=8052)

### Dataset 4 - Book-Crossing Comunity

In [32]:
df4.columns = df4.columns.str.strip()

# Corrigir ano de publicação
df4['year_numeric'] = pd.to_numeric(df4['year_numeric'], errors='coerce')

df4.columns

Index(['isbn', 'book_title', 'book_author', 'year_of_publication',
       'publisher_raw', 'image_url_s', 'image_url_m', 'image_url_l',
       'publication_year', 'year_numeric', 'publisher', 'isbn_10', 'isbn_13'],
      dtype='object')

#### Interactive dashboard

In [33]:
# Inicializar o app Dash com external_stylesheets
app4 = dash.Dash(__name__, external_stylesheets=['data:text/css;charset=utf-8,.sidebar:hover { opacity: 1; } .button:hover { opacity: 0.8; }'], suppress_callback_exceptions=True)

# Estilo simples
style = {
    'fontFamily': 'Arial',
    'padding': '10px',
    'backgroundColor': '#f8f9fa',
    'color': '#333'
}

# Estilo da barra lateral
sidebar_style = {
    'width': '20%',
    'backgroundColor': '#e9ecef',
    'padding': '10px',
    'borderRadius': '5px',
    'opacity': '0.7',
    'transition': 'opacity 0.3s',
    'position': 'fixed',
    'left': '10px',
    'top': '100px'
}

# Estilo dos botões
button_style = {
    'backgroundColor': 'pink',
    'color': 'white',
    'border': 'none',
    'padding': '10px 20px',
    'margin': '5px',
    'borderRadius': '5px',
    'cursor': 'pointer',
    'fontSize': '14px'
}

# Layout do app
app4.layout = html.Div([
    html.H1("Analysis of Book-Crossing Community Dataset", style={'textAlign': 'center', 'fontSize': '24px', 'margin': '10px'}),
    
    # Botões no topo (apenas 2 gráficos)
    html.Div([
        html.Button("Books Distribution", id='button-1', style=button_style, className='button'),
        html.Button("Books per Year", id='button-2', style=button_style, className='button')
    ], style={'display': 'flex', 'justifyContent': 'center', 'gap': '10px', 'marginBottom': '20px'}),
    
    # Conteúdo dinâmico
    html.Div(id='content')
], style=style)

# Callback para atualizar o conteúdo
@app4.callback(
    Output('content', 'children'),
    [Input('button-1', 'n_clicks'), Input('button-2', 'n_clicks')]
)
def update_content(btn1, btn2):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'button-1'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
    if button_id == 'button-1':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-1',
                    options=[
                        {'label': 'Author', 'value': 'book_author'},
                        {'label': 'Publisher', 'value': 'publisher'}
                    ],
                    value='book_author',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-1',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (Title)', 'value': 'book_title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-1', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    else:
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-2',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (ISBN-10)', 'value': 'isbn_10'},
                        {'label': 'Without Duplicates (Title)', 'value': 'book_title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-2', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]

# Callbacks para atualizar os gráficos
# Gráfico 1
@app4.callback(
    Output('chart-1', 'figure'),
    [Input('y-axis-filter-1', 'value'), Input('duplicate-filter-1', 'value')]
)
def update_chart_1(y_axis, duplicate_option):
    filtered_df = df4.copy()
    
    # Tratar duplicatas
    if duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'book_title':
        filtered_df = filtered_df.drop_duplicates(subset=['book_title'])
    
    # Preparar dados
    data = filtered_df[y_axis].value_counts().reset_index()
    data.columns = [y_axis, 'count']
    
    # Limitar a 10 categorias para book_author e publisher
    if y_axis in ['book_author', 'publisher']:
        data = data.head(10)
    
    # Mapear valores de y_axis para nomes legíveis (mesmos do dropdown)
    y_axis_labels = {
        'book_author': 'Author',
        'publisher': 'Publisher'
    }
    
    # Obter o rótulo legível para o título e eixo Y
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    # Criar gráfico
    fig = px.bar(
        data,
        x='count',
        y=y_axis,
        title=f'{y_label} by Number of Books',
        color_discrete_sequence=['pink'],
        labels={'count': 'Number of Books', y_axis: y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 2 (número de livros por ano)
@app4.callback(
    Output('chart-2', 'figure'),
    [Input('duplicate-filter-2', 'value')]
)
def update_chart_2(duplicate_option):
    filtered_df = df4.copy()
    
    # Excluir linhas com year_numeric nulo
    filtered_df = filtered_df[filtered_df['year_numeric'].notna()]
    
    # Restringir anos até 2009
    filtered_df = filtered_df[filtered_df['year_numeric'] <= 2009]
    
    # Tratar duplicatas
    if duplicate_option == 'isbn_10':
        filtered_df = filtered_df.drop_duplicates(subset=['isbn_10'])
    elif duplicate_option == 'book_title':
        filtered_df = filtered_df.drop_duplicates(subset=['book_title'])
    
    # Preparar dados
    data = filtered_df.groupby('year_numeric').size().reset_index(name='count')
    if data.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Definir rótulos legíveis
    x_label = 'Year'
    y_label = 'Number of Books'
    
    fig = px.line(
        data,
        x='year_numeric',
        y='count',
        title=f'{y_label} by {x_label}',
        markers=True,
        color_discrete_sequence=['pink'],
        labels={'year_numeric': x_label, 'count': y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Rodar o app
if __name__ == '__main__':
    app4.run(debug=True, port=8053)

### Dataset 5 - Book Sales and Ratings

In [34]:
df5.columns = df5.columns.str.strip()

# Correct data types
df5['sale_price'] = pd.to_numeric(df5['sale_price'], errors='coerce')
df5['author_rating'] = pd.to_numeric(df5['author_rating'], errors='coerce')
df5['book_average_rating'] = pd.to_numeric(df5['book_average_rating'], errors='coerce')
df5['book_ratings_count'] = pd.to_numeric(df5['book_ratings_count'], errors='coerce')
df5['units_sold'] = pd.to_numeric(df5['units_sold'], errors='coerce')
df5['publishing_year'] = pd.to_numeric(df5['publishing_year'], errors='coerce')

count_books_per_rating = df5.groupby('book_average_rating').size().to_dict()
df5['count_books'] = df5['book_average_rating'].map(count_books_per_rating)

df5.columns


Index(['publishing_year', 'book_name', 'author', 'language_code',
       'author_rating', 'book_average_rating', 'book_ratings_count', 'genre',
       'gross_sales', 'publisher_revenue', 'sale_price', 'sales_rank',
       'publisher', 'units_sold', 'count_books'],
      dtype='object')

#### Interactive dashboard 

In [35]:
# Inicializar o app Dash com external_stylesheets
app5 = dash.Dash(__name__, external_stylesheets=['data:text/css;charset=utf-8,.sidebar:hover { opacity: 1; } .button:hover { opacity: 0.8; }'], suppress_callback_exceptions=True)

# Estilo simples
style = {
    'fontFamily': 'Arial',
    'padding': '10px',
    'backgroundColor': '#f8f9fa',
    'color': '#333'
}

# Estilo da barra lateral
sidebar_style = {
    'width': '20%',
    'backgroundColor': '#e9ecef',
    'padding': '10px',
    'borderRadius': '5px',
    'opacity': '0.7',
    'transition': 'opacity 0.3s',
    'position': 'fixed',
    'left': '10px',
    'top': '150px'
}

# Estilo dos botões
button_style = {
    'backgroundColor': 'darkred',
    'color': 'white',
    'border': 'none',
    'padding': '10px 20px',
    'margin': '5px',
    'borderRadius': '5px',
    'cursor': 'pointer',
    'fontSize': '14px'
}

# Layout do app
app5.layout = html.Div([
    html.H1("Analysis of Books Sales and Ratings Dataset", style={'textAlign': 'center', 'fontSize': '24px', 'margin': '10px'}),
    
    # Botões no topo
    html.Div([
        html.Button("Books Distribution", id='button-1', style=button_style, className='button'),
        html.Button("Reviews Comparations", id='button-2', style=button_style, className='button'),
        html.Button("Rating Correlation Heatmap", id='button-3', style=button_style, className='button'),
        html.Button("Books per Year", id='button-4', style=button_style, className='button'),
        html.Button("Price vs Units Sold", id='button-5', style=button_style, className='button'),
        html.Button("Revenue by Publisher", id='button-6', style=button_style, className='button')
    ], style={'display': 'flex', 'justifyContent': 'center', 'gap': '10px', 'marginBottom': '20px'}),
    
    # Conteúdo dinâmico
    html.Div(id='content')
], style=style)

# Callback para atualizar o conteúdo
@app5.callback(
    Output('content', 'children'),
    [Input('button-1', 'n_clicks'), Input('button-2', 'n_clicks'), Input('button-3', 'n_clicks'), 
     Input('button-4', 'n_clicks'), Input('button-5', 'n_clicks'), Input('button-6', 'n_clicks')]
)
def update_content(btn1, btn2, btn3, btn4, btn5, btn6):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'button-1'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
    if button_id == 'button-1':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-1',
                    options=[
                        {'label': 'Author', 'value': 'author'},
                        {'label': 'Publisher', 'value': 'publisher'},
                        {'label': 'Main Category', 'value': 'genre'},
                        {'label': 'Price', 'value': 'sale_price'},
                        {'label': 'Language', 'value': 'language_code'}
                    ],
                    value='author',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-1',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Book Name)', 'value': 'book_name'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-1', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-2':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-2',
                    options=[
                        {'label': 'Number of Books', 'value': 'count_books'},
                        {'label': 'Number of Ratings', 'value': 'book_ratings_count'},
                        {'label': 'Price', 'value': 'sale_price'},
                        {'label': 'Units Sold', 'value': 'units_sold'},
                        {'label': 'Year', 'value': 'publishing_year'}
                    ],
                    value='count_books',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-2',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Book Name)', 'value': 'book_name'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-2', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-3':
        return [
            html.Div([], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-3', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-4':
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-4',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Book Name)', 'value': 'book_name'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-4', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-5':
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-5',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Book Name)', 'value': 'book_name'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-5', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    else:  # button-6
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-6',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Book Name)', 'value': 'book_name'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-6', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]

# Callbacks para atualizar os gráficos
# Gráfico 1
@app5.callback(
    Output('chart-1', 'figure'),
    [Input('y-axis-filter-1', 'value'), Input('duplicate-filter-1', 'value')]
)
def update_chart_1(y_axis, duplicate_option):
    filtered_df = df5.copy()
    
    # Verificar se a coluna 'book_name' existe
    if 'book_name' not in filtered_df.columns:
        fig = px.scatter(title='Error: Book Name column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'book_name' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Tratar duplicatas
    if duplicate_option == 'book_name':
        filtered_df = filtered_df.drop_duplicates(subset=['book_name'])
    
    y_axis_labels = {
        'author': 'Author',
        'publisher': 'Publisher',
        'genre': 'Main Category',
        'sale_price': 'Price ($)',
        'language_code': 'Language'
    }
    
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    if y_axis == 'sale_price':
        # Verificar se há valores nulos
        filtered_df = filtered_df[filtered_df['sale_price'].notna()]
        try:
            filtered_df['sale_price'] = pd.to_numeric(filtered_df['sale_price'])
            if filtered_df.empty:
                fig = px.scatter(title='No valid price data available')
                fig.update_layout(
                    annotations=[
                        dict(text="No valid price data after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                    ],
                    xaxis=dict(visible=False),
                    yaxis=dict(visible=False)
                )
                return fig
            
            # Contar o número de livros por preço
            price_counts = filtered_df['sale_price'].value_counts().reset_index()
            price_counts.columns = ['sale_price', 'count']
            
            # Criar gráfico de dispersão
            fig = px.scatter(
                price_counts,
                x='count',
                y='sale_price',
                title='Number of Books by Price (Scatter Plot)',
                labels={'count': 'Number of Books', 'sale_price': 'Price ($)'},
                color_discrete_sequence=['darkred'],
                opacity=0.6,
                size_max=10
            )
            
            fig.update_traces(marker=dict(size=8))  # Ajustar o tamanho das bolinhas
        
        except ValueError as e:
            fig = px.scatter(title='Error processing price data')
            fig.update_layout(
                annotations=[
                    dict(text=f"Error: Non-numeric values in 'sale_price' ({str(e)}).", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
    else:
        # Para outras variáveis (categóricas), usar value_counts
        data = filtered_df[y_axis].value_counts().reset_index()
        data.columns = [y_axis, 'count']
        if y_axis in ['author', 'genre', 'publisher', 'language_code']:
            data = data.head(10)
        fig = px.bar(
            data,
            x='count',
            y=y_axis,
            title=f'{y_label} by Number of Books',
            color_discrete_sequence=['darkred'],
            labels={'count': 'Number of Books', y_axis: y_label}
        )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 2
@app5.callback(
    Output('chart-2', 'figure'),
    [Input('y-axis-filter-2', 'value'), Input('duplicate-filter-2', 'value')]
)
def update_chart_2(y_axis, duplicate_option):
    filtered_df = df5.copy()
    
    # Verificar se a coluna 'book_name' existe
    if 'book_name' not in filtered_df.columns:
        fig = px.scatter(title='Error: Book Name column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'book_name' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Tratar duplicatas
    if duplicate_option == 'book_name':
        filtered_df = filtered_df.drop_duplicates(subset=['book_name'])
    
    filtered_df = filtered_df[filtered_df['book_average_rating'].notna()]
    
    y_axis_labels = {
        'count_books': 'Number of Books',
        'book_ratings_count': 'Number of Ratings',
        'sale_price': 'Price ($)',
        'units_sold': 'Units Sold',
        'publishing_year': 'Year'
    }
    
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    if y_axis == 'count_books':
        data = filtered_df.groupby('book_average_rating').size().reset_index(name='count')
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='book_average_rating',
            y='count',
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['darkred'],
            labels={'book_average_rating': 'Average Rating', 'count': y_label}
        )
    elif y_axis == 'book_ratings_count':
        data = filtered_df.groupby('book_average_rating')[y_axis].mean().reset_index()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='book_average_rating',
            y=y_axis,
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['darkred'],
            labels={'book_average_rating': 'Average Rating', y_axis: y_label}
        )
    elif y_axis == 'sale_price':
        data = filtered_df[['book_average_rating', 'sale_price']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid prices available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='book_average_rating',
            y='sale_price',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['darkred'],
            labels={'book_average_rating': 'Average Rating', 'sale_price': y_label},
            opacity=0.6
        )
    elif y_axis == 'units_sold':
        data = filtered_df[['book_average_rating', 'units_sold']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid units sold available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='book_average_rating',
            y='units_sold',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['darkred'],
            labels={'book_average_rating': 'Average Rating', 'units_sold': y_label},
            opacity=0.6
        )
    else:  # y_axis == 'publishing_year'
        data = filtered_df[['book_average_rating', 'publishing_year']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid years available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='book_average_rating',
            y='publishing_year',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['darkred'],
            labels={'book_average_rating': 'Average Rating', 'publishing_year': y_label},
            opacity=0.6
        )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 3 (Rating Correlation Heatmap)
@app5.callback(
    Output('chart-3', 'figure'),
    [Input('button-3', 'n_clicks')]
)
def update_chart_3(n_clicks):
    available_cols = df5.columns
    numeric_cols_mapping = {
        'book_average_rating': ['book_average_rating'] if 'book_average_rating' in available_cols else [],
        'count_books': ['count_books'] if 'count_books' in available_cols else [],
        'book_ratings_count': ['book_ratings_count'] if 'book_ratings_count' in available_cols else [],
        'sale_price': ['sale_price'] if 'sale_price' in available_cols else [],
        'units_sold': ['units_sold'] if 'units_sold' in available_cols else [],
        'publishing_year': ['publishing_year'] if 'publishing_year' in available_cols else []
    }

    numeric_cols = []
    for key in ['book_average_rating', 'count_books', 'book_ratings_count', 'sale_price', 'units_sold', 'publishing_year']:
        if numeric_cols_mapping[key]:
            numeric_cols.append(numeric_cols_mapping[key][0])
    if not numeric_cols:
        fig = px.scatter(title='Error: No numeric columns available for correlation.')
        fig.update_layout(
            annotations=[
                dict(text="Please check the dataset columns (e.g., book_average_rating, count_books, book_ratings_count, sale_price, units_sold, publishing_year).", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    df5_numeric = df5[numeric_cols].dropna()
    
    if df5_numeric.empty:
        fig = px.scatter(title='Error: No valid data after removing NaN values.')
        fig.update_layout(
            annotations=[
                dict(text="All rows contain NaN values for the selected columns.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    correlation_matrix = df5_numeric.corr()
    
    fig = px.imshow(
        correlation_matrix,
        text_auto=True,
        aspect="auto",
        color_continuous_scale='RdBu_r',
        title='Correlation of Variables from Reviews Comparations'
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 60, 'b': 40},
        font={'size': 12},
        coloraxis_colorbar_title="Correlation"
    )
    
    return fig

# Gráfico 4 (Books per Year)
@app5.callback(
    Output('chart-4', 'figure'),
    [Input('duplicate-filter-4', 'value')]
)
def update_chart_4(duplicate_option):
    filtered_df = df5.copy()
    
    # Verificar se a coluna 'book_name' existe
    if 'book_name' not in filtered_df.columns:
        fig = px.scatter(title='Error: Book Name column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'book_name' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    filtered_df = filtered_df[filtered_df['publishing_year'].notna()]
    
    if duplicate_option == 'book_name':
        filtered_df = filtered_df.drop_duplicates(subset=['book_name'])
    
    data = filtered_df.groupby('publishing_year').size().reset_index(name='count')
    if data.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    x_label = 'Year'
    y_label = 'Number of Books'
    
    fig = px.line(
        data,
        x='publishing_year',
        y='count',
        title=f'{y_label} by {x_label}',
        markers=True,
        color_discrete_sequence=['darkred'],
        labels={'publishing_year': x_label, 'count': y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 5 (Price vs Units Sold)
@app5.callback(
    Output('chart-5', 'figure'),
    [Input('duplicate-filter-5', 'value')]
)
def update_chart_5(duplicate_option):
    filtered_df = df5.copy()
    
    # Verificar se a coluna 'book_name' existe
    if 'book_name' not in filtered_df.columns:
        fig = px.scatter(title='Error: Book Name column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'book_name' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    if duplicate_option == 'book_name':
        filtered_df = filtered_df.drop_duplicates(subset=['book_name'])
    
    filtered_df_sales = filtered_df[['sale_price', 'units_sold']].dropna()
    if filtered_df_sales.empty:
        fig = px.scatter(title='No data available for Sale Price vs Units Sold')
        fig.update_layout(
            annotations=[
                dict(text="No valid data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    fig = px.scatter(
        filtered_df_sales,
        x='sale_price',
        y='units_sold',
        title='Sale Price vs Units Sold',
        color_discrete_sequence=['darkred'],
        labels={'sale_price': 'Sale Price ($)', 'units_sold': 'Units Sold'},
        opacity=0.6
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 6 (Revenue by Publisher)
@app5.callback(
    Output('chart-6', 'figure'),
    [Input('duplicate-filter-6', 'value')]
)
def update_chart_6(duplicate_option):
    filtered_df = df5.copy()
    
    # Verificar se a coluna 'book_name' existe
    if 'book_name' not in filtered_df.columns:
        fig = px.scatter(title='Error: Book Name column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'book_name' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    if duplicate_option == 'book_name':
        filtered_df = filtered_df.drop_duplicates(subset=['book_name'])
    
    data_revenue = filtered_df.groupby('publisher')['publisher_revenue'].sum().reset_index()
    data_revenue = data_revenue.sort_values('publisher_revenue', ascending=False).head(10)
    if data_revenue.empty:
        fig = px.scatter(title='No data available for Revenue by Publisher')
        fig.update_layout(
            annotations=[
                dict(text="No valid revenue data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    fig = px.bar(
        data_revenue,
        x='publisher_revenue',
        y='publisher',
        title='Revenue by Publisher (Top 10)',
        color_discrete_sequence=['darkred'],
        labels={'publisher_revenue': 'Revenue ($)', 'publisher': 'Publisher'},
        orientation='h'
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Rodar o app
if __name__ == '__main__':
    app5.run(debug=True, port=8054)

### Dataset 6 - Amazon Kindle Books Dataset 2023

In [36]:
df6.columns = df6.columns.str.strip()

# Converter tipos
df6['price'] = pd.to_numeric(df6['price'], errors='coerce')
df6['stars'] = pd.to_numeric(df6['stars'], errors='coerce')
df6['reviews'] = pd.to_numeric(df6['reviews'], errors='coerce')
df6['publishedDate'] = pd.to_datetime(df6['publishedDate'], errors='coerce')
df6['year'] = df6['publishedDate'].dt.year
df6 = df6[~df6['author'].str.lower().isin(['captivating history', 'hourly history'])]


df6.columns


Index(['asin', 'title', 'author', 'soldBy', 'imgUrl', 'productURL', 'stars',
       'reviews', 'price', 'isKindleUnlimited', 'category_id', 'isBestSeller',
       'isEditorsPick', 'isGoodReadsChoice', 'publishedDate', 'category_name',
       'year'],
      dtype='object')

#### Interactive dashboard

In [37]:
# Inicializar o app Dash com external_stylesheets e suppress_callback_exceptions
app6 = dash.Dash(__name__, external_stylesheets=['data:text/css;charset=utf-8,.sidebar:hover { opacity: 1; } .button:hover { opacity: 0.8; }'], suppress_callback_exceptions=True)

# Estilo simples
style = {
    'fontFamily': 'Arial',
    'padding': '10px',
    'backgroundColor': '#f8f9fa',
    'color': '#333'
}

# Estilo da barra lateral
sidebar_style = {
    'width': '20%',
    'backgroundColor': '#e9ecef',
    'padding': '10px',
    'borderRadius': '5px',
    'opacity': '0.7',
    'transition': 'opacity 0.3s',
    'position': 'fixed',
    'left': '10px',
    'top': '150px'
}

# Estilo dos botões
button_style = {
    'backgroundColor': 'royalblue',
    'color': 'white',
    'border': 'none',
    'padding': '10px 20px',
    'margin': '5px',
    'borderRadius': '5px',
    'cursor': 'pointer',
    'fontSize': '14px'
}

# Layout do app
app6.layout = html.Div([
    html.H1("Analysis of Amazon Kindle Books’23 Dataset", style={'textAlign': 'center', 'fontSize': '24px', 'margin': '10px'}),
    
    # Botões no topo
    html.Div([
        html.Button("Books Distribution", id='button-1', style=button_style, className='button'),
        html.Button("Reviews Comparations", id='button-2', style=button_style, className='button'),
        html.Button("Rating Correlation Heatmap", id='button-3', style=button_style, className='button'),
        html.Button("Books per Year", id='button-4', style=button_style, className='button')
    ], style={'display': 'flex', 'justifyContent': 'center', 'gap': '10px', 'marginBottom': '20px'}),
    
    # Conteúdo dinâmico
    html.Div(id='content')
], style=style)

# Callback para atualizar o conteúdo
@app6.callback(
    Output('content', 'children'),
    [Input('button-1', 'n_clicks'), Input('button-2', 'n_clicks'), Input('button-3', 'n_clicks'), Input('button-4', 'n_clicks')]
)
def update_content(btn1, btn2, btn3, btn4):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'button-1'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
    if button_id == 'button-1':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-1',
                    options=[
                        {'label': 'Author', 'value': 'author'},
                        {'label': 'Seller/Publisher', 'value': 'soldBy'},
                        {'label': 'Main Category', 'value': 'category_name'},
                        {'label': 'Price', 'value': 'price'},
                        {'label': 'Is Kindle Unlimited', 'value': 'isKindleUnlimited'},
                        {'label': 'Is Best Seller', 'value': 'isBestSeller'},
                        {'label': 'Is Editors Pick', 'value': 'isEditorsPick'},
                        {'label': 'Is GoodReads Choice', 'value': 'isGoodReadsChoice'}
                    ],
                    value='author',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-1',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-1', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-2':
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-2',
                    options=[
                        {'label': 'Number of Reviews', 'value': 'reviews'},
                        {'label': 'Price', 'value': 'price'},
                        {'label': 'Year', 'value': 'year'},
                    ],
                    value='reviews',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-2',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-2', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-3':
        return [
            html.Div([], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-3', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    else:
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-4',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-4', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]

# Callbacks para atualizar os gráficos
# Gráfico 1
@app6.callback(
    Output('chart-1', 'figure'),
    [Input('y-axis-filter-1', 'value'), Input('duplicate-filter-1', 'value')]
)
def update_chart_1(y_axis, duplicate_option):
    filtered_df = df6.copy()
    
    # Verificar se a coluna 'title' existe
    if 'title' not in filtered_df.columns:
        fig = px.scatter(title='Error: Title column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'title' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    
    # Preparar dados
    if y_axis == 'price':
        # Verificar se há valores nulos
        filtered_df = filtered_df[filtered_df['price'].notna()]
        try:
            filtered_df['price'] = pd.to_numeric(filtered_df['price'])
            if filtered_df.empty:
                fig = px.scatter(title='No valid price data available')
                fig.update_layout(
                    annotations=[
                        dict(text="No valid price data after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                    ],
                    xaxis=dict(visible=False),
                    yaxis=dict(visible=False)
                )
                return fig
            
            # Contar o número de livros por preço
            price_counts = filtered_df['price'].value_counts().reset_index()
            price_counts.columns = ['price', 'count']
            
            # Criar gráfico de dispersão
            fig = px.scatter(
                price_counts,
                x='count',
                y='price',
                title='Number of Books by Price (Scatter Plot)',
                labels={'count': 'Number of Books', 'price': 'Price ($)'},
                color_discrete_sequence=['royalblue'],
                opacity=0.6,
                size_max=10
            )
            
            fig.update_traces(marker=dict(size=8))  # Ajustar o tamanho das bolinhas
        
        except ValueError as e:
            fig = px.scatter(title='Error processing price data')
            fig.update_layout(
                annotations=[
                    dict(text=f"Error: Non-numeric values in 'price' ({str(e)}).", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
    else:
        # Para outras variáveis (categóricas), usar value_counts
        data = filtered_df[y_axis].value_counts().reset_index()
        data.columns = [y_axis, 'count']
    
        # Limitar a 10 categorias para author, soldBy, category_name
        if y_axis in ['author', 'soldBy', 'category_name']:
            data = data.head(10)
    
        # Garantir True/False com maiúsculas
        if y_axis in ['isKindleUnlimited', 'isBestSeller', 'isEditorsPick', 'isGoodReadsChoice']:
            data[y_axis] = data[y_axis].map({True: 'True', False: 'False'})
    
    # Mapear valores de y_axis para nomes legíveis
    y_axis_labels = {
        'author': 'Author',
        'soldBy': 'Seller/Publisher',
        'category_name': 'Main Category',
        'price': 'Price ($)',
        'isKindleUnlimited': 'Is Kindle Unlimited',
        'isBestSeller': 'Is Best Seller',
        'isEditorsPick': 'Is Editors Pick',
        'isGoodReadsChoice': 'Is GoodReads Choice'
    }
    
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    # Criar gráfico (apenas para outras variáveis, já tratado o price acima)
    if y_axis != 'price':
        fig = px.bar(
            data,
            x='count',
            y=y_axis,
            title=f'{y_label} by Number of Books',
            color_discrete_sequence=['royalblue'],
            labels={'count': 'Number of Books', y_axis: y_label}
        )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 2
@app6.callback(
    Output('chart-2', 'figure'),
    [Input('y-axis-filter-2', 'value'), Input('duplicate-filter-2', 'value')]
)
def update_chart_2(y_axis, duplicate_option):
    filtered_df = df6.copy()
    
    # Verificar se a coluna 'title' existe
    if 'title' not in filtered_df.columns:
        fig = px.scatter(title='Error: Title column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'title' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    
    # Verificar nulos em stars
    filtered_df = filtered_df[filtered_df['stars'].notna()]
    
    # Mapear valores de y_axis para nomes legíveis
    y_axis_labels = {
        'reviews': 'Number of Reviews',
        'price': 'Price ($)',
        'year': 'Year'
    }
    
    y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
    
    # Preparar dados
    if y_axis == 'reviews':
        data = filtered_df.groupby('stars')[y_axis].mean().reset_index()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.line(
            data,
            x='stars',
            y=y_axis,
            title=f'{y_label} by Average Rating',
            markers=True,
            color_discrete_sequence=['royalblue'],
            labels={'stars': 'Average Rating', y_axis: y_label}
        )
    elif y_axis == 'price':
        data = filtered_df[['stars', 'price']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid prices available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='stars',
            y='price',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['royalblue'],
            labels={'stars': 'Average Rating', 'price': y_label},
            opacity=0.6
        )
        fig.update_yaxes(type='log', title_text=y_label)
    else:  # y_axis == 'year'
        data = filtered_df[['stars', 'year']].dropna()
        if data.empty:
            fig = px.scatter(title='No data available for the selected filters')
            fig.update_layout(
                annotations=[
                    dict(text="No valid years available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
        fig = px.scatter(
            data,
            x='stars',
            y='year',
            title=f'{y_label} by Average Rating',
            color_discrete_sequence=['royalblue'],
            labels={'stars': 'Average Rating', 'year': y_label},
            opacity=0.6
        )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 3 (Rating Correlation Heatmap)
@app6.callback(
    Output('chart-3', 'figure'),
    [Input('button-3', 'n_clicks')]
)
def update_chart_3(n_clicks):
    available_cols = df6.columns
    numeric_cols_mapping = {
        'stars': ['stars'] if 'stars' in available_cols else [],
        'reviews': ['reviews'] if 'reviews' in available_cols else [],
        'price': ['price'] if 'price' in available_cols else [],
        'year': ['year'] if 'year' in available_cols else []
    }

    numeric_cols = []
    for key in ['stars', 'reviews', 'price', 'year']:
        if numeric_cols_mapping[key]:
            numeric_cols.append(numeric_cols_mapping[key][0])
    if not numeric_cols:
        fig = px.scatter(title='Error: No numeric columns available for correlation.')
        fig.update_layout(
            annotations=[
                dict(text="Please check the dataset columns (e.g., stars, reviews, price, year).", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    df6_numeric = df6[numeric_cols].dropna()
    
    if df6_numeric.empty:
        fig = px.scatter(title='Error: No valid data after removing NaN values.')
        fig.update_layout(
            annotations=[
                dict(text="All rows contain NaN values for the selected columns.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    correlation_matrix = df6_numeric.corr()
    
    fig = px.imshow(
        correlation_matrix,
        text_auto=True,
        aspect="auto",
        color_continuous_scale='RdBu_r',
        title='Correlation of Variables from Reviews Comparations'
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 60, 'b': 40},
        font={'size': 12},
        coloraxis_colorbar_title="Correlation"
    )
    
    return fig

# Gráfico 4 (Books per Year)
@app6.callback(
    Output('chart-4', 'figure'),
    [Input('duplicate-filter-4', 'value')]
)
def update_chart_4(duplicate_option):
    filtered_df = df6.copy()
    
    # Verificar se a coluna 'title' existe
    if 'title' not in filtered_df.columns:
        fig = px.scatter(title='Error: Title column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'title' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Excluir linhas com year nulo
    filtered_df = filtered_df[filtered_df['year'].notna()]
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['title'])
    
    # Preparar dados
    data = filtered_df.groupby('year').size().reset_index(name='count')
    if data.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    x_label = 'Year'
    y_label = 'Number of Books'
    
    fig = px.line(
        data,
        x='year',
        y='count',
        title=f'{y_label} by {x_label}',
        markers=True,
        color_discrete_sequence=['royalblue'],
        labels={'year': x_label, 'count': y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

if __name__ == '__main__':
    app6.run(debug=True, port=8055)

### Dataset 7 - WonderBooks

In [38]:
df7.columns = df7.columns.str.strip()

# Converter tipos de dados
df7['price_clean'] = pd.to_numeric(df7['price_clean'])
df7['publication_date'] = pd.to_datetime(df7['publication_date'], errors='coerce')
df7['year'] = df7['publication_date'].dt.year
df7 = df7[~df7['authors_clean'].str.lower().isin(['"better homes and gardens"', 'time-life books', "reader's digest association", 'by', 'unknown'])]

df7.columns

Index(['Title', 'Authors', 'Description', 'Category', 'Publisher',
       'Publish Date', 'Price', 'publication_date', 'price_clean',
       'authors_clean', 'publisher_standard', 'year'],
      dtype='object')

#### Interative Dashboard

In [39]:
# Inicializar o app Dash com external_stylesheets
app7 = dash.Dash(__name__, external_stylesheets=['data:text/css;charset=utf-8,.sidebar:hover { opacity: 1; } .button:hover { opacity: 0.8; }'], suppress_callback_exceptions=True)

# Estilo simples
style = {
    'fontFamily': 'Arial',
    'padding': '10px',
    'backgroundColor': '#f8f9fa',
    'color': '#333'
}

# Estilo da barra lateral
sidebar_style = {
    'width': '20%',
    'backgroundColor': '#e9ecef',
    'padding': '10px',
    'borderRadius': '5px',
    'opacity': '0.7',
    'transition': 'opacity 0.3s',
    'position': 'fixed',
    'left': '10px',
    'top': '150px'
}

# Estilo dos botões
button_style = {
    'backgroundColor': 'purple',
    'color': 'white',
    'border': 'none',
    'padding': '10px 20px',
    'margin': '5px',
    'borderRadius': '5px',
    'cursor': 'pointer',
    'fontSize': '14px'
}

# Layout do app
app7.layout = html.Div([
    html.H1("Analysis of Wonderbooks Dataset", style={'textAlign': 'center', 'fontSize': '24px', 'margin': '10px'}),
    
    # Botões no topo
    html.Div([
        html.Button("Books Distribution", id='button-1', style=button_style, className='button'),
        html.Button("Books per Year", id='button-3', style=button_style, className='button')
    ], style={'display': 'flex', 'justifyContent': 'center', 'gap': '10px', 'marginBottom': '20px'}),
    
    # Conteúdo dinâmico
    html.Div(id='content')
], style=style)

# Callback para atualizar o conteúdo
@app7.callback(
    Output('content', 'children'),
    [Input('button-1', 'n_clicks'), Input('button-3', 'n_clicks')]
)
def update_content(btn1, btn3):
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'button-1'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
    if button_id == 'button-1':
        available_columns = df7.columns.tolist()
        
        dropdown_options = [
            {'label': 'Author', 'value': 'authors_clean'},
            {'label': 'Publisher', 'value': 'publisher_standard'},
            {'label': 'Categories', 'value': 'Category'}
        ]
        if 'price_clean' in available_columns:
            dropdown_options.append({'label': 'Price', 'value': 'price_clean'})
        else:
            print("Coluna 'price_clean' não encontrada no df7.")
        
        return [
            html.Div([
                html.Label("Y-Axis:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='y-axis-filter-1',
                    options=dropdown_options,
                    value='authors_clean',
                    style={'width': '100%'}
                ),
                html.Label("Count Duplicates:", style={'fontSize': '14px', 'marginTop': '10px'}),
                dcc.Dropdown(
                    id='duplicate-filter-1',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-1', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]
    elif button_id == 'button-3':
        return [
            html.Div([
                html.Label("Count Duplicates:", style={'fontSize': '14px'}),
                dcc.Dropdown(
                    id='duplicate-filter-3',
                    options=[
                        {'label': 'With Duplicates', 'value': 'with'},
                        {'label': 'Without Duplicates (Title)', 'value': 'title'}
                    ],
                    value='with',
                    style={'width': '100%'}
                )
            ], style=sidebar_style, className='sidebar'),
            dcc.Graph(id='chart-3', style={'height': '400px', 'width': '75%', 'marginLeft': '22%'})
        ]

# Callbacks para atualizar os gráficos
# Gráfico 1
@app7.callback(
    Output('chart-1', 'figure'),
    [Input('y-axis-filter-1', 'value'), Input('duplicate-filter-1', 'value')]
)
def update_chart_1(y_axis, duplicate_option):
    filtered_df = df7.copy()
    
    # Verificar se a coluna 'Title' existe
    if 'Title' not in filtered_df.columns:
        fig = px.scatter(title='Error: Title column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'Title' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['Title'])
    
    # Preparar dados
    if y_axis == 'price_clean':
        # Verificar se há valores nulos
        filtered_df = filtered_df[filtered_df['price_clean'].notna()]
        try:
            filtered_df['price_clean'] = pd.to_numeric(filtered_df['price_clean'])
            if filtered_df.empty:
                fig = px.scatter(title='No valid price data available')
                fig.update_layout(
                    annotations=[
                        dict(text="No valid price data after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                    ],
                    xaxis=dict(visible=False),
                    yaxis=dict(visible=False)
                )
                return fig
            
            # Contar o número de livros por preço
            price_counts = filtered_df['price_clean'].value_counts().reset_index()
            price_counts.columns = ['price_clean', 'count']
            
            # Criar gráfico de dispersão
            fig = px.scatter(
                price_counts,
                x='count',
                y='price_clean',
                title='Number of Books by Price (Scatter Plot)',
                labels={'count': 'Number of Books', 'price_clean': 'Price ($)'},
                color_discrete_sequence=['purple'],
                opacity=0.6,
                size_max=10
            )
            
            fig.update_traces(marker=dict(size=8))  # Ajustar o tamanho das bolinhas
        
        except ValueError as e:
            fig = px.scatter(title='Error processing price data')
            fig.update_layout(
                annotations=[
                    dict(text=f"Error: Non-numeric values in 'price_clean' ({str(e)}).", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
                ],
                xaxis=dict(visible=False),
                yaxis=dict(visible=False)
            )
            return fig
    else:
        # Para outras variáveis (categóricas), usar value_counts
        data = filtered_df[y_axis].value_counts().reset_index()
        data.columns = [y_axis, 'count']
    
        # Limitar a 10 categorias para authors_clean, Category, publisher_standard
        if y_axis in ['authors_clean', 'Category', 'publisher_standard']:
            data = data.head(10)
    
        # Mapear valores de y_axis para nomes legíveis (mesmos do dropdown)
        y_axis_labels = {
            'authors_clean': 'Author',
            'publisher_standard': 'Publisher',
            'Category': 'Categories',
            'price_clean': 'Price ($)'
        }
        
        # Obter o rótulo legível para o título e eixo Y
        y_label = y_axis_labels.get(y_axis, y_axis.replace("_", " ").capitalize())
        
        # Criar gráfico de barras para outras variáveis
        fig = px.bar(
            data,
            x='count',
            y=y_axis,
            title=f'{y_label} by Number of Books',
            color_discrete_sequence=['purple'],
            labels={'count': 'Number of Books', y_axis: y_label}
        )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

# Gráfico 3 (Books per Year)
@app7.callback(
    Output('chart-3', 'figure'),
    [Input('duplicate-filter-3', 'value')]
)
def update_chart_3(duplicate_option):
    filtered_df = df7.copy()
    
    # Excluir linhas com year nulo e restringir até 2023
    filtered_df = filtered_df[filtered_df['year'].notna()]
    filtered_df = filtered_df[filtered_df['year'] <= 2023]
    
    # Verificar se a coluna 'Title' existe
    if 'Title' not in filtered_df.columns:
        fig = px.scatter(title='Error: Title column not found')
        fig.update_layout(
            annotations=[
                dict(text="Column 'Title' not found in the dataset.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Tratar duplicatas
    if duplicate_option == 'title':
        filtered_df = filtered_df.drop_duplicates(subset=['Title'])
    
    # Preparar dados
    data = filtered_df.groupby('year').size().reset_index(name='count')
    if data.empty:
        fig = px.scatter(title='No data available for the selected filters')
        fig.update_layout(
            annotations=[
                dict(text="No data available after filtering.", x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper", font=dict(size=14))
            ],
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        return fig
    
    # Definir rótulos legíveis
    x_label = 'Year'
    y_label = 'Number of Books'
    
    fig = px.line(
        data,
        x='year',
        y='count',
        title=f'{y_label} by {x_label}',
        markers=True,
        color_discrete_sequence=['purple'],
        labels={'year': x_label, 'count': y_label}
    )
    
    fig.update_layout(
        margin={'l': 40, 'r': 40, 't': 40, 'b': 40},
        font={'size': 12},
        showlegend=False
    )
    
    return fig

if __name__ == '__main__':
    app7.run(debug=True, port=8056)