In [3]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/45/6d/8c1d2570a52db6263d855c3ee3daf8f4bdf4a365cd6610772d6fce5fd904/xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata
  Downloading xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [18]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import timedelta
from sklearn.metrics import mean_squared_error
from dash import dcc, html, Input, Output, State
import dash
import dash_bootstrap_components as dbc
import plotly.graph_objs as go
import plotly.express as px

# Load the merged dataset
merged_dataset_path = 'merged_final_dataset.csv'
merged_df = pd.read_csv(merged_dataset_path)

# Load the datasets for other features
file_path = 'final_dataset.csv'
df = pd.read_csv(file_path)

# Convert date columns to datetime
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'], errors='coerce')

# Calculate delivery time in days
df['delivery_time'] = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.days

# Filter out invalid data
df = df[df['delivery_time'].notna() & df['review_score'].notna()]

# Calculate average delivery time and rating by state
state_summary = df.groupby('customer_state').agg({'delivery_time': 'mean', 'review_score': 'mean'}).reset_index()

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Sidebar layout
sidebar = html.Div(
    [
        html.H2("Olist Consulting", className="display-6"),
        html.Hr(),
        dbc.Nav(
            [
                dbc.NavLink("Demand Forecast", href="/", active="exact"),
                dbc.NavLink("Rating and Delivery Time", href="/rating-delivery", active="exact"),
                dbc.NavLink("Seller Analysis", href="/seller-analysis", active="exact"),
            ],
            vertical=True,
            pills=True,
        ),
    ],
    style={
        "position": "fixed",
        "top": 0,
        "left": 0,
        "bottom": 0,
        "width": "16rem",
        "padding": "2rem 1rem",
        "background-color": "#e0f2f1",
    },
)

top_bar = html.Div(
    [
        html.Div(
            [
                html.A(
                    href="https://olist.com/",
                    children=[
                        html.Img(
                            src="/mariafernandezgonzalez/Descargas/channels4_profile.jpeg",  
                            style={"width": "8%", "height": "auto"},
                        )
                    ],
                    style={"display": "flex", "justify-content": "flex-end", "align-items": "flex-start", "width": "98%", "padding-top": "1rem"},
                )
            ],
            style={
                "display": "flex",
                "justify-content": "flex-end",
                "align-items": "flex-start",
                "width": "98%",
                "padding-top": "1rem",
            }
        ),
    ],
    style={
        "height": "4rem",
        "background-color": "#e0f2f1",
        "margin-left": "0rem",
        "position": "fixed",
        "top": 0,
        "right": 0,
        "left": "16rem",
        "z-index": 100,
    }
)

# Demand Forecast Analysis Layout
state_dropdown = dcc.Dropdown(
    id='state-dropdown',
    options=[{'label': state, 'value': state} for state in df['customer_state'].unique()],
    placeholder="Select a customer state",
)

category_dropdown = dcc.Dropdown(
    id='category-dropdown',
    options=[{'label': category, 'value': category} for category in df['product_category_name_english'].unique()],
    placeholder="Select a product category",
)

forecast_option = dcc.RadioItems(
    id='forecast-option',
    options=[
        {'label': 'Only by Category', 'value': 'category'},
        {'label': 'Only by State', 'value': 'state'},
        {'label': 'By Both State and Category', 'value': 'both'}
    ],
    value='state'
)

go_button = html.Button('Go', id='go-button', n_clicks=0)

analysis_content = html.Div(id='analysis-content')

# Layout for Demand Forecast Analysis
analysis_content_layout = html.Div(
    [
        html.H1("Demand Forecast Analysis"),
        state_dropdown,
        category_dropdown,
        forecast_option,
        go_button,
        analysis_content
    ],
    id="analysis-content-layout",
    style={
        "margin-left": "8rem",
        "margin-top": "1rem",
        "padding": "1rem",
    },
)

# Rating and Delivery Time Analysis Layout
rating_delivery_layout = html.Div(
    [
        html.H1("Rating and Delivery Time Analysis"),
        dcc.Dropdown(
            id='metric-dropdown',
            options=[
                {'label': 'Delivery Time', 'value': 'delivery_time'},
                {'label': 'Rating', 'value': 'review_score'}
            ],
            value='delivery_time',
            clearable=False
        ),
        dcc.Graph(id='choropleth-map', style={'height': '800px', 'width': '100%'})
    ],
    id="rating-delivery-layout",
    style={
        "margin-left": "8rem",
        "margin-top": "1rem",
        "padding": "1rem",
    },
)

# Seller Analysis Layout
seller_analysis_layout = html.Div(
    [
        html.H1("Seller Analysis"),
        dcc.Dropdown(
            id='seller-metric-dropdown',
            options=[
                {'label': 'Revenue', 'value': 'revenue_final'},
                {'label': 'Delivery Time', 'value': 'delivery_time_final'},
                {'label': 'Rating', 'value': 'avg_rating'}
            ],
            placeholder="Select a metric to filter",
            clearable=True
        ),
        dcc.Dropdown(
            id='state-filter-dropdown',
            options=[{'label': state, 'value': state} for state in merged_df['customer_state_summary'].unique()],
            placeholder="Select a customer state",
            clearable=True
        ),
        dcc.Dropdown(
            id='category-filter-dropdown',
            options=[{'label': category, 'value': category} for category in merged_df['product_category_name_english_summary'].unique()],
            placeholder="Select a product category",
            clearable=True
        ),
        html.Button('Go', id='seller-go-button', n_clicks=0),
        dcc.Graph(id='seller-scatter-plot', style={'height': '800px', 'width': '100%'}),
        html.H2("Top 5 Sellers Ranking"),
        html.Div(id='top-sellers-ranking')
    ],
    id="seller-analysis-layout",
    style={
        "margin-left": "8rem",
        "margin-top": "1rem",
        "padding": "1rem",
    },
)

# App layout
app.layout = html.Div(
    [
        dcc.Location(id="url"),
        sidebar,
        html.Div(id="page-content", style={"margin-left": "8rem", "padding": "1rem"})
    ]
)

# Callbacks for URL Routing and Scatter Plot
@app.callback(
    Output("page-content", "children"),
    [Input("url", "pathname")]
)
def display_page(pathname):
    if pathname == "/rating-delivery":
        return rating_delivery_layout
    elif pathname == "/seller-analysis":
        return seller_analysis_layout
    return analysis_content_layout

# Callback for Demand Forecast Analysis
def prepare_data(data, selection_type, customer_state=None, product_category=None):
    if selection_type == 'state':
        df = data[data['customer_state'] == customer_state].copy()
    elif selection_type == 'category':
        df = data[data['product_category_name_english'] == product_category].copy()
    elif selection_type == 'both':
        df = data[(data['customer_state'] == customer_state) & (data['product_category_name_english'] == product_category)].copy()
    else:
        raise ValueError("Invalid selection_type. Choose from 'state', 'category', or 'both'.")
    
    df = df.set_index('order_purchase_timestamp').resample('D').size().reset_index(name='demand')
    return df

def analyze_orders(selection_type, state=None, category=None):
    df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
    cutoff_date = pd.to_datetime('2018-07-31')
    df_filtered = df[df['order_purchase_timestamp'] <= cutoff_date]

    prepared_df = prepare_data(df_filtered, selection_type, state, category)
    prepared_df = prepared_df.sort_values('order_purchase_timestamp')

    train = prepared_df.iloc[:-21].copy()
    test = prepared_df.iloc[-21:].copy()

    def create_features(df):
        df = df.copy()
        df['day_of_week'] = df['order_purchase_timestamp'].dt.dayofweek
        df['day_of_month'] = df['order_purchase_timestamp'].dt.day
        df['week_of_year'] = df['order_purchase_timestamp'].dt.isocalendar().week
        df['month'] = df['order_purchase_timestamp'].dt.month
        return df

    train = create_features(train)
    test = create_features(test)

    X_train = train.drop(['order_purchase_timestamp', 'demand'], axis=1)
    y_train = train['demand']
    X_test = test.drop(['order_purchase_timestamp', 'demand'], axis=1)
    y_test = test['demand']

    model = xgb.XGBRegressor(objective='reg:squarederror')
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    start_date = test['order_purchase_timestamp'].min() - timedelta(days=30)
    plot_data = prepared_df[(prepared_df['order_purchase_timestamp'] >= start_date) & (prepared_df['order_purchase_timestamp']
        .max())]

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=plot_data['order_purchase_timestamp'], y=plot_data['demand'], mode='lines', name='Historical'))
    fig.add_trace(go.Scatter(x=test['order_purchase_timestamp'], y=y_test, mode='lines', name='Test'))
    fig.add_trace(go.Scatter(x=test['order_purchase_timestamp'], y=preds, mode='lines', name='Forecast'))
    fig.add_trace(go.Scatter(x=test['order_purchase_timestamp'], y=lower_bounds, mode='lines', line=dict(color='gray', width=0.5), name='Lower Bound'))
    fig.add_trace(go.Scatter(x=test['order_purchase_timestamp'], y=upper_bounds, mode='lines', line=dict(color='gray', width=0.5), name='Upper Bound'))

    fig.update_layout(
        title='Demand Forecast',
        xaxis_title='Date',
        yaxis_title='Demand',
        showlegend=True,
        hovermode='x'
    )

    return fig, rmse, results_filtered

@app.callback(
    Output('analysis-content', 'children'),
    [Input('go-button', 'n_clicks')],
    [State('state-dropdown', 'value'), State('category-dropdown', 'value'), State('forecast-option', 'value')]
)
def update_analysis(n_clicks, state, category, forecast_option):
    if n_clicks > 0:
        plot_figure, rmse, forecast_comparison = analyze_orders(forecast_option, state, category)

        plot_figure.update_layout(height=600)

        plot_div = dcc.Graph(figure=plot_figure)

        rmse_text = html.P(f"Root Mean Square Error (RMSE): {rmse}")
        forecast_table = dbc.Table.from_dataframe(forecast_comparison, striped=True, bordered=True, hover=True)

        return [plot_div, rmse_text, forecast_table]
    return html.P("Select options and click 'Go' to run the analysis.")

# Callback for Rating and Delivery Time Analysis
@app.callback(
    Output('choropleth-map', 'figure'),
    [Input('metric-dropdown', 'value')]
)
def update_choropleth(selected_metric):
    if selected_metric == 'delivery_time':
        color_scale = 'Reds'
        color_label = 'Avg Delivery Time (days)'
    else:
        color_scale = 'Blues'
        color_label = 'Avg Rating'
    
    fig = px.choropleth(
        state_summary,
        geojson="https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson",
        locations='customer_state',
        featureidkey="properties.sigla",
        hover_name='customer_state',
        color=selected_metric,
        color_continuous_scale=color_scale,
        labels={selected_metric: color_label},
        hover_data={
            'delivery_time': True,
            'review_score': True,
            'customer_state': False
        },
        title=f'Average {color_label} by State'
    )
    
    fig.update_geos(fitbounds="locations", visible=False)
    fig.update_layout(
        margin={"r":0,"t":50,"l":0,"b":0},
        clickmode='event+select',
        autosize=True,
        width=1000,
        height=600,
        coloraxis_colorbar=dict(
            title=color_label,
            thicknessmode="pixels", thickness=15,
            lenmode="pixels", len=200,
            yanchor="middle", y=0.5,
            xanchor="left", x=-0.1
        )
    )

    return fig

# Callback for Seller Analysis
@app.callback(
    [Output('seller-scatter-plot', 'figure'),
     Output('top-sellers-ranking', 'children')],
    [Input('seller-go-button', 'n_clicks')],
    [State('seller-metric-dropdown', 'value'),
     State('state-filter-dropdown', 'value'),
     State('category-filter-dropdown', 'value')]
)
def update_seller_analysis(n_clicks, selected_metric, selected_state, selected_category):
    filtered_data = merged_df

    if selected_state:
        filtered_data = filtered_data[filtered_data['customer_state_summary'] == selected_state]
    if selected_category:
        filtered_data = filtered_data[filtered_data['product_category_name_english_summary'] == selected_category]

    fig = px.scatter(
        filtered_data,
        x='delivery_time_summary',
        y='revenue_final',
        size='avg_rating',
        hover_name='seller_id',
        title='Seller Analysis: Delivery Time vs. Revenue with Rating as Size',
        labels={'delivery_time_summary': 'Avg Delivery Time', 'revenue_final': 'Revenue', 'avg_rating': 'Avg Rating'},
        size_max=60
    )
    
    if selected_metric:
        if selected_metric == 'revenue_final':
            fig.update_traces(marker=dict(size=filtered_data['avg_rating'], color=filtered_data['revenue_final'], colorscale='Viridis'))
            fig.update_layout(title='Seller Analysis: Revenue', coloraxis_colorbar=dict(title='Revenue'))
        elif selected_metric == 'delivery_time_summary':
            fig.update_traces(marker=dict(size=filtered_data['avg_rating'], color=filtered_data['delivery_time_summary'], colorscale='Cividis'))
            fig.update_layout(title='Seller Analysis: Average Delivery Time', coloraxis_colorbar=dict(title='Avg Delivery Time'))
        elif selected_metric == 'avg_rating':
            fig.update_traces(marker=dict(size=filtered_data['avg_rating'], color=filtered_data['avg_rating'], colorscale='Plasma'))
            fig.update_layout(title='Seller Analysis: Average Rating', coloraxis_colorbar=dict(title='Avg Rating'))
    else:
        fig.update_traces(marker=dict(color=filtered_data['avg_rating'], colorscale='Plasma'))
    
    fig.update_layout(
        margin={"r":0,"t":50,"l":0,"b":0},
        height=800,
        width=1000
    )

    fig.add_annotation(
        xref="paper", yref="paper",
        x=1.05, y=1,
        showarrow=False,
        text="Dot size represents average rating",
        font=dict(
            size=12,
            color="black"
        ),
        align="left"
    )

    top_sellers_revenue = filtered_data.nlargest(5, 'revenue_final')[['seller_id', 'revenue_final']]
    top_sellers_delivery_time = filtered_data.nsmallest(5, 'delivery_time_summary')[['seller_id', 'delivery_time_summary']]
    top_sellers_rating = filtered_data.nlargest(5, 'avg_rating')[['seller_id', 'avg_rating']]
    
    filtered_data['overall_score'] = (
        (filtered_data['revenue_final'].rank(ascending=False) +
         filtered_data['delivery_time_summary'].rank(ascending=True) +
         filtered_data['avg_rating'].rank(ascending=False)) / 3
    )
    top_sellers_overall = filtered_data.nsmallest(5, 'overall_score')[['seller_id', 'overall_score']]

    top_sellers_content = html.Div([
        html.H3("Top 5 by Revenue"),
        dbc.Table.from_dataframe(top_sellers_revenue, striped=True, bordered=True, hover=True),
        
        html.H3("Top 5 by Delivery Time"),
        dbc.Table.from_dataframe(top_sellers_delivery_time, striped=True, bordered=True, hover=True),
        
        html.H3("Top 5 by Rating"),
        dbc.Table.from_dataframe(top_sellers_rating, striped=True, bordered=True, hover=True),
        
        html.H3("Top 5 Overall"),
        dbc.Table.from_dataframe(top_sellers_overall, striped=True, bordered=True, hover=True)
    ])

    return fig, top_sellers_content

if __name__ == "__main__":
    app.run_server(port=8056)                                                                               


[2024-06-19 17:52:02,775] ERROR in app: Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "/Users/mariafernandezgonzalez/anaconda3/lib/python3.11/site-packages/flask/app.py", line 2525, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mariafernandezgonzalez/anaconda3/lib/python3.11/site-packages/flask/app.py", line 1822, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mariafernandezgonzalez/anaconda3/lib/python3.11/site-packages/flask/app.py", line 1820, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mariafernandezgonzalez/anaconda3/lib/python3.11/site-packages/flask/app.py", line 1796, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

In [14]:
import pandas as pd

# Convertir a datetime si no lo está
start_date = pd.to_datetime(start_date)
prepared_df['order_purchase_timestamp'] = pd.to_datetime(prepared_df['order_purchase_timestamp'])


NameError: name 'start_date' is not defined

[2024-06-19 17:28:27,190] ERROR in app: Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "/Users/mariafernandezgonzalez/anaconda3/lib/python3.11/site-packages/pandas/core/ops/array_ops.py", line 311, in na_logical_op
    result = op(x, y)
             ^^^^^^^^
TypeError: unsupported operand type(s) for &: 'numpy.ndarray' and 'Timestamp'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/mariafernandezgonzalez/anaconda3/lib/python3.11/site-packages/pandas/core/ops/array_ops.py", line 325, in na_logical_op
    result = libops.scalar_binop(x, y, op)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pandas/_libs/ops.pyx", line 180, in pandas._libs.ops.scalar_binop
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'bool'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/mariafernandezgonzalez/anaco

In [16]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import timedelta
from sklearn.metrics import mean_squared_error
from dash import dcc, html, Input, Output, State
import dash
import dash_bootstrap_components as dbc
import plotly.graph_objs as go
import plotly.express as px

# Load the merged dataset
merged_dataset_path = 'merged_final_dataset.csv'
merged_df = pd.read_csv(merged_dataset_path)

# Load the datasets for other features
file_path = 'final_dataset.csv'
df = pd.read_csv(file_path)

# Convert date columns to datetime
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'], errors='coerce')

# Calculate delivery time in days
df['delivery_time'] = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.days

# Filter out invalid data
df = df[df['delivery_time'].notna() & df['review_score'].notna()]

# Calculate average delivery time and rating by state
state_summary = df.groupby('customer_state').agg({'delivery_time': 'mean', 'review_score': 'mean'}).reset_index()

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Sidebar layout
sidebar = html.Div(
    [
        html.H2("Olist Consulting", className="display-6"),
        html.Hr(),
        dbc.Nav(
            [
                dbc.NavLink("Demand Forecast", href="/", active="exact"),
                dbc.NavLink("Rating and Delivery Time", href="/rating-delivery", active="exact"),
                dbc.NavLink("Seller Analysis", href="/seller-analysis", active="exact"),
            ],
            vertical=True,
            pills=True,
        ),
    ],
    style={
        "position": "fixed",
        "top": 0,
        "left": 0,
        "bottom": 0,
        "width": "16rem",
        "padding": "2rem 1rem",
        "background-color": "#e0f2f1",
    },
)

# Definir imagen y enlace
logo_url = "https://meta.com.br/wp-content/uploads/2022/05/Logo-Olist.png"
webpage_url = "https://www.ejemplo.com"  # Reemplaza con la URL de tu elección

top_bar = html.Div(
    [
        html.Div(
            [
                html.A(
                    html.Img(
                        src=logo_url,
                        style={"width": "8%", "height": "auto"},
                    ),
                    href=webpage_url,  # Esto redirige a la página web cuando se hace clic en la imagen
                    target="_blank"  # Abre el enlace en una nueva pestaña
                )
            ],
            style={
                "display": "flex",
                "justify-content": "flex-end",
                "align-items": "flex-start",
                "width": "98%",
                "padding-top": "1rem",
            }
        ),
    ],
    style={
        "height": "4rem",
        "background-color": "#e0f2f1",
        "margin-left": "0rem",
        "position": "fixed",
        "top": 0,
        "right": 0,
        "left": "16rem",
        "z-index": 100,
    }
)

# Demand Forecast Analysis Layout
state_dropdown = dcc.Dropdown(
    id='state-dropdown',
    options=[{'label': state, 'value': state} for state in df['customer_state'].unique()],
    placeholder="Select a customer state",
)

category_dropdown = dcc.Dropdown(
    id='category-dropdown',
    options=[{'label': category, 'value': category} for category in df['product_category_name_english'].unique()],
    placeholder="Select a product category",
)

forecast_option = dcc.RadioItems(
    id='forecast-option',
    options=[
        {'label': 'Only by Category', 'value': 'category'},
        {'label': 'Only by State', 'value': 'state'},
        {'label': 'By Both State and Category', 'value': 'both'}
    ],
    value='state'
)

go_button = html.Button('Go', id='go-button', n_clicks=0)

analysis_content = html.Div(id='analysis-content')

# Layout for Demand Forecast Analysis
analysis_content_layout = html.Div(
    [
        html.H1("Demand Forecast Analysis"),
        state_dropdown,
        category_dropdown,
        forecast_option,
        go_button,
        analysis_content
    ],
    id="analysis-content-layout",
    style={
        "margin-left": "8rem",
        "margin-top": "1rem",
        "padding": "1rem",
    },
)

# Rating and Delivery Time Analysis Layout
rating_delivery_layout = html.Div(
    [
        html.H1("Rating and Delivery Time Analysis"),
        dcc.Dropdown(
            id='metric-dropdown',
            options=[
                {'label': 'Delivery Time', 'value': 'delivery_time'},
                {'label': 'Rating', 'value': 'review_score'}
            ],
            value='delivery_time',
            clearable=False
        ),
        dcc.Graph(id='choropleth-map', style={'height': '800px', 'width': '100%'})
    ],
    id="rating-delivery-layout",
    style={
        "margin-left": "8rem",
        "margin-top": "1rem",
        "padding": "1rem",
    },
)

# Seller Analysis Layout
seller_analysis_layout = html.Div(
    [
        html.H1("Seller Analysis"),
        dcc.Dropdown(
            id='seller-metric-dropdown',
            options=[
                {'label': 'Revenue', 'value': 'revenue_final'},
                {'label': 'Delivery Time', 'value': 'delivery_time_final'},
                {'label': 'Rating', 'value': 'avg_rating'}
            ],
            placeholder="Select a metric to filter",
            clearable=True
        ),
        dcc.Dropdown(
            id='state-filter-dropdown',
            options=[{'label': state, 'value': state} for state in merged_df['customer_state_summary'].unique()],
            placeholder="Select a customer state",
            clearable=True
        ),
        dcc.Dropdown(
            id='category-filter-dropdown',
            options=[{'label': category, 'value': category} for category in merged_df['product_category_name_english_summary'].unique()],
            placeholder="Select a product category",
            clearable=True
        ),
        html.Button('Go', id='seller-go-button', n_clicks=0),
        dcc.Graph(id='seller-scatter-plot', style={'height': '800px', 'width': '100%'}),
        html.H2("Top 5 Sellers Ranking"),
        html.Div(id='top-sellers-ranking')
    ],
    id="seller-analysis-layout",
    style={
        "margin-left": "8rem",
        "margin-top": "1rem",
        "padding": "1rem",
    },
)

# App layout
app.layout = html.Div(
    [
        dcc.Location(id="url"),
        sidebar,
        html.Div(id="page-content", style={"margin-left": "8rem", "padding": "1rem"})
    ]
)

# Callbacks for URL Routing and Scatter Plot
@app.callback(
    Output("page-content", "children"),
    [Input("url", "pathname")]
)
def display_page(pathname):
    if pathname == "/rating-delivery":
        return rating_delivery_layout
    elif pathname == "/seller-analysis":
        return seller_analysis_layout
    return analysis_content_layout

# Callback for Demand Forecast Analysis
def prepare_data(data, selection_type, customer_state=None, product_category=None):
    if selection_type == 'state':
        df = data[data['customer_state'] == customer_state].copy()
    elif selection_type == 'category':
        df = data[data['product_category_name_english'] == product_category].copy()
    elif selection_type == 'both':
        df = data[(data['customer_state'] == customer_state) & (data['product_category_name_english'] == product_category)].copy()
    else:
        raise ValueError("Invalid selection_type. Choose from 'state', 'category', or 'both'.")
    
    df = df.set_index('order_purchase_timestamp').resample('D').size().reset_index(name='demand')
    return df

def analyze_orders(selection_type, state=None, category=None):
    df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
    cutoff_date = pd.to_datetime('2018-07-31')
    df_filtered = df[df['order_purchase_timestamp'] <= cutoff_date]

    prepared_df = prepare_data(df_filtered, selection_type, state, category)
    prepared_df = prepared_df.sort_values('order_purchase_timestamp')

    train = prepared_df.iloc[:-21].copy()
    test = prepared_df.iloc[-21:].copy()

    def create_features(df):
        df = df.copy()
        df['day_of_week'] = df['order_purchase_timestamp'].dt.dayofweek
        df['day_of_month'] = df['order_purchase_timestamp'].dt.day
        df['week_of_year'] = df['order_purchase_timestamp'].dt.isocalendar().week
        df['month'] = df['order_purchase_timestamp'].dt.month
        return df

    train = create_features(train)
    test = create_features(test)

    X_train = train.drop(['order_purchase_timestamp', 'demand'], axis=1)
    y_train = train['demand']
    X_test = test.drop(['order_purchase_timestamp', 'demand'], axis=1)
    y_test = test['demand']

    model = xgb.XGBRegressor(objective='reg:squarederror')
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    start_date = test['order_purchase_timestamp'].min() - timedelta(days=30)
    plot_data = prepared_df[(prepared_df['order_purchase_timestamp'] >= start_date) & (prepared_df['order_purchase_timestamp'] <= test['order_purchase_timestamp'].max())]

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=plot_data['order_purchase_timestamp'], y=plot_data['demand'], mode='lines', name='Historical'))
    fig.add_trace(go.Scatter(x=test['order_purchase_timestamp'], y=y_test, mode='lines', name='Test'))
    fig.add_trace(go.Scatter(x=test['order_purchase_timestamp'], y=preds, mode='lines', name='Forecast'))

    fig.update_layout(
        title='Demand Forecast',
        xaxis_title='Date',
        yaxis_title='Demand',
        showlegend=True,
        hovermode='x'
    )

    return fig, rmse, results_filtered

@app.callback(
    Output('analysis-content', 'children'),
    [Input('go-button', 'n_clicks')],
    [State('state-dropdown', 'value'), State('category-dropdown', 'value'), State('forecast-option', 'value')]
)
def update_analysis(n_clicks, state, category, forecast_option):
    if n_clicks > 0:
        plot_figure, rmse, forecast_comparison = analyze_orders(forecast_option, state, category)

        plot_figure.update_layout(height=600)

        plot_div = dcc.Graph(figure=plot_figure)

        rmse_text = html.P(f"Root Mean Square Error (RMSE): {rmse}")
        forecast_table = dbc.Table.from_dataframe(forecast_comparison, striped=True, bordered=True, hover=True)

        return [plot_div, rmse_text, forecast_table]
    return html.P("Select options and click 'Go' to run the analysis.")

# Callback for Rating and Delivery Time Analysis
@app.callback(
    Output('choropleth-map', 'figure'),
    [Input('metric-dropdown', 'value')]
)
def update_choropleth(selected_metric):
    if selected_metric == 'delivery_time':
        color_scale = 'Reds'
        color_label = 'Avg Delivery Time (days)'
    else:
        color_scale = 'Blues'
        color_label = 'Avg Rating'
    
    fig = px.choropleth(
        state_summary,
        geojson="https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/brazil-states.geojson",
        locations='customer_state',
        featureidkey="properties.sigla",
        hover_name='customer_state',
        color=selected_metric,
        color_continuous_scale=color_scale,
        labels={selected_metric: color_label},
        hover_data={
            'delivery_time': True,
            'review_score': True,
            'customer_state': False
        },
        title=f'Average {color_label} by State'
    )
    
    fig.update_geos(fitbounds="locations", visible=False)
    fig.update_layout(
        margin={"r":0,"t":50,"l":0,"b":0},
        clickmode='event+select',
        autosize=True,
        width=1000,
        height=600,
        coloraxis_colorbar=dict(
            title=color_label,
            thicknessmode="pixels", thickness=15,
            lenmode="pixels", len=200,
            yanchor="middle", y=0.5,
            xanchor="left", x=-0.1
        )
    )

    return fig

# Callback for Seller Analysis
@app.callback(
    [Output('seller-scatter-plot', 'figure'),
     Output('top-sellers-ranking', 'children')],
    [Input('seller-go-button', 'n_clicks')],
    [State('seller-metric-dropdown', 'value'),
     State('state-filter-dropdown', 'value'),
     State('category-filter-dropdown', 'value')]
)
def update_seller_analysis(n_clicks, selected_metric, selected_state, selected_category):
    filtered_data = merged_df

    if selected_state:
        filtered_data = filtered_data[filtered_data['customer_state_summary'] == selected_state]
    if selected_category:
        filtered_data = filtered_data[filtered_data['product_category_name_english_summary'] == selected_category]

    fig = px.scatter(
        filtered_data,
        x='delivery_time_summary',
        y='revenue_final',
        size='avg_rating',
        hover_name='seller_id',
        title='Seller Analysis: Delivery Time vs. Revenue with Rating as Size',
        labels={'delivery_time_summary': 'Avg Delivery Time', 'revenue_final': 'Revenue', 'avg_rating': 'Avg Rating'},
        size_max=60
    )
    
    if selected_metric:
        if selected_metric == 'revenue_final':
            fig.update_traces(marker=dict(size=filtered_data['avg_rating'], color=filtered_data['revenue_final'], colorscale='Viridis'))
            fig.update_layout(title='Seller Analysis: Revenue', coloraxis_colorbar=dict(title='Revenue'))
        elif selected_metric == 'delivery_time_summary':
            fig.update_traces(marker=dict(size=filtered_data['avg_rating'], color=filtered_data['delivery_time_summary'], colorscale='Cividis'))
            fig.update_layout(title='Seller Analysis: Average Delivery Time', coloraxis_colorbar=dict(title='Avg Delivery Time'))
        elif selected_metric == 'avg_rating':
            fig.update_traces(marker=dict(size=filtered_data['avg_rating'], color=filtered_data['avg_rating'], colorscale='Plasma'))
            fig.update_layout(title='Seller Analysis: Average Rating', coloraxis_colorbar=dict(title='Avg Rating'))
    else:
        fig.update_traces(marker=dict(color=filtered_data['avg_rating'], colorscale='Plasma'))
    
    fig.update_layout(
        margin={"r":0,"t":50,"l":0,"b":0},
        height=800,
        width=1000
    )

    fig.add_annotation(
        xref="paper", yref="paper",
        x=1.05, y=1,
        showarrow=False,
        text="Dot size represents average rating",
        font=dict(
            size=12,
            color="black"
        ),
        align="left"
    )

    top_sellers_revenue = filtered_data.nlargest(5, 'revenue_final')[['seller_id', 'revenue_final']]
    top_sellers_delivery_time = filtered_data.nsmallest(5, 'delivery_time_summary')[['seller_id', 'delivery_time_summary']]
    top_sellers_rating = filtered_data.nlargest(5, 'avg_rating')[['seller_id', 'avg_rating']]
    
    filtered_data['overall_score'] = (
        (filtered_data['revenue_final'].rank(ascending=False) +
         filtered_data['delivery_time_summary'].rank(ascending=True) +
         filtered_data['avg_rating'].rank(ascending=False)) / 3
    )
    top_sellers_overall = filtered_data.nsmallest(5, 'overall_score')[['seller_id', 'overall_score']]

    top_sellers_content = html.Div([
        html.H3("Top 5 by Revenue"),
        dbc.Table.from_dataframe(top_sellers_revenue, striped=True, bordered=True, hover=True),
        
        html.H3("Top 5 by Delivery Time"),
        dbc.Table.from_dataframe(top_sellers_delivery_time, striped=True, bordered=True, hover=True),
        
        html.H3("Top 5 by Rating"),
        dbc.Table.from_dataframe(top_sellers_rating, striped=True, bordered=True, hover=True),
        
        html.H3("Top 5 Overall"),
        dbc.Table.from_dataframe(top_sellers_overall, striped=True, bordered=True, hover=True)
    ])

    return fig, top_sellers_content

if __name__ == "__main__":
    app.run_server(port=8065)
