In [1]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from statsmodels.tsa.stattools import acf, pacf

In [2]:
df = pd.read_parquet(r"D:\Git\darts-pipeline\data\dataset\rossmann.parquet")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8307 entries, 19 to 8477
Data columns (total 99 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       8307 non-null   datetime64[ns]
 1   Store                      8307 non-null   int64         
 2   DayOfWeek                  8307 non-null   int64         
 3   Sales                      8307 non-null   int64         
 4   Customers                  8307 non-null   int64         
 5   Open                       8307 non-null   int64         
 6   Promo                      8307 non-null   int64         
 7   SchoolHoliday              8307 non-null   int64         
 8   month                      8307 non-null   int64         
 9   week                       8307 non-null   int64         
 10  day_of_week                8307 non-null   int64         
 11  day_of_month               8307 non-null   int64         
 12  day_o

In [4]:
df["Date"] = pd.to_datetime(df["Date"])

df.set_index("Date", inplace=True)

In [5]:
resampled = df.copy().groupby("Store")["Sales"].resample("W").sum().reset_index()

fig = px.line(
    resampled, x="Date", y="Sales", color="Store", title="Weekly Sales by Store"
)

fig.show()

In [6]:
resampled = df.copy().groupby("Store")["Customers"].resample("W").sum().reset_index()

fig = px.line(
    resampled, x="Date", y="Customers", color="Store", title="Weekly Customers by Store"
)

fig.show()

In [7]:
resampled = df.copy().groupby("Store")["Sales"].resample("M").sum().reset_index()

fig = px.line(
    resampled, x="Date", y="Sales", color="Store", title="Monthly Sales by Store"
)

fig.show()

In [8]:
resampled = df.copy().groupby("Store")["Customers"].resample("M").sum().reset_index()

fig = px.line(
    resampled,
    x="Date",
    y="Customers",
    color="Store",
    title="Monthly Customers by Store",
)

fig.show()

In [9]:
correlation = df["Sales"].corr(df["Customers"])

fig = px.scatter(
    df,
    x="Sales",
    y="Customers",
    title=f"Correlation between Sales and Customers: {correlation:.2f}",
)
fig.show()

In [10]:
correlation = df["Sales"].corr(df["Promo"])

fig = px.scatter(
    df,
    x="Sales",
    y="Promo",
    title=f"Correlation between Sales and Promotions: {correlation:.2f}",
)
fig.show()

In [11]:
fig_sales = px.box(
    df,
    x="Promo",
    y="Sales",
    points="all",
    title="Distribution of Sales based on Promotion",
    labels={"Promo": "Promotion Status", "Sales": "Sales Amount"},
)
fig_sales.update_layout(xaxis_tickvals=[0, 1], xaxis_ticktext=["No Promo", "Promo"])
fig_sales.show()

In [12]:
fig_customers = px.box(
    df,
    x="Promo",
    y="Customers",
    points="all",
    title="Distribution of Customers based on Promotion",
    labels={"Promo": "Promotion Status", "Customers": "Number of Customers"},
)
fig_customers.update_layout(xaxis_tickvals=[0, 1], xaxis_ticktext=["No Promo", "Promo"])
fig_customers.show()

In [13]:
corr = df[["Sales", "Customers"] + [col for col in df.columns if "lag" in col]].corr()

fig = go.Figure(
    data=go.Heatmap(
        z=corr.values,
        x=corr.columns.tolist(),
        y=corr.index.tolist(),
        colorscale="Viridis",
        hoverongaps=False,
        hoverinfo="z",
    )
)

fig.show()

In [14]:
groups = df.groupby("Store")

fig = go.Figure()

for name, group in groups:
    sales_data = group["Sales"]

    acf_values = acf(sales_data)

    fig.add_trace(
        go.Scatter(
            x=list(range(len(acf_values))),
            y=acf_values,
            mode="lines",
            name=f"Store {name} ACF",
        )
    )

fig.show()

In [15]:
fig = go.Figure()

for name, group in groups:
    sales_data = group["Sales"]
    pacf_values = pacf(sales_data)
    fig.add_trace(
        go.Scatter(
            x=list(range(len(pacf_values))),
            y=pacf_values,
            mode="lines",
            name=f"Store {name} PACF",
        )
    )

fig.show()