# 01 - Exploratory Data Analysis - Initial EDA & Data Prep

(Python + tsforge)

**Goal** Get a fast, working understanding of our dataset *as a forecasting panel*, prove it’s modeling-ready, and produce a simple baseline forecast. We’ll keep it practical and direct:
- Load the prepared train/test splits (full + subset) from the setup.
- Run focused EDA for time series (frequency, completeness, zeros, outliers).
- Summarize/Pad/Visualize with `tsforge` wrappers (over Nixtla + pytimetk).


> We’re not polishing models here. We’re checking **data completeness**, **readiness**, and **signal health** 



In [1]:
from tsfeatures import tsfeatures

In [2]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

ids = ["store_a", "store_b", "store_c"]
dept_map = {
    "store_a": "dept_1",
    "store_b": "dept_1",
    "store_c": "dept_2",
}

dates = pd.date_range(start="2020-01-01", periods=52 * 4, freq="W")

base_rates = {"store_a": 3.0, "store_b": 1.5, "store_c": 0.8}

records = []
for uid in ids:
    lam = base_rates[uid]
    counts = rng.poisson(lam=lam, size=len(dates))

    # introduce intermittent zero runs
    gap_mask = rng.choice([0, 1], size=len(dates), p=[0.2, 0.8])
    gap_lengths = rng.integers(1, 4, size=len(dates))
    idx = 0
    while idx < len(dates):
        if gap_mask[idx] == 0:
            counts[idx : idx + gap_lengths[idx]] = 0
            idx += gap_lengths[idx]
        else:
            idx += 1

    records.append(
        pd.DataFrame(
            {
                "unique_id": uid,
                "department": dept_map[uid],
                "ds": dates,
                "y": counts,
            }
        )
    )

id_df = pd.concat(records, ignore_index=True)


In [3]:
import pandas as pd 
import numpy as np
from tsforge.eda import hierarchical_tsfeatures,datetime_diagnostics,TSFORGE_FEATURES


date_table = datetime_diagnostics(
    df = id_df,
    id_col="unique_id",
    date_col="ds",
    target_col="y", # include target for peak seasons to be returned
)



In [4]:
dummy_df = hierarchical_tsfeatures(
    df = id_df,
    id_col='unique_id',
    date_col='ds',
    target_col='y',
    hierarchy=['unique_id','department'],
    features=TSFORGE_FEATURES,
    freq=52,
)

<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


In [5]:
dummy_df.head()

Unnamed: 0,hier_id,lumpiness,permutation_entropy,MI_top_k_lags,MI_top_k_lags_indices,trend_strength,seasonal_strength,adi,permutation_entropy.1,MI_top_k_lags_indices.1,...,entropy,crossing_points,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
0,store_a,0.110754,0.924562,0.558724,"[25, 28, 29, 30, 31]",,0.52026,1.540741,0.924562,"[25, 28, 29, 30, 31]",...,0.892627,84,0.042019,0.224588,0.127668,-0.338549,0.197491,-0.569496,0.370376,-0.024868
1,store_b,0.026325,0.905015,0.41065,"[25, 28, 29, 30, 31]",,0.515114,1.855856,0.905015,"[25, 28, 29, 30, 31]",...,0.92483,89,0.057619,0.028881,0.017459,-0.504763,0.275144,-0.681607,0.526486,-0.001386
2,store_c,0.010871,0.746518,0.416338,"[51, 29, 30, 31, 32]",,0.480588,2.915493,0.746518,"[51, 29, 30, 31, 32]",...,0.883664,88,0.024822,0.063542,0.038755,-0.485915,0.261282,-0.661113,0.504433,-0.063544
0,dept_1,0.833311,0.985704,0.481154,"[51, 27, 28, 29, 30]",,0.536548,1.216374,0.985704,"[51, 27, 28, 29, 30]",...,0.903325,92,0.029309,0.099902,0.072627,-0.433208,0.236663,-0.636463,0.46834,0.039849
1,dept_2,0.010871,0.746518,0.416338,"[51, 29, 30, 31, 32]",,0.480588,2.915493,0.746518,"[51, 29, 30, 31, 32]",...,0.883664,88,0.024822,0.063542,0.038755,-0.485915,0.261282,-0.661113,0.504433,-0.063544


In [6]:
date_table.head()

Unnamed: 0_level_0,start_date,end_date,n_obs,span_days,inferred_freq,obs_per_year,n_gaps,pct_missing,has_duplicates,peak_month,peak_quarter
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
store_a,2020-01-05,2023-12-24,208,1449.0,W-SUN,52.430642,0,0.0,False,3,1
store_b,2020-01-05,2023-12-24,208,1449.0,W-SUN,52.430642,0,0.0,False,2,1
store_c,2020-01-05,2023-12-24,208,1449.0,W-SUN,52.430642,0,0.0,False,8,3
