In [1]:
import pandas as pd

from valuation.asset.identity.dataset import DatasetID
from valuation.core.stage import DatasetStage
from valuation.infra.store.dataset import DatasetStore


## Get Training Data

In [2]:
store = DatasetStore()
dataset_id = DatasetID(name="train_val", stage=DatasetStage.MODEL)
passport = store.get_passport(dataset_id=dataset_id)
ds = store.get(passport=passport)
train_df = ds.data

[32m2025-10-23 22:53:34.158[0m | [34m[1mDEBUG   [0m | [36mvaluation.asset.dataset.base[0m:[36mload[0m:[36m338[0m - [34m[1mDataset Dataset train_val of the model stage created on 2025-10-23 at 21:11 loaded.[0m


## Check Date Differences and Number of Periods

In [3]:
# Check frequency of the 'ds' column in your train_df
# Calculate the difference between consecutive dates for a sample series
sample_id = train_df['unique_id'].iloc[0]
sample_series_dates = train_df[train_df['unique_id'] == sample_id]['ds'].sort_values()
date_diffs = sample_series_dates.diff().dropna()

print("Frequency check for a sample series:")
print(date_diffs.value_counts())

# Check for missing periods per series
completeness_check = train_df.groupby('unique_id')['ds'].agg(['min', 'max', 'count'])
expected_weeks_train = (train_df['ds'].max() - train_df['ds'].min()).days // 7 + 1 # Approximate for 6 years

print(f"\nExpected number of weeks in train_df (approx): {expected_weeks_train}")
print("\nChecking completeness per series:")
print(completeness_check.head())

# Find series that don't have the expected number of weeks
incomplete_series = completeness_check[completeness_check['count'] != expected_weeks_train]
if not incomplete_series.empty:
    print(f"\nFound {len(incomplete_series)} series with potentially missing periods:")
    print(incomplete_series.head())
else:
    print("\nAll series appear to have the expected number of periods.")

Frequency check for a sample series:
ds
7 days     304
35 days      1
21 days      1
Name: count, dtype: int64

Expected number of weeks in train_df (approx): 313

Checking completeness per series:
                            min        max  count
unique_id                                        
100_analgesics       1990-01-03 1995-12-27    307
100_bath soap        1992-02-26 1995-12-27    195
100_bathroom tissues 1990-01-03 1995-12-27    303
100_beer             1991-06-12 1995-12-27    232
100_bottled juices   1990-01-03 1995-12-27    307

Found 1924 series with potentially missing periods:
                            min        max  count
unique_id                                        
100_analgesics       1990-01-03 1995-12-27    307
100_bath soap        1992-02-26 1995-12-27    195
100_bathroom tissues 1990-01-03 1995-12-27    303
100_beer             1991-06-12 1995-12-27    232
100_bottled juices   1990-01-03 1995-12-27    307


## Series Length Statistics

In [4]:
# How much history do you have per series?
series_length = train_df.groupby('unique_id')['ds'].agg(['min', 'max', 'count'])
series_length['weeks'] = (series_length['max'] - series_length['min']).dt.days / 7

print("Series length distribution:")
print(series_length['weeks'].describe())
print(f"\nSeries with < 260 weeks: {(series_length['weeks'] < 260).sum():,}")
print(f"Series with >= 260 weeks: {(series_length['weeks'] >= 260).sum():,}")

Series length distribution:
count    2403.000000
mean      280.491469
std        52.452566
min        30.000000
25%       237.000000
50%       312.000000
75%       312.000000
max       312.000000
Name: weeks, dtype: float64

Series with < 260 weeks: 664
Series with >= 260 weeks: 1,739


## Diagnosing Time Series Horizon Issues

In [5]:
# Check 1: Verify your data's actual frequency
print("Checking data frequency...")
for uid in train_df['unique_id'].unique()[:5]:  # Check first 5 series
    ts = train_df[train_df['unique_id'] == uid].sort_values('ds')
    date_diffs = ts['ds'].diff().dt.days.value_counts()
    print(f"\n{uid}:")
    print(date_diffs.head())

# Check 2: What day of week are your dates?
print("\nDay of week distribution:")
print(train_df['ds'].dt.day_name().value_counts())

# Check 3: Are there gaps?
print("\nChecking for gaps...")
for uid in train_df['unique_id'].unique()[:10]:
    ts = train_df[train_df['unique_id'] == uid].sort_values('ds')
    expected_periods = (ts['ds'].max() - ts['ds'].min()).days // 7 + 1
    actual_periods = len(ts)
    if expected_periods != actual_periods:
        print(f"{uid}: expected {expected_periods}, got {actual_periods}")

Checking data frequency...

100_analgesics:
ds
7.0     304
35.0      1
21.0      1
Name: count, dtype: int64

100_bath soap:
ds
7.0     192
35.0      1
21.0      1
Name: count, dtype: int64

100_bathroom tissues:
ds
7.0     299
35.0      2
21.0      1
Name: count, dtype: int64

100_beer:
ds
7.0     229
35.0      1
21.0      1
Name: count, dtype: int64

100_bottled juices:
ds
7.0     304
35.0      1
21.0      1
Name: count, dtype: int64

Day of week distribution:
ds
Wednesday    662767
Name: count, dtype: int64

Checking for gaps...
100_analgesics: expected 313, got 307
100_bath soap: expected 201, got 195
100_bathroom tissues: expected 313, got 303
100_beer: expected 238, got 232
100_bottled juices: expected 313, got 307
100_canned soup: expected 313, got 301
100_canned tuna: expected 313, got 301
100_cereals: expected 313, got 283
100_cheeses-: expected 313, got 307


## Model Performance Diagnosis