In [3]:
%reload_ext autoreload
%autoreload 2

In [None]:
from valuation.config.filepaths import DATASET_PROFILE_FILEPATH
from valuation.utils.io import IOService
from valuation.utils.data import DataFrameSplitter
from valuation.utils.print import Printer

# Dominick's Finer Foods Sales Dataset

The Dominick's Finer Foods (DFF) dataset, provided by the James M. Kilts Center at the University of Chicago Booth School of Business, comprises sales transactions across 28 distinct categorize from the 100-store retail chain. Spanning nearly eight years from September 1989 to May 1997, these data constitute transactions at the Uniform Product Code (UPC) level. 

***

## Key Data Fields

* **STORE**: A unique numeric identifier for each retail filepath.
* **UPC**: The Uniform Product Code, a unique identifier for each distinct product.
* **WEEK**: A numeric value representing the week of the transaction.
* **MOVE**: The total number of individual units sold for a given UPC in a specific store and week.
* **QTY**: The number of items in a promotional bundle (e.g., `3` for a 3-pack). This value is `1` for individually sold items.
* **PRICE**: The shelf price for the bundle or individual item. Revenue is calculated as **`(PRICE * MOVE) / QTY`** to account for bundled sales.
* **PROFIT**: The **gross margin percentage** for the product. A value of `25.3` corresponds to a 25.3% margin.
* **SALE**: A flag indicating a promotion ('B', 'C', 'S'). This flag is noted by the data providers as being inconsistently applied.
* **OK**: A data quality flag where a value of `1` indicates the record is considered valid for analysis.

## Dataset Overview
The following profile characterizes each category file across key structural dimensions: participating store counts, temporal span, data quality indicators, and storage requirements.

In [9]:
df = IOService.read(DATASET_PROFILE_FILEPATH)
column_formatting = {"missing_values_%":'{:.2f}', 
                     "memory_usage_mb": '{:.2f}', 
                     "invalid_records_%": '{:.2f}', 
                     "file_size_mb": '{:.2f}'}
df.style.format(column_formatting, thousands=",")

Unnamed: 0,filename,category,stores,weeks,num_records,num_columns,missing_values,missing_values_%,invalid_records,invalid_records_%,memory_usage_mb,file_size_mb
0,wana.zip,Analgesics,93,393,7339217,11,0,0.0,97562,1.33,1586.37,30.23
1,wbat.zip,Bath Soap,93,266,1644557,11,0,0.0,4031,0.25,355.25,5.82
2,wber.zip,Beer,89,303,3990672,11,0,0.0,22952,0.58,869.53,15.85
3,wbjc.zip,Bottled Juices,93,393,6222806,11,0,0.0,98797,1.59,1355.74,38.62
4,wcer.zip,Cereals,93,367,6602582,11,0,0.0,141285,2.14,1429.24,40.44
5,wche.zip,Cheeses-,93,393,9427395,11,0,0.0,176529,1.87,2055.23,60.8
6,wcig.zip,Cigarettes,93,399,5398197,11,0,0.0,66828,1.24,1163.48,20.07
7,wcoo.zip,Cookies,93,389,13447807,11,5418,0.04,267711,1.99,2920.87,59.25
8,wcra.zip,Crackers,93,381,3624688,11,1400,0.04,68781,1.9,790.0,17.15
9,wcso.zip,Canned Soup,93,379,7011243,11,0,0.0,148069,2.11,1523.27,41.14


## Data Preprocessing
The data preprocessing stage transforms the raw, transaction-level records into a series of analysis-ready datasets. These files serve distinct purposes, from model training to strategic performance analysis.

***

### Sales Dataset
* **`sales_data.csv`**: Contains cleaned sales data aggregated weekly by store and category. Revenue and gross profit are calculated for each record and the week start and end dates are added.

### Modeling Datasets
The primary dataset is partitioned chronologically for model development.

* **`train.csv`**: The **Training Set** contains the first 280 weeks (~70%) of the data, and are used to train forecasting models. All performance analysis is derived from this subset.
* **`validation.csv`**: The **Validation Set** comprises the next 60 weeks of data (~15%) and are used for hyperparameter tuning.
* **`test.csv`**: The **Test Set is the hold-test set containing the final 60 weeks of data. This dataset is set aside for unbiased model evaluation.

### Performance Analysis Datasets
Summary datasets are derived from the training set to support strategic analysis.

* **`same_store_sales_growth.csv`**: Contains the aggregated year-over-year Same-Store Sales (SSS) growth for the company.
* **`store_performance.csv`**: Details store-level metrics from the final year of the training data, including year-over-year sales growth, total gross profit, and gross margin percentage.
* **`category_performance.csv`**: Details category-level metrics, structured identically to the store performance file.

## Sales Dataset
The 28 category-level files are now processed and concatenated into a single, aggregated dataset.

In [None]:
# Obtain categories and filenames from config

from valuation.dataset import SalesDataPrep

# Instantiate the config reader and read the category filenames
config_reader = ConfigReader()
category_filenames = config_reader.read(CONFIG_CATEGORY_FILEPATH)

# Instantiate the sales data processor
processor = SalesDataPrep()

# Run the processor pipeline
processor.prepare(category_filenames=category_filenames, force=True)

## Modeling Datasets
For model development, the preprocessed dataset is partitioned chronologically into training, validation, and test subsets. The split is based on a 70/15/15 division of the total weeks in the dataset, which simulates a real-world forecasting scenario by training on past data to predict future outcomes.

In [None]:
# Instantiate Services
from valuation.config.filepaths import SALES_DATA_FILEPATH


partitioner = DataFrameSplitter()
printer = Printer()

# Load Data
sales_data = IOService.read(SALES_DATA_FILEPATH)

# Split Data
splits = partitioner.split_by_proportion_of_values(df=sales_data, val_col='week', train_size=0.7, val_size=0.15)

# Save Split Data
IOService.write(data=splits['data'].get("train",None), filepath=TRAIN_DATA_FILEPATH)
IOService.write(data=splits['data'].get("validation",None), filepath=VALIDATION_DATA_FILEPATH)
IOService.write(data=splits['data'].get("test",None), filepath=TEST_DATA_FILEPATH)

# Print Split Metadata
printer.print_dict(data=splits['meta'], title="Dataset Split Metadata")