# Introduction

This notebook creates a training and validation set to train a model

# Setup

Navigate to the project's root folder:

In [1]:
%cd ..

/opt/rossmann-tf


# Libraries

In [2]:
import pandas as pd

# Data

In [3]:
!ls -la data

total 48628
drwxrwxrwx 1 1000 staff     4096 May 21 05:48 .
drwxrwxrwx 1 1000 staff     4096 May 20 16:10 ..
-rwxrwxrwx 1 1000 staff  1318531 May 20 07:38 test_clean.parquet
-rwxrwxrwx 1 1000 staff   616017 May 20 17:40 test_model_features.parquet
-rwxrwxrwx 1 1000 staff 26111959 May 20 07:37 train_clean.parquet
-rwxrwxrwx 1 1000 staff 10853274 May 20 17:40 train_model_features.parquet
-rwxrwxrwx 1 1000 staff 10259901 May 21 05:48 train.parquet
-rwxrwxrwx 1 1000 staff   620519 May 21 05:48 valid.parquet


In [4]:
train_df = pd.read_parquet('./data/train_model_features.parquet')
test_df = pd.read_parquet('./data/test_model_features.parquet')

# Split

The size of the validation split will be approximately the same as the test set:

In [5]:
n_test, _ = test_df.shape
n_test

41088

In [6]:
valid_start_date = train_df['Date'].sort_values()[-n_test:].min()

train_set = train_df.query('Date < @valid_start_date')
valid_set = train_df.query('Date >= @valid_start_date')

Some sanity checks:

In [7]:
train_set['Date'].min(), train_set['Date'].max()

(Timestamp('2013-01-01 00:00:00'), Timestamp('2015-06-18 00:00:00'))

In [8]:
valid_set['Date'].min(), valid_set['Date'].max()

(Timestamp('2015-06-19 00:00:00'), Timestamp('2015-07-31 00:00:00'))

In [9]:
n_total, _ = train_df.shape
n_train, _ = train_set.shape
n_valid, _ = valid_set.shape

assert n_train + n_valid == n_total

In [10]:
n_train, n_valid

(802942, 41396)

Save the results:

In [11]:
train_set.to_parquet('./data/train.parquet', index=False)
valid_set.to_parquet('./data/valid.parquet', index=False)