In [13]:
######## Generate and save dataset
    # Generates the processed dataset used by all model v0 candidates
    # Generates train, val, and test splits
    # Saves generated train, val, and test datasets to disk for quick loading into memory

######## Preprocess final dataset if necessary
    # Scale datasets
    # Saves scaled datasets and scalers to disk

Import & Instantiate All Required Modules

In [1]:
import sys

# Manually add the project root to sys.path
sys.path.append('/Users/joaquinuriarte/Documents/GitHub/sports-betting/')


# === STEP 0: Imports
from modules.dataset_generator.helpers.configuration_loader import ConfigurationLoader as DSConfigLoader
from modules.dataset_generator.factories.data_io_factory import DataIOFactory
from modules.dataset_generator.factories.feature_processor_factory import FeatureProcessorFactory
from modules.dataset_generator.factories.join_factory import JoinFactory
from modules.dataset_generator.factories.strategy_factory import StrategyFactory
from modules.dataset_generator.dataset_generator import DatasetGenerator
from modules.processor.helpers.configuration_loader import ConfigurationLoader as PConfigLoader
from modules.processor.factories.split_strategy_factory import SplitStrategyFactory
from modules.processor.processor import Processor
from model_binaries.utils.binary_utils import save_entity, load_entity, assess_dataset_balance, scale_features


In [2]:
# === STEP 1: Dependency Instantiations And Global Variable Declarations

## === DATASET GEN
yaml_path = '/Users/joaquinuriarte/Documents/GitHub/sports-betting/configs/model_v0/model_v0_000.yaml'
ds_configuration_loader = DSConfigLoader()
data_factory, feature_processor_factory, join_factory, strategy_factory = DataIOFactory(), FeatureProcessorFactory(), JoinFactory(), StrategyFactory()
## === PROCESSOR
p_configuration_loader = PConfigLoader()
split_strategy_factory = SplitStrategyFactory()

Create Dataset Generator & Generate Processed Dataset

In [3]:
# === STEP 2: DATASET GEN
dataset_generator = DatasetGenerator(yaml_path, ds_configuration_loader, data_factory, feature_processor_factory, join_factory, strategy_factory)

In [None]:
processed_dataset = dataset_generator.generate()

Save/Load Processed Dataset to Disk

In [4]:
# Folder path to save and load the processed dataset
processed_dataset_folder_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/model_v0"

In [None]:
## Use to save processed dataset to disk ## 
save_entity(processed_dataset_folder_path, "processed_dataset.pkl", processed_dataset)

In [5]:
## Use to load processed dataset if computed already ## 
processed_dataset = load_entity(processed_dataset_folder_path, "processed_dataset.pkl")

Processed Dataset Peek

In [None]:
processed_dataset.features

Create Processor & Generate Train, Val, and Test Datasets

In [10]:
# === STEP 3: PROCESSOR
processor = Processor(yaml_path, p_configuration_loader, processed_dataset, split_strategy_factory)

In [11]:
train_dataset, validation_dataset, test_dataset = processor.generate()

Save/Load Train, Val, Test DS

In [12]:
# File path to save and load train, test, and val datasets
train_test_val_folder_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/model_v0"

In [13]:
## Use to save train, test, and val datasets to disk ## 
save_entity(train_test_val_folder_path, "train.pkl", train_dataset)
save_entity(train_test_val_folder_path, "val.pkl", validation_dataset)
save_entity(train_test_val_folder_path, "test.pkl", test_dataset)

In [14]:
## Use to load train, test, and val datasets if computed already ## 
train_dataset = load_entity(train_test_val_folder_path, "train.pkl")
validation_dataset = load_entity(train_test_val_folder_path, "val.pkl")
test_dataset = load_entity(train_test_val_folder_path, "test.pkl")

Assess Datasets Inbalance

In [15]:
assess_dataset_balance(train_dataset, ["Team_A_Wins"])
assess_dataset_balance(validation_dataset, ["Team_A_Wins"])
assess_dataset_balance(test_dataset, ["Team_A_Wins"])


Column: 'Team_A_Wins'
  - Number of 0.0s: 7290 (40.93%)
  - Number of 1.0s: 10521 (59.07%)

Column: 'Team_A_Wins'
  - Number of 1.0s: 2225 (58.29%)
  - Number of 0.0s: 1592 (41.71%)

Column: 'Team_A_Wins'
  - Number of 1.0s: 2251 (58.97%)
  - Number of 0.0s: 1566 (41.03%)


Preprocess Features

In [21]:
scaled_training_dataset, training_scaler = scale_features(train_dataset, return_scaler=True)
scaled_validation_dataset, val_scaler = scale_features(validation_dataset, return_scaler=True)
scaled_test_dataset, test_scaler = scale_features(test_dataset, return_scaler=True)

Save Scaled Datasets and Scalers

In [22]:
# File path to save and load scaled train, test, and val datasets & scaler
scaled_train_test_val_folder_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/model_v0/scaler"

In [23]:
# Save scaled datasets and scaler
save_entity(scaled_train_test_val_folder_path, "scaled_train.pkl", scaled_training_dataset)
save_entity(scaled_train_test_val_folder_path, "train_scaler.pkl", training_scaler)

save_entity(scaled_train_test_val_folder_path, "scaled_val.pkl", scaled_validation_dataset)
save_entity(scaled_train_test_val_folder_path, "val_scaler.pkl", val_scaler)

save_entity(scaled_train_test_val_folder_path, "scaled_test.pkl", scaled_test_dataset)
save_entity(scaled_train_test_val_folder_path, "test_scaler.pkl", test_scaler)

In [24]:
# Load scaled datasets and scaler
scaled_training_dataset, training_scaler = load_entity(scaled_train_test_val_folder_path, "scaled_train.pkl"), load_entity(scaled_train_test_val_folder_path, "train_scaler.pkl")
scaled_validation_dataset, val_scaler = load_entity(scaled_train_test_val_folder_path, "scaled_val.pkl"), load_entity(scaled_train_test_val_folder_path, "val_scaler.pkl")
scaled_test_dataset, test_scaler = load_entity(scaled_train_test_val_folder_path, "scaled_test.pkl"), load_entity(scaled_train_test_val_folder_path, "test_scaler.pkl")