In [13]:
######## Generate and save dataset binary

# Generates the processed dataset used by all model v0 candidates
# Generates train, val, and test splits
# Saves generated train, val, and test datasets to disk for quick loading into memory

Import All Required Modules

In [14]:
import sys

# Manually add the project root to sys.path
sys.path.append('/Users/joaquinuriarte/Documents/GitHub/sports-betting/')


# === STEP 0: Imports
print("# === STEP 0: Imports: Starting ...")
from modules.dataset_generator.helpers.configuration_loader import ConfigurationLoader as DSConfigLoader
from modules.dataset_generator.factories.data_io_factory import DataIOFactory
from modules.dataset_generator.factories.feature_processor_factory import FeatureProcessorFactory
from modules.dataset_generator.factories.join_factory import JoinFactory
from modules.dataset_generator.factories.strategy_factory import StrategyFactory
from modules.dataset_generator.dataset_generator import DatasetGenerator
from modules.processor.helpers.configuration_loader import ConfigurationLoader as PConfigLoader
from modules.processor.factories.split_strategy_factory import SplitStrategyFactory
from modules.processor.processor import Processor
print("# === STEP 0: Imports: Complete")

# === STEP 0: Imports: Starting ...
# === STEP 0: Imports: Complete


In [15]:
# === STEP 1: Dependency Instantiations And Global Variable Declarations
print("# === STEP 1: Dependency Instantiations And Global Variable Declarations: Starting ...")
## === DATASET GEN
yaml_path = '/Users/joaquinuriarte/Documents/GitHub/sports-betting/configs/model_v0/model_v01_000.yaml'
ds_configuration_loader = DSConfigLoader()
data_factory, feature_processor_factory, join_factory, strategy_factory = DataIOFactory(), FeatureProcessorFactory(), JoinFactory(), StrategyFactory()
## === PROCESSOR
p_configuration_loader = PConfigLoader()
split_strategy_factory = SplitStrategyFactory()
print("# === STEP 1: Dependency Instantiations And Global Variable Declarations: Complete")

# === STEP 1: Dependency Instantiations And Global Variable Declarations: Starting ...
# === STEP 1: Dependency Instantiations And Global Variable Declarations: Complete


Create Dataset Generator & Generate Processed Dataset

In [16]:
# === STEP 2: DATASET GEN
print("# === STEP 2: DATASET GEN: Starting ...")
dataset_generator = DatasetGenerator(yaml_path, ds_configuration_loader, data_factory, feature_processor_factory, join_factory, strategy_factory)

# === STEP 2: DATASET GEN: Starting ...


In [None]:
processed_dataset = dataset_generator.generate()
print("# === STEP 2: DATASET GEN: Complete")

In [17]:
import os

# File path to save and load the processed dataset
processed_dataset_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/model_v01"

# Ensure the directory exists
os.makedirs(processed_dataset_path, exist_ok=True)

In [37]:
## Use to save processed dataset to disk ## 
import pickle
import os

processed_dataset_path_w_file = processed_dataset_path + "/processed_dataset.pkl"

with open(processed_dataset_path_w_file, "wb") as f:
        pickle.dump(processed_dataset, f)
        print("# === STEP 2: DATASET GEN: Saved to file")

# === STEP 2: DATASET GEN: Saved to file


In [18]:
## Use to load processed dataset if computed already ## 
import pickle
import os

processed_dataset_path_w_file = processed_dataset_path + "/processed_dataset.pkl"

print("# === STEP 2: DATASET GEN: Loading from saved file")
with open(processed_dataset_path_w_file, "rb") as f:
    processed_dataset = pickle.load(f)

# === STEP 2: DATASET GEN: Loading from saved file


In [19]:
processed_dataset.features

Unnamed: 0_level_0,A_player_1_MIN,A_player_1_PTS,A_player_1_AST,A_player_1_TO,A_player_1_PLUS_MINUS,A_player_1_OREB,A_player_1_DREB,A_player_1_PF,A_player_1_FG3_PCT,A_player_1_FG_PCT,...,B_player_8_AST,B_player_8_TO,B_player_8_PLUS_MINUS,B_player_8_OREB,B_player_8_DREB,B_player_8_PF,B_player_8_FG3_PCT,B_player_8_FG_PCT,B_player_8_FT_PCT,Team_A_Wins
GAME_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11400001,41.266667,31.0,5.0,1.0,-16.0,1.0,9.0,1.0,0.333,0.476,...,0.0,0.0,-11.0,0.0,0.0,4.0,0.0000,0.6670,0.0,0
11400002,45.083333,18.0,6.0,2.0,2.0,0.0,2.0,2.0,0.500,0.417,...,4.0,2.0,5.0,0.0,0.0,2.0,0.0000,0.4000,0.0,1
11400004,43.200000,6.0,7.0,4.0,-1.0,2.0,16.0,1.0,0.000,0.375,...,0.0,0.0,2.0,0.0,0.0,1.0,0.0000,0.0000,0.0,0
11400005,36.600000,12.0,2.0,1.0,9.0,1.0,7.0,3.0,0.000,0.545,...,1.0,0.0,6.0,0.0,5.0,4.0,0.5000,0.6670,1.0,1
11400006,36.866667,18.0,3.0,5.0,-17.0,2.0,3.0,4.0,0.429,0.389,...,0.0,0.0,13.0,0.0,3.0,4.0,0.0000,0.6000,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52100111,32.566667,22.0,0.0,2.0,18.0,0.0,2.0,1.0,1.000,0.538,...,0.0,0.0,-4.0,0.0,0.0,0.0,0.0000,0.0000,0.0,1
52100121,21.316667,13.0,2.0,0.0,-3.0,3.0,0.0,4.0,0.000,0.714,...,2.5,0.5,14.5,0.0,3.0,2.0,0.4165,0.4285,0.5,1
52100131,32.316667,19.0,6.0,3.0,-13.0,3.0,2.0,2.0,0.000,0.533,...,1.0,0.0,-1.0,2.0,2.0,0.0,0.0000,0.5000,1.0,1
52100201,40.000000,12.0,7.0,2.0,2.0,1.0,6.0,3.0,0.500,0.333,...,1.0,0.0,2.0,0.0,0.0,1.0,0.0000,1.0000,0.0,0


Create Processor & Generate Train, Val, and Test Datasets

In [20]:
# === STEP 3: PROCESSOR
print("# === STEP 3: PROCESSOR: Starting ...")
processor = Processor(yaml_path, p_configuration_loader, processed_dataset, split_strategy_factory)

# === STEP 3: PROCESSOR: Starting ...


In [21]:
train_dataset, validation_dataset, test_dataset = processor.generate()
print("# === STEP 3: PROCESSOR: Complete")

# === STEP 3: PROCESSOR: Complete


In [22]:
# File path to save and load train, test, and val datasets
train_test_val_dataset_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/model_v01"

# Ensure the directory exists
os.makedirs(train_test_val_dataset_path, exist_ok=True)

In [23]:
## Use to save train, test, and val datasets to disk ## 
import pickle
import os

train_dataset_path = train_test_val_dataset_path + "/train.pkl"
val_dataset_path = train_test_val_dataset_path + "/val.pkl"
test_dataset_path = train_test_val_dataset_path + "/test.pkl"

with open(train_dataset_path, "wb") as f:
        pickle.dump(train_dataset, f)
        print("Train dataset saved to file")
with open(val_dataset_path, "wb") as f:
        pickle.dump(validation_dataset, f)
        print("Val dataset saved to file")
with open(test_dataset_path, "wb") as f:
        pickle.dump(test_dataset, f)
        print("Test dataset saved to file")

Train dataset saved to file
Val dataset saved to file
Test dataset saved to file


In [43]:
## Use to load train, test, and val datasets if computed already ## 
import pickle
import os

train_dataset_path = train_test_val_dataset_path + "/train.pkl"
val_dataset_path = train_test_val_dataset_path + "/val.pkl"
test_dataset_path = train_test_val_dataset_path + "/test.pkl"

with open(train_dataset_path, "rb") as f:
    train_dataset = pickle.load(f)
    print("Train dataset leaded to memory")
with open(val_dataset_path, "rb") as f:
    validation_dataset = pickle.load(f)
    print("Val dataset leaded to memory")
with open(test_dataset_path, "rb") as f:
    test_dataset = pickle.load(f)
    print("Test dataset leaded to memory")

Train dataset leaded to memory
Val dataset leaded to memory
Test dataset leaded to memory


In [24]:
print("# === STEP 3.5: Assessing dataset inbalance: Starting...")
from collections import Counter
# Function to calculate dataset balance
def calculate_balance(dataset, name):
    # Extract "Team_A_Wins" labels
    y_labels = [example.features["Team_A_Wins"][0] for example in dataset.examples]

    # Count occurrences of each label
    label_counts = Counter(y_labels)
    num_1s = label_counts[1]
    num_0s = label_counts[0]

    # Calculate proportions
    total_samples = len(y_labels)
    proportion_1s = num_1s / total_samples * 100
    proportion_0s = num_0s / total_samples * 100

    # Display results
    print(f"Number of 1s (Team A wins) in {name}: {num_1s} ({proportion_1s:.2f}%)")
    print(f"Number of 0s (Team A does not win) in {name}: {num_0s} ({proportion_0s:.2f}%)")

# Assuming your dataset is loaded into memory as `my_dataset`
calculate_balance(train_dataset, "train")
calculate_balance(validation_dataset, "val")
calculate_balance(test_dataset, "test")

print("# === STEP 3.5: Assessing dataset inbalance: Complete")

# === STEP 3.5: Assessing dataset inbalance: Starting...
Number of 1s (Team A wins) in train: 10538 (59.17%)
Number of 0s (Team A does not win) in train: 7273 (40.83%)
Number of 1s (Team A wins) in val: 2249 (58.92%)
Number of 0s (Team A does not win) in val: 1568 (41.08%)
Number of 1s (Team A wins) in test: 2210 (57.90%)
Number of 0s (Team A does not win) in test: 1607 (42.10%)
# === STEP 3.5: Assessing dataset inbalance: Complete
