In [2]:
######## Generate and save dataset binary

# Generates the processed dataset used by all model v0 candidates
# Generates train, val, and test splits
# Saves generated train, val, and test datasets to disk for quick loading into memory

In [6]:
import sys
import os

# Dynamically add the project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../../.."))  # Adjust based on depth
sys.path.append(project_root)

print(sys.path)


# === STEP 0: Imports
print("# === STEP 0: Imports: Starting ...")
from modules.dataset_generator.helpers.configuration_loader import ConfigurationLoader as DSConfigLoader
from modules.dataset_generator.factories.data_io_factory import DataIOFactory
from modules.dataset_generator.factories.feature_processor_factory import FeatureProcessorFactory
from modules.dataset_generator.factories.join_factory import JoinFactory
from modules.dataset_generator.factories.strategy_factory import StrategyFactory
from modules.dataset_generator.dataset_generator import DatasetGenerator
from modules.processor.helpers.configuration_loader import ConfigurationLoader as PConfigLoader
from modules.processor.factories.split_strategy_factory import SplitStrategyFactory
from modules.processor.processor import Processor
print("# === STEP 0: Imports: Complete")

AttributeError: 'str' object has no attribute 'append'

In [2]:
# === STEP 1: Dependency Instantiations And Global Variable Declarations
print("# === STEP 1: Dependency Instantiations And Global Variable Declarations: Starting ...")
## === DATASET GEN
yaml_path = '/Users/joaquinuriarte/Documents/GitHub/sports-betting/configs/model_v0/model_v0.yaml'
ds_configuration_loader = DSConfigLoader()
data_factory, feature_processor_factory, join_factory, strategy_factory = DataIOFactory(), FeatureProcessorFactory(), JoinFactory(), StrategyFactory()
## === PROCESSOR
p_configuration_loader = PConfigLoader()
split_strategy_factory = SplitStrategyFactory()
## === MODEL MANAGER
checkpoint = '/Users/joaquinuriarte/Documents/GitHub/sports-betting/models'
trainer = Trainer(checkpoint)
predictor = Predictor()
model_factory = ModelFactory()
mm_configuration_loader = MMConfigLoader()
print("# === STEP 1: Dependency Instantiations And Global Variable Declarations: Complete")

# === STEP 1: Dependency Instantiations And Global Variable Declarations: Starting ...
# === STEP 1: Dependency Instantiations And Global Variable Declarations: Complete


In [3]:
# === STEP 2: DATASET GEN
print("# === STEP 2: DATASET GEN: Starting ...")
dataset_generator = DatasetGenerator(yaml_path, ds_configuration_loader, data_factory, feature_processor_factory, join_factory, strategy_factory)

# === STEP 2: DATASET GEN: Starting ...


In [None]:
processed_dataset = dataset_generator.generate()
print("# === STEP 2: DATASET GEN: Complete")

In [4]:
# File path to save and load the processed dataset
processed_dataset_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/processed_dataset.pkl"

In [None]:
## Use to save processed dataset to disk ## 
import pickle
import os

with open(processed_dataset_path, "wb") as f:
        pickle.dump(processed_dataset, f)
        print("# === STEP 2: DATASET GEN: Saved to file")

In [5]:
## Use to load processed dataset if computed already ## 
import pickle
import os

print("# === STEP 2: DATASET GEN: Loading from saved file")
with open(processed_dataset_path, "rb") as f:
    processed_dataset = pickle.load(f)

# === STEP 2: DATASET GEN: Loading from saved file


In [6]:
processed_dataset.features

Unnamed: 0_level_0,A_player_1_MIN,A_player_1_PTS,A_player_1_AST,A_player_1_TO,A_player_1_PLUS_MINUS,A_player_1_OREB,A_player_1_DREB,A_player_1_PF,A_player_1_FG3_PCT,A_player_1_FG_PCT,...,B_player_8_AST,B_player_8_TO,B_player_8_PLUS_MINUS,B_player_8_OREB,B_player_8_DREB,B_player_8_PF,B_player_8_FG3_PCT,B_player_8_FG_PCT,B_player_8_FT_PCT,Team_A_Wins
GAME_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11400001,41.266667,31.0,5.0,1.0,-16.0,1.0,9.0,1.0,0.333,0.476,...,0.0,0.0,-11.0,0.0,0.0,4.0,0.0000,0.6670,0.0,0
11400002,45.083333,18.0,6.0,2.0,2.0,0.0,2.0,2.0,0.500,0.417,...,4.0,2.0,5.0,0.0,0.0,2.0,0.0000,0.4000,0.0,1
11400004,43.200000,6.0,7.0,4.0,-1.0,2.0,16.0,1.0,0.000,0.375,...,0.0,0.0,2.0,0.0,0.0,1.0,0.0000,0.0000,0.0,0
11400005,36.600000,12.0,2.0,1.0,9.0,1.0,7.0,3.0,0.000,0.545,...,1.0,0.0,6.0,0.0,5.0,4.0,0.5000,0.6670,1.0,1
11400006,36.866667,18.0,3.0,5.0,-17.0,2.0,3.0,4.0,0.429,0.389,...,0.0,0.0,13.0,0.0,3.0,4.0,0.0000,0.6000,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52100111,32.566667,22.0,0.0,2.0,18.0,0.0,2.0,1.0,1.000,0.538,...,0.0,0.0,-4.0,0.0,0.0,0.0,0.0000,0.0000,0.0,1
52100121,21.316667,13.0,2.0,0.0,-3.0,3.0,0.0,4.0,0.000,0.714,...,2.5,0.5,14.5,0.0,3.0,2.0,0.4165,0.4285,0.5,1
52100131,32.316667,19.0,6.0,3.0,-13.0,3.0,2.0,2.0,0.000,0.533,...,1.0,0.0,-1.0,2.0,2.0,0.0,0.0000,0.5000,1.0,1
52100201,40.000000,12.0,7.0,2.0,2.0,1.0,6.0,3.0,0.500,0.333,...,1.0,0.0,2.0,0.0,0.0,1.0,0.0000,1.0000,0.0,0


In [6]:
# === STEP 3: PROCESSOR
print("# === STEP 3: PROCESSOR: Starting ...")
processor = Processor(yaml_path, p_configuration_loader, processed_dataset, split_strategy_factory)

# === STEP 3: PROCESSOR: Starting ...


In [7]:
train_dataset, validation_dataset, test_dataset = processor.generate()
print("# === STEP 3: PROCESSOR: Complete")

# === STEP 3: PROCESSOR: Complete


In [8]:
len(train_dataset.examples)

17811

In [9]:
print("# === STEP 3.5: Assessing dataset inbalance: Starting...")
from collections import Counter
# Function to calculate dataset balance
def calculate_balance(dataset, name):
    # Extract "Team_A_Wins" labels
    y_labels = [example.features["Team_A_Wins"][0] for example in dataset.examples]

    # Count occurrences of each label
    label_counts = Counter(y_labels)
    num_1s = label_counts[1]
    num_0s = label_counts[0]

    # Calculate proportions
    total_samples = len(y_labels)
    proportion_1s = num_1s / total_samples * 100
    proportion_0s = num_0s / total_samples * 100

    # Display results
    print(f"Number of 1s (Team A wins) in {name}: {num_1s} ({proportion_1s:.2f}%)")
    print(f"Number of 0s (Team A does not win) in {name}: {num_0s} ({proportion_0s:.2f}%)")

# Assuming your dataset is loaded into memory as `my_dataset`
calculate_balance(train_dataset, "train")
calculate_balance(validation_dataset, "val")
calculate_balance(test_dataset, "test")

print("# === STEP 3.5: Assessing dataset inbalance: Complete")

# === STEP 3.5: Assessing dataset inbalance: Starting...
Number of 1s (Team A wins) in train: 10480 (58.84%)
Number of 0s (Team A does not win) in train: 7331 (41.16%)
Number of 1s (Team A wins) in val: 2255 (59.08%)
Number of 0s (Team A does not win) in val: 1562 (40.92%)
Number of 1s (Team A wins) in test: 2262 (59.26%)
Number of 0s (Team A does not win) in test: 1555 (40.74%)
# === STEP 3.5: Assessing dataset inbalance: Complete


In [10]:
# === STEP 4: MODEL MANAGER
print("# === STEP 4: MODEL MANAGER: Starting ...")
model_manager = ModelManager(trainer, predictor, model_factory, mm_configuration_loader)
models = model_manager.create_models([yaml_path])

# === STEP 4: MODEL MANAGER: Starting ...


In [11]:
train_dataset.examples[0].features
len(train_dataset.examples[0].features) # should be 177

177

In [12]:
# If no val dataset, code requires to push None instead
val_dataset = validation_dataset if validation_dataset is not None else None

model_manager.train(models, [(train_dataset, val_dataset)], save_after_training=True)
print("# === STEP 4: MODEL MANAGER: Complete")

INFO:root:Training model '8997782287ac95361ce10ecee1af1840' for 20 epochs with batch size 32.
INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 1/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 2/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 3/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 4/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 5/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 6/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 7/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 8/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 9/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 10/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 11/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 12/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 13/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 14/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 15/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 16/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 17/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 18/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 19/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Starting epoch 20/20.




INFO:root:Model '8997782287ac95361ce10ecee1af1840': Finished training.


Model saved successfully at: models/8997782287ac95361ce10ecee1af1840/model_weights_8997782287ac95361ce10ecee1af1840.pth
# === STEP 4: MODEL MANAGER: Complete


In [13]:
predictions = []
for model in models:
    print(model)
    predictions.append(model.predict(test_dataset.examples, return_target_labels=True))

<modules.model_manager.implementations.tensorflow_model.TensorFlowModel object at 0x7f8c19366d30>


In [14]:
for df in predictions:
    correct_predictions = (df['predictions'] == df['target_label']).sum()
    total_predictions = len(df)
    accuracy = correct_predictions / total_predictions
    print(f"Accuracy on test dataset: {accuracy}")

Accuracy on test dataset: 0.563793555148022


In [15]:
%tensorboard --logdir logs/fit/8997782287ac95361ce10ecee1af1840

UsageError: Line magic function `%tensorboard` not found.
