# Using LSTMs to Predict LeBron James' Points in the Next Game

**Team:** Taiyo Nakai & Jonathan Wu  
**Objective:** Train an LSTM neural network to predict LeBron James' performance (e.g., over/under betting) using historical game data.  
**Methods:** LSTM model with softmax activation for multiclass outcomes. Incorporating long-term and short-term performance trends.  
**Data Source:** Basketball Reference (https://www.basketball-reference.com/players/j/jamesle01.html); CSV files for individual player game data  
**Evaluation:** Precision-based evaluation with every game betting simulation.  
**Experiments:** Hyperparameter tuning for LSTM architecture, dropout, and forget gate bias initialization.

## Imports

In [1]:
import numpy as np
import pandas as pd
import os
import shutil
import random
import matplotlib.pyplot as plt
import tensorflow as tf
import keras_tuner as kt

from io import StringIO
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sortedcontainers import SortedSet
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.regularizers import l2

# Loading and Merging LeBron James Game Logs

This code loads and combines multiple CSV files containing LeBron James' game logs from a specified folder.  
1. Recursively searches for CSV files in the `GAME_LOG_FOLDER`.  
2. Adds each file path to a sorted set to maintain the ordered date.  
3. Reads and concatenates all CSV files into a single DataFrame `df`.  


In [2]:
GAME_LOG_FOLDER = "LeBron James Game Logs"

def fetch_files(DIRECTORY="LeBron James Game Logs"):
    csv_files = SortedSet()
    for dir_, _, files in os.walk(DIRECTORY):
        for file_name in files:
            rel_dir = os.path.relpath(dir_, DIRECTORY)
            rel_file = os.path.join(DIRECTORY, file_name)
            csv_files.add(rel_file)

    df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
    return df
    
df = fetch_files()
df

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1.0,10/29/2003,18-303,CLE,@,SAC,L (-14),1,42:50:00,...,4,6,9,4,0,2,3,25,24.7,-9
1,2,2.0,10/30/2003,18-304,CLE,@,PHO,L (-9),1,40:21:00,...,10,12,8,1,0,7,1,21,14.7,-3
2,3,3.0,11/1/2003,18-306,CLE,@,POR,L (-19),1,39:10:00,...,4,4,6,2,0,2,3,8,5,-21
3,4,4.0,11/5/2003,18-310,CLE,,DEN,L (-4),1,41:06:00,...,9,11,7,2,3,2,1,7,11.2,-3
4,5,5.0,11/7/2003,18-312,CLE,@,IND,L (-1),1,43:44:00,...,5,5,3,0,0,7,2,23,9,-7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1762,78,78.0,4/6/2025,40-098,LAL,@,OKC,,,,...,,,,,,,,-1,,
1763,79,79.0,4/8/2025,40-100,LAL,@,OKC,,,,...,,,,,,,,-1,,
1764,80,80.0,4/9/2025,40-101,LAL,@,DAL,,,,...,,,,,,,,-1,,
1765,81,81.0,4/11/2025,40-103,LAL,,HOU,,,,...,,,,,,,,-1,,


# Renaming Columns in Game Log Data

This code renames two in the game log DataFrame for better readability.  

1. Renames the columns `'Unnamed: 5'` and `'Unnamed: 7'` to `'Location'` and `'Score Differential'`, respectively.  


In [3]:
# Retain the original df before editing
original_df = df.copy()

def df_rename(df):
    df = df.rename(columns={'Unnamed: 5': 'Location', 'Unnamed: 7': 'Score Differential'})
    return df

df = df_rename(df)
df.columns.values

array(['Rk', 'G', 'Date', 'Age', 'Tm', 'Location', 'Opp',
       'Score Differential', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
       '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-'], dtype=object)

# Data Cleaning & Feature Engineering

This code performs several data preprocessing steps on the game log DataFrame to clean.  
1. Drops rows where the `'G'` (Game) column is NaN.  
2. Converts the `'Date'` column to datetime format, and extracts the day, month, and year into new columns.  
3. Drops irrelevant columns like `'Rk'`, `'G'`, `'Date'`, etc.  
4. Maps categorical values in the `'Location'` column (home or away) and updates `'Opp'` and `'Tm'` columns to numeric values representing teams.  
5. Converts the `'MP'` column (minutes played) into a float, representing total minutes played in decimal format.  
6. Calculates and updates the `'Age'` column by converting it from a year-day format to a float representing age in years.  
7. Converts all columns to `float` type for consistency.  


In [4]:
def cleansing(df):
    df.drop(df.index[df['G'].isnull()], inplace = True)

    df['Date'] = pd.to_datetime(df['Date'])
    df['Day'] = df['Date'].dt.day
    df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year

    df = df.drop(columns=['Rk', 'G', 'Date', 'Score Differential', 'GS', 'GmSc', '+/-', 'ORB', 'DRB', 'TRB', 
                          'AST', 'STL', 'BLK', 'TOV', 'PF', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
                          '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
                          'BLK', 'TOV', 'PF'])

    df['Location'] = df['Location'].map({'@': 1, np.nan: 0}) 

    df['Opp'] = df['Opp'].map({'ATL': 1, 'BOS': 2, 'BRK': 3, 'NJN': 3, 'CHA': 4, 'CHO': 4,
                               'CHI': 5, 'CLE': 6, 'DAL': 7, 'DEN': 8, 'DET': 9, 'GSW': 10, 
                               'HOU': 11, 'IND': 12, 'LAC': 13, 'LAL': 14, 'MEM': 15, 'MIA': 16, 
                               'MIL': 17, 'MIN': 18, 'NOH': 19, 'NOK': 19, 'NOP': 19, 'NYK': 20, 
                               'OKC': 21, 'ORL': 22, 'PHI': 23, 'PHO': 24, 'POR': 25, 'SAC': 26,
                               'SAS': 27, 'SEA': 28, 'TOR': 29, 'UTA': 30, 'WAS': 31}) 

    df['Tm'] = df['Tm'].map({'ATL': 1, 'BOS': 2, 'BRK': 3, 'NJN': 3, 'CHA': 4, 'CHO': 4,
                             'CHI': 5, 'CLE': 6, 'DAL': 7, 'DEN': 8, 'DET': 9, 'GSW': 10, 
                             'HOU': 11, 'IND': 12, 'LAC': 13, 'LAL': 14, 'MEM': 15, 'MIA': 16, 
                             'MIL': 17, 'MIN': 18, 'NOH': 19, 'NOK': 19, 'NOP': 19, 'NYK': 20, 
                             'OKC': 21, 'ORL': 22, 'PHI': 23, 'PHO': 24, 'POR': 25, 'SAC': 26,
                             'SAS': 27, 'SEA': 28, 'TOR': 29, 'UTA': 30, 'WAS': 31})

    for i, row in df.iterrows():
        year, days = df.loc[i]['Age'].split('-')
        if int(df.loc[i]['Year']) % 4 == 0:
            days = int(days) / 366
        else:
            days = int(days) / 365
        df.loc[i, 'Age'] = int(year) + days

    df = df.astype(float)
    return df

df = cleansing(df)
df

Unnamed: 0,Age,Tm,Location,Opp,PTS,Day,Month,Year
0,18.830137,6.0,1.0,26.0,25.0,29.0,10.0,2003.0
1,18.832877,6.0,1.0,24.0,21.0,30.0,10.0,2003.0
2,18.838356,6.0,1.0,25.0,8.0,1.0,11.0,2003.0
3,18.849315,6.0,0.0,8.0,7.0,5.0,11.0,2003.0
4,18.854795,6.0,1.0,12.0,23.0,7.0,11.0,2003.0
...,...,...,...,...,...,...,...,...
1762,40.268493,14.0,1.0,21.0,-1.0,6.0,4.0,2025.0
1763,40.273973,14.0,1.0,21.0,-1.0,8.0,4.0,2025.0
1764,40.276712,14.0,1.0,7.0,-1.0,9.0,4.0,2025.0
1765,40.282192,14.0,0.0,11.0,-1.0,11.0,4.0,2025.0


# Data Scaling and Splitting for Time-Series Model

This code scales the features and target variable, and prepares the data for training a time-series model using sliding windows.   
1. **Scaling:**  
   - Scales the feature columns (excluding `'PTS'`) using `MinMaxScaler`.  
   - Scales the target column `'PTS'` using the same scaler.  
2. **Data Splitting:**  
   - Defines a `split()` function that creates sequences of past data points (with a specified window length) to predict future values.  
3. **Train/Test Split:**  
   - Splits the scaled features into training (80%) and testing (20%) sets.  
4. **Windowing:**  
   - Applies the `split()` function to create input-output pairs for training and testing using a variable sliding window length.  
5. **Reshaping:**  
   - Reshapes the input data into 3D arrays suitable for feeding into a neural network (samples, time steps, features).  
  

In [5]:
WINDOW = 5
YEAR_LIMIT = 2022

def split_train_test(df, window, year_limit):
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df.drop(columns=['PTS']))
    scaled_target = scaler.fit_transform(df[['PTS']])

    def split(dataset, window=1):
        dataX, dataY = [], []
        for i in range(len(dataset)-window-1):
            a = dataset[i:(i+window), 0]
            dataX.append(a)
            dataY.append(dataset[i + window, 0])
        return np.array(dataX), np.array(dataY)

    train_size = df[(df['Year'] < year_limit) | ((df['Year'] == year_limit) & (df['Month'] < 7))].shape[0]
    test_size = len(scaled_target) - train_size
    train, test = scaled_features[0:train_size,:], scaled_target[train_size:len(scaled_target),:]

    trainX, trainY = split(train, window)
    testX, testY = split(test, window)

    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
    
    return trainX, testX, trainY, testY, scaler

trainX, testX, trainY, testY, scaler = split_train_test(df, WINDOW, YEAR_LIMIT)

# Hyperparameter Tuning for LSTM Model

This code tunes the hyperparameters of an LSTM-based model for predicting LeBron James' points in future games.


In [6]:
def LSTM_Tuning():
    tuning_directory = 'LSTM_Tuning_Results'
    if os.path.exists(tuning_directory):
        shutil.rmtree(tuning_directory)
    
    def build_model(hp):
        model = Sequential([
            Input(shape=(trainX.shape[1], trainX.shape[2])),
            Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.001))),
            LSTM(64, return_sequences=True, kernel_regularizer=l2(0.001)),
            Dropout(0.25),
            LSTM(50,return_sequences=True),
            Dropout(0.25),
            LSTM(50,return_sequences=True),
            Dropout(0.25),
            LSTM(50),
            Dropout(0.25),
            Dense(48, kernel_regularizer=l2(0.001)),
            Dense(1, kernel_regularizer=l2(0.001), bias_initializer='zeros')
        ])

        # Compile the model
        model.compile(
            optimizer=tf.keras.optimizers.Adam(
                learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')
            ),
            loss='mse',
            metrics=['mae']
        )

        return model

    # Initialize the tuner
    tuner = kt.Hyperband(
        build_model,  # Model-building function
        objective='val_mae',  # Optimize for validation MAE
        max_epochs=50,  # Max epochs for training
        factor=3,  # Reduction factor
        directory=tuning_directory,
        project_name='LSTM_Hyperband'
    )

    callbacks = [
        keras.callbacks.EarlyStopping(monitor='val_mae', patience=10)
    ]

    # Perform hyperparameter search
    tuner.search(
        trainX, trainY,
        epochs=50,
        validation_data=(testX, testY),
        callbacks=callbacks
    )

    # Get the best hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    
    # Build the best model
    model = tuner.hypermodel.build(best_hps)

    # Train the best model
    model_History = model.fit(
        trainX, trainY,
        epochs=100,
        batch_size=32,
        validation_data=(testX, testY),
        callbacks=callbacks
    )
    
    return model, model_History

# Building and Training the LSTM Model

This code defines, compiles, and trains a LSTM model for predicting LeBron James' performance.  
1. **Model Architecture:**  
   - Creates a sequential model with the layers shown below
2. **Compilation:**  
   - Compiles the model using the Adam optimizer, mean squared error (MSE) loss, and mean absolute error (MAE) metric.  
3. **Callbacks:**  
   - Implements early stopping based on validation MAE with a patience of 10 epochs.  
4. **Training:**  
   - Trains the model for up to 100 epochs, using a batch size of 32, with validation data and the early stopping callback.  
5. **Output:** The model's training history is stored in `LSTM_History`.


In [None]:
def build_LSTM(tuning=False):
    if not tuning:
        LSTM_Neurons = 128
        dropout_val = 0.25
        dense_neurons = 38
        final_dense = 1
        LSTM_Model = Sequential([
            Input(shape=(trainX.shape[1], trainX.shape[2])),
            
            Bidirectional(LSTM(LSTM_Neurons, return_sequences=True, kernel_regularizer=l2(0.001))),
            LSTM(LSTM_Neurons/2, return_sequences=True, kernel_regularizer=l2(0.001)),
            Dropout(dropout_val),
            LSTM(LSTM_Neurons/2,return_sequences=True),
            Dropout(dropout_val),
            LSTM(LSTM_Neurons/2,return_sequences=True),
            Dropout(dropout_val),
            LSTM(LSTM_Neurons/2),
            Dropout(dropout_val),
            Dense(dense_neurons, kernel_regularizer=l2(0.001)),
            Dense(final_dense, kernel_regularizer=l2(0.001), bias_initializer='zeros')
        ])

        LSTM_Model.compile(optimizer='adam', loss='mse', metrics=['mae'])

        callbacks = [
            keras.callbacks.EarlyStopping(monitor='val_mae', patience=10)
        ]

        LSTM_History = LSTM_Model.fit(trainX, trainY, epochs=100, batch_size=32, validation_data=(testX, testY), callbacks=callbacks)

        return LSTM_Model, LSTM_History
    else:
        return LSTM_Tuning()

LSTM, LSTM_History = build_LSTM(True)

Trial 7 Complete [00h 00m 07s]
val_mae: 0.19006116688251495

Best val_mae So Far: 0.1813495010137558
Total elapsed time: 00h 00m 49s

Search: Running Trial #8

Value             |Best Value So Far |Hyperparameter
0.0013664         |0.00064118        |learning_rate
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
3                 |3                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/2


# Evaluating and Making Predictions with the LSTM Model

This code performs recurrent predictions using an LSTM model, updating test data with noisy predictions to simulate variability in the forecast.

### Functions:

1. **`new_window(X_test, scaled_prediction)`**:
    - **Purpose**: Updates the `X_test` data by replacing values in the windows containing `0` with the given `scaled_prediction` value.
    - Iterates over `X_test`, and if a window contains `0`, it replaces the `0` with `scaled_prediction`, decreasing the `replacement_index` until all applicable values are replaced.

2. **`new_target(y_test, scaled_prediction)`**:
    - **Purpose**: Updates the `y_test` target values by replacing the first occurrence of `0` with `scaled_prediction`.
    - Finds the first `0` in `y_test` and replaces it with `scaled_prediction`.

3. **`predict_with_noise(prediction)`**:
    - **Purpose**: Adjusts a given prediction by adding noise based on statistical properties of historical data (`PTS`).
    - Generates noise from a normal distribution based on the mean and standard deviation of the `PTS` column in `df`. If the noise is greater than the prediction, it averages the prediction with the noise; otherwise, it takes one-third of the average of both values to introduce more uncertainty.

4. **`recurrent_predictions(X_test, y_test, current_length)`**:
    - **Purpose**: Makes recurrent predictions with the LSTM model, updates the `PTS` column of a dataframe, and modifies the test data (`X_test`, `y_test`) based on the predictions.
    - Predicts future values using the LSTM model and inverse scales the predictions.
    - The prediction is adjusted with noise by calling `predict_with_noise()`.
    - Updates `X_test` and `y_test` by calling `new_window()` and `new_target()` with the updated prediction.
    - Repeats this process until all values are predicted.

In [None]:
def LSTM_Predict(year_limit):
    def new_window(X_test, scaled_prediction):
        """
        Updates the windows in the test data (X_test) by replacing the first occurrence of 0 
        with the scaled prediction value at the appropriate index.
        """
        replacement_index = 4
        for _ in X_test:
            for window in _:
                if 0 in window and replacement_index >= 0:
                    window[replacement_index] = scaled_prediction.item()
                    replacement_index -= 1
                elif replacement_index < 0:
                    break

    def new_target(y_test, scaled_prediction):
        """
        Updates the target values (y_test) by replacing the first occurrence of 0 with the 
        scaled prediction value.
        """
        index = np.where(y_test == 0)[0][0]
        y_test[index] = scaled_prediction.item()

    def predict_with_noise(prediction):
        """
        Adds noise to the given prediction by drawing from a normal distribution based on 
        historical data statistics (mean and std of 'PTS'). The noisy prediction is adjusted 
        depending on whether the noise is greater than the original prediction.
        """
        stats = df[(df['Year'] < year_limit) | ((df['Year'] == year_limit) & (df['Month'] < 7))]['PTS'].describe()
        noise = np.random.normal(loc=stats['mean'], scale=stats['std'], size=1)
        if noise > prediction:
            return np.round((prediction + noise) / 1.8, 0) # Decided on arbitrarily
        else:
            return np.round((prediction + noise) / 2.8, 0) # Decided on arbitrarily

    def recurrent_predictions(X_test, y_test, current_length):
        """
        Makes recursive predictions using the LSTM model, updating the 'PTS' column in the 
        DataFrame with noisy predictions at each step. The process continues until all test 
        data is used.
        """
        n = len(X_test)

        while current_length < n:
            current_X, current_y = X_test[:current_length], y_test[:current_length-1]

            predictions = LSTM.predict(np.array(current_X))
            predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))
            predictions = [round(points[0]) for points in predictions]
            actual = scaler.inverse_transform(np.array(current_y).reshape(-1, 1))

            index = df[df['PTS'] == -1].index[0]
            df.loc[index, 'PTS'] = predict_with_noise(predictions[-1])

            temp_scaling = scaler.fit_transform(df[['PTS']])
            scaled_index = temp_scaling.tolist().index([0])-1
            scaled_number = temp_scaling[scaled_index]

            new_window(X_test, scaled_number)
            new_target(y_test, scaled_number)

            current_length += 1

        return predictions, actual

    original_length = testY.tolist().index(0) + 1
    predictions, actual = recurrent_predictions(testX, testY, original_length)
    
    return predictions, actual, original_length

predictions, actual, original_length = LSTM_Predict(YEAR_LIMIT)

# Plotting Actual vs. Predicted Values

This code visualizes the actual and predicted values of points scored for the test set.


In [None]:
plt.plot(actual[:original_length-1], label='Actual', color='blue')
plt.axvline(x=original_length-2, color='black', linestyle='--', label='Unplayed')
plt.plot(predictions, label='Predicted', color='red')

plt.title('Actual vs. Predicted NBA Points')
plt.xlabel('Game (Most Recent From Left to Right)')
plt.ylabel('Points')
plt.legend(loc='upper right')
plt.grid(True)

plt.show()

In [None]:
plt.plot(actual, label='Actual + Simulated', color='blue')
plt.axvline(x=original_length-2, color='black', linestyle='--', label='Unplayed')
plt.plot(predictions, label='Predicted', color='red')

plt.title('Actual vs. Predicted NBA Points')
plt.xlabel('Game (Most Recent From Left to Right)')
plt.ylabel('Points')
plt.legend(loc='upper right')
plt.grid(True)

plt.show()

# True Metric: Evaluation on Betting (Prop) Lines
While we did use MAE to best fit the model on LeBron James' historical data in points scored, we evaluted the ability of the model on a second metric: how many prop lines can it accurate hit and/or predict?

To do this, we scraped data from FanDuel's historical prop lines for LeBron James from 2023 to the present and compared it the actual points he scored that day, and how many points our model predicts him to actually score.

We want to maximize the amount of money earned if we were to truly bet on Lebron James.

In [None]:
bettinglinesdf = pd.read_csv("Betting_Lines_2024_2025")
bettinglines = np.array(bettinglinesdf["Line"])

In [None]:
predictions_combined = predictions[-window-2:] + predictionstemp
y_combined = np.array(data2024["PTS"])
plt.plot(y_combined, label='Actual', color='blue')
plt.plot(predictions_combined, label='Predicted', color='red')
plt.plot(bettinglines, label='Betting Line', color='goldenrod')

plt.title('Actual vs Predicted Points by Lebron James, 2024-2025')
plt.xlabel('Game (Most Recent From Left to Right)')
plt.ylabel('Points')
plt.legend(loc='upper left')
plt.grid(True)

plt.show()

# Betting in Different Ways
We will retroactively "bet" on points scored by Lebron James in several different ways, and see if one works better than others.

## Preparation: Predicting the rest of December 2024 
(as of December 4, the future)

In [None]:
datafall2024 = pd.read_csv("December_2024_data")
datafall2024.tail()

## Baseline: Bet the Favorite Every Game
If one simply bet on the favorite between over and under for every game, how would they have fared so far?

In [None]:
bettinglinesdf['Actual Points'] = np.array(data2024['PTS'])
bettinglinesdf['Predicted Points'] = np.array(predictions_combined)
bettinglinesdf

In [None]:
spendings = []  # Use a list to collect spendings
earnings = []   # Use a list to collect earnings

for index, row in bettinglinesdf.iterrows():
    # Assign variables based on columns
    line = row['Line']
    over = row['Over']
    under = row['Under']
    actual = row['Actual Points']
    predicted = row['Predicted Points']

    # Betting over
    if over < under:
        # Positive odds
        if over > 0:
            spendings.append(100)
            if actual > line:
                earnings.append(100 + over)
            else:
                earnings.append(0)  # Add 0 if no earnings
        # Negative odds
        elif over < 0:
            spendings.append(-over)
            if actual > line:
                earnings.append(100 - over)
            else:
                earnings.append(0)  # Add 0 if no earnings
    # Betting under
    elif over > under:
        if under > 0:
            spendings.append(100)
            if actual < line:
                earnings.append(100 + under)
            else:
                earnings.append(0)  # Add 0 if no earnings
        elif under < 0:
            spendings.append(-under)
            if actual < line:
                earnings.append(100 - under)
            else:
                earnings.append(0)  # Add 0 if no earnings

# Convert lists to NumPy arrays
spendings = np.array(spendings)
earnings = np.array(earnings)

# Calculate total spendings and earnings
total_spendings = spendings.sum()
total_earnings = earnings.sum()

print(f"We bet ${total_spendings} to earn ${total_earnings} \n for a net profit of ${total_earnings - total_spendings} \n or ${round((total_earnings - total_spendings)/len(spendings), 2)} per game")
d = {'spendings': spendings, 'earnings': earnings}
pd.DataFrame(data=d)


Evidently, a person that bet on the favorite every game lost money.

## Method 1: Predicted vs. Betting Line

In [None]:
spendings = []  # Use a list to collect spendings
earnings = []   # Use a list to collect earnings

for index, row in bettinglinesdf.iterrows():
    # Assign variables based on columns
    line = row['Line']
    over = row['Over']
    under = row['Under']
    actual = row['Actual Points']
    predicted = row['Predicted Points']

    # Betting over
    if predicted > line:
        # Positive odds
        if over > 0:
            spendings.append(100)
            if actual > line:
                earnings.append(100 + over)
            else:
                earnings.append(0)  # Add 0 if no earnings
        # Negative odds
        elif over < 0:
            spendings.append(-over)
            if actual > line:
                earnings.append(100 - over)
            else:
                earnings.append(0)  # Add 0 if no earnings
    # Betting under
    elif predicted < line:
        if under > 0:
            spendings.append(100)
            if actual < line:
                earnings.append(100 + under)
            else:
                earnings.append(0)  # Add 0 if no earnings
        elif under < 0:
            spendings.append(-under)
            if actual < line:
                earnings.append(100 - under)
            else:
                earnings.append(0)  # Add 0 if no earnings

# Convert lists to NumPy arrays
spendings = np.array(spendings)
earnings = np.array(earnings)

# Calculate total spendings and earnings
total_spendings = spendings.sum()
total_earnings = earnings.sum()

print(f"We bet ${total_spendings} to earn ${total_earnings} \n for a net profit of ${total_earnings - total_spendings} \n or ${round((total_earnings - total_spendings)/len(spendings), 2)} per game")
d = {'spendings': spendings, 'earnings': earnings}
pd.DataFrame(data=d)

This ended up being a worse strategy than the baseline.