# This is the main notebook file for the machine learning program

In [78]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import classification_report
#%run read_csv_files.ipynb

## Begin with scaled data where the target value predicts whether the home team will win the game (Signal = '1')
## Part 1: Neural Network with Deep Learning for 10 Seasons

In [14]:
# Drop the 'Date' column from the dataframe
ten_seasons_nn = df_combined_2011_2021.drop(columns=['Date']).reset_index()

# Display the dataframe
ten_seasons_nn

Unnamed: 0,index,Home_Team,Away_Team,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin
0,0,New York Knicks,Boston Celtics,195.5,5.0,-210,185,210,2
1,1,Dallas Mavericks,Miami Heat,190.5,3.5,185,-210,199,-11
2,2,Los Angeles Lakers,Chicago Bulls,186.0,1.5,205,-245,175,-1
3,3,Oklahoma City Thunder,Orlando Magic,197.0,7.5,-380,310,186,8
4,4,Golden State Warriors,Los Angeles Clippers,212.0,4.0,175,-200,191,-19
...,...,...,...,...,...,...,...,...,...
12576,1166,Phoenix Suns,Milwaukee Bucks,221.0,5.5,-180,160,226,10
12577,1167,Milwaukee Bucks,Phoenix Suns,222.0,3.5,-210,180,220,20
12578,1168,Milwaukee Bucks,Phoenix Suns,220.5,3.5,-190,170,212,6
12579,1169,Phoenix Suns,Milwaukee Bucks,218.5,4.0,-170,150,242,-4


In [15]:
# drop the 'index' column
ten_seasons_nn = ten_seasons_nn.drop(columns=['index'])

# display the dataframe
ten_seasons_nn

Unnamed: 0,Home_Team,Away_Team,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin
0,New York Knicks,Boston Celtics,195.5,5.0,-210,185,210,2
1,Dallas Mavericks,Miami Heat,190.5,3.5,185,-210,199,-11
2,Los Angeles Lakers,Chicago Bulls,186.0,1.5,205,-245,175,-1
3,Oklahoma City Thunder,Orlando Magic,197.0,7.5,-380,310,186,8
4,Golden State Warriors,Los Angeles Clippers,212.0,4.0,175,-200,191,-19
...,...,...,...,...,...,...,...,...
12576,Phoenix Suns,Milwaukee Bucks,221.0,5.5,-180,160,226,10
12577,Milwaukee Bucks,Phoenix Suns,222.0,3.5,-210,180,220,20
12578,Milwaukee Bucks,Phoenix Suns,220.5,3.5,-190,170,212,6
12579,Phoenix Suns,Milwaukee Bucks,218.5,4.0,-170,150,242,-4


In [16]:
# Review the data types
ten_seasons_nn.dtypes

Home_Team               object
Away_Team               object
Over_Under             float64
Spread                 float64
Moneyline_Home_Team      int64
Moneyline_Away_Team      int64
Game_Total_Points        int64
Win_Margin               int64
dtype: object

In [17]:
# Create a list of the categorical variables
categorical_variables_1 = list(
    ten_seasons_nn.dtypes[ten_seasons_nn.dtypes == 'object'].index
)

# Display the list
categorical_variables_1

['Home_Team', 'Away_Team']

In [18]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(
    sparse = False
)

In [19]:
# Encode the categorical variables into numerical format
encoded_data_1 = enc.fit_transform(
    ten_seasons_nn[categorical_variables_1]
)

In [20]:
# Create a dataframe with the encoded data
encoded_ten_seasons_df = pd.DataFrame(
    encoded_data_1,
    columns = enc.get_feature_names_out(
        categorical_variables_1
    )
)

# Display the dataframe
encoded_ten_seasons_df

Unnamed: 0,Home_Team_Atlanta Hawks,Home_Team_Boston Celtics,Home_Team_Brooklyn Nets,Home_Team_Charlotte Bobcats,Home_Team_Chicago Bulls,Home_Team_Cleveland Cavaliers,Home_Team_Dallas Mavericks,Home_Team_Denver Nuggets,Home_Team_Detroit Pistons,Home_Team_Golden State Warriors,...,Away_Team_Oklahoma City Thunder,Away_Team_Orlando Magic,Away_Team_Philadelphia 76ers,Away_Team_Phoenix Suns,Away_Team_Portland Trail Blazers,Away_Team_Sacramento Kings,Away_Team_San Antonio Spurs,Away_Team_Toronto Raptors,Away_Team_Utah Jazz,Away_Team_Washington Wizards
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
12578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
12579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Create a new variable for the exisiting numerical data and reset the index
numerical_variables_1 = ten_seasons_nn.drop(
    columns = categorical_variables_1
)
numerical_variables_1

Unnamed: 0,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin
0,195.5,5.0,-210,185,210,2
1,190.5,3.5,185,-210,199,-11
2,186.0,1.5,205,-245,175,-1
3,197.0,7.5,-380,310,186,8
4,212.0,4.0,175,-200,191,-19
...,...,...,...,...,...,...
12576,221.0,5.5,-180,160,226,10
12577,222.0,3.5,-210,180,220,20
12578,220.5,3.5,-190,170,212,6
12579,218.5,4.0,-170,150,242,-4


In [22]:
# Add the numerical data and the categorical encoded data to one dataframe
encoded_ten_seasons_df = pd.concat(
    [encoded_ten_seasons_df,
     numerical_variables_1],
    axis=1
)

# Display the dataframe
encoded_ten_seasons_df

Unnamed: 0,Home_Team_Atlanta Hawks,Home_Team_Boston Celtics,Home_Team_Brooklyn Nets,Home_Team_Charlotte Bobcats,Home_Team_Chicago Bulls,Home_Team_Cleveland Cavaliers,Home_Team_Dallas Mavericks,Home_Team_Denver Nuggets,Home_Team_Detroit Pistons,Home_Team_Golden State Warriors,...,Away_Team_San Antonio Spurs,Away_Team_Toronto Raptors,Away_Team_Utah Jazz,Away_Team_Washington Wizards,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,195.5,5.0,-210,185,210,2
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,190.5,3.5,185,-210,199,-11
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,186.0,1.5,205,-245,175,-1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,197.0,7.5,-380,310,186,8
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,212.0,4.0,175,-200,191,-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,221.0,5.5,-180,160,226,10
12577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,222.0,3.5,-210,180,220,20
12578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,220.5,3.5,-190,170,212,6
12579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,218.5,4.0,-170,150,242,-4


In [23]:
# Add a "Signal" column which tracks the "Win_Margin" column
encoded_ten_seasons_df['Signal'] = 0.0

# When the "Win_Margin" is a greater than or equal to 0, set signal to 1
encoded_ten_seasons_df.loc[(encoded_ten_seasons_df['Win_Margin'] >= 0), 'Signal'] = 1

# When the "Win_Margin" is less than 0, set signal to -1
encoded_ten_seasons_df.loc[(encoded_ten_seasons_df['Win_Margin'] < 0), 'Signal'] = -1

# Display the dataframe
encoded_ten_seasons_df

Unnamed: 0,Home_Team_Atlanta Hawks,Home_Team_Boston Celtics,Home_Team_Brooklyn Nets,Home_Team_Charlotte Bobcats,Home_Team_Chicago Bulls,Home_Team_Cleveland Cavaliers,Home_Team_Dallas Mavericks,Home_Team_Denver Nuggets,Home_Team_Detroit Pistons,Home_Team_Golden State Warriors,...,Away_Team_Toronto Raptors,Away_Team_Utah Jazz,Away_Team_Washington Wizards,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin,Signal
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,195.5,5.0,-210,185,210,2,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,190.5,3.5,185,-210,199,-11,-1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,186.0,1.5,205,-245,175,-1,-1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,197.0,7.5,-380,310,186,8,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,212.0,4.0,175,-200,191,-19,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,221.0,5.5,-180,160,226,10,1.0
12577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,222.0,3.5,-210,180,220,20,1.0
12578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,220.5,3.5,-190,170,212,6,1.0
12579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,218.5,4.0,-170,150,242,-4,-1.0


In [24]:
# Change the 'Signal' column to float data type
encoded_ten_seasons_df['Signal'] = encoded_ten_seasons_df['Signal'].astype(float)
encoded_ten_seasons_df.dtypes

Home_Team_Atlanta Hawks        float64
Home_Team_Boston Celtics       float64
Home_Team_Brooklyn Nets        float64
Home_Team_Charlotte Bobcats    float64
Home_Team_Chicago Bulls        float64
                                ...   
Moneyline_Home_Team              int64
Moneyline_Away_Team              int64
Game_Total_Points                int64
Win_Margin                       int64
Signal                         float64
Length: 69, dtype: object

In [25]:
# Display the amount of each signal option
encoded_ten_seasons_df['Signal'].value_counts()

 1.0    7318
-1.0    5263
Name: Signal, dtype: int64

In [26]:
# Set the y (target value) to 'Signal' column
y = encoded_ten_seasons_df['Signal']

# Set the X (features) to all columns except 'Signal'
X = encoded_ten_seasons_df.drop(columns=['Signal'])

# Display a sample of each
display(y[:5])
display(X.head())

0    1.0
1   -1.0
2   -1.0
3    1.0
4   -1.0
Name: Signal, dtype: float64

Unnamed: 0,Home_Team_Atlanta Hawks,Home_Team_Boston Celtics,Home_Team_Brooklyn Nets,Home_Team_Charlotte Bobcats,Home_Team_Chicago Bulls,Home_Team_Cleveland Cavaliers,Home_Team_Dallas Mavericks,Home_Team_Denver Nuggets,Home_Team_Detroit Pistons,Home_Team_Golden State Warriors,...,Away_Team_San Antonio Spurs,Away_Team_Toronto Raptors,Away_Team_Utah Jazz,Away_Team_Washington Wizards,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,195.5,5.0,-210,185,210,2
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,190.5,3.5,185,-210,199,-11
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,186.0,1.5,205,-245,175,-1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,197.0,7.5,-380,310,186,8
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,212.0,4.0,175,-200,191,-19


In [27]:
# Split the data into training and testing variables
# We will NOT be scaling the data in this intial model

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=1
)

In [28]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
# Define the number of inputs to the model
inputs_1 = len(X_train.iloc[0])

# Display the amount of inputs
inputs_1

68

In [30]:
# Define the amount of outputs
outputs = 1

In [31]:
# Define the first hidden layer
hidden_1_layer_1 = ((inputs_1 + 1) // 2) + 33

# Display the amount of nodes
hidden_1_layer_1

67

In [32]:
# Define the second hidden layer
hidden_1_layer_2 = ((hidden_1_layer_1 + 1) // 2)

# Display the amount of nodes
hidden_1_layer_2

34

In [33]:
# Define the third hidden layer
hidden_1_layer_3 = ((hidden_1_layer_2 + 1) // 2)

# Display the amount of nodes
hidden_1_layer_3

17

In [34]:
# Create a Sequential mode instance
neural_network_1 = Sequential()

2023-02-13 18:14:19.160408: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [35]:
# Add the hidden layers
# First layer
neural_network_1.add(
    Dense(
        units=hidden_1_layer_1,
        input_dim=inputs_1,
        activation='ReLU'
    )
)

# Second layer
neural_network_1.add(
    Dense(
        units=hidden_1_layer_2,
        activation='ReLU'
    )
)

# Third layer
neural_network_1.add(
    Dense(
        units=hidden_1_layer_3,
        activation='ReLU'
    )
)

In [36]:
# Add the output layer
neural_network_1.add(
    Dense(
        units=outputs,
        activation='sigmoid'
    )
)

In [37]:
# Display the Sequential model summary
neural_network_1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 67)                4623      
                                                                 
 dense_1 (Dense)             (None, 34)                2312      
                                                                 
 dense_2 (Dense)             (None, 17)                595       
                                                                 
 dense_3 (Dense)             (None, 1)                 18        
                                                                 
Total params: 7,548
Trainable params: 7,548
Non-trainable params: 0
_________________________________________________________________


In [38]:
# Compile the Sequential model
neural_network_1.compile(
    loss="binary_crossentropy",
    optimizer="adam", 
    metrics=["accuracy"]
)

In [43]:
# Fit the model using 85 epochs
fit_model = neural_network_1.fit(
    X_train_scaled,
    y_train,
    epochs=100
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [49]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = neural_network_1.evaluate(
    X_test_scaled,
    y_test, 
    verbose=2
)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

99/99 - 0s - loss: -9.7889e+10 - accuracy: 0.3954 - 458ms/epoch - 5ms/step
Loss: -97889476608.0, Accuracy: 0.3954227566719055


## Report:
With 10 seasons of learning data scaled, we are receiving about a 39.54% accuracy. This is very close to our goal of 40% accuracy. We hypothesize that the result will be better with less data, and we will try this with 5 seasons in parts 3 and 4. The next part, part 2, we will use SKLearn's logistic regression machine learning method and see if we can get a better result using all 10 seasons.

## Part 2: SKLearn's logistic regression machine learning model with 10 seasons

In [55]:
# Pull the original data
ten_seasons_lr = df_combined_2011_2021.copy().reset_index()

# display the dataframe
ten_seasons_lr

Unnamed: 0,index,Date,Home_Team,Away_Team,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin
0,0,2011-12-25,New York Knicks,Boston Celtics,195.5,5.0,-210,185,210,2
1,1,2011-12-25,Dallas Mavericks,Miami Heat,190.5,3.5,185,-210,199,-11
2,2,2011-12-25,Los Angeles Lakers,Chicago Bulls,186.0,1.5,205,-245,175,-1
3,3,2011-12-25,Oklahoma City Thunder,Orlando Magic,197.0,7.5,-380,310,186,8
4,4,2011-12-25,Golden State Warriors,Los Angeles Clippers,212.0,4.0,175,-200,191,-19
...,...,...,...,...,...,...,...,...,...,...
12576,1166,2020-07-08,Phoenix Suns,Milwaukee Bucks,221.0,5.5,-180,160,226,10
12577,1167,2020-07-11,Milwaukee Bucks,Phoenix Suns,222.0,3.5,-210,180,220,20
12578,1168,2020-07-14,Milwaukee Bucks,Phoenix Suns,220.5,3.5,-190,170,212,6
12579,1169,2020-07-17,Phoenix Suns,Milwaukee Bucks,218.5,4.0,-170,150,242,-4


In [56]:
# Reset the index
ten_seasons_lr = ten_seasons_lr.drop(columns=['index', 'Date', 'Home_Team','Away_Team'])

# display the dataframe
ten_seasons_lr

Unnamed: 0,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin
0,195.5,5.0,-210,185,210,2
1,190.5,3.5,185,-210,199,-11
2,186.0,1.5,205,-245,175,-1
3,197.0,7.5,-380,310,186,8
4,212.0,4.0,175,-200,191,-19
...,...,...,...,...,...,...
12576,221.0,5.5,-180,160,226,10
12577,222.0,3.5,-210,180,220,20
12578,220.5,3.5,-190,170,212,6
12579,218.5,4.0,-170,150,242,-4


In [57]:
# Add a column called 'Signal' to track the 'Win_Margin' column
ten_seasons_lr['Signal'] = 0.0

# When the 'Win_Margin' is greater than or equal to 0, set the signal to 1
ten_seasons_lr.loc[(ten_seasons_lr['Win_Margin'] >= 0), 'Signal'] = 1

# When the 'Win_Margin' is less than 0, set the signal to -1
ten_seasons_lr.loc[(ten_seasons_lr['Win_Margin'] < 0), 'Signal'] = -1

# Display the dataframe
ten_seasons_lr

Unnamed: 0,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin,Signal
0,195.5,5.0,-210,185,210,2,1.0
1,190.5,3.5,185,-210,199,-11,-1.0
2,186.0,1.5,205,-245,175,-1,-1.0
3,197.0,7.5,-380,310,186,8,1.0
4,212.0,4.0,175,-200,191,-19,-1.0
...,...,...,...,...,...,...,...
12576,221.0,5.5,-180,160,226,10,1.0
12577,222.0,3.5,-210,180,220,20,1.0
12578,220.5,3.5,-190,170,212,6,1.0
12579,218.5,4.0,-170,150,242,-4,-1.0


In [59]:
# Change all data types to float
ten_seasons_lr['Moneyline_Home_Team'] = ten_seasons_lr['Moneyline_Home_Team'].astype(float)
ten_seasons_lr['Moneyline_Away_Team'] = ten_seasons_lr['Moneyline_Away_Team'].astype(float)
ten_seasons_lr['Game_Total_Points'] = ten_seasons_lr['Game_Total_Points'].astype(float)
ten_seasons_lr['Win_Margin'] = ten_seasons_lr['Win_Margin'].astype(float)

In [60]:
ten_seasons_lr.dtypes

Over_Under             float64
Spread                 float64
Moneyline_Home_Team    float64
Moneyline_Away_Team    float64
Game_Total_Points      float64
Win_Margin             float64
Signal                 float64
dtype: object

In [62]:
# Set the y (target data) as 'Signal'
y = ten_seasons_lr['Signal']

# Set the X (features) as everything else
X = ten_seasons_lr.drop(columns=['Signal'])

# Display the dataframes
display(y[:5])
display(X.head())

0    1.0
1   -1.0
2   -1.0
3    1.0
4   -1.0
Name: Signal, dtype: float64

Unnamed: 0,Over_Under,Spread,Moneyline_Home_Team,Moneyline_Away_Team,Game_Total_Points,Win_Margin
0,195.5,5.0,-210.0,185.0,210.0,2.0
1,190.5,3.5,185.0,-210.0,199.0,-11.0
2,186.0,1.5,205.0,-245.0,175.0,-1.0
3,197.0,7.5,-380.0,310.0,186.0,8.0
4,212.0,4.0,175.0,-200.0,191.0,-19.0


In [69]:
# Split the training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    random_state = 1
)

In [70]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)

# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [71]:
# Create a logistic regression model instance
model = LogisticRegression(random_state = 1)

In [72]:
# Fit the model
ten_seasons_model = model.fit(X_train_scaled, y_train)

In [74]:
# Use the scaled testing data to predict the model
ten_seasons_pred = model.predict(X_test_scaled)

# display the predicted values
ten_seasons_pred[:10]

array([-1.,  1., -1., -1.,  1.,  1., -1.,  1.,  1., -1.])

In [75]:
# Create a classification report
ten_seasons_testing_report = classification_report(y_test, ten_seasons_pred)

# Print the report
print(ten_seasons_testing_report)

NameError: name 'classification_report' is not defined