In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import hvplot.pandas
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from joblib import dump,load

In [2]:
Combined_Data = pd.read_csv("Resources/Cleaned CSVs/Combined_Data.csv")
df = pd.read_csv("Resources/Cleaned CSVs/Combined_Data.csv")

df_2024 = pd.read_csv("Resources/Cleaned CSVs/Cleaned_2024.csv")

Combined_Data.drop('Tm', axis=1, inplace=True)

# df_2024.drop("Unnamed: 0", axis=1, inplace=True)
# df_2024 = df_2024.drop([30, 31])

In [3]:
corr_data = Combined_Data.corr()['playoffs']

low_corr_df = pd.DataFrame(corr_data.loc[(corr_data <= 0.1) & (corr_data >= -0.1)])
low_corr_df.rename(columns={'playoffs': 'correlation'}, inplace=True)

index_values = low_corr_df.index.tolist()

index_values

['BK', 'CG_y', 'Ch', 'GF', 'SB', 'cSho']

In [4]:
columns_to_keep = index_values + ['playoffs']

focused_df = Combined_Data[columns_to_keep]

focused_df

Unnamed: 0,BK,CG_y,Ch,GF,SB,cSho,playoffs
0,5.0,2.0,6150.0,160.0,79.0,1.0,0
1,8.0,2.0,5889.0,160.0,90.0,1.0,1
2,8.0,2.0,5934.0,160.0,81.0,0.0,0
3,3.0,2.0,5819.0,160.0,125.0,0.0,1
4,3.0,1.0,6208.0,162.0,66.0,0.0,1
...,...,...,...,...,...,...,...
145,7.0,0.0,5944.0,162.0,101.0,0.0,0
146,1.0,0.0,5802.0,162.0,160.0,0.0,1
147,4.0,3.0,5789.0,159.0,79.0,1.0,1
148,12.0,1.0,5733.0,161.0,99.0,1.0,1


 ## Separate the Features (X) from the Target (y)

In [5]:
y = focused_df["playoffs"]
X = focused_df.drop(columns="playoffs")

 ## Split our data into training and testing

In [6]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

## Test for best regularization

In [7]:
# Fit a Logistic Regression model with L1 (Lasso) regularization
model_lasso = LogisticRegression(penalty='l1', solver='liblinear', random state = 1)
model_lasso.fit(X_train, y_train)

# Fit a Logistic Regression model with L2 (Ridge) regularization
model_ridge = LogisticRegression(penalty='l2', solver='lbfgs', random state = 1)
model_ridge.fit(X_train, y_train)

# Evaluate the models

y_pred_lasso_train = model_lasso.predict(X_train)
accuracy_lasso_train = accuracy_score(y_train, y_pred_lasso_train)

y_pred_lasso_test = model_lasso.predict(X_test)
accuracy_lasso_test = accuracy_score(y_test, y_pred_lasso_test)

y_pred_ridge_train = model_ridge.predict(X_train)
accuracy_ridge_train = accuracy_score(y_train, y_pred_ridge_train)

y_pred_ridge_test = model_ridge.predict(X_test)
accuracy_ridge_test = accuracy_score(y_test, y_pred_ridge_test)

print("Training Accuracy with L1 (Lasso) regularization:", accuracy_lasso_train)
print("Testing Accuracy with L1 (Lasso) regularization:", accuracy_lasso_test)
print("-------------------------------------------------------------------------")
print("Training Accuracy with L2 (Ridge) regularization:", accuracy_ridge_train)
print("Testing Accuracy with L2 (Ridge) regularization:", accuracy_ridge_test)

Training Accuracy with L1 (Lasso) regularization: 0.6517857142857143
Testing Accuracy with L1 (Lasso) regularization: 0.631578947368421
-------------------------------------------------------------------------
Training Accuracy with L2 (Ridge) regularization: 0.6428571428571429
Testing Accuracy with L2 (Ridge) regularization: 0.631578947368421


In [24]:
# Initialize logistic regression model
logreg = logreg = LogisticRegression(penalty='l1', solver='liblinear', max_iter=5000)  # Set a maximum number of iterations

In [25]:
# Lists to store training and testing accuracy
train_accuracy = []
test_accuracy = []

# Train the model for different numbers of iterations
for i in range(1, 1001):  # Try different numbers of iterations
    logreg.set_params(max_iter=i)  # Set the number of iterations
    logreg.fit(X_train, y_train)  # Train the model

    # Predict on training and testing data
    y_train_pred = logreg.predict(X_train)
    y_test_pred = logreg.predict(X_test)

    # Calculate accuracy on training and testing data
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    # Append accuracy to the lists
    train_accuracy.append(train_acc)
    test_accuracy.append(test_acc)













In [26]:
# Create a DataFrame with the data
data = {
    'Iterations': range(1, 1001),
    'Training Accuracy': train_accuracy,
    'Testing Accuracy': test_accuracy
}
df_accuracy = pd.DataFrame(data)

In [27]:
# Plot the training and testing accuracy against the number of iterations using HVPlot
plot = df_accuracy.hvplot.line(x='Iterations', y=['Training Accuracy', 'Testing Accuracy'],
                               xlabel='Number of Iterations', ylabel='Accuracy',
                               title='Training and Testing Accuracy vs. Number of Iterations',
                               legend='bottom_right')

plot

In [12]:
# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Penalty type
    'solver': ['liblinear']  # Solver type
}

# Create a logistic regression model
logreg = LogisticRegression()

# Perform Grid Search Cross Validation
grid_search = GridSearchCV(logreg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Fit the model using the best hyperparameters
best_logreg = LogisticRegression(**best_params)
best_logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = best_logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.631578947368421




In [13]:
# Fit a Logistic Regression model with L1 (Lasso) regularization
model_lasso = LogisticRegression(penalty='l1', solver='liblinear')
model_lasso.fit(X_train, y_train)

# Fit a Logistic Regression model with L2 (Ridge) regularization
model_ridge = LogisticRegression(penalty='l2', solver='lbfgs')
model_ridge.fit(X_train, y_train)

# Evaluate the models

y_pred_lasso_train = model_lasso.predict(X_train)
accuracy_lasso_train = accuracy_score(y_train, y_pred_lasso_train)

y_pred_lasso_test = model_lasso.predict(X_test)
accuracy_lasso_test = accuracy_score(y_test, y_pred_lasso_test)

y_pred_ridge_train = model_ridge.predict(X_train)
accuracy_ridge_train = accuracy_score(y_train, y_pred_ridge_train)

y_pred_ridge_test = model_ridge.predict(X_test)
accuracy_ridge_test = accuracy_score(y_test, y_pred_ridge_test)

print("Training Accuracy with L1 (Lasso) regularization:", accuracy_lasso_train)
print("Testing Accuracy with L1 (Lasso) regularization:", accuracy_lasso_test)
print("-------------------------------------------------------------------------")
print("Training Accuracy with L2 (Ridge) regularization:", accuracy_ridge_train)
print("Testing Accuracy with L2 (Ridge) regularization:", accuracy_ridge_test)

Training Accuracy with L1 (Lasso) regularization: 0.6517857142857143
Testing Accuracy with L1 (Lasso) regularization: 0.631578947368421
-------------------------------------------------------------------------
Training Accuracy with L2 (Ridge) regularization: 0.6428571428571429
Testing Accuracy with L2 (Ridge) regularization: 0.631578947368421


 ## Make predictions

# Save the pre-trained model

In [14]:
dump(model_ridge, 'log_model.joblib')

['log_model.joblib']

In [15]:
model = load('log_model.joblib')

In [16]:
predictions_2024 = model.predict(df_2024[index_values])
predictions_2024

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1])

In [17]:
df_2024['predictions'] = predictions_2024
perdicted_2024 = df_2024[["Tm", "predictions"]]

perdicted_2024

Unnamed: 0,Tm,predictions
0,Arizona Diamondbacks,0
1,Atlanta Braves,0
2,Baltimore Orioles,0
3,Boston Red Sox,1
4,Chicago Cubs,0
5,Chicago White Sox,0
6,Cincinnati Reds,1
7,Cleveland Guardians,0
8,Colorado Rockies,0
9,Detroit Tigers,0


In [18]:
# Make list for all the predictions
predictions = {}

# Create loop for it to run 100 times with the model
iterations = 100
for i in range(iterations):
    y = focused_df["playoffs"]
    X = focused_df.drop(columns="playoffs")
    X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)
    model_ridge = LogisticRegression(penalty='l2', solver='lbfgs')
    model_ridge.fit(X_train, y_train)
    dump(model_ridge, 'log_model.joblib')
    model = load('log_model.joblib')
    predictions_2024 = model.predict(df_2024[index_values])
    predictions_df = pd.DataFrame({"Team": df_2024["Tm"], "Prediction": predictions_2024})
    counts = predictions_df.groupby('Team')['Prediction'].sum().astype(int)
    
    for team, count in counts.items():
        if team in predictions:
            predictions[team] += count
        else:
            predictions[team] = count

predictions

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'Arizona Diamondbacks': 0,
 'Atlanta Braves': 1,
 'Baltimore Orioles': 0,
 'Boston Red Sox': 32,
 'Chicago Cubs': 1,
 'Chicago White Sox': 6,
 'Cincinnati Reds': 65,
 'Cleveland Guardians': 3,
 'Colorado Rockies': 3,
 'Detroit Tigers': 0,
 'Houston Astros': 6,
 'Kansas City Royals': 12,
 'Los Angeles Angels': 9,
 'Los Angeles Dodgers': 0,
 'Miami Marlins': 1,
 'Milwaukee Brewers': 33,
 'Minnesota Twins': 5,
 'New York Mets': 0,
 'New York Yankees': 10,
 'Oakland Athletics': 1,
 'Philadelphia Phillies': 39,
 'Pittsburgh Pirates': 0,
 'San Diego Padres': 4,
 'San Francisco Giants': 0,
 'Seattle Mariners': 0,
 'St. Louis Cardinals': 0,
 'Tampa Bay Rays': 26,
 'Texas Rangers': 0,
 'Toronto Blue Jays': 5,
 'Washington Nationals': 48}

In [19]:
# Make list for all the predictions
scores = {}

# Create loop for it to run 100 times with the model
iterations = 100
for i in range(iterations):
    y = focused_df["playoffs"]
    X = focused_df.drop(columns="playoffs")
    X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)
    classifier = LogisticRegression(solver='liblinear', max_iter=93)
    classifier.fit(X_train, y_train)
    print(f"Training Data Score: {classifier.score(X_train, y_train)}")
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
    dump(classifier, 'log_model.joblib')
    model = load('log_model.joblib')
    predictions_2024 = model.predict(df_2024[index_values])
    predictions_df = pd.DataFrame({"Team": df_2024["Tm"], "Prediction": predictions_2024})
    counts = predictions_df.groupby('Team')['Prediction'].sum().astype(int)
    
    for score, count in counts.items():
        if team in predictions:
            predictions[team] += count
        else:
            predictions[team] = count

predictions

Training Data Score: 0.6428571428571429
Testing Data Score: 0.631578947368421
Training Data Score: 0.6428571428571429
Testing Data Score: 0.631578947368421
Training Data Score: 0.6428571428571429
Testing Data Score: 0.6578947368421053
Training Data Score: 0.6339285714285714
Testing Data Score: 0.6578947368421053
Training Data Score: 0.6428571428571429
Testing Data Score: 0.6578947368421053
Training Data Score: 0.6071428571428571
Testing Data Score: 0.631578947368421
Training Data Score: 0.6428571428571429
Testing Data Score: 0.631578947368421
Training Data Score: 0.6428571428571429
Testing Data Score: 0.6578947368421053
Training Data Score: 0.6428571428571429
Testing Data Score: 0.6578947368421053
Training Data Score: 0.6339285714285714
Testing Data Score: 0.6578947368421053
Training Data Score: 0.6428571428571429
Testing Data Score: 0.6578947368421053
Training Data Score: 0.6517857142857143
Testing Data Score: 0.6052631578947368
Training Data Score: 0.6428571428571429
Testing Data Sco

{'Arizona Diamondbacks': 0,
 'Atlanta Braves': 1,
 'Baltimore Orioles': 0,
 'Boston Red Sox': 32,
 'Chicago Cubs': 1,
 'Chicago White Sox': 6,
 'Cincinnati Reds': 65,
 'Cleveland Guardians': 3,
 'Colorado Rockies': 3,
 'Detroit Tigers': 0,
 'Houston Astros': 6,
 'Kansas City Royals': 12,
 'Los Angeles Angels': 9,
 'Los Angeles Dodgers': 0,
 'Miami Marlins': 1,
 'Milwaukee Brewers': 33,
 'Minnesota Twins': 5,
 'New York Mets': 0,
 'New York Yankees': 10,
 'Oakland Athletics': 1,
 'Philadelphia Phillies': 39,
 'Pittsburgh Pirates': 0,
 'San Diego Padres': 4,
 'San Francisco Giants': 0,
 'Seattle Mariners': 0,
 'St. Louis Cardinals': 0,
 'Tampa Bay Rays': 26,
 'Texas Rangers': 0,
 'Toronto Blue Jays': 5,
 'Washington Nationals': 358}