In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score
from joblib import dump,load

## Read in cleaned CSVs

In [2]:
combined_data = pd.read_csv("Resources/Cleaned CSVs/Combined_Data.csv")

df_2024 = pd.read_csv("Resources/Cleaned CSVs/Cleaned_2024.csv")

 ## Prep the data for modeling

In [3]:
#Drop the teams column
combined_data.drop('Tm', axis=1, inplace=True)

# Separate the Features (X) from the Target (y)

y = combined_data["playoffs"]
X = combined_data.drop(columns="playoffs")

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 ## Create a Logistic Regression Model

In [4]:
lr_model = LogisticRegression(solver='liblinear', max_iter=100)
# Fit the model
lr_model = lr_model.fit(X_train, y_train)

 ## Create a Random Forest Model

In [5]:
rf_model = RandomForestClassifier(n_estimators=500)
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

 ## Create a K Nearest Neighbors Model

In [6]:
knn_model = KNeighborsClassifier(n_neighbors=6)
# Fit the model
knn_model.fit(X_train_scaled, y_train)

In [7]:
# Create predictions
y_pred_train = knn_model.predict(X_train_scaled)
y_pred_test = knn_model.predict(X_test_scaled)
# Calculate the accuracy score
test_acc_score = accuracy_score(y_pred_test, y_test)
train_acc_score = accuracy_score(y_pred_train, y_train)

 ## Score the models using the test data

In [8]:
print(f"Logistic Regression Training Data Score: {lr_model.score(X_train, y_train)}")
print(f"Logistic Regression Testing Data Score: {lr_model.score(X_test, y_test)}")
print(f"-----------------------------------------------------------------")
print(f"Random Forest Training Data Score: {rf_model.score(X_train_scaled, y_train)}")
print(f"Random Forest Testing Data Score: {rf_model.score(X_test_scaled, y_test)}")
print(f"-----------------------------------------------------------------")
print(f"K Nearest Neighbors Training Data Score: {train_acc_score}")
print(f"K Nearest Neighbors Training Data Score : {test_acc_score}")

Logistic Regression Training Data Score: 1.0
Logistic Regression Testing Data Score: 0.8157894736842105
-----------------------------------------------------------------
Random Forest Training Data Score: 1.0
Random Forest Testing Data Score: 0.8421052631578947
-----------------------------------------------------------------
K Nearest Neighbors Training Data Score: 0.8660714285714286
K Nearest Neighbors Training Data Score : 0.7894736842105263


## Identifying Independant Variables to Optimize the Models

In [9]:
#Run a correlation test on the playoffs column

corr_data = combined_data.corr()['playoffs']

corr_data

#Bat       -0.272665
#Fld       -0.274598
#P         -0.175295
2B          0.302605
3B         -0.100437
              ...   
WHIP       -0.607284
WP         -0.225175
cSho        0.026995
tSho        0.454448
playoffs    1.000000
Name: playoffs, Length: 78, dtype: float64

In [10]:
#From the results limit our scope to those values closest to zero to find the most independent variables

low_corr_df = pd.DataFrame(corr_data.loc[(corr_data <= 0.1) & (corr_data >= -0.1)])
low_corr_df.rename(columns={'playoffs': 'correlation'}, inplace=True)

#Save the independent variables to a list

index_values = low_corr_df.index.tolist()

index_values

['BK', 'CG_y', 'Ch', 'GF', 'SB', 'cSho']

In [11]:
#Amend the data to only include the independent variables
columns_to_keep = index_values + ['playoffs']

focused_df = combined_data[columns_to_keep]
df_2024_focused = combined_data[columns_to_keep]

focused_df

Unnamed: 0,BK,CG_y,Ch,GF,SB,cSho,playoffs
0,5.0,2.0,6150.0,160.0,79.0,1.0,0
1,8.0,2.0,5889.0,160.0,90.0,1.0,1
2,8.0,2.0,5934.0,160.0,81.0,0.0,0
3,3.0,2.0,5819.0,160.0,125.0,0.0,1
4,3.0,1.0,6208.0,162.0,66.0,0.0,1
...,...,...,...,...,...,...,...
145,7.0,0.0,5944.0,162.0,101.0,0.0,0
146,1.0,0.0,5802.0,162.0,160.0,0.0,1
147,4.0,3.0,5789.0,159.0,79.0,1.0,1
148,12.0,1.0,5733.0,161.0,99.0,1.0,1


## Retrain the models based on independent variables

In [12]:
focused_y = focused_df["playoffs"]
focused_X = focused_df.drop(columns="playoffs")

In [13]:
# Splitting into Train and Test sets
focused_X_train, focused_X_test, focused_y_train, focused_y_test = train_test_split(focused_X, focused_y, stratify=y)

In [14]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(focused_X_train)
# Scale the training data
focused_X_train_scaled = X_scaler.transform(focused_X_train)
focused_X_test_scaled = X_scaler.transform(focused_X_test)

In [15]:
# Logistic Regression
focused_lr_model = LogisticRegression(solver='liblinear', max_iter=100)
# Fit the model
focused_lr_model = focused_lr_model.fit(focused_X_train, focused_y_train)

In [16]:
# Random Forest
focused_rf_model = RandomForestClassifier(n_estimators=500)
# Fit the model
focused_rf_model = focused_rf_model.fit(focused_X_train_scaled, focused_y_train)

In [17]:
# K Nearest Neighbors
focused_knn_model = KNeighborsClassifier(n_neighbors=6)
# Fit the model
focused_knn_model = focused_knn_model.fit(focused_X_train_scaled, focused_y_train)

In [18]:
# Create predictions
focused_y_pred_train = focused_knn_model.predict(focused_X_train_scaled)
focused_y_pred_test = focused_knn_model.predict(focused_X_test_scaled)
# Calculate the accuracy score
focused_test_acc_score = accuracy_score(focused_y_pred_test, focused_y_test)
focused_train_acc_score = accuracy_score(focused_y_pred_train, focused_y_train)

In [19]:
print(f"Logistic Regression Training Data Score: {focused_lr_model.score(focused_X_train, focused_y_train)}")
print(f"Logistic Regression Testing Data Score: {focused_lr_model.score(focused_X_test, focused_y_test)}")
print(f"-----------------------------------------------------------------")
print(f"Random Forest Training Data Score: {focused_rf_model.score(focused_X_train_scaled, focused_y_train)}")
print(f"Random Forest Testing Data Score: {focused_rf_model.score(focused_X_test_scaled, focused_y_test)}")
print(f"-----------------------------------------------------------------")
print(f"K Nearest Neighbors Training Data Score: {focused_train_acc_score}")
print(f"K Nearest Neighbors Training Data Score : {focused_test_acc_score}")

Logistic Regression Training Data Score: 0.6339285714285714
Logistic Regression Testing Data Score: 0.5789473684210527
-----------------------------------------------------------------
Random Forest Training Data Score: 1.0
Random Forest Testing Data Score: 0.39473684210526316
-----------------------------------------------------------------
K Nearest Neighbors Training Data Score: 0.75
K Nearest Neighbors Training Data Score : 0.5


## Further Optimize the Logistic Regression Model

In [20]:
# Logistic Regression
focused_lr_model = LogisticRegression(solver='liblinear', max_iter=80)
# Fit the model
focused_lr_model = focused_lr_model.fit(focused_X_train, focused_y_train)

In [21]:
print(f"Logistic Regression Training Data Score: {focused_lr_model.score(focused_X_train, focused_y_train)}")
print(f"Logistic Regression Testing Data Score: {focused_lr_model.score(focused_X_test, focused_y_test)}")

Logistic Regression Training Data Score: 0.6339285714285714
Logistic Regression Testing Data Score: 0.5789473684210527


# Save the pre-trained model

In [22]:
dump(focused_lr_model, 'log_model.joblib')

['log_model.joblib']

In [23]:
model = load('log_model.joblib')

In [24]:
predictions_2024 = model.predict(df_2024[index_values])
predictions_2024

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
df_2024['predictions'] = predictions_2024
perdicted_2024 = df_2024[["Tm", "predictions"]]

perdicted_2024

Unnamed: 0,Tm,predictions
0,Arizona Diamondbacks,0
1,Atlanta Braves,0
2,Baltimore Orioles,0
3,Boston Red Sox,1
4,Chicago Cubs,0
5,Chicago White Sox,0
6,Cincinnati Reds,1
7,Cleveland Guardians,0
8,Colorado Rockies,0
9,Detroit Tigers,0


In [26]:
# Make list for all the predictions
predictions = {}

# Create loop for it to run 100 times with the model
iterations = 100
for i in range(iterations):
    y = focused_df["playoffs"]
    X = focused_df.drop(columns="playoffs")
    focused_X_train, focused_X_test, focused_y_train, focused_y_test = train_test_split(X,y,stratify=y)
    focused_lr_model = LogisticRegression(solver='liblinear', max_iter=93)
    focused_lr_model.fit(focused_X_train, focused_y_train)
    dump(focused_lr_model, 'log_model.joblib')
    model = load('log_model.joblib')
    predictions_2024 = model.predict(df_2024[index_values])
    predictions_df = pd.DataFrame({"Team": df_2024["Tm"], "Prediction": predictions_2024})
    counts = predictions_df.groupby('Team')['Prediction'].sum().astype(int)
    
    for team, count in counts.items():
        if team in predictions:
            predictions[team] += count
        else:
            predictions[team] = count

predictions

{'Arizona Diamondbacks': 0,
 'Atlanta Braves': 4,
 'Baltimore Orioles': 0,
 'Boston Red Sox': 22,
 'Chicago Cubs': 2,
 'Chicago White Sox': 10,
 'Cincinnati Reds': 55,
 'Cleveland Guardians': 3,
 'Colorado Rockies': 1,
 'Detroit Tigers': 0,
 'Houston Astros': 11,
 'Kansas City Royals': 15,
 'Los Angeles Angels': 14,
 'Los Angeles Dodgers': 0,
 'Miami Marlins': 2,
 'Milwaukee Brewers': 32,
 'Minnesota Twins': 7,
 'New York Mets': 1,
 'New York Yankees': 10,
 'Oakland Athletics': 0,
 'Philadelphia Phillies': 29,
 'Pittsburgh Pirates': 0,
 'San Diego Padres': 4,
 'San Francisco Giants': 0,
 'Seattle Mariners': 1,
 'St. Louis Cardinals': 0,
 'Tampa Bay Rays': 21,
 'Texas Rangers': 0,
 'Toronto Blue Jays': 7,
 'Washington Nationals': 37}