In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from joblib import dump,load

In [2]:
Combined_Data = pd.read_csv("Resources/Cleaned CSVs/Combined_Data.csv")
df = pd.read_csv("Resources/Cleaned CSVs/Combined_Data.csv")

df_2024 = pd.read_csv("Resources/Cleaned CSVs/Cleaned_2024.csv")

Combined_Data.drop('Tm', axis=1, inplace=True)

df_2024.drop("Unnamed: 0", axis=1, inplace=True)

In [3]:
corr_data = Combined_Data.corr()['playoffs']

low_corr_df = pd.DataFrame(corr_data.loc[(corr_data <= 0.1) & (corr_data >= -0.1)])
low_corr_df.rename(columns={'playoffs': 'correlation'}, inplace=True)

index_values = low_corr_df.index.tolist()

index_values

['BK', 'CG_y', 'Ch', 'GF', 'SB', 'cSho']

In [4]:
columns_to_keep = index_values + ['playoffs']

focused_df = Combined_Data[columns_to_keep]

focused_df

Unnamed: 0,BK,CG_y,Ch,GF,SB,cSho,playoffs
0,5.0,2.0,6150.0,160.0,79.0,1.0,0
1,8.0,2.0,5889.0,160.0,90.0,1.0,1
2,8.0,2.0,5934.0,160.0,81.0,0.0,0
3,3.0,2.0,5819.0,160.0,125.0,0.0,1
4,3.0,1.0,6208.0,162.0,66.0,0.0,1
...,...,...,...,...,...,...,...
145,7.0,0.0,5944.0,162.0,101.0,0.0,0
146,1.0,0.0,5802.0,162.0,160.0,0.0,1
147,4.0,3.0,5789.0,159.0,79.0,1.0,1
148,12.0,1.0,5733.0,161.0,99.0,1.0,1


 ## Separate the Features (X) from the Target (y)

In [5]:
y = focused_df["playoffs"]
X = focused_df.drop(columns="playoffs")

 ## Split our data into training and testing

In [6]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
#                                                     random_state=1, 
                                                    stratify=y)
X_train.shape

(112, 6)

 ## Create a Logistic Regression Model

In [7]:
classifier = LogisticRegression(solver='liblinear',
                                max_iter=1000,
                                # random_state=1
                               )
classifier

 ## Fit (train) or model using the training data

In [8]:
classifier.fit(X_train, y_train)

 ## Score the model using the test data

In [9]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6339285714285714
Testing Data Score: 0.631578947368421


In [10]:
# df.groupby(['Tm']).mean()
len(X_train)

112

In [11]:
len(X_test)

38

 ## Make predictions

In [12]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,1
4,0,0
5,0,1
6,0,1
7,0,0
8,0,1
9,0,0


## Calculate the Accuracy Score

In [13]:
accuracy_score(y_test, predictions)

0.631578947368421

# Save the pre-trained model

In [14]:
dump(classifier, 'log_model.joblib')

['log_model.joblib']

In [15]:
model = load('log_model.joblib')

In [16]:
predictions_2024 = model.predict(df_2024[index_values])
predictions_2024

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [17]:
df_2024['predictions'] = predictions_2024
df_2024

Unnamed: 0,Tm,#Bat,BatAge,R/G,G_x,PA,AB,R_x,H_x,2B,...,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W,LOB_y,predictions
0,Arizona Diamondbacks,41,28.9,4.64,66,2515,2239,306,552,113,...,87,4.25,1.361,9.2,1.1,3.1,7.4,2.4,446,0
1,Atlanta Braves,42,28.6,4.43,63,2362,2133,279,521,127,...,113,3.61,1.234,8.2,0.9,2.9,8.9,3.02,417,0
2,Baltimore Orioles,40,27.1,5.17,65,2467,2240,336,559,112,...,119,3.5,1.129,7.2,0.8,2.9,8.6,2.96,418,0
3,Boston Red Sox,42,27.3,4.48,66,2524,2262,296,558,113,...,123,3.55,1.152,7.8,0.9,2.6,8.8,3.4,417,0
4,Chicago Cubs,41,28.2,4.3,66,2486,2188,284,500,99,...,106,3.92,1.291,8.3,1.1,3.3,9.0,2.72,458,0
5,Chicago White Sox,46,28.7,3.09,67,2408,2191,207,477,102,...,82,4.65,1.439,8.9,1.4,4.1,8.7,2.14,482,0
6,Cincinnati Reds,36,27.4,4.36,66,2426,2134,288,485,96,...,108,4.01,1.249,8.1,1.0,3.1,8.5,2.73,457,1
7,Cleveland Guardians,35,26.2,5.08,64,2398,2120,325,508,106,...,113,3.76,1.176,7.7,1.0,2.9,9.0,3.09,418,0
8,Colorado Rockies,39,28.2,4.0,66,2476,2250,264,551,116,...,86,4.69,1.519,9.9,1.1,3.8,6.6,1.73,468,0
9,Detroit Tigers,35,27.3,4.22,65,2425,2184,274,502,100,...,106,3.64,1.222,8.2,0.9,2.8,8.6,3.11,421,0
