In [424]:
import pandas as pd
import numpy as np 
import json

In [425]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-1]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL'].to_numpy()
y_nl = y[league == 'NL'].to_numpy()



In [426]:
# Read selected features (from forest importance)
with open('forest_importance_nl.json', 'r') as f:
    selected_features_nl_importance = json.load(f)

with open('forest_importance_al.json', 'r') as f:
    selected_features_al_importance = json.load(f)

# Read selected features (from Chi-squared)
with open('chi2_selected_features_nl.json', 'r') as f:
    selected_features_nl_chi2 = json.load(f)


with open('chi2_selected_features_al.json', 'r') as f:
    selected_features_al_chi2 = json.load(f)

In [427]:
#Random Forest Importance
x_nl_importance = x_nl[selected_features_nl_importance]
x_al_importance = x_al[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2 = x_nl[selected_features_nl_chi2]
x_al_chi2 = x_al[selected_features_al_chi2]

In [428]:
#Set up matrices for x 
def createMatrix(input_df):
    matrix = input_df.to_numpy()
    matrix = np.column_stack([np.ones(len(input_df)), input_df])
    return matrix

x_nl_imp_matrix = createMatrix(x_nl_importance)
x_al_imp_matrix = createMatrix(x_al_importance)
x_nl_chi2_matrix = createMatrix(x_nl_chi2)
x_al_chi2_matrix = createMatrix(x_al_chi2)

In [429]:
y_nl = y_nl.reshape(len(y_nl), 1)
y_al = y_al.reshape(len(y_al), 1)

In [430]:
theta_nl_imp = np.linalg.solve(x_nl_imp_matrix.T@x_nl_imp_matrix, x_nl_imp_matrix.T@y_nl)
theta_al_imp = np.linalg.solve(x_al_imp_matrix.T@x_al_imp_matrix, x_al_imp_matrix.T@y_al)
theta_nl_chi2 = np.linalg.solve(x_nl_chi2_matrix.T@x_nl_chi2_matrix, x_nl_chi2_matrix.T@y_nl)
theta_al_chi2 = np.linalg.solve(x_al_chi2_matrix.T@x_al_chi2_matrix, x_al_chi2_matrix.T@y_al)


In [431]:
y_hat_nl_imp = x_nl_imp_matrix@theta_nl_imp
y_hat_al_imp = x_al_imp_matrix@theta_al_imp
y_hat_nl_chi2 = x_nl_chi2_matrix@theta_nl_chi2
y_hat_al_chi2 = x_al_chi2_matrix@theta_al_chi2


In [432]:
def findSSE(y_hat, y):
    error = y_hat - y
    sse = np.sum(error*error)
    return sse

sse_nl_imp = findSSE(y_hat_nl_imp, y_nl)
sse_nl_chi2 = findSSE(y_hat_nl_chi2, y_nl)
sse_al_imp = findSSE(y_hat_al_imp, y_al)
sse_al_chi2 = findSSE(y_hat_al_chi2, y_al)

In [433]:
#display error
print(f"Importance error: NL =  {sse_nl_imp}, AL = {sse_al_imp}")
print(f"Chi2 error: NL = {sse_nl_chi2}, AL = {sse_al_chi2}")

Importance error: NL =  1207.9600433669286, AL = 943.5261380486745
Chi2 error: NL = 1231.6595723819332, AL = 960.2641131523691


In [434]:
# Prep test data
df_test = pd.read_csv("test_data.csv")

player_descriptors_test = df_test.iloc[:, :4]
league = df_test['Lg']
x_test = df_test.iloc[:, 4:]

player_descriptors_nl = player_descriptors_test[league == 'NL'].reset_index(drop=True)
player_descriptors_al = player_descriptors_test[league == 'AL'].reset_index(drop=True)

# Add league temporarily for sorting
x_test['Lg'] = league

# Create separate datasets
x_al_test = x_test[x_test['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl_test = x_test[x_test['Lg'] == 'NL'].drop(columns=['Lg'])


In [435]:
#Random Forest Importance
x_nl_importance_test = x_nl_test[selected_features_nl_importance]
x_al_importance_test = x_al_test[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2_test = x_nl_test[selected_features_nl_chi2]
x_al_chi2_test = x_al_test[selected_features_al_chi2]

In [436]:
#generate X matrices
nl_imp_test_matrix = createMatrix(x_nl_importance_test)
al_imp_test_matrix = createMatrix(x_al_importance_test)
al_chi2_test_matrix = createMatrix(x_al_chi2_test)
nl_chi2_test_matrix = createMatrix(x_nl_chi2_test)

In [437]:
#Generate prediction matrix
y_pred_nl_imp = nl_imp_test_matrix@theta_nl_imp
y_pred_nl_chi2 = nl_chi2_test_matrix@theta_nl_chi2
y_pred_al_imp = al_imp_test_matrix@theta_al_imp
y_pred_al_chi2 = al_chi2_test_matrix@theta_al_chi2

In [438]:
#winner index of players
winner_idx_nl_imp = np.argmin(y_pred_nl_imp)
winner_idx_nl_chi2 = np.argmin(y_pred_nl_chi2)
winner_idx_al_imp = np.argmin(y_pred_al_imp)
winner_idx_al_chi2 = np.argmin(y_pred_al_chi2)


In [439]:
print(len(y_pred_al_imp), winner_idx_al_chi2, winner_idx_al_imp)
print(len(player_descriptors_al))
print(len(y_pred_nl_imp), winner_idx_nl_chi2, winner_idx_nl_imp)
print(len(player_descriptors_nl))

125 83 83
125
116 60 79
116


In [440]:
print("Predicted NL Cy Young Winner (Importance Model):", player_descriptors_nl.iloc[winner_idx_nl_imp])
print("Predicted NL Cy Young Winner (Chi-Squared Model):", player_descriptors_nl.iloc[winner_idx_nl_chi2])
print("Predicted AL Cy Young Winner (Importance Model):", player_descriptors_al.iloc[winner_idx_al_imp])
print("Predicted AL Cy Young Winner (Chi-Squared Model):", player_descriptors_al.iloc[winner_idx_al_chi2])


Predicted NL Cy Young Winner (Importance Model): Rk             152.0
Player    Kyle Leahy
yr              25.0
Team             STL
Name: 79, dtype: object
Predicted NL Cy Young Winner (Chi-Squared Model): Rk              123.0
Player    Roki Sasaki
yr               25.0
Team              LAD
Name: 60, dtype: object
Predicted AL Cy Young Winner (Importance Model): Rk                 176.0
Player    Justin Sterner
yr                  25.0
Team                 ATH
Name: 83, dtype: object
Predicted AL Cy Young Winner (Chi-Squared Model): Rk                 176.0
Player    Justin Sterner
yr                  25.0
Team                 ATH
Name: 83, dtype: object


Retrain the model using forward selection and ridge regression, Forward will choose the attributes in place of the attribute selector

In [441]:
sse_list_nl_imp = []
sse_list_nl_chi2 = []
sse_list_al_imp = []
sse_list_al_chi2 = []

selected_atts_nl_imp = []
selected_atts_nl_chi2 = []
selected_atts_al_imp = []
selected_atts_al_chi2 = []

def ForwardSelection(selectedAtts, remainingAtts, sse_list, X, y):
    for step in range(len(remainingAtts)):
        best_sse = float('inf')
        best_attribute = None

        for attribute in remainingAtts:
            current_predictors = selectedAtts + [attribute]
            
            X_subset = X.loc[:, current_predictors]
            X_matrix = createMatrix(X_subset)

            theta = np.linalg.solve(X_matrix.T @ X_matrix, X_matrix.T @ y)
            
            y_hat = X_matrix @ theta
            sse = findSSE(y_hat, y)
            
            if sse < best_sse:
                best_sse = sse
                best_attribute = attribute

        selectedAtts.append(best_attribute)
        remainingAtts.remove(best_attribute)
        
        sse_list.append(best_sse)

def results(selectedAtts, sse_list):
    for i in range(len(selectedAtts)):
        print(f"Step {i+1}: Selected {selectedAtts[i]}, SSE = {sse_list[i]:.2f}")


In [442]:
selected_atts_nl_fs = []
sse_nl_forward_sel = []

selected_atts_al_fs = []
sse_al_forward_sel = []

print("Forward Selection Results AL:\n")
ForwardSelection(selected_atts_al_fs, x_al.columns.to_list(), sse_al_forward_sel, x_al, y_al)
results(selected_atts_al_fs, sse_al_forward_sel)

print("Forward Selection Results NL:\n")
ForwardSelection(selected_atts_nl_fs, x_nl.columns.to_list(), sse_nl_forward_sel, x_nl, y_nl)
results(selected_atts_nl_fs, sse_nl_forward_sel)


Forward Selection Results AL:

Step 1: Selected SO, SSE = 1046.02
Step 2: Selected CG, SSE = 1006.63
Step 3: Selected HR, SSE = 970.26
Step 4: Selected SO9, SSE = 944.46
Step 5: Selected HR9, SSE = 899.15
Step 6: Selected HBP, SSE = 887.71
Step 7: Selected SO/BB, SSE = 880.34
Step 8: Selected IBB, SSE = 872.11
Step 9: Selected WP, SSE = 869.16
Step 10: Selected H9, SSE = 866.28
Step 11: Selected FIP, SSE = 864.43
Step 12: Selected BB, SSE = 863.15
Step 13: Selected IP, SSE = 859.63
Step 14: Selected WHIP, SSE = 857.30
Step 15: Selected BB9, SSE = 853.14
Step 16: Selected BF, SSE = 850.30
Step 17: Selected H, SSE = 848.88
Step 18: Selected GS, SSE = 847.75
Step 19: Selected G, SSE = 846.61
Step 20: Selected W, SSE = 845.90
Step 21: Selected W-L%, SSE = 845.26
Step 22: Selected ERA+, SSE = 844.61
Step 23: Selected SV, SSE = 844.07
Step 24: Selected R, SSE = 843.60
Step 25: Selected ERA, SSE = 843.17
Step 26: Selected GF, SSE = 842.89
Step 27: Selected ER, SSE = 842.61
Step 28: Selected L

In [443]:
atts_nl_fs = selected_atts_nl_fs[:10]
atts_al_fs = selected_atts_al_fs[:10]


x_nl_fs = x_nl[selected_atts_nl_fs]
x_al_fs = x_al[selected_atts_al_fs]

x_nl_fs = createMatrix(x_nl_fs)
x_al_fs = createMatrix(x_al_fs)


I will perform ridge regression on all sets

In [444]:
def ridgeRegression(x, y):
    theta = np.linalg.solve(x.T@x + 0.5*np.eye(x.shape[1]), x.T@y)
    return theta

In [445]:
theta_al_fs_ridged = ridgeRegression(x_al_fs, y_al)
theta_al_imp_ridged = ridgeRegression(x_al_imp_matrix, y_al)
theta_al_chi2_ridged = ridgeRegression(x_al_chi2_matrix, y_al)
theta_nl_fs_ridged = ridgeRegression(x_nl_fs, y_nl)
theta_nl_imp_ridged = ridgeRegression(x_nl_imp_matrix, y_nl)
theta_nl_chi2_ridged = ridgeRegression(x_nl_chi2_matrix, y_nl)


In [446]:
y_hat_nl_imp_ridged = x_nl_imp_matrix@theta_nl_imp_ridged
y_hat_al_imp_ridged = x_al_imp_matrix@theta_al_imp_ridged
y_hat_nl_chi2_ridged = x_nl_chi2_matrix@theta_nl_chi2_ridged
y_hat_al_chi2_ridged = x_al_chi2_matrix@theta_al_chi2_ridged
y_hat_nl_fs = x_nl_fs@theta_nl_fs_ridged
y_hat_al_fs = x_al_fs@theta_al_fs_ridged

In [447]:
x_nl_fs_test = x_nl_test[selected_atts_nl_fs]
x_al_fs_test = x_al_test[selected_atts_al_fs]
x_nl_fs_test = createMatrix(x_nl_fs_test)
x_al_fs_test = createMatrix(x_al_fs_test)

In [448]:
y_pred_nl_imp_ridged = nl_imp_test_matrix@theta_nl_imp_ridged
y_pred_nl_chi2_ridged = nl_chi2_test_matrix@theta_nl_chi2_ridged
y_pred_al_imp_ridged = al_imp_test_matrix@theta_al_imp_ridged
y_pred_al_chi2_ridged = al_chi2_test_matrix@theta_al_chi2_ridged
y_pred_nl_fs = x_nl_fs_test@theta_nl_fs_ridged
y_pred_al_fs = x_al_fs_test@theta_al_fs_ridged

In [449]:
winner_idx_nl_imp_ridged = np.argmin(y_pred_nl_imp_ridged)
winner_idx_nl_chi2_ridged = np.argmin(y_pred_nl_chi2_ridged)
winner_idx_al_imp_ridged = np.argmin(y_pred_al_imp_ridged)
winner_idx_al_chi2_ridged = np.argmin(y_pred_al_chi2_ridged)
winner_idx_nl_fs = np.argmin(y_pred_nl_fs)
winner_idx_al_fs = np.argmin(y_pred_al_fs)

In [450]:
print("Predicted Cy Young Winners with Ridge Regression")
print("NL (Importance Model):", player_descriptors_nl.iloc[winner_idx_nl_imp_ridged])
print("NL (Chi-Squared Model):", player_descriptors_nl.iloc[winner_idx_nl_chi2_ridged])
print("NL (Forward Selection):", player_descriptors_nl.iloc[winner_idx_nl_fs])
print("AL (Forward Selection):", player_descriptors_nl.iloc[winner_idx_al_fs])
print("AL (Importance Model):", player_descriptors_al.iloc[winner_idx_al_imp_ridged])
print("AL(Chi-Squared Model):", player_descriptors_al.iloc[winner_idx_al_chi2_ridged])


Predicted Cy Young Winners with Ridge Regression
NL (Importance Model): Rk             152.0
Player    Kyle Leahy
yr              25.0
Team             STL
Name: 79, dtype: object
NL (Chi-Squared Model): Rk              123.0
Player    Roki Sasaki
yr               25.0
Team              LAD
Name: 60, dtype: object
NL (Forward Selection): Rk             152.0
Player    Kyle Leahy
yr              25.0
Team             STL
Name: 79, dtype: object
AL (Forward Selection): Rk               153.0
Player    Jack Dreyer*
yr                25.0
Team               LAD
Name: 80, dtype: object
AL (Importance Model): Rk                 176.0
Player    Justin Sterner
yr                  25.0
Team                 ATH
Name: 83, dtype: object
AL(Chi-Squared Model): Rk                 176.0
Player    Justin Sterner
yr                  25.0
Team                 ATH
Name: 83, dtype: object
