In [159]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from scipy.spatial.distance import cdist
import json

In [160]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-1]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL']
y_nl = y[league == 'NL']


The following is done using cdist and euclidean distances, without selecting any attributes

In [161]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler_nl = MinMaxScaler()
X_nl_scaled = scaler_nl.fit_transform(x_nl)

scaler_al = MinMaxScaler()
X_al_scaled = scaler_al.fit_transform(x_al)

In [162]:
#NL cdist
x_nl_dist = cdist(X_nl_scaled,X_nl_scaled, metric='euclidean')
#AL cdist
x_al_dist = cdist(X_al_scaled,X_al_scaled, metric='euclidean')

print("NL distance: \n", x_nl_dist)
print("AL distance: \n", x_al_dist)

NL distance: 
 [[0.         0.98695817 0.56775318 ... 2.23652283 1.8199935  2.16903755]
 [0.98695817 0.         1.03209955 ... 2.07810409 1.75828245 2.11737208]
 [0.56775318 1.03209955 0.         ... 1.97678025 1.54167331 1.96272459]
 ...
 [2.23652283 2.07810409 1.97678025 ... 0.         1.1837118  0.65915191]
 [1.8199935  1.75828245 1.54167331 ... 1.1837118  0.         1.08374042]
 [2.16903755 2.11737208 1.96272459 ... 0.65915191 1.08374042 0.        ]]
AL distance: 
 [[0.         1.16213957 1.33382651 ... 1.6825858  1.92145488 2.06929937]
 [1.16213957 0.         1.1494366  ... 1.79586316 2.00766221 2.33398518]
 [1.33382651 1.1494366  0.         ... 2.0490732  2.24144284 2.26030779]
 ...
 [1.6825858  1.79586316 2.0490732  ... 0.         0.55228623 0.96820896]
 [1.92145488 2.00766221 2.24144284 ... 0.55228623 0.         0.90288181]
 [2.06929937 2.33398518 2.26030779 ... 0.96820896 0.90288181 0.        ]]


In [163]:
print("NL Predictions")
#predict NL
for i in range(len(x_nl_dist)):
  x_nl_dist[i,i] = np.inf
  index = np.argmin(x_nl_dist[i])
  x_nl_dist[i,i] = 0
  predict_class = y_nl.iloc[index]
  print(predict_class)

print("AL predictions")
#predict AL
for i in range(len(x_al_dist)):
  x_al_dist[i,i] = np.inf
  index = np.argmin(x_al_dist[i])
  x_al_dist[i,i] = 0
  predict_class = y_al.iloc[index]
  print(predict_class)

NL Predictions
11
0
0
0
5
6
3
6
0
0
0
5
4
4
7
2
11
3
0
2
6
8
10
2
2
1
10
8
2
6
2
3
5
3
0
1
0
0
4
6
0
4
0
2
5
7
1
1
3
0
3
0
0
11
0
0
0
0
0
0
0
0
0
0
0
6
0
5
0
1
0
0
0
0
0
0
3
0
0
0
10
0
0
0
2
0
0
0
8
0
0
0
0
0
0
0
0
0
0
11
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
8
0
0
0
0
0
0
4
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
AL predictions
6
0
0
0
0
5
0
7
3
6
4
3
0
0
0
9
4
4
8
0
6
8
3
0
0
6
6
7
6
0
3
8
3
0
9
3
6
4
0
2
4
5
1
1
7
0
0
3
0
0
0
0
0
0
0
0
0
0
0
6
0
0
10
0
0
0
0
0
0
0
0
0
0
0
0
0
0
9
0
4
0
0
0
0
7
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [164]:
# Read selected features (from forest importance)
with open('forest_importance_nl.json', 'r') as f:
    selected_features_nl_importance = json.load(f)

with open('forest_importance_al.json', 'r') as f:
    selected_features_al_importance = json.load(f)

# Read selected features (from Chi-squared)
with open('chi2_selected_features_nl.json', 'r') as f:
    selected_features_nl_chi2 = json.load(f)


with open('chi2_selected_features_al.json', 'r') as f:
    selected_features_al_chi2 = json.load(f)

In [165]:
#Random Forest Importance
x_nl_importance = x_nl[selected_features_nl_importance]
x_al_importance = x_al[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2 = x_nl[selected_features_nl_chi2]
x_al_chi2 = x_al[selected_features_al_chi2]


After selecting attributes, We will begin to use KNearestNeighbor classifier

In [166]:
#Scale with MinMax
scaler_nl_importance = MinMaxScaler()
X_nl_importance_min_max = scaler_nl_importance.fit_transform(x_nl_importance)

scaler_al_importance = MinMaxScaler()
X_al_importance_min_max = scaler_al_importance.fit_transform(x_al_importance)

scaler_nl_chi2 = MinMaxScaler()
X_nl_chi2_min_max = scaler_nl_chi2.fit_transform(x_nl_chi2)

scaler_al_chi2 = MinMaxScaler()
X_al_chi2_min_max = scaler_al_chi2.fit_transform(x_al_chi2)

In [167]:
#KNN with min max
k = 10
#NL importance
knn_nl_importance = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='distance')
knn_nl_importance.fit(X_nl_importance_min_max, y_nl)

y_pred_nl_importance = knn_nl_importance.predict(X_nl_importance_min_max)
print("Importance NL Predictions: \n", y_pred_nl_importance)

#AL Importance
knn_al_importance = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='distance')
knn_al_importance.fit(X_al_importance_min_max, y_al)

y_pred_al_importance = knn_al_importance.predict(X_al_importance_min_max)
print("Importance AL Predictions: \n", y_pred_al_importance)

#NL Chi2
knn_nl_chi2 = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='distance')
knn_nl_chi2.fit(X_nl_chi2_min_max, y_nl)

y_pred_nl_chi2 = knn_nl_chi2.predict(X_nl_chi2_min_max)
print("Chi2 NL Predictions: \n", y_pred_nl_chi2)

#AL Chi2
knn_al_chi2 = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='distance')
knn_al_chi2.fit(X_al_chi2_min_max, y_al)

y_pred_al_chi2 = knn_al_chi2.predict(X_al_chi2_min_max)
print("Chi2 AL Predictions: \n", y_pred_al_chi2)

Importance NL Predictions: 
 [11 11 11 11 10 10  9  9  8  8  8  8  7  7  7  7  7  7  6  6  6  6  6  5
  5  5  5  5  4  4  4  4  4  3  3  3  3  3  2  2  2  2  2  1  1  1  1  1
 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0 

In [168]:
#Model Evaluation
k = 15
#Importance Models
kf_nl_importance = KFold(n_splits=k, shuffle=True, random_state=42)
scores_nl_importance = cross_val_score(knn_nl_importance, X_nl_importance_min_max , y_nl, cv=kf_nl_importance)
print(f"Accuracy scores for each fold NL Importance: {scores_nl_importance}")
print(f"Average accuracy for NL Importance: {np.mean(scores_nl_importance):.4f}")

kf_al_importance = KFold(n_splits=k, shuffle=True, random_state=42)
scores_al_importance = cross_val_score(knn_al_importance, X_al_importance_min_max , y_al, cv=kf_nl_importance)
print(f"Accuracy scores for each fold AL Importance: {scores_al_importance}")
print(f"Average accuracy for AL Importance: {np.mean(scores_al_importance):.4f}")

#Chi2 Models
kf_nl_chi2 = KFold(n_splits=k, shuffle=True, random_state=42)
scores_nl_chi2 = cross_val_score(knn_nl_chi2, X_nl_chi2_min_max , y_nl, cv=kf_nl_chi2)
print(f"Accuracy scores for each fold NL Chi2: {scores_nl_chi2}")
print(f"Average accuracy for NL Chi2: {np.mean(scores_nl_chi2):.4f}")

kf_al_chi2 = KFold(n_splits=k, shuffle=True, random_state=42)
scores_al_chi2 = cross_val_score(knn_al_chi2, X_al_chi2_min_max , y_al, cv=kf_al_chi2)
print(f"Accuracy scores for each fold AL Chi2: {scores_al_chi2}")
print(f"Average accuracy for AL Chi2: {np.mean(scores_al_chi2):.4f}")

Accuracy scores for each fold NL Importance: [0.875      0.75       0.875      0.875      0.79166667 0.91666667
 0.875      0.875      0.91304348 0.7826087  0.7826087  0.86956522
 0.82608696 0.86956522 0.95652174]
Average accuracy for NL Importance: 0.8556
Accuracy scores for each fold AL Importance: [0.8        0.8        0.96       0.84       0.84       0.91666667
 0.95833333 0.91666667 0.875      0.875      0.83333333 0.83333333
 0.875      0.91666667 0.95833333]
Average accuracy for AL Importance: 0.8799
Accuracy scores for each fold NL Chi2: [0.875      0.75       0.875      0.875      0.79166667 0.91666667
 0.91666667 0.875      0.86956522 0.82608696 0.7826087  0.82608696
 0.82608696 0.86956522 0.95652174]
Average accuracy for NL Chi2: 0.8554
Accuracy scores for each fold AL Chi2: [0.8        0.8        0.96       0.84       0.84       0.91666667
 0.91666667 0.91666667 0.91666667 0.875      0.83333333 0.83333333
 0.875      0.91666667 0.95833333]
Average accuracy for AL Chi2: 0.8

We already have prediction classes here, but again I will be using f1 score

In [169]:
#Now we can use f1 to show the relation between precision and recall

f1_nl_importance = f1_score(y_nl, y_pred_nl_importance, average='macro')
print("F1 Score NL Importance:", f1_nl_importance)

f1_al_importance = f1_score(y_al, y_pred_al_importance, average='macro')
print("F1 Score AL Importance:", f1_al_importance)

f1_nl_chi2 = f1_score(y_nl, y_pred_nl_chi2, average='macro')
print("F1 Score NL Chi2:", f1_nl_chi2)

f1_al_chi2 = f1_score(y_al, y_pred_al_chi2, average='macro')
print("F1 Score AL Chi2:", f1_al_chi2)

F1 Score NL Importance: 1.0
F1 Score AL Importance: 1.0
F1 Score NL Chi2: 1.0
F1 Score AL Chi2: 1.0


Above we made a mistake and evaluated the model by comparing something we've already fit to the model so it predicts it perfectly almost always. Below we will use train test split to minimize this risk

In [170]:
k = 2

# Train-test split for Importance
#NL
X_train_nl, X_test_nl, y_train_nl, y_test_nl = train_test_split(X_nl_importance_min_max, y_nl, test_size=0.2, random_state=42)

knn_nl_importance = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='distance')
knn_nl_importance.fit(X_train_nl, y_train_nl)
y_pred_nl_importance = knn_nl_importance.predict(X_test_nl)

f1_nl_importance = f1_score(y_test_nl, y_pred_nl_importance, average='macro')
print("F1 Score NL Importance:", f1_nl_importance)

# AL
X_train_al, X_test_al, y_train_al, y_test_al = train_test_split(X_al_importance_min_max, y_al, test_size=0.2, random_state=42)

knn_al_importance = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='distance')
knn_al_importance.fit(X_train_al, y_train_al)
y_pred_al_importance = knn_al_importance.predict(X_test_al)

f1_al_importance = f1_score(y_test_al, y_pred_al_importance, average='macro')
print("F1 Score AL Importance:", f1_al_importance)

# Train-tests split for Chi2
#NL
X_train_nl_chi2, X_test_nl_chi2, y_train_nl_chi2, y_test_nl_chi2 = train_test_split(X_nl_chi2_min_max, y_nl, test_size=0.2, random_state=42)

knn_nl_chi2 = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='distance')
knn_nl_chi2.fit(X_train_nl_chi2, y_train_nl_chi2)
y_pred_nl_chi2 = knn_nl_chi2.predict(X_test_nl_chi2)

f1_nl_chi2 = f1_score(y_test_nl_chi2, y_pred_nl_chi2, average='macro')
print("F1 Score NL Chi2:", f1_nl_chi2)

# AL
X_train_al_chi2, X_test_al_chi2, y_train_al_chi2, y_test_al_chi2 = train_test_split(X_al_chi2_min_max, y_al, test_size=0.2, random_state=42)

knn_al_chi2 = KNeighborsClassifier(n_neighbors=k, metric='euclidean', weights='distance')
knn_al_chi2.fit(X_train_al_chi2, y_train_al_chi2)
y_pred_al_chi2 = knn_al_chi2.predict(X_test_al_chi2)

f1_al_chi2 = f1_score(y_test_al_chi2, y_pred_al_chi2, average='macro')
print("F1 Score AL Chi2:", f1_al_chi2)


F1 Score NL Importance: 0.1318181818181818
F1 Score AL Importance: 0.1334776334776335
F1 Score NL Chi2: 0.08715251690458302
F1 Score AL Chi2: 0.09606299212598425


Did more research on how to control and correctly classify a dataset with highly imbalanced classes, as KNN relies on neighbors, but all MLB pitchers aer at a professional level and there is only 1 Cy Young winner. However, most of the research dives into models we haven't worked with so we will just mention this in our write up.

In [171]:
# Prep test data
df_test = pd.read_csv("test_data.csv")

player_descriptors_test = df_test.iloc[:, :4]
league = df_test['Lg']
x_test = df_test.iloc[:, 3:]

player_descriptors_nl = player_descriptors_test[league == 'NL'].reset_index(drop=True)
player_descriptors_al = player_descriptors_test[league == 'AL'].reset_index(drop=True)

# Add league temporarily for sorting
x_test['Lg'] = league

# Create separate datasets
x_al_test = x_test[x_test['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl_test = x_test[x_test['Lg'] == 'NL'].drop(columns=['Lg'])

In [172]:
#Random Forest Importance
x_nl_importance_test = x_nl_test[selected_features_nl_importance]
x_al_importance_test = x_al_test[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2_test = x_nl_test[selected_features_nl_chi2]
x_al_chi2_test = x_al_test[selected_features_al_chi2]

print(x_al_importance_test)

     WAR    SO   FIP   ERA+    IP   ERA     BF    W   WHIP   GS
0    1.9  47.0  2.91  381.0  51.2  1.05  208.0  6.0  0.910  8.0
1    1.2  37.0  4.39  144.0  50.2  2.84  200.0  3.0  1.105  8.0
2    1.6  38.0  3.71  112.0  49.2  3.62  208.0  1.0  1.208  8.0
3    1.4  56.0  2.84  205.0  49.0  2.02  196.0  3.0  1.061  8.0
4    1.5  53.0  2.22  188.0  48.2  2.03  186.0  3.0  0.801  8.0
..   ...   ...   ...    ...   ...   ...    ...  ...    ...  ...
120  0.6  23.0  1.92  237.0  16.0  1.69   70.0  0.0  1.250  0.0
121  0.8  22.0  2.11  237.0  16.0  1.69   63.0  1.0  0.875  0.0
122  0.4  16.0  2.23  173.0  16.0  2.25   63.0  1.0  1.125  0.0
123  0.5  18.0  2.98  178.0  16.0  2.25   63.0  2.0  1.063  0.0
124  0.4  18.0  2.42  150.0  16.0  2.81   66.0  1.0  1.063  0.0

[125 rows x 10 columns]


In [173]:
scaler_nl_importance = MinMaxScaler()
scaler_nl_importance.fit(x_nl_importance)
x_nl_imp_scaled_test = scaler_nl_importance.transform(x_nl_importance_test)

scaler_al_importance = MinMaxScaler()
scaler_al_importance.fit(x_al_importance)
x_al_imp_scaled_test = scaler_al_importance.transform(x_al_importance_test)

scaler_nl_chi2 = MinMaxScaler()
scaler_nl_chi2.fit(x_nl_chi2)
x_nl_chi2_scaled_test = scaler_nl_chi2.transform(x_nl_chi2_test)

scaler_al_chi2 = MinMaxScaler()
scaler_al_chi2.fit(x_al_chi2)
x_al_chi2_scaled_test = scaler_al_chi2.transform(x_al_chi2_test)


In [174]:
#Impute to handle NaN data
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

# Impute all four test sets
x_nl_imp_scaled_test = imputer.fit_transform(x_nl_imp_scaled_test)
x_al_imp_scaled_test = imputer.fit_transform(x_al_imp_scaled_test)
x_nl_chi2_scaled_test = imputer.fit_transform(x_nl_chi2_scaled_test)
x_al_chi2_scaled_test = imputer.fit_transform(x_al_chi2_scaled_test)


In [175]:
#predict test set
y_pred_nl_importance = knn_nl_importance.predict(x_nl_imp_scaled_test)
y_pred_al_importance = knn_al_importance.predict(x_al_imp_scaled_test)
y_pred_nl_chi2 = knn_nl_chi2.predict(x_nl_chi2_scaled_test)
y_pred_al_chi2 = knn_al_chi2.predict(x_al_chi2_scaled_test)


In [176]:
# Calculate distances from test points to nearest neighbors
distances_imp_nl, _ = knn_nl_importance.kneighbors(x_nl_imp_scaled_test, n_neighbors=3)
distances_imp_al, _ = knn_al_importance.kneighbors(x_al_imp_scaled_test, n_neighbors=3)
distances_chi2_nl, _ = knn_nl_chi2.kneighbors(x_nl_chi2_scaled_test, n_neighbors=3)
distances_chi2_al, _ = knn_al_chi2.kneighbors(x_al_chi2_scaled_test, n_neighbors=3)

def select_winner(winner_mask, distances, descriptors):
    indices = list(np.where(winner_mask)[0])
    if indices:
        best_index = min(indices, key=lambda i: distances[i][0])
    else:
        best_index = np.argmin(distances)
    return descriptors.iloc[best_index]

# Importance
nl_winner_imp = select_winner(y_pred_nl_importance, distances_imp_nl, player_descriptors_nl)
al_winner_imp = select_winner(y_pred_al_importance, distances_imp_al, player_descriptors_al)

# Chi2
nl_winner_chi2 = select_winner(y_pred_nl_chi2, distances_chi2_nl, player_descriptors_nl)
al_winner_chi2 = select_winner(y_pred_al_chi2, distances_chi2_al, player_descriptors_al)

# Importance Output
print("🏆 Predicted NL Cy Young (Importance features):")
print(nl_winner_imp)

print("\n🏆 Predicted AL Cy Young (Importance features):")
print(al_winner_imp)

# Chi2 Output
print("🏆 Predicted NL Cy Young (Chi2 features):")
print(nl_winner_chi2)

print("\n🏆 Predicted AL Cy Young (Chi2 features):")
print(al_winner_chi2)


🏆 Predicted NL Cy Young (Importance features):
Rk                224.0
Player    Robert Suarez
yr                 25.0
Team                SDP
Name: 115, dtype: object

🏆 Predicted AL Cy Young (Importance features):
Rk                  41.0
Player    Yusei Kikuchi*
yr                  25.0
Team                 LAA
Name: 21, dtype: object
🏆 Predicted NL Cy Young (Chi2 features):
Rk                224.0
Player    Robert Suarez
yr                 25.0
Team                SDP
Name: 115, dtype: object

🏆 Predicted AL Cy Young (Chi2 features):
Rk               10.0
Player    Kris Bubic*
yr               25.0
Team              KCR
Name: 6, dtype: object


Add a testing dataset where we know the classifications

In [177]:
#Prep 2018 data
df_2018 = pd.read_csv("data_2018.csv")

# Separate descriptors and target
player_descriptors_2018 = df_2018.iloc[:, :3]
league = df_2018['Lg']
cy_young_place = df_2018['Awards']
X = df_2018.iloc[:, 3:-1]

player_descriptors_nl_2018 = player_descriptors_2018[league == 'NL'].reset_index(drop=True)
player_descriptors_al_2018 = player_descriptors_2018[league == 'AL'].reset_index(drop=True)

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al_2018 = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl_2018 = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al_2018 = y[league == 'AL']
y_nl_2018 = y[league == 'NL']


In [178]:
#Random Forest Importance
x_nl_importance_2018 = x_nl_2018[selected_features_nl_importance]
x_al_importance_2018 = x_al_2018[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2_2018 = x_nl_2018[selected_features_nl_chi2]
x_al_chi2_2018 = x_al_2018[selected_features_al_chi2]


In [179]:
scaler_nl_importance = MinMaxScaler()
scaler_nl_importance.fit(x_nl_importance)
x_nl_imp_scaled_2018 = scaler_nl_importance.transform(x_nl_importance_2018)

scaler_al_importance = MinMaxScaler()
scaler_al_importance.fit(x_al_importance)
x_al_imp_scaled_2018 = scaler_al_importance.transform(x_al_importance_2018)

scaler_nl_chi2 = MinMaxScaler()
scaler_nl_chi2.fit(x_nl_chi2)
x_nl_chi2_scaled_2018 = scaler_nl_chi2.transform(x_nl_chi2_2018)

scaler_al_chi2 = MinMaxScaler()
scaler_al_chi2.fit(x_al_chi2)
x_al_chi2_scaled_2018 = scaler_al_chi2.transform(x_al_chi2_2018)


In [180]:
imputer = SimpleImputer(strategy='mean')

# Impute all four test sets
x_nl_imp_scaled_2018 = imputer.fit_transform(x_nl_imp_scaled_2018)
x_al_imp_scaled_2018 = imputer.fit_transform(x_al_imp_scaled_2018)
x_nl_chi2_scaled_2018 = imputer.fit_transform(x_nl_chi2_scaled_2018)
x_al_chi2_scaled_2018 = imputer.fit_transform(x_al_chi2_scaled_2018)

In [181]:
y_pred_nl_importance_2018 = knn_nl_importance.predict(x_nl_imp_scaled_2018)
y_pred_al_importance_2018 = knn_al_importance.predict(x_al_imp_scaled_2018)
y_pred_nl_chi2_2018 = knn_nl_chi2.predict(x_nl_chi2_scaled_2018)
y_pred_al_chi2_2018 = knn_al_chi2.predict(x_al_chi2_scaled_2018)

In [182]:
f1_al_chi2_2018 = f1_score(y_al_2018, y_pred_al_chi2_2018, average='macro')
print("F1 Score AL Chi2:", f1_al_chi2_2018)


F1 Score AL Chi2: 0.20596736596736598


In [183]:
# Calculate distances from test points to nearest neighbors
distances_imp_nl, _ = knn_nl_importance.kneighbors(x_nl_imp_scaled_2018, n_neighbors=3)
distances_imp_al, _ = knn_al_importance.kneighbors(x_al_imp_scaled_2018, n_neighbors=3)
distances_chi2_nl, _ = knn_nl_chi2.kneighbors(x_nl_chi2_scaled_2018, n_neighbors=3)
distances_chi2_al, _ = knn_al_chi2.kneighbors(x_al_chi2_scaled_2018, n_neighbors=3)

def select_winner(winner_mask, distances, descriptors):
    indices = list(np.where(winner_mask)[0])
    if indices:
        best_index = min(indices, key=lambda i: distances[i][0])
    else:
        best_index = np.argmin(distances)
    return descriptors.iloc[best_index]

# Importance
nl_winner_imp = select_winner(y_pred_nl_importance_2018, distances_imp_nl, player_descriptors_nl_2018)
al_winner_imp = select_winner(y_pred_al_importance_2018, distances_imp_al, player_descriptors_al_2018)

# Chi2
nl_winner_chi2 = select_winner(y_pred_nl_chi2_2018, distances_chi2_nl, player_descriptors_nl_2018)
al_winner_chi2 = select_winner(y_pred_al_chi2_2018, distances_chi2_al, player_descriptors_al_2018)

# Importance Output
print("🏆 Predicted NL Cy Young (Importance features):")
print(nl_winner_imp)

print("\n🏆 Predicted AL Cy Young (Importance features):")
print(al_winner_imp)

# Chi2 Output
print("🏆 Predicted NL Cy Young (Chi2 features):")
print(nl_winner_chi2)

print("\n🏆 Predicted AL Cy Young (Chi2 features):")
print(al_winner_chi2)


🏆 Predicted NL Cy Young (Importance features):
Rk                     228
Player    Mike Foltynewicz
yr                      18
Name: 40, dtype: object

🏆 Predicted AL Cy Young (Importance features):
Rk                   492
Player    Charlie Morton
yr                    18
Name: 148, dtype: object
🏆 Predicted NL Cy Young (Chi2 features):
Rk                   434
Player    Germán Márquez
yr                    18
Name: 134, dtype: object

🏆 Predicted AL Cy Young (Chi2 features):
Rk                   492
Player    Charlie Morton
yr                    18
Name: 148, dtype: object
