In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.spatial.distance import cdist
import json

In [2]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-2]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL']
y_nl = y[league == 'NL']


The following is done using cdist and euclidean distances, without selecting any attributes

In [3]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler_nl = MinMaxScaler()
X_nl_scaled = scaler_nl.fit_transform(x_nl)

scaler_al = MinMaxScaler()
X_al_scaled = scaler_al.fit_transform(x_al)

In [4]:
#NL cdist
x_nl_dist = cdist(X_nl_scaled,X_nl_scaled, metric='euclidean')
#AL cdist
x_al_dist = cdist(X_al_scaled,X_al_scaled, metric='euclidean')

print("NL distance: \n", x_nl_dist)
print("AL distance: \n", x_al_dist)

NL distance: 
 [[0.         0.98695817 0.56775318 ... 2.23652283 1.8199935  2.16903755]
 [0.98695817 0.         1.03209955 ... 2.07810409 1.75828245 2.11737208]
 [0.56775318 1.03209955 0.         ... 1.97678025 1.54167331 1.96272459]
 ...
 [2.23652283 2.07810409 1.97678025 ... 0.         1.1837118  0.65915191]
 [1.8199935  1.75828245 1.54167331 ... 1.1837118  0.         1.08374042]
 [2.16903755 2.11737208 1.96272459 ... 0.65915191 1.08374042 0.        ]]
AL distance: 
 [[0.         1.16213957 1.33382651 ... 1.6825858  1.92145488 2.06929937]
 [1.16213957 0.         1.1494366  ... 1.79586316 2.00766221 2.33398518]
 [1.33382651 1.1494366  0.         ... 2.0490732  2.24144284 2.26030779]
 ...
 [1.6825858  1.79586316 2.0490732  ... 0.         0.55228623 0.96820896]
 [1.92145488 2.00766221 2.24144284 ... 0.55228623 0.         0.90288181]
 [2.06929937 2.33398518 2.26030779 ... 0.96820896 0.90288181 0.        ]]


In [5]:
print("NL Predictions")
#predict NL
for i in range(len(x_nl_dist)):
  x_nl_dist[i,i] = np.inf
  index = np.argmin(x_nl_dist[i])
  x_nl_dist[i,i] = 0
  predict_class = y_nl.iloc[index]
  print(predict_class)

print("AL predictions")
#predict AL
for i in range(len(x_al_dist)):
  x_al_dist[i,i] = np.inf
  index = np.argmin(x_al_dist[i])
  x_al_dist[i,i] = 0
  predict_class = y_al.iloc[index]
  print(predict_class)

NL Predictions
11
0
0
0
5
6
3
6
0
0
0
5
4
4
7
2
11
3
0
2
6
8
10
2
2
1
10
8
2
6
2
3
5
3
0
1
0
0
4
6
0
4
0
2
5
7
1
1
3
0
3
0
0
11
0
0
0
0
0
0
0
0
0
0
0
6
0
5
0
1
0
0
0
0
0
0
3
0
0
0
10
0
0
0
2
0
0
0
8
0
0
0
0
0
0
0
0
0
0
11
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
8
0
0
0
0
0
0
4
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
AL predictions
6
0
0
0
0
5
0
7
3
6
4
3
0
0
0
9
4
4
8
0
6
8
3
0
0
6
6
7
6
0
3
8
3
0
9
3
6
4
0
2
4
5
1
1
7
0
0
3
0
0
0
0
0
0
0
0
0
0
0
6
0
0
10
0
0
0
0
0
0
0
0
0
0
0
0
0
0
9
0
4
0
0
0
0
7
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [6]:
# Read selected features (from forest importance)
with open('forest_importance_nl.json', 'r') as f:
    selected_features_nl_importance = json.load(f)

with open('forest_importance_al.json', 'r') as f:
    selected_features_al_importance = json.load(f)

# Read selected features (from Chi-squared)
with open('chi2_selected_features_nl.json', 'r') as f:
    selected_features_nl_chi2 = json.load(f)


with open('chi2_selected_features_al.json', 'r') as f:
    selected_features_al_chi2 = json.load(f)

In [7]:
#Random Forest Importance
x_nl_importance = x_nl[selected_features_nl_importance]
x_al_importance = x_al[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2 = x_nl[selected_features_nl_chi2]
x_al_chi2 = x_al[selected_features_al_chi2]


After selecting attributes, We will begin to use KNearestNeighbor classifier

In [8]:
#Scale with MinMax
scaler_nl_importance = MinMaxScaler()
X_nl_importance_min_max = scaler_nl_importance.fit_transform(x_nl_importance)

scaler_al_importance = MinMaxScaler()
X_al_importance_min_max = scaler_al_importance.fit_transform(x_al_importance)

scaler_nl_chi2 = MinMaxScaler()
X_nl_chi2_min_max = scaler_nl_chi2.fit_transform(x_nl_chi2)

scaler_al_chi2 = MinMaxScaler()
X_al_chi2_min_max = scaler_al_chi2.fit_transform(x_al_chi2)

In [9]:
#KNN with min max
#NL importance
knn_nl_importance = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn_nl_importance.fit(X_nl_importance_min_max, y_nl)

y_pred_nl_importance = knn_nl_importance.predict(X_nl_importance_min_max)
print("Importance NL Predictions: \n", y_pred_nl_importance)

#AL Importance
knn_al_importance = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn_al_importance.fit(X_al_importance_min_max, y_al)

y_pred_al_importance = knn_nl_importance.predict(X_al_importance_min_max)
print("Importance AL Predictions: \n", y_pred_al_importance)

#NL Chi2
knn_nl_chi2 = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn_nl_chi2.fit(X_nl_chi2_min_max, y_nl)

y_pred_nl_chi2 = knn_nl_chi2.predict(X_nl_chi2_min_max)
print("Chi2 NL Predictions: \n", y_pred_nl_chi2)

#AL Chi2
knn_al_chi2 = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn_al_chi2.fit(X_al_chi2_min_max, y_al)

y_pred_al_chi2 = knn_al_chi2.predict(X_al_chi2_min_max)
print("Chi2 AL Predictions: \n", y_pred_al_chi2)

Importance NL Predictions: 
 [11 11 11 11 10 10  9  9  8  8  8  8  7  7  7  7  7  7  6  6  6  6  6  5
  5  5  5  5  4  4  4  4  4  3  3  3  3  3  2  2  2  2  2  1  1  1  1  1
  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0 

In [13]:
#Model Evaluation
k = 15
#Importance Models
kf_nl_importance = KFold(n_splits=k, shuffle=True, random_state=42)
scores_nl_importance = cross_val_score(knn_nl_importance, X_nl_importance_min_max , y_nl, cv=kf_nl_importance)
print(f"Accuracy scores for each fold NL Importance: {scores_nl_importance}")
print(f"Average accuracy for NL Importance: {np.mean(scores_nl_importance):.4f}")

kf_al_importance = KFold(n_splits=k, shuffle=True, random_state=42)
scores_al_importance = cross_val_score(knn_al_importance, X_al_importance_min_max , y_al, cv=kf_nl_importance)
print(f"Accuracy scores for each fold AL Importance: {scores_al_importance}")
print(f"Average accuracy for AL Importance: {np.mean(scores_al_importance):.4f}")

#Chi2 Models
kf_nl_chi2 = KFold(n_splits=k, shuffle=True, random_state=42)
scores_nl_chi2 = cross_val_score(knn_nl_chi2, X_nl_chi2_min_max , y_nl, cv=kf_nl_chi2)
print(f"Accuracy scores for each fold NL Chi2: {scores_nl_chi2}")
print(f"Average accuracy for NL Chi2: {np.mean(scores_nl_chi2):.4f}")

kf_al_chi2 = KFold(n_splits=k, shuffle=True, random_state=42)
scores_al_chi2 = cross_val_score(knn_al_chi2, X_al_chi2_min_max , y_al, cv=kf_al_chi2)
print(f"Accuracy scores for each fold AL Chi2: {scores_al_chi2}")
print(f"Average accuracy for AL Chi2: {np.mean(scores_al_chi2):.4f}")

Accuracy scores for each fold NL Importance: [0.875      0.79166667 0.875      0.91666667 0.83333333 0.91666667
 0.875      0.875      0.91304348 0.7826087  0.73913043 0.86956522
 0.7826087  0.82608696 0.95652174]
Average accuracy for NL Importance: 0.8552
Accuracy scores for each fold AL Importance: [0.8        0.8        0.96       0.84       0.84       0.875
 0.95833333 0.91666667 0.875      0.83333333 0.79166667 0.875
 0.83333333 0.91666667 0.875     ]
Average accuracy for AL Importance: 0.8660
Accuracy scores for each fold NL Chi2: [0.83333333 0.79166667 0.875      0.875      0.79166667 0.91666667
 0.875      0.875      0.82608696 0.7826087  0.69565217 0.82608696
 0.7826087  0.82608696 0.86956522]
Average accuracy for NL Chi2: 0.8295
Accuracy scores for each fold AL Chi2: [0.76       0.8        0.92       0.84       0.8        0.875
 0.875      0.875      0.875      0.875      0.79166667 0.83333333
 0.875      0.91666667 0.95833333]
Average accuracy for AL Chi2: 0.8580
