In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.spatial.distance import cdist


In [None]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-2]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL']
y_nl = y[league == 'NL']


1      11
2      11
3      11
4      11
7      10
       ..
713     0
714     0
715     0
716     0
717     0
Name: Cy_young, Length: 353, dtype: int64


The following is done using cdist and euclidean distances, without selecting any attributes

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler_nl = MinMaxScaler()
X_nl_scaled = scaler_nl.fit_transform(x_nl)

scaler_al = MinMaxScaler()
X_al_scaled = scaler_al.fit_transform(x_al)

In [8]:
#NL cdist
x_nl_dist = cdist(X_nl_scaled,X_nl_scaled, metric='euclidean')
#AL cdist
x_al_dist = cdist(X_al_scaled,X_al_scaled, metric='euclidean')

print("NL distance: \n", x_nl_dist)
print("AL distance: \n", x_al_dist)

NL distance: 
 [[0.         0.98695817 0.56775318 ... 2.23652283 1.8199935  2.16903755]
 [0.98695817 0.         1.03209955 ... 2.07810409 1.75828245 2.11737208]
 [0.56775318 1.03209955 0.         ... 1.97678025 1.54167331 1.96272459]
 ...
 [2.23652283 2.07810409 1.97678025 ... 0.         1.1837118  0.65915191]
 [1.8199935  1.75828245 1.54167331 ... 1.1837118  0.         1.08374042]
 [2.16903755 2.11737208 1.96272459 ... 0.65915191 1.08374042 0.        ]]
AL distance: 
 [[0.         1.16213957 1.33382651 ... 1.6825858  1.92145488 2.06929937]
 [1.16213957 0.         1.1494366  ... 1.79586316 2.00766221 2.33398518]
 [1.33382651 1.1494366  0.         ... 2.0490732  2.24144284 2.26030779]
 ...
 [1.6825858  1.79586316 2.0490732  ... 0.         0.55228623 0.96820896]
 [1.92145488 2.00766221 2.24144284 ... 0.55228623 0.         0.90288181]
 [2.06929937 2.33398518 2.26030779 ... 0.96820896 0.90288181 0.        ]]


In [12]:
print("NL Predictions")
#predict NL
for i in range(len(x_nl_dist)):
  x_nl_dist[i,i] = np.inf
  index = np.argmin(x_nl_dist[i])
  x_nl_dist[i,i] = 0
  predict_class = y_nl.iloc[index]
  print(predict_class)

print("AL predictions")
#predict AL
for i in range(len(x_al_dist)):
  x_al_dist[i,i] = np.inf
  index = np.argmin(x_al_dist[i])
  x_al_dist[i,i] = 0
  predict_class = y_al.iloc[index]
  print(predict_class)

NL Predictions
11
0
0
0
5
6
3
6
0
0
0
5
4
4
7
2
11
3
0
2
6
8
10
2
2
1
10
8
2
6
2
3
5
3
0
1
0
0
4
6
0
4
0
2
5
7
1
1
3
0
3
0
0
11
0
0
0
0
0
0
0
0
0
0
0
6
0
5
0
1
0
0
0
0
0
0
3
0
0
0
10
0
0
0
2
0
0
0
8
0
0
0
0
0
0
0
0
0
0
11
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
8
0
0
0
0
0
0
4
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
AL predictions
6
0
0
0
0
5
0
7
3
6
4
3
0
0
0
9
4
4
8
0
6
8
3
0
0
6
6
7
6
0
3
8
3
0
9
3
6
4
0
2
4
5
1
1
7
0
0
3
0
0
0
0
0
0
0
0
0
0
0
6
0
0
10
0
0
0
0
0
0
0
0
0
0
0
0
0
0
9
0
4
0
0
0
0
7
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
