In [103]:
import numpy as np
import pandas as pd 
import json

In [168]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-2]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL']
y_nl = y[league == 'NL']



In [131]:
# Read selected features (from forest importance)
with open('forest_importance_nl.json', 'r') as f:
    selected_features_nl_importance = json.load(f)

with open('forest_importance_al.json', 'r') as f:
    selected_features_al_importance = json.load(f)

# Read selected features (from Chi-squared)
with open('chi2_selected_features_nl.json', 'r') as f:
    selected_features_nl_chi2 = json.load(f)


with open('chi2_selected_features_al.json', 'r') as f:
    selected_features_al_chi2 = json.load(f)

# Print selected features
print("Selected Features for NL:", selected_features_nl_importance)
print("Selected Features for AL:", selected_features_al_importance)
print("Selected Features for NL:", selected_features_nl_chi2)
print("Selected Features for AL:", selected_features_al_chi2)

Selected Features for NL: ['WAR', 'SO', 'ERA+', 'FIP', 'WHIP', 'IP', 'ERA', 'W', 'BF', 'SO/BB']
Selected Features for AL: ['WAR', 'SO', 'IP', 'FIP', 'BF', 'ERA+', 'ERA', 'W', 'WHIP', 'GS']
Selected Features for NL: ['WAR', 'W', 'ERA', 'CG', 'SHO', 'IP', 'SO', 'BF', 'ERA+', 'SO/BB']
Selected Features for AL: ['WAR', 'W', 'CG', 'SHO', 'IP', 'SO', 'BK', 'BF', 'ERA+', 'SO/BB']


In [132]:
#Random Forest Importance
x_nl_importance = x_nl[selected_features_nl_importance]
x_al_importance = x_al[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2 = x_nl[selected_features_nl_chi2]
x_al_chi2 = x_al[selected_features_al_chi2]

In [149]:
# generate based on importance
nl_importance = x_nl_importance.copy()
nl_importance['Cy_young'] = y_nl

for attribute in x_nl_importance:  
    breakpoints = np.percentile(x_nl_importance[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_nl_importance[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    labels = list(range(len(breakpoints)-1))
    nl_importance[attribute] = pd.cut(x_nl_importance[attribute], bins=breakpoints, labels=labels, include_lowest=True)
nl_importance = nl_importance.to_numpy()

In [None]:

Q_nl_importance = np.zeros((11,11,10))

for attribute in range(11):
  for c in range(11):
    for v in range(10):
      Q_nl_importance[attribute, c ,v] = np.sum((nl_importance[:, attribute] == v) & (nl_importance[:, 10] == c)) / np.sum(nl_importance[:, 10] == c)
      
      

In [151]:
P_nl_importance = np.zeros(11)

for c in range(11):
  P_nl_importance[c] = np.sum(nl_importance[:,10] == c) / len(nl_importance[:, 10])


In [None]:
c_nl_importance = np.zeros((10,10,10,10,10))

for v1 in range(10):
  for v2 in range(10):
    for v3 in range(10):
      for v4 in range(10):
        for v5 in range(10):
          c_nl_importance[v1, v2, v3, v4, v5] = np.argmax(Q_nl_importance[0, :, v1] * 
                                                          Q_nl_importance[1, :, v2] * 
                                                          Q_nl_importance[2, :, v3] * 
                                                          Q_nl_importance[3, :, v4] * 
                                                          Q_nl_importance[4, :, v5] * P_nl_importance)


In [160]:
#Overall NL based on forest importance
nl_importance_result = []
nl_importance_correct = 0
for i in range(len(nl_importance)):
  true_class = nl_importance[i, 10]
  estimated_class = c_nl_importance[nl_importance[i,0], nl_importance[i, 1], nl_importance[i, 2], nl_importance[i, 3], nl_importance[i,4]]
  #print("true class is:", true_class, "estimated_class:", estimated_class)
  nl_importance_result.append(estimated_class)
  if estimated_class == true_class:
        nl_importance_correct += 1
        
accuracy = nl_importance_correct / len(nl_importance)
print("\nOverall Accuracy:", accuracy)


Overall Accuracy: 0.9065155807365439


In [156]:
# generate based on importance
al_importance = x_al_importance.copy()
al_importance['Cy_young'] = y_al

for attribute in x_al_importance:  
    breakpoints = np.percentile(x_al_importance[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_al_importance[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    labels = list(range(len(breakpoints)-1))
    al_importance[attribute] = pd.cut(x_al_importance[attribute], bins=breakpoints, labels=labels, include_lowest=True)
al_importance = al_importance.to_numpy()

In [157]:
Q_al_importance = np.zeros((11,11,10))

for attribute in range(11):
  for c in range(11):
    for v in range(10):
      Q_al_importance[attribute, c ,v] = np.sum((al_importance[:, attribute] == v) & (al_importance[:, 10] == c)) / np.sum(al_importance[:, 10] == c)


In [158]:
P_al_importance = np.zeros(11)

for c in range(11):
  P_al_importance[c] = np.sum(al_importance[:,10] == c) / len(al_importance[:, 10])


In [159]:
c_al_importance = np.zeros((10,10,10,10,10))

for v1 in range(10):
  for v2 in range(10):
    for v3 in range(10):
      for v4 in range(10):
        for v5 in range(10):
          c_al_importance[v1, v2, v3, v4, v5] = np.argmax(Q_nl_importance[0, :, v1] * 
                                                          Q_al_importance[1, :, v2] * 
                                                          Q_al_importance[2, :, v3] * 
                                                          Q_al_importance[3, :, v4] * 
                                                          Q_al_importance[4, :, v5] * P_al_importance)


In [161]:
#Overall AL based on forest importance
al_importance_result = []
al_importance_correct = 0
for i in range(len(al_importance)):
  true_class = al_importance[i, 10]
  estimated_class = c_al_importance[al_importance[i,0], al_importance[i, 1], al_importance[i, 2], al_importance[i, 3], al_importance[i,4]]
  #print("true class is:", true_class, "estimated_class:", estimated_class)
  al_importance_result.append(estimated_class)
  if estimated_class == true_class:
        al_importance_correct += 1
        
accuracy = al_importance_correct / len(al_importance)
print("\nOverall Accuracy:", accuracy)


Overall Accuracy: 0.9205479452054794


In [169]:
# generate based on chi squared importance
nl_chi2 = x_nl_chi2.copy()
nl_chi2['Cy_young'] = y_nl

for attribute in x_nl_chi2:  
    breakpoints = np.percentile(x_nl_chi2[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_nl_chi2[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    labels = list(range(len(breakpoints)-1))
    nl_chi2[attribute] = pd.cut(x_nl_chi2[attribute], bins=breakpoints, labels=labels, include_lowest=True)
nl_chi2 = nl_chi2.to_numpy()

In [170]:
Q_nl_chi2 = np.zeros((11,11,10))

for attribute in range(11):
  for c in range(11):
    for v in range(10):
      Q_nl_chi2[attribute, c ,v] = np.sum((nl_chi2[:, attribute] == v) & (nl_chi2[:, 10] == c)) / np.sum(nl_chi2[:, 10] == c)
      
      

In [171]:
P_nl_chi2 = np.zeros(11)

for c in range(11):
  P_nl_chi2[c] = np.sum(nl_chi2[:,10] == c) / len(nl_chi2[:, 10])


In [172]:
c_nl_chi2 = np.zeros((10,10,10,10,10))

for v1 in range(10):
  for v2 in range(10):
    for v3 in range(10):
      for v4 in range(10):
        for v5 in range(10):
          c_nl_chi2[v1, v2, v3, v4, v5] = np.argmax(Q_nl_chi2[0, :, v1] * 
                                                          Q_nl_chi2[1, :, v2] * 
                                                          Q_nl_chi2[2, :, v3] * 
                                                          Q_nl_chi2[3, :, v4] * 
                                                          Q_nl_chi2[4, :, v5] * P_nl_chi2)


In [None]:
#Overall NL based on chi squared importance
nl_chi2_result = []
nl_chi2_correct = 0
for i in range(len(nl_chi2)):
  true_class = nl_chi2[i, 10]
  estimated_class = c_nl_chi2[nl_chi2[i,0], nl_chi2[i, 1], nl_chi2[i, 2], nl_chi2[i, 3], nl_chi2[i,4]]
  #print("true class is:", true_class, "estimated_class:", estimated_class)
  nl_chi2_result.append(estimated_class)
  if estimated_class == true_class:
        nl_chi2_correct += 1
        
accuracy = nl_chi2_correct / len(nl_chi2)
print("\nOverall Accuracy:", accuracy)


Overall Accuracy: 0.8725212464589235


In [174]:
# generate based on chi squared importance
al_chi2 = x_al_chi2.copy()
al_chi2['Cy_young'] = y_al

for attribute in x_al_chi2:  
    breakpoints = np.percentile(x_al_chi2[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_al_chi2[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    labels = list(range(len(breakpoints)-1))
    al_chi2[attribute] = pd.cut(x_al_chi2[attribute], bins=breakpoints, labels=labels, include_lowest=True)
al_chi2 = al_chi2.to_numpy()

In [175]:
Q_al_chi2 = np.zeros((11,11,10))

for attribute in range(11):
  for c in range(11):
    for v in range(10):
      Q_al_chi2[attribute, c ,v] = np.sum((al_chi2[:, attribute] == v) & (al_chi2[:, 10] == c)) / np.sum(al_chi2[:, 10] == c)
      

In [176]:
P_al_chi2 = np.zeros(11)

for c in range(11):
  P_al_chi2[c] = np.sum(al_chi2[:,10] == c) / len(al_chi2[:, 10])


In [177]:
c_al_chi2 = np.zeros((10,10,10,10,10))

for v1 in range(10):
  for v2 in range(10):
    for v3 in range(10):
      for v4 in range(10):
        for v5 in range(10):
          c_al_chi2[v1, v2, v3, v4, v5] = np.argmax(Q_al_chi2[0, :, v1] * 
                                                    Q_al_chi2[1, :, v2] * 
                                                    Q_al_chi2[2, :, v3] * 
                                                    Q_al_chi2[3, :, v4] * 
                                                    Q_al_chi2[4, :, v5] * P_al_chi2)


In [None]:
#Overall aL based on chi squared importance
al_chi2_result = []
al_chi2_correct = 0
for i in range(len(al_chi2)):
  true_class = al_chi2[i, 10]
  estimated_class = c_al_chi2[al_chi2[i,0], al_chi2[i, 1], al_chi2[i, 2], al_chi2[i, 3], al_chi2[i,4]]
  #print("true class is:", true_class, "estimated_class:", estimated_class)
  al_chi2_result.append(estimated_class)
  if estimated_class == true_class:
        al_chi2_correct += 1
        
accuracy = al_chi2_correct / len(al_chi2)
print("\nOverall Accuracy:", accuracy)


Overall Accuracy: 0.8986301369863013
