In [13]:
import numpy as np
import pandas as pd 
import json
from itertools import product


In [4]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-2]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL']
y_nl = y[league == 'NL']



In [5]:
# Read selected features (from forest importance)
with open('forest_importance_nl.json', 'r') as f:
    selected_features_nl_importance = json.load(f)

with open('forest_importance_al.json', 'r') as f:
    selected_features_al_importance = json.load(f)

# Read selected features (from Chi-squared)
with open('chi2_selected_features_nl.json', 'r') as f:
    selected_features_nl_chi2 = json.load(f)


with open('chi2_selected_features_al.json', 'r') as f:
    selected_features_al_chi2 = json.load(f)

# Print selected features
print("Selected Features for NL:", selected_features_nl_importance)
print("Selected Features for AL:", selected_features_al_importance)
print("Selected Features for NL:", selected_features_nl_chi2)
print("Selected Features for AL:", selected_features_al_chi2)

Selected Features for NL: ['WAR', 'SO', 'ERA', 'W', 'FIP', 'IP', 'ERA+', 'WHIP', 'BF', 'W-L%']
Selected Features for AL: ['WAR', 'SO', 'FIP', 'ERA+', 'IP', 'ERA', 'BF', 'W', 'WHIP', 'GS']
Selected Features for NL: ['WAR', 'W', 'ERA', 'CG', 'SHO', 'IP', 'SO', 'BF', 'ERA+', 'FIP']
Selected Features for AL: ['WAR', 'W', 'CG', 'SHO', 'IP', 'SO', 'BK', 'BF', 'ERA+', 'FIP']


In [6]:
#Random Forest Importance
x_nl_importance = x_nl[selected_features_nl_importance]
x_al_importance = x_al[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2 = x_nl[selected_features_nl_chi2]
x_al_chi2 = x_al[selected_features_al_chi2]

In [89]:
def bin_mlb_features(df):
    binned_df = df.copy()
    reverse_metrics = ['ERA', 'WHIP', 'FIP']
    n_bins = 5

    for col in binned_df.select_dtypes(include=[np.number]).columns:
        series = binned_df[col]
        series = series.fillna(0)
        if series.nunique() < 2:
            print(f"Skipping {col}: not enough unique values to bin.")
            continue
        try:
            binned_col = pd.qcut(series, q=n_bins, labels=False, duplicates='drop')
            if col in reverse_metrics:
                max_label = binned_col.max()
                binned_col = max_label - binned_col  # Reverse order
            binned_df[col] = binned_col
        except ValueError as e:
            print(f"Skipping {col} due to qcut error: {e}")

    return binned_df

In [90]:
# generate nl based on importance
nl_importance = bin_mlb_features(x_nl_importance)
nl_importance['Cy_young'] = y_nl
nl_importance = nl_importance.to_numpy()

# generate AL based on importance
al_importance = bin_mlb_features(x_al_importance)
al_importance['Cy_young'] = y_al
al_importance = al_importance.to_numpy()

# generate NL based on chi squared importance
nl_chi2 = bin_mlb_features(x_nl_chi2)
nl_chi2['Cy_young'] = y_nl
nl_chi2 = nl_chi2.to_numpy()

# generate AL based on chi squared importance
al_chi2 = bin_mlb_features(x_al_chi2)
al_chi2['Cy_young'] = y_al
al_chi2 = al_chi2.to_numpy()

In [91]:

def find_q(data, attributes,classes,values):
  Q_nl_importance = np.zeros((attributes,classes,values))

  for attribute in range(attributes):
    for c in range(classes):
      for v in range(values):
        Q_nl_importance[attribute, c ,v] = np.sum((data[:, attribute] == v) & (data[:, classes] == c)) / np.sum(data[:, values] == c)
  return Q_nl_importance      
        
def find_p(data,classes,values):
  P_nl_importance = np.zeros(values)

  for c in range(values):
    P_nl_importance[c] = np.sum(data[:,classes] == c) / len(data[:, classes])
  
  return P_nl_importance

def calculate_c(Q_nl_importance, P_nl_importance, values=10):
    c_nl_importance = np.zeros((values, values, values, values, values))

    for v1 in range(values):
        for v2 in range(values):
            for v3 in range(values):
                for v4 in range(values):
                    for v5 in range(values):
                        c_nl_importance[v1, v2, v3, v4, v5] = np.argmax(
                            Q_nl_importance[0, :, v1] *
                            Q_nl_importance[1, :, v2] *
                            Q_nl_importance[2, :, v3] *
                            Q_nl_importance[3, :, v4] *
                            Q_nl_importance[4, :, v5] *
                            P_nl_importance
                        )
    
    return c_nl_importance
    

In [92]:

attributes = 10  
classes = 10  
values = 10  

# Call the functions
Q_nl_importance = find_q(nl_importance, attributes, classes, values)
P_nl_importance = find_p(nl_importance, classes, values)
c_nl_importance = calculate_c(Q_nl_importance, P_nl_importance, values)

nl_importance_result = []
nl_importance_correct = 0
for i in range(len(nl_importance)):
  true_class = nl_importance[i, 10]
  estimated_class = c_nl_importance[nl_importance[i,0], nl_importance[i, 1], nl_importance[i, 2], nl_importance[i, 3], nl_importance[i,4]]
  #print("true class is:", true_class, "estimated_class:", estimated_class)
  nl_importance_result.append(estimated_class)
  if estimated_class == true_class:
        nl_importance_correct += 1
        
accuracy = nl_importance_correct / len(nl_importance)
print("\nOverall Accuracy:", accuracy)


Overall Accuracy: 0.8526912181303116


In [93]:
attributes = 10  
classes = 10  
values = 10  

# Call the functions
Q_al_importance = find_q(al_importance, attributes, classes, values)
P_al_importance = find_p(al_importance, classes, values)
c_al_importance = calculate_c(Q_al_importance, P_al_importance, values)
al_importance_result = []
al_importance_correct = 0
for i in range(len(al_importance)):
  true_class = al_importance[i, 10]
  estimated_class = c_al_importance[al_importance[i,0], al_importance[i, 1], al_importance[i, 2], al_importance[i, 3], al_importance[i,4]]
  #print("true class is:", true_class, "estimated_class:", estimated_class)
  al_importance_result.append(estimated_class)
  if estimated_class == true_class:
        al_importance_correct += 1

accuracy = al_importance_correct / len(al_importance)
print("\nOverall Accuracy:", accuracy)



Overall Accuracy: 0.8931506849315068


In [94]:
Q_nl_chi2 = find_q(nl_chi2, attributes, classes, values)
P_nl_chi2 = find_p(nl_chi2, classes, values)
c_nl_chi2 = calculate_c(Q_nl_chi2, P_nl_chi2, values)
nl_chi2_result = []
nl_chi2_correct = 0
for i in range(len(nl_chi2)):
  true_class = nl_chi2[i, 10]
  estimated_class = c_nl_chi2[nl_chi2[i,0], nl_chi2[i, 1], nl_chi2[i, 2], nl_chi2[i, 3], nl_chi2[i,4]]
  #print("true class is:", true_class, "estimated_class:", estimated_class)
  nl_chi2_result.append(estimated_class)
  if estimated_class == true_class:
        nl_chi2_correct += 1

accuracy = nl_chi2_correct / len(nl_chi2)
print("\nOverall Accuracy:", accuracy)


Overall Accuracy: 0.8498583569405099


In [95]:
Q_al_chi2 = find_q(al_chi2, attributes, classes, values)
P_al_chi2 = find_p(al_chi2, classes, values)
c_al_chi2 = calculate_c(Q_al_chi2, P_al_chi2, values)
al_chi2_result = []
al_chi2_correct = 0
for i in range(len(al_chi2)):
  true_class = al_chi2[i, 10]
  estimated_class = c_al_chi2[al_chi2[i,0], al_chi2[i, 1], al_chi2[i, 2], al_chi2[i, 3], al_chi2[i,4]]
  #print("true class is:", true_class, "estimated_class:", estimated_class)
  al_chi2_result.append(estimated_class)
  if estimated_class == true_class:
        al_chi2_correct += 1

accuracy = al_chi2_correct / len(al_chi2)
print("\nOverall Accuracy:", accuracy)


Overall Accuracy: 0.8657534246575342


In [96]:
# Prep test data
df_test = pd.read_csv("test_data.csv")

player_descriptors_test = df_test.iloc[:, :3]
league = df_test['Lg']
x_test = df_test.iloc[:, 3:-1]

player_descriptors_nl = player_descriptors_test[league == 'NL'].reset_index(drop=True)
player_descriptors_al = player_descriptors_test[league == 'AL'].reset_index(drop=True)

# Add league temporarily for sorting
x_test['Lg'] = league

# Create separate datasets
x_al_test = x_test[x_test['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl_test = x_test[x_test['Lg'] == 'NL'].drop(columns=['Lg'])

In [97]:
#Random Forest Importance
x_nl_importance_test = x_nl_test[selected_features_nl_importance]
x_al_importance_test = x_al_test[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2_test = x_nl_test[selected_features_nl_chi2]
x_al_chi2_test = x_al_test[selected_features_al_chi2]

In [98]:
# generate nl based on importance
nl_importance_test = bin_mlb_features(x_nl_importance_test)
nl_importance_test = nl_importance_test.to_numpy()

# generate AL based on importance
al_importance_test = bin_mlb_features(x_al_importance_test)
al_importance_test = al_importance_test.to_numpy()

# generate NL based on chi squared importance
nl_chi2_test = bin_mlb_features(x_nl_chi2_test)
nl_chi2_test = nl_chi2_test.to_numpy()

# generate AL based on chi squared importance
al_chi2_test = bin_mlb_features(x_al_chi2_test)
al_chi2_test = al_chi2_test.to_numpy()


In [99]:
def predict_test_data(test_data, attribute_indices, c_nl_importance):
    predictions = []

    for row in test_data:
        attr_values = tuple(int(row[i]) for i in attribute_indices)
        predicted_class = c_nl_importance[attr_values]
        predictions.append(predicted_class)

    return np.array(predictions)

In [101]:
attribute_indices = [0, 1, 2, 3, 4]

predictions = predict_test_data(nl_importance_test, attribute_indices, c_nl_importance)
non_zero_indices = np.where(predictions != 0)[0]
non_zero_predictions = predictions[non_zero_indices]
sorted_non_zero_indices = non_zero_indices[np.argsort(non_zero_predictions)]
lowest_three_indices = sorted_non_zero_indices[:3]
lowest_players = player_descriptors_nl.iloc[lowest_three_indices]
print("NL forest importance predictions:", predictions[lowest_three_indices])
print("Corresponding players:")
print(lowest_players)

NL forest importance predictions: [3. 4. 4.]
Corresponding players:
      Rk               Player    yr
34  70.0          Kodai Senga  25.0
2    9.0           Logan Webb  25.0
41  85.0  Cristopher Sánchez*  25.0


In [105]:
attribute_indices = [0, 1, 2, 3, 4]

predictions = predict_test_data(al_importance_test, attribute_indices, c_al_importance)
non_zero_indices = np.where(predictions != 0)[0]
non_zero_predictions = predictions[non_zero_indices]
sorted_non_zero_indices = non_zero_indices[np.argsort(non_zero_predictions)]
lowest_three_indices = sorted_non_zero_indices[:3]
lowest_players = player_descriptors_al.iloc[lowest_three_indices]
print("AL forest importance predictions:", predictions[lowest_three_indices])
print("Corresponding players:")
print(lowest_players)

AL forest importance predictions: [1. 1. 1.]
Corresponding players:
     Rk            Player    yr
0   1.0        Max Fried*  25.0
3   5.0  Garrett Crochet*  25.0
6  10.0       Kris Bubic*  25.0


In [109]:
attribute_indices = [0, 1, 2, 3, 4]

predictions = predict_test_data(nl_chi2_test, attribute_indices, c_nl_chi2)
non_zero_indices = np.where(predictions != 0)[0]
non_zero_predictions = predictions[non_zero_indices]
sorted_non_zero_indices = non_zero_indices[np.argsort(non_zero_predictions)]
lowest_three_indices = sorted_non_zero_indices[:3]
lowest_players = player_descriptors_nl.iloc[lowest_three_indices]
print("NL Chi2 predictions:", predictions[lowest_three_indices])
print("Corresponding players:")
print(lowest_players)

NL Chi2 predictions: [7. 7. 7.]
Corresponding players:
      Rk              Player    yr
10  21.0       Hunter Greene  25.0
12  23.0      Freddy Peralta  25.0
14  27.0  Yoshinobu Yamamoto  25.0


In [114]:
attribute_indices = [0, 1, 2, 3, 4]

predictions = predict_test_data(al_chi2_test, attribute_indices, c_al_chi2)
non_zero_indices = np.where(predictions != 0)[0]
non_zero_predictions = predictions[non_zero_indices]
sorted_non_zero_indices = non_zero_indices[np.argsort(non_zero_predictions)]
lowest_three_indices = sorted_non_zero_indices[:3]
lowest_players = player_descriptors_al.iloc[lowest_three_indices]
print("AL Chi2 predictions:", predictions[lowest_three_indices])
print("Corresponding players:")
print(lowest_players)

AL Chi2 predictions: [2. 2. 2.]
Corresponding players:
    Rk            Player    yr
1  3.0         Seth Lugo  25.0
3  5.0  Garrett Crochet*  25.0
4  6.0    Nathan Eovaldi  25.0
