In [1]:
import numpy as np
import pandas as pd 
import json

In [15]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-2]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL']
y_nl = y[league == 'NL']



In [16]:
# Read selected features (from forest importance)
with open('forest_importance_nl.json', 'r') as f:
    selected_features_nl_importance = json.load(f)

with open('forest_importance_al.json', 'r') as f:
    selected_features_al_importance = json.load(f)

# Read selected features (from Chi-squared)
with open('chi2_selected_features_nl.json', 'r') as f:
    selected_features_nl_chi2 = json.load(f)


with open('chi2_selected_features_al.json', 'r') as f:
    selected_features_al_chi2 = json.load(f)

# Print selected features
print("Selected Features for NL:", selected_features_nl_importance)
print("Selected Features for AL:", selected_features_al_importance)
print("Selected Features for NL:", selected_features_nl_chi2)
print("Selected Features for AL:", selected_features_al_chi2)

Selected Features for NL: ['WAR', 'SO', 'ERA', 'W', 'FIP', 'IP', 'ERA+', 'WHIP', 'BF', 'W-L%']
Selected Features for AL: ['WAR', 'SO', 'FIP', 'ERA+', 'IP', 'ERA', 'BF', 'W', 'WHIP', 'GS']
Selected Features for NL: ['WAR', 'W', 'ERA', 'CG', 'SHO', 'IP', 'SO', 'BF', 'ERA+', 'FIP']
Selected Features for AL: ['WAR', 'W', 'CG', 'SHO', 'IP', 'SO', 'BK', 'BF', 'ERA+', 'FIP']


In [17]:
#Random Forest Importance
x_nl_importance = x_nl[selected_features_nl_importance]
x_al_importance = x_al[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2 = x_nl[selected_features_nl_chi2]
x_al_chi2 = x_al[selected_features_al_chi2]

In [18]:
# generate nl based on importance
nl_importance = x_nl_importance.copy()
nl_importance['Cy_young'] = y_nl

for attribute in x_nl_importance:  
    breakpoints = np.percentile(x_nl_importance[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_nl_importance[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    labels = list(range(len(breakpoints)-1))
    nl_importance[attribute] = pd.cut(x_nl_importance[attribute], bins=breakpoints, labels=labels, include_lowest=True)
nl_importance = nl_importance.to_numpy()

# generate AL based on importance
al_importance = x_al_importance.copy()
al_importance['Cy_young'] = y_al

for attribute in x_al_importance:  
    breakpoints = np.percentile(x_al_importance[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_al_importance[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    labels = list(range(len(breakpoints)-1))
    al_importance[attribute] = pd.cut(x_al_importance[attribute], bins=breakpoints, labels=labels, include_lowest=True)
al_importance = al_importance.to_numpy()

# generate NL based on chi squared importance
nl_chi2 = x_nl_chi2.copy()
nl_chi2['Cy_young'] = y_nl

for attribute in x_nl_chi2:  
    breakpoints = np.percentile(x_nl_chi2[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_nl_chi2[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    labels = list(range(len(breakpoints)-1))
    nl_chi2[attribute] = pd.cut(x_nl_chi2[attribute], bins=breakpoints, labels=labels, include_lowest=True)
nl_chi2 = nl_chi2.to_numpy()

# generate AL based on chi squared importance
al_chi2 = x_al_chi2.copy()
al_chi2['Cy_young'] = y_al

for attribute in x_al_chi2:  
    breakpoints = np.percentile(x_al_chi2[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_al_chi2[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    labels = list(range(len(breakpoints)-1))
    al_chi2[attribute] = pd.cut(x_al_chi2[attribute], bins=breakpoints, labels=labels, include_lowest=True)
al_chi2 = al_chi2.to_numpy()

In [None]:

def compute_class_priors(data, class_index):

    classes, counts = np.unique(data[:, class_index], return_counts=True)
    total = len(data)
    priors = {int(cls): count / total for cls, count in zip(classes, counts)}
    return priors

def estimate_likelihood(data, class_val, sample, class_index):

    # Filter rows where class == class_val
    class_rows = data[data[:, class_index] == class_val]
    if len(class_rows) == 0:
        return 0

    # Compare all features except the class label
    matches = np.all(class_rows[:, :class_index] == sample[:class_index], axis=1)
    likelihood = np.sum(matches) / len(class_rows)
    return likelihood

def predict_bayes(data, class_index):

    priors = compute_class_priors(data, class_index)
    predictions = []

    for i in range(len(data)):
        sample = data[i]
        probs = {}
        for cls in priors:
            likelihood = estimate_likelihood(data, cls, sample, class_index)
            probs[cls] = priors[cls] * likelihood
        if sum(probs.values()) == 0:
            predicted_class = np.random.choice(list(priors.keys()))  # fallback
        else:
            predicted_class = max(probs, key=probs.get)
        predictions.append(predicted_class)

    return np.array(predictions)


In [None]:
class_index = nl_importance.shape[1] - 1 
#predict NL using forest importance
preds_nl_importance = predict_bayes(nl_importance, class_index)
true_labels_nl_importance = nl_importance[:, class_index].astype(int)
accuracy_nl_importance = np.mean(preds_nl_importance == true_labels_nl_importance)
print(f"NL Forest importance Bayes Accuracy: {accuracy_nl_importance:.4f}")

#predict AL using forest importance
preds_al_importance = predict_bayes(al_importance, class_index)
true_labels_al_importance = al_importance[:, class_index].astype(int)
accuracy_al_importance = np.mean(preds_al_importance == true_labels_al_importance)
print(f"AL Forest importance Bayes Accuracy: {accuracy_al_importance:.4f}")

#predict NL using Chi2 importance
preds_nl_chi2 = predict_bayes(nl_chi2, class_index)
true_labels_nl_chi2 = nl_chi2[:, class_index].astype(int)
accuracy_nl_chi2 = np.mean(preds_nl_chi2 == true_labels_nl_chi2)
print(f"NL Chi2 Bayes Accuracy: {accuracy_nl_chi2:.4f}")

#predict AL using Chi2 importance
preds_al_chi2 = predict_bayes(al_chi2, class_index)
true_labels_al_chi2 = al_chi2[:, class_index].astype(int)
accuracy_al_chi2 = np.mean(preds_al_chi2 == true_labels_al_chi2)
print(f"NL Chi2 Bayes Accuracy: {accuracy_al_chi2:.4f}")


NL Forest importance Bayes Accuracy: 1.0000
AL Forest importance Bayes Accuracy: 0.9973
NL Chi2 Bayes Accuracy: 0.9972
NL Chi2 Bayes Accuracy: 0.9945


In [29]:
# Prep test data
df_test = pd.read_csv("test_data.csv")

player_descriptors_test = df_test.iloc[:, :3]
league = df_test['Lg']
x_test = df_test.iloc[:, 3:-1]

player_descriptors_nl = player_descriptors_test[league == 'NL'].reset_index(drop=True)
player_descriptors_al = player_descriptors_test[league == 'AL'].reset_index(drop=True)

# Add league temporarily for sorting
x_test['Lg'] = league

# Create separate datasets
x_al_test = x_test[x_test['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl_test = x_test[x_test['Lg'] == 'NL'].drop(columns=['Lg'])

In [30]:
#Random Forest Importance
x_nl_importance_test = x_nl_test[selected_features_nl_importance]
x_al_importance_test = x_al_test[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2_test = x_nl_test[selected_features_nl_chi2]
x_al_chi2_test = x_al_test[selected_features_al_chi2]

In [31]:
# generate nl based on importance
nl_importance_test = x_nl_importance_test.copy()

for attribute in x_nl_importance_test:
    series = x_nl_importance_test[attribute]  
    breakpoints = np.percentile(x_nl_importance_test[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_nl_importance_test[attribute].min()  
    breakpoints = np.unique(breakpoints) 
    breakpoints = breakpoints[np.diff(breakpoints, prepend=-np.inf) > 0] 
    labels = list(range(len(breakpoints)-1))
    nl_importance_test[attribute] = pd.cut(x_nl_importance_test[attribute], bins=breakpoints, labels=labels, include_lowest=True)
    mask = nl_importance_test[attribute].isna()
    print(f"{attribute} NaNs: {series[mask]}")

#nl_importance_test = nl_importance_test.to_numpy()

# generate AL based on importance
al_importance_test = x_al_importance_test.copy()

for attribute in x_al_importance_test:  
    breakpoints = np.percentile(x_al_importance_test[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_al_importance_test[attribute].min()  
    breakpoints = np.unique(breakpoints)   
    breakpoints = breakpoints[np.diff(breakpoints, prepend=-np.inf) > 0]
    labels = list(range(len(breakpoints)-1))
    al_importance_test[attribute] = pd.cut(x_al_importance_test[attribute], bins=breakpoints, labels=labels, include_lowest=True)
al_importance_test = al_importance_test.to_numpy()

# generate NL based on chi squared importance
nl_chi2_test = x_nl_chi2_test.copy()

for attribute in x_nl_chi2_test:  
    breakpoints = np.percentile(x_nl_chi2_test[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_nl_chi2_test[attribute].min()  
    breakpoints = np.unique(breakpoints)  
    breakpoints = breakpoints[np.diff(breakpoints, prepend=-np.inf) > 0] 
    labels = list(range(len(breakpoints)-1))
    nl_chi2_test[attribute] = pd.cut(x_nl_chi2_test[attribute], bins=breakpoints, labels=labels, include_lowest=True)
nl_chi2_test = nl_chi2_test.to_numpy()

# generate AL based on chi squared importance
al_chi2_test = x_al_chi2_test.copy()

for attribute in x_al_chi2_test:  
    breakpoints = np.percentile(x_al_chi2_test[attribute], [0,10,20,30,40,50,60,70,80,90,100])
    breakpoints[0] = x_al_chi2_test[attribute].min()  
    breakpoints = np.unique(breakpoints) 
    breakpoints = breakpoints[np.diff(breakpoints, prepend=-np.inf) > 0]  
    labels = list(range(len(breakpoints)-1))
    al_chi2_test[attribute] = pd.cut(x_al_chi2_test[attribute], bins=breakpoints, labels=labels, include_lowest=True)
al_chi2_test = al_chi2_test.to_numpy()


nl_importance_test

WAR NaNs: Series([], Name: WAR, dtype: float64)
SO NaNs: Series([], Name: SO, dtype: int64)
ERA NaNs: Series([], Name: ERA, dtype: float64)
W NaNs: Series([], Name: W, dtype: int64)
FIP NaNs: Series([], Name: FIP, dtype: float64)
IP NaNs: Series([], Name: IP, dtype: float64)
ERA+ NaNs: Series([], Name: ERA+, dtype: float64)
WHIP NaNs: Series([], Name: WHIP, dtype: float64)
BF NaNs: Series([], Name: BF, dtype: int64)
W-L% NaNs: Series([], Name: W-L%, dtype: float64)


Unnamed: 0,WAR,SO,ERA,W,FIP,IP,ERA+,WHIP,BF,W-L%
3,9,9,2,3,0,9,6,3,9,4
4,9,9,4,1,1,9,5,3,9,2
5,9,9,1,3,3,9,7,2,9,4
6,8,7,3,2,7,9,6,2,9,4
9,9,9,5,1,2,9,4,1,9,4
...,...,...,...,...,...,...,...,...,...,...
203,2,1,1,1,2,0,8,4,0,2
205,5,0,2,1,0,0,7,0,0,6
207,9,2,0,0,0,0,9,0,0,0
208,7,0,0,0,5,0,9,1,0,0
