In [3]:
import numpy as np
import pandas as pd 
import json

In [4]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-2]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL']
y_nl = y[league == 'NL']



In [5]:
# Read selected features (from forest importance)
with open('forest_importance_nl.json', 'r') as f:
    selected_features_nl_importance = json.load(f)

with open('forest_importance_al.json', 'r') as f:
    selected_features_al_importance = json.load(f)

# Read selected features (from Chi-squared)
with open('chi2_selected_features_nl.json', 'r') as f:
    selected_features_nl_chi2 = json.load(f)


with open('chi2_selected_features_al.json', 'r') as f:
    selected_features_al_chi2 = json.load(f)

# Print selected features
print("Selected Features for NL:", selected_features_nl_importance)
print("Selected Features for AL:", selected_features_al_importance)
print("Selected Features for NL:", selected_features_nl_chi2)
print("Selected Features for AL:", selected_features_al_chi2)

Selected Features for NL: ['WAR', 'SO', 'ERA', 'W', 'FIP', 'IP', 'ERA+', 'WHIP', 'BF', 'W-L%']
Selected Features for AL: ['WAR', 'SO', 'FIP', 'ERA+', 'IP', 'ERA', 'BF', 'W', 'WHIP', 'GS']
Selected Features for NL: ['WAR', 'W', 'ERA', 'CG', 'SHO', 'IP', 'SO', 'BF', 'ERA+', 'FIP']
Selected Features for AL: ['WAR', 'W', 'CG', 'SHO', 'IP', 'SO', 'BK', 'BF', 'ERA+', 'FIP']


In [6]:
#Random Forest Importance
x_nl_importance = x_nl[selected_features_nl_importance]
x_al_importance = x_al[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2 = x_nl[selected_features_nl_chi2]
x_al_chi2 = x_al[selected_features_al_chi2]

In [7]:
def bin_mlb_features(df):
    binned_df = df.copy()
    reverse_metrics = ['ERA', 'WHIP', 'FIP']
    n_bins = 10

    for col in binned_df.select_dtypes(include=[np.number]).columns:
        series = binned_df[col]

        if series.nunique() < 2:
            print(f"Skipping {col}: not enough unique values to bin.")
            continue
        
        try:
            binned_col = pd.qcut(series, q=n_bins, labels=False, duplicates='drop')
            
            if col in reverse_metrics:
                max_label = binned_col.max()
                binned_col = max_label - binned_col  # Reverse order

            binned_df[col] = binned_col

        except ValueError as e:
            print(f"Skipping {col} due to qcut error: {e}")

    return binned_df


In [8]:
# generate nl based on importance
nl_importance = bin_mlb_features(x_nl_importance)
nl_importance['Cy_young'] = y_nl
nl_importance = nl_importance.to_numpy()

# generate AL based on importance
al_importance = bin_mlb_features(x_al_importance)
al_importance['Cy_young'] = y_al
al_importance = al_importance.to_numpy()

# generate NL based on chi squared importance
nl_chi2 = bin_mlb_features(x_nl_chi2)
nl_chi2['Cy_young'] = y_nl
nl_chi2 = nl_chi2.to_numpy()

# generate AL based on chi squared importance
al_chi2 = bin_mlb_features(x_al_chi2)
al_chi2['Cy_young'] = y_al
al_chi2 = al_chi2.to_numpy()


In [21]:


def compute_class_priors(data, class_index):
    classes, counts = np.unique(data[:, class_index], return_counts=True)
    total = len(data)
    priors = {int(cls): count / total for cls, count in zip(classes, counts)}
    return priors

def estimate_likelihood(train_data, target_class, sample, class_index):
    likelihood = 1.0
    n_features = len(sample)
    
    class_rows = train_data[train_data[:, class_index] == target_class]
    
    for i in range(n_features):
        if i == class_index:
            continue
        value_counts = np.sum(class_rows[:, i] == sample[i])
        total = len(class_rows)

        # Apply Laplace smoothing: +1 to numerator, +num_values to denominator
        smoothed_prob = (value_counts + 1) / (total + 10)  # assume 10 possible values per feature
        likelihood *= smoothed_prob

    return likelihood

def predict_bayes(train_data, test_data, class_index):
    # Compute priors based on the training data
    priors = compute_class_priors(train_data, class_index)
    predictions = []

    for i in range(len(test_data)):
        sample = test_data[i]
        probs = {}
        
        # Compute likelihood for each class based on the sample's features
        for cls in priors:
            likelihood = estimate_likelihood(train_data, cls, sample, class_index)
            probs[cls] = priors[cls] * likelihood

        
        # Predict the class with the highest posterior probability
        predicted_class = max(probs, key=probs.get)
        predictions.append(predicted_class)
      

    return np.array(predictions)


In [10]:
class_index = nl_importance.shape[1] - 1 
#predict NL using forest importance
preds_nl_importance = predict_bayes(nl_importance,nl_importance, class_index)
true_labels_nl_importance = nl_importance[:, class_index].astype(int)
accuracy_nl_importance = np.mean(preds_nl_importance == true_labels_nl_importance)
print(f"NL Forest importance Bayes Accuracy: {accuracy_nl_importance:.4f}")

#predict AL using forest importance
preds_al_importance = predict_bayes(al_importance,al_importance, class_index)
true_labels_al_importance = al_importance[:, class_index].astype(int)
accuracy_al_importance = np.mean(preds_al_importance == true_labels_al_importance)
print(f"AL Forest importance Bayes Accuracy: {accuracy_al_importance:.4f}")

#predict NL using Chi2 importance
preds_nl_chi2 = predict_bayes(nl_chi2,nl_chi2, class_index)
true_labels_nl_chi2 = nl_chi2[:, class_index].astype(int)
accuracy_nl_chi2 = np.mean(preds_nl_chi2 == true_labels_nl_chi2)
print(f"NL Chi2 Bayes Accuracy: {accuracy_nl_chi2:.4f}")

#predict AL using Chi2 importance
preds_al_chi2 = predict_bayes(al_chi2,al_chi2, class_index)
true_labels_al_chi2 = al_chi2[:, class_index].astype(int)
accuracy_al_chi2 = np.mean(preds_al_chi2 == true_labels_al_chi2)
print(f"NL Chi2 Bayes Accuracy: {accuracy_al_chi2:.4f}")


NL Forest importance Bayes Accuracy: 1.0000
AL Forest importance Bayes Accuracy: 0.9973
NL Chi2 Bayes Accuracy: 0.9972
NL Chi2 Bayes Accuracy: 0.9945


In [11]:
# Prep test data
df_test = pd.read_csv("test_data.csv")

player_descriptors_test = df_test.iloc[:, :3]
league = df_test['Lg']
x_test = df_test.iloc[:, 3:-1]

player_descriptors_nl = player_descriptors_test[league == 'NL'].reset_index(drop=True)
player_descriptors_al = player_descriptors_test[league == 'AL'].reset_index(drop=True)

# Add league temporarily for sorting
x_test['Lg'] = league

# Create separate datasets
x_al_test = x_test[x_test['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl_test = x_test[x_test['Lg'] == 'NL'].drop(columns=['Lg'])

In [12]:
#Random Forest Importance
x_nl_importance_test = x_nl_test[selected_features_nl_importance]
x_al_importance_test = x_al_test[selected_features_al_importance]

# Chi-squared Features
x_nl_chi2_test = x_nl_test[selected_features_nl_chi2]
x_al_chi2_test = x_al_test[selected_features_al_chi2]

In [13]:
# generate nl based on importance
nl_importance_test = bin_mlb_features(x_nl_importance_test)
nl_importance_test = nl_importance_test.to_numpy()

# generate AL based on importance
al_importance_test = bin_mlb_features(x_al_importance_test)
al_importance_test = al_importance_test.to_numpy()

# generate NL based on chi squared importance
nl_chi2_test = bin_mlb_features(x_nl_chi2_test)
nl_chi2_test = nl_chi2_test.to_numpy()

# generate AL based on chi squared importance
al_chi2_test = bin_mlb_features(x_al_chi2_test)
al_chi2_test = al_chi2_test.to_numpy()



In [26]:
#predict NL using forest importance
preds_nl_importance_test = predict_bayes(nl_importance,nl_importance_test, class_index)
indices_of_ones = [index for index, pred in enumerate(preds_nl_importance_test) if pred == 1]
names_of_ones = player_descriptors_test.iloc[indices_of_ones]['Player']
print("Names where predicted class is 1:")
for name in names_of_ones:
    print(f" - {name}")

#predict AL using forest importance
preds_al_importance_test = predict_bayes(al_importance,al_importance_test, class_index)
indices_of_ones = [index for index, pred in enumerate(preds_al_importance_test) if pred == 1]
names_of_ones = player_descriptors_test.iloc[indices_of_ones]['Player']
print("Names where predicted class is 1:")
for name in names_of_ones:
    print(f" - {name}")
    
#predict NL using Chi2 importance
preds_nl_chi2_test = predict_bayes(nl_chi2, nl_chi2_test, class_index)
indices_of_ones = [index for index, pred in enumerate(preds_nl_chi2_test) if pred == 1]
names_of_ones = player_descriptors_test.iloc[indices_of_ones]['Player']
print("Names where predicted class is 1:")
for name in names_of_ones:
    print(f" - {name}")
    
#predict AL using Chi2 importance
preds_al_chi2_test = predict_bayes(al_chi2, al_chi2_test, class_index)
indices_of_ones = [index for index, pred in enumerate(preds_al_chi2_test) if pred == 1]
names_of_ones = player_descriptors_test.iloc[indices_of_ones]['Player']
print("Names where predicted class is 1:")
for name in names_of_ones:
    print(f" - {name}")



Names where predicted class is 1:
 - Fernando Cruz
 - Steven Okert*
 - Emmanuel Clase
Names where predicted class is 1:
 - Max Fried*
 - Kris Bubic*
 - Andrés Muñoz
Names where predicted class is 1:
Names where predicted class is 1:
 - Max Fried*
 - Kris Bubic*
