In [4]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE, chi2
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv("working_data.csv")

# Separate descriptors and target
player_descriptors = df.iloc[:, :3]
league = df['Lg']
cy_young_place = df['Cy_young']
X = df.iloc[:, 3:-2]

# Add league temporarily for sorting
X['Lg'] = league
y = cy_young_place

# Create separate datasets
x_al = X[X['Lg'] == 'AL'].drop(columns=['Lg'])
x_nl = X[X['Lg'] == 'NL'].drop(columns=['Lg'])

y_al = y[league == 'AL']
y_nl = y[league == 'NL']

print(x_al,"\n", x_nl)


     WAR   W   L   W-L%   ERA   G  GS  GF  CG  SHO  ...  WP   BF  ERA+   FIP  \
0    4.3  11   9  0.550  3.53  31  31   0   1    0  ...  12  720   124  3.79   
5    4.0  12   4  0.750  3.15  27  27   0   1    1  ...   5  637   131  3.34   
6    2.7  16   8  0.667  3.60  33  33   0   1    1  ...   4  826   119  4.28   
9    3.2  12  11  0.522  3.45  31  31   0   2    2  ...   6  808   123  3.50   
10   3.0  12  10  0.545  3.35  31  31   0   0    0  ...   2  725   115  2.38   
..   ...  ..  ..    ...   ...  ..  ..  ..  ..  ...  ...  ..  ...   ...   ...   
409 -1.1   6   9  0.400  6.47  21  21   0   0    0  ...   3  535    68  5.19   
410 -1.2   3  10  0.231  5.64  21  21   0   0    0  ...   5  447    71  4.93   
411 -1.8   3   7  0.300  6.09  29  17   2   0    0  ...   3  485    67  4.96   
412 -1.8   5   6  0.455  6.77  19  19   0   0    0  ...   3  445    63  5.42   
413 -2.0   3  13  0.188  6.90  25  19   0   0    0  ...   2  492    63  5.81   

      WHIP    H9  HR9  BB9   SO9  SO/BB

In [6]:

print("NL attributes: \n ", x_nl)
print("NL target: \n", y_nl)
print("AL attributes: \n", x_al)
print("AL target: \n", y_al)


NL attributes: 
       WAR   W   L   W-L%   ERA   G  GS  GF  CG  SHO  ...  WP   BF  ERA+   FIP  \
1    5.1  14   7  0.667  3.25  33  33   0   1    1  ...   4  835   138  3.49   
2    4.6  15   9  0.625  2.90  32  32   0   0    0  ...   6  787   136  3.03   
3    3.7  14   8  0.636  3.57  33  33   0   1    1  ...   2  820   115  3.94   
4    3.0  12   6  0.667  3.47  32  32   0   0    0  ...   4  738   112  3.83   
7    3.7  21   5  0.808  3.19  30  30   0   0    0  ...   7  738   128  3.58   
..   ...  ..  ..    ...   ...  ..  ..  ..  ..  ...  ...  ..  ...   ...   ...   
713 -1.4   3   8  0.273  6.80  20  20   0   0    0  ...   3  417    61  5.86   
714 -1.5   4  12  0.250  5.92  29  24   4   0    0  ...   0  567    68  4.98   
715 -1.9   5  11  0.313  7.40  21  21   0   0    0  ...   1  484    59  5.99   
716 -2.3   6  19  0.240  6.31  31  31   0   1    0  ...   2  713    62  4.83   
717 -2.8   5  14  0.263  7.39  24  24   0   0    0  ...   3  469    57  6.17   

      WHIP    H9  HR9

Use the chiSquared feature to select 5 attributes, these attributes have the most effect on the dataset.

In [7]:
scaler_nl = MinMaxScaler()
X_nl_scaled = scaler_nl.fit_transform(x_nl)

scaler_al = MinMaxScaler()
X_al_scaled = scaler_al.fit_transform(x_al)

#NL feature selection
selector_nl = SelectKBest(chi2, k=10)
X_new_nl = selector_nl.fit_transform(X_nl_scaled, y_nl)
selected_indices_nl = selector_nl.get_support(indices=True)
selected_features_nl = x_nl.columns[selected_indices_nl]

#AL feature selection
selector_al = SelectKBest(chi2, k=10)
X_new_al = selector_al.fit_transform(X_al_scaled, y_al)
selected_indices_al = selector_al.get_support(indices=True)
selected_features_al = x_al.columns[selected_indices_al]

print("Top 10 features affecting Cy Young prediction (NL):")
print(selected_features_nl)

print("\nTop 10 features affecting Cy Young prediction (AL):")
print(selected_features_al)

Top 10 features affecting Cy Young prediction (NL):
Index(['WAR', 'W', 'ERA', 'CG', 'SHO', 'IP', 'SO', 'BF', 'ERA+', 'SO/BB'], dtype='object')

Top 10 features affecting Cy Young prediction (AL):
Index(['WAR', 'W', 'CG', 'SHO', 'IP', 'SO', 'BK', 'BF', 'ERA+', 'SO/BB'], dtype='object')


Use the RandomForestClassifier(Did research and it showed that this was an effective way to rank attributes)

In [8]:
from sklearn.ensemble import RandomForestClassifier

# NL model
rf_model_nl = RandomForestClassifier(random_state=42)
rf_model_nl.fit(X_nl_scaled, y_nl)
importances_nl = rf_model_nl.feature_importances_

# AL model
rf_model_al = RandomForestClassifier(random_state=42)
rf_model_al.fit(X_al_scaled, y_al)
importances_al = rf_model_al.feature_importances_

#both dataframes
feature_importance_nl = pd.DataFrame({
    'Feature': x_nl.columns,
    'Importance': importances_nl
}).sort_values(by='Importance', ascending=False)

feature_importance_al = pd.DataFrame({
    'Feature': x_al.columns,
    'Importance': importances_al
}).sort_values(by='Importance', ascending=False)

# Display
print("Feature Importance Weights (NL):")
print(feature_importance_nl)

print("\nFeature Importance Weights (AL):")
print(feature_importance_al)

Feature Importance Weights (NL):
   Feature  Importance
0      WAR    0.120687
18      SO    0.067645
23    ERA+    0.063782
24     FIP    0.057913
25    WHIP    0.051700
11      IP    0.050659
4      ERA    0.049656
1        W    0.047850
22      BF    0.044928
30   SO/BB    0.036260
3     W-L%    0.034347
26      H9    0.031273
12       H    0.031097
13       R    0.029250
14      ER    0.027505
27     HR9    0.027441
28     BB9    0.027113
16      BB    0.027048
29     SO9    0.025958
6       GS    0.025738
15      HR    0.021177
19     HBP    0.020492
21      WP    0.019299
5        G    0.016881
2        L    0.011966
17     IBB    0.009556
8       CG    0.009018
9      SHO    0.006767
20      BK    0.006702
7       GF    0.000147
10      SV    0.000143

Feature Importance Weights (AL):
   Feature  Importance
0      WAR    0.122775
18      SO    0.102079
11      IP    0.068911
24     FIP    0.068113
22      BF    0.056652
23    ERA+    0.051794
4      ERA    0.046291
1        W   