In [1]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE, chi2
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("working_data.csv")

# Separate the player descriptors (first 3 columns) and keep them in a separate DataFrame
player_descriptors = df.iloc[:, :3]

# Extract the relevant columns: league and statistics
league = df['Lg'] 
cy_young_place = df['Cy_young']  
X = df.iloc[:, 3:-2]  # All columns between the descriptors and the target column
print(X)




     Lg  WAR   W   L   W-L%   ERA   G  GS  GF  CG  ...  WP   BF  ERA+   FIP  \
0    AL  4.3  11   9  0.550  3.53  31  31   0   1  ...  12  720   124  3.79   
1    NL  5.1  14   7  0.667  3.25  33  33   0   1  ...   4  835   138  3.49   
2    NL  4.6  15   9  0.625  2.90  32  32   0   0  ...   6  787   136  3.03   
3    NL  3.7  14   8  0.636  3.57  33  33   0   1  ...   2  820   115  3.94   
4    NL  3.0  12   6  0.667  3.47  32  32   0   0  ...   4  738   112  3.83   
..   ..  ...  ..  ..    ...   ...  ..  ..  ..  ..  ...  ..  ...   ...   ...   
713  NL -1.4   3   8  0.273  6.80  20  20   0   0  ...   3  417    61  5.86   
714  NL -1.5   4  12  0.250  5.92  29  24   4   0  ...   0  567    68  4.98   
715  NL -1.9   5  11  0.313  7.40  21  21   0   0  ...   1  484    59  5.99   
716  NL -2.3   6  19  0.240  6.31  31  31   0   1  ...   2  713    62  4.83   
717  NL -2.8   5  14  0.263  7.39  24  24   0   0  ...   3  469    57  6.17   

      WHIP    H9  HR9  BB9   SO9  SO/BB  
0    1.10

In [13]:
# Convert League to numeric (0 for AL, 1 for NL)
league_encoded = league.map({'AL': 0, 'NL': 1})

# Add the encoded league as a feature to X
X['Lg'] = league_encoded


In [5]:
# Convert League to numeric (0 for AL, 1 for NL)
league_encoded = league.map({'AL': 0, 'NL': 1})

# Add the encoded league as a feature to X
X['Lg'] = league_encoded

In [6]:
y = cy_young_place # Target variable is Cy Young place
print("attributes: \n ", X)
print("target: \n", y)


attributes: 
       Lg  WAR   W   L   W-L%   ERA   G  GS  GF  CG  ...  WP   BF  ERA+   FIP  \
0     0  4.3  11   9  0.550  3.53  31  31   0   1  ...  12  720   124  3.79   
1     1  5.1  14   7  0.667  3.25  33  33   0   1  ...   4  835   138  3.49   
2     1  4.6  15   9  0.625  2.90  32  32   0   0  ...   6  787   136  3.03   
3     1  3.7  14   8  0.636  3.57  33  33   0   1  ...   2  820   115  3.94   
4     1  3.0  12   6  0.667  3.47  32  32   0   0  ...   4  738   112  3.83   
..   ..  ...  ..  ..    ...   ...  ..  ..  ..  ..  ...  ..  ...   ...   ...   
713   1 -1.4   3   8  0.273  6.80  20  20   0   0  ...   3  417    61  5.86   
714   1 -1.5   4  12  0.250  5.92  29  24   4   0  ...   0  567    68  4.98   
715   1 -1.9   5  11  0.313  7.40  21  21   0   0  ...   1  484    59  5.99   
716   1 -2.3   6  19  0.240  6.31  31  31   0   1  ...   2  713    62  4.83   
717   1 -2.8   5  14  0.263  7.39  24  24   0   0  ...   3  469    57  6.17   

      WHIP    H9  HR9  BB9   SO9  SO

Use the chiSquared feature to select 5 attributes, these attributes have the most effect on the dataset.

In [7]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Select the top 5 features using SelectKBest with chi-squared test
selector = SelectKBest(chi2, k=10)
X_new = selector.fit_transform(X_scaled, y)

# Get the selected feature names
selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices]

print("Top 5 features affecting Cy Young prediction using SelectKBest with Chi-squared test:")
print(selected_features)

Top 5 features affecting Cy Young prediction using SelectKBest with Chi-squared test:
Index(['WAR', 'W', 'ERA', 'CG', 'SHO', 'IP', 'SO', 'BF', 'ERA+', 'SO/BB'], dtype='object')


Use the RandomForestClassifier(Did research and it showed that this was an effective way to rank attributes)

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_scaled, y)

# Get the feature importances (how much each feature contributes to the model)
importances = rf_model.feature_importances_

# Create a DataFrame to display the feature names and their corresponding importance
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the weighted features
print("Feature Importance Weights (sorted):")
print(feature_importance_df)


Feature Importance Weights (sorted):
   Feature  Importance
1      WAR    0.106045
25     FIP    0.077295
19      SO    0.071539
12      IP    0.069249
5      ERA    0.062902
24    ERA+    0.060072
23      BF    0.047883
2        W    0.045728
26    WHIP    0.037830
31   SO/BB    0.033313
27      H9    0.030633
30     SO9    0.028154
14       R    0.026694
13       H    0.026671
4     W-L%    0.026289
22      WP    0.026217
28     HR9    0.024227
15      ER    0.023849
7       GS    0.021656
17      BB    0.021028
20     HBP    0.020735
16      HR    0.019794
29     BB9    0.019739
3        L    0.015957
6        G    0.013229
18     IBB    0.010596
9       CG    0.009340
21      BK    0.008840
0       Lg    0.006981
10     SHO    0.006811
8       GF    0.000644
11      SV    0.000058
