In [1]:
import pandas as pd
import numpy as np

import itertools

import pickle

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
import time




In [2]:
df = pd.read_csv("./combine_2010_2025")


In [3]:
# Excludes positions with heavier players
# df = df[df["Pos"].isin(["WR", "CB", "RB", "FS", "OLB", "ILB", "SS", "TE"])]

In [4]:
# Prepare data
data = df[["Vertical", "Ht", "Wt"]].copy()
data["Relative-mass"] = df["Wt"] / df["Ht"] ** 3
data["Relative-vertical"] = df["Vertical"] / df["Ht"]
data["40yd"] = df["40yd"]
data = data.dropna()

X_original, y = data.drop(["40yd"], axis="columns"), data["40yd"]


In [5]:
# Determine potential feature combinations
feature_names = X_original.columns
feature_combinations = []
for i in range(1, len(feature_names)):
    i_length_combinations =itertools.combinations(feature_names, i)
    for combination in i_length_combinations:
        feature_combinations.append(combination)


In [6]:
data.corr()

Unnamed: 0,Vertical,Ht,Wt,Relative-mass,Relative-vertical,40yd
Vertical,1.0,-0.4091,-0.638525,-0.557512,0.971428,-0.750982
Ht,-0.4091,1.0,0.717502,0.195373,-0.611627,0.589288
Wt,-0.638525,0.717502,1.0,0.820546,-0.734143,0.862213
Relative-mass,-0.557512,0.195373,0.820546,1.0,-0.529799,0.723959
Relative-vertical,0.971428,-0.611627,-0.734143,-0.529799,1.0,-0.797393
40yd,-0.750982,0.589288,0.862213,0.723959,-0.797393,1.0


In [19]:
# Model definition
linear_regression = GridSearchCV(LinearRegression(), param_grid={})
lasso = GridSearchCV(LassoCV(max_iter = 1000000), param_grid={})
ridge = GridSearchCV(RidgeCV(), param_grid={})
elastic_net = GridSearchCV(ElasticNetCV(max_iter = 1000000), param_grid={})
polynomial_regression = GridSearchCV(make_pipeline(PolynomialFeatures(), LinearRegression()),
                                     param_grid={"polynomialfeatures__degree": [2, 3, 4, 5, 6]})
polynomial_lasso = GridSearchCV(make_pipeline(PolynomialFeatures(), LassoCV(max_iter=100000)),
                                     param_grid={"polynomialfeatures__degree": [2, 3, 4, 5, 6]})
polynomial_ridge = GridSearchCV(make_pipeline(PolynomialFeatures(), RidgeCV()),
                                     param_grid={"polynomialfeatures__degree": [2, 3, 4, 5, 6]})
polynomial_elastic_net = GridSearchCV(make_pipeline(PolynomialFeatures(), ElasticNetCV(max_iter=100000)),
                                     param_grid={"polynomialfeatures__degree": [2, 3, 4, 5, 6]})

random_forest = GridSearchCV(RandomForestRegressor(), {
    "n_estimators": [1, 10, 50, 100, 200],
    "max_depth": [5, 7, 10, 15, None],
    "max_features": ["sqrt", None],
    "bootstrap": [False, True]
})

extra_trees = GridSearchCV(ExtraTreesRegressor(), {
    "n_estimators": [10, 50, 100, 200],    
    "max_depth": [5, 7, 10, 15, None],
    "max_features": ["sqrt", None],
    "bootstrap": [False, True]
})

xgb = GridSearchCV(XGBRegressor(), {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [5, 7, 10, None],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "tree_method": ["auto", "approx"]
})

xgbrf = GridSearchCV(XGBRFRegressor(), {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [5, 7, 10, 15, None],
    "tree_method": ["auto", "approx"]
})

mlp = GridSearchCV(MLPRegressor(), {
    ""
    "max_iter": [20000],
    "solver": ["lbfgs", "adam"],
    "hidden_layer_sizes": [(100, 50, 20), (200, 100, 50), (200, 100, 50, 25), (120, 90, 68, 51), (120, 90, 68, 51, 39, 25)]
})


In [8]:
# Columns represent GridSearches for different models
# Rows represent feature combinations used
models_df = pd.DataFrame({
    "LinearRegression": [clone(linear_regression) for _ in range(len(feature_combinations))],
    "LassoCV": [clone(lasso) for _ in range(len(feature_combinations))],
    "RidgeCV": [clone(ridge) for _ in range(len(feature_combinations))],
    "ElasticNetCV": [clone(elastic_net) for _ in range(len(feature_combinations))],
    "PolynomialRegression": [clone(polynomial_regression) for _ in range(len(feature_combinations))],
    "PolynomialLasso": [clone(polynomial_lasso) for _ in range(len(feature_combinations))],
    "PolynomialRidge": [clone(polynomial_ridge) for _ in range(len(feature_combinations))],
    "PolynomialElasticNet": [clone(polynomial_elastic_net) for _ in range(len(feature_combinations))],
    "RandomForestRegressor": [clone(random_forest) for _ in range(len(feature_combinations))],
    "ExtraTreesRegressor": [clone(extra_trees) for _ in range(len(feature_combinations))],
    "XGBRegressor": [clone(xgb) for _ in range(len(feature_combinations))],
    "XGBRFRegressor": [clone(xgbrf) for _ in range(len(feature_combinations))],
    "MLPRegressor": [clone(mlp) for _ in range(len(feature_combinations))]
})

features_df = pd.DataFrame({
    "Features": feature_combinations
})
features_df["Scaler"] = features_df["Features"].apply(lambda _: StandardScaler()) # type: ignore
for i in features_df.index:
    features = features_df.loc[i, "Features"]
    scaler = features_df.loc[i, "Scaler"]
    X = X_original[list(features)].copy() # type: ignore
    scaler.fit(X) # type: ignore


In [9]:


def search_models(models_df: pd.DataFrame) -> None:
    for i in models_df.index:
        print(f"{i}/{models_df.shape[0]}")
        features = features_df.loc[i, "Features"]
        scaler = features_df.loc[i, "Scaler"]
        X = X_original[list(features)].copy() # type: ignore
        X = scaler.transform(X) # type: ignore
        for col in models_df.columns:
            start_time = time.time()
            model = models_df.loc[i, col]
            model.fit(X, y) # type: ignore
            print(f"Training {col} complete in {round(time.time() - start_time, 2)} seconds")
            score = model.cv_results_["mean_test_score"].max() # type: ignore
            print(f"Score: {score}")
            print()
    print()


In [25]:
# Take some fast models first, to see which feature combinations yield good results
fast_models_slice = models_df.loc[:, [
    "LinearRegression",
    "LassoCV",
    "RidgeCV", 
    "ElasticNetCV", 
    "PolynomialRegression", 
    "PolynomialRidge", 
    "PolynomialElasticNet", 
    "XGBRegressor"
]]

# Remaining, slower models will only be trained on the best feature combinations
remaining_models_slice = models_df.drop(fast_models_slice.columns, axis="columns")



In [23]:
search_models(fast_models_slice)

0/30
Training LinearRegression complete in 0.02 seconds
Score: 0.5590128028270787

Training LassoCV complete in 0.42 seconds
Score: 0.5590315084644141

Training RidgeCV complete in 0.02 seconds
Score: 0.5590142115238448

Training ElasticNetCV complete in 0.37 seconds
Score: 0.5590279299151237

1/30
Training LinearRegression complete in 0.02 seconds
Score: 0.333792329210553

Training LassoCV complete in 0.41 seconds
Score: 0.3337946090245477

Training RidgeCV complete in 0.03 seconds
Score: 0.3337959512189518

Training ElasticNetCV complete in 0.39 seconds
Score: 0.33378426709877757

2/30
Training LinearRegression complete in 0.02 seconds
Score: 0.7378105317074845

Training LassoCV complete in 0.4 seconds
Score: 0.7378126477464964

Training RidgeCV complete in 0.03 seconds
Score: 0.7378106591910365

Training ElasticNetCV complete in 0.46 seconds
Score: 0.7378133203764602

3/30
Training LinearRegression complete in 0.02 seconds
Score: 0.5187916473244113

Training LassoCV complete in 0.38

In [26]:
# Select only feature combinations that yield scores above a threshold
chosen_combination_indices = []
scores = []
for i in fast_models_slice.index:
    scores.append(fast_models_slice.loc[i, :].apply(lambda s: s.cv_results_["mean_test_score"].max()).max())

percentile_90 = np.percentile(scores, 90)

for i in fast_models_slice.index:
    if fast_models_slice.loc[i, :].apply(lambda s: s.cv_results_["mean_test_score"].max()).max() > percentile_90: # type: ignore
        chosen_combination_indices.append(i)

features_df.loc[chosen_combination_indices]


        

Unnamed: 0,Features,Scaler
6,"(Vertical, Wt)",StandardScaler()
19,"(Vertical, Wt, Relative-vertical)",StandardScaler()
24,"(Wt, Relative-mass, Relative-vertical)",StandardScaler()


In [13]:
chosen_indices_remaining_models_slice = remaining_models_slice.loc[chosen_combination_indices]
search_models(chosen_indices_remaining_models_slice)

6/3
Training LassoCV complete in 0.45 seconds
Score: 0.807463355429255

Training PolynomialLasso complete in 2.52 seconds
Score: 0.8194903790859442

Training RandomForestRegressor complete in 181.37 seconds
Score: 0.8172862697398356

Training ExtraTreesRegressor complete in 101.55 seconds
Score: 0.818950249160254

Training XGBRFRegressor complete in 46.32 seconds
Score: 0.8171775353722142

Training MLPRegressor complete in 5616.29 seconds
Score: 0.8166783190990093

19/3
Training LassoCV complete in 0.46 seconds
Score: 0.8082820101818861

Training PolynomialLasso complete in 6.97 seconds
Score: 0.819351175437544

Training RandomForestRegressor complete in 199.03 seconds
Score: 0.8159659696483725

Training ExtraTreesRegressor complete in 105.81 seconds
Score: 0.8187373445519162

Training XGBRFRegressor complete in 78.3 seconds
Score: 0.8155271196302895



STOP: TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Training MLPRegressor complete in 8319.27 seconds
Score: 0.8158701049229247

24/3
Training LassoCV complete in 0.51 seconds
Score: 0.8036930857138979

Training PolynomialLasso complete in 4.66 seconds
Score: 0.818892137200061

Training RandomForestRegressor complete in 250.03 seconds
Score: 0.8151174373957151

Training ExtraTreesRegressor complete in 101.3 seconds
Score: 0.8185543709887753

Training XGBRFRegressor complete in 98.46 seconds
Score: 0.8144449354953032

Training MLPRegressor complete in 7941.96 seconds
Score: 0.8204801177135073




In [27]:
final_features_df = models_df.loc[chosen_combination_indices].reset_index()
final_models_df = models_df.loc[chosen_combination_indices].reset_index()

In [28]:


with open("pickled_models/model_selection.pickle", "wb") as file:
    pickle.dump((final_features_df, final_models_df), file)