In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import itertools

import pickle

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
import time




In [None]:
df = pd.read_csv("./combine_2010_2025")


In [None]:
# Excludes positions with heavier players
# df = df[df["Pos"].isin(["WR", "CB", "RB", "FS", "OLB", "ILB", "SS", "TE"])]

In [None]:
# Prepare data
data = df[["Vertical", "Ht", "Wt"]].copy()
data["Relative-mass"] = df["Wt"] / df["Ht"] ** 3
data["Relative-vertical"] = df["Vertical"] / df["Ht"]
data["40yd"] = df["40yd"]
data = data.dropna()

X_original, y = data.drop(["40yd"], axis="columns"), data["40yd"]


In [None]:
# Determine potential feature combinations
feature_names = X_original.columns
feature_combinations = []
for i in range(1, len(feature_names)):
    i_length_combinations =itertools.combinations(feature_names, i)
    for combination in i_length_combinations:
        feature_combinations.append(combination)


In [None]:
data.corr()

In [None]:
# Model definition
linear_regression = GridSearchCV(LinearRegression(), param_grid={})
lasso = GridSearchCV(LassoCV(), param_grid={})
ridge = GridSearchCV(RidgeCV(), param_grid={})
elastic_net = GridSearchCV(ElasticNetCV(), param_grid={})
polynomial_regression = GridSearchCV(make_pipeline(PolynomialFeatures(), LinearRegression()),
                                     param_grid={"polynomialfeatures__degree": [2, 3, 4, 5, 6]})
polynomial_lasso = GridSearchCV(make_pipeline(PolynomialFeatures(), LassoCV(max_iter=100000)),
                                     param_grid={"polynomialfeatures__degree": [2, 3, 4, 5, 6]})
polynomial_ridge = GridSearchCV(make_pipeline(PolynomialFeatures(), RidgeCV()),
                                     param_grid={"polynomialfeatures__degree": [2, 3, 4, 5, 6]})
polynomial_elastic_net = GridSearchCV(make_pipeline(PolynomialFeatures(), ElasticNetCV(max_iter=100000)),
                                     param_grid={"polynomialfeatures__degree": [2, 3, 4, 5, 6]})

random_forest = GridSearchCV(RandomForestRegressor(), {
    "n_estimators": [1, 10, 50, 100, 200],
    "max_depth": [5, 7, 10, 15, None],
    "max_features": ["sqrt", None],
    "bootstrap": [False, True]
})

extra_trees = GridSearchCV(ExtraTreesRegressor(), {
    "n_estimators": [10, 50, 100, 200],    
    "max_depth": [5, 7, 10, 15, None],
    "max_features": ["sqrt", None],
    "bootstrap": [False, True]
})

xgb = GridSearchCV(XGBRegressor(), {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [5, 7, 10, None],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "tree_method": ["auto", "approx"]
})

xgbrf = GridSearchCV(XGBRFRegressor(), {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [5, 7, 10, 15, None],
    "tree_method": ["auto", "approx"]
})

mlp = GridSearchCV(MLPRegressor(), {
    ""
    "max_iter": [20000],
    "solver": ["lbfgs", "adam"],
    "hidden_layer_sizes": [(100, 50, 20), (200, 100, 50), (200, 100, 50, 25), (120, 90, 68, 51), (120, 90, 68, 51, 39, 25)]
})


In [None]:
# Columns represent GridSearches for different models
# Rows represent feature combinations used
models_df = pd.DataFrame({
    "LinearRegression": [clone(linear_regression) for _ in range(len(feature_combinations))],
    "LassoCV": [clone(lasso) for _ in range(len(feature_combinations))],
    "RidgeCV": [clone(ridge) for _ in range(len(feature_combinations))],
    "ElasticNetCV": [clone(elastic_net) for _ in range(len(feature_combinations))],
    "PolynomialRegression": [clone(polynomial_regression) for _ in range(len(feature_combinations))],
    "PolynomialLasso": [clone(polynomial_lasso) for _ in range(len(feature_combinations))],
    "PolynomialRidge": [clone(polynomial_ridge) for _ in range(len(feature_combinations))],
    "PolynomialElasticNet": [clone(polynomial_elastic_net) for _ in range(len(feature_combinations))],
    "RandomForestRegressor": [clone(random_forest) for _ in range(len(feature_combinations))],
    "ExtraTreesRegressor": [clone(extra_trees) for _ in range(len(feature_combinations))],
    "XGBRegressor": [clone(xgb) for _ in range(len(feature_combinations))],
    "XGBRFRegressor": [clone(xgbrf) for _ in range(len(feature_combinations))],
    "MLPRegressor": [clone(mlp) for _ in range(len(feature_combinations))]
})

features_df = pd.DataFrame({
    "Features": feature_combinations
})
features_df["Scaler"] = features_df["Features"].apply(lambda _: StandardScaler()) # type: ignore
for i in features_df.index:
    features = features_df.loc[i, "Features"]
    scaler = features_df.loc[i, "Scaler"]
    X = X_original[list(features)].copy() # type: ignore
    scaler.fit(X) # type: ignore


In [None]:


def search_models(models_df: pd.DataFrame) -> None:
    for i in models_df.index:
        print(f"{i}/{models_df.shape[0]}")
        features = features_df.loc[i, "Features"]
        scaler = features_df.loc[i, "Scaler"]
        X = X_original[list(features)].copy() # type: ignore
        X = scaler.transform(X) # type: ignore
        for col in models_df.columns:
            start_time = time.time()
            model = models_df.loc[i, col]
            model.fit(X, y) # type: ignore
            print(f"Training {col} complete in {round(time.time() - start_time, 2)} seconds")
            score = model.cv_results_["mean_test_score"].max() # type: ignore
            print(f"Score: {score}")
            print()
    print()


In [None]:
# Take some fast models first, to see which feature combinations yield good results
fast_models_slice = models_df.loc[:, [
    "LinearRegression", 
    "RidgeCV", 
    "ElasticNetCV", 
    "PolynomialRegression", 
    "PolynomialRidge", 
    "PolynomialElasticNet", 
    "XGBRegressor"
]]

# Remaining, slower models will only be trained on the best feature combinations
remaining_models_slice = models_df.drop(fast_models_slice.columns, axis="columns")



In [None]:
search_models(fast_models_slice)

In [None]:
# Select only feature combinations that yield scores above a threshold
chosen_combination_indices = []
scores = []
for i in fast_models_slice.index:
    scores.append(fast_models_slice.loc[i, :].apply(lambda s: s.cv_results_["mean_test_score"].max()).max())

percentile_90 = np.percentile(scores, 90)

for i in fast_models_slice.index:
    if fast_models_slice.loc[i, :].apply(lambda s: s.cv_results_["mean_test_score"].max()).max() > percentile_90: # type: ignore
        chosen_combination_indices.append(i)

features_df.loc[chosen_combination_indices]


        

In [None]:
chosen_indices_remaining_models_slice = remaining_models_slice.loc[chosen_combination_indices]
search_models(chosen_indices_remaining_models_slice)

In [None]:
final_features_df = models_df.loc[chosen_combination_indices].reset_index()
final_models_df = models_df.loc[chosen_combination_indices].reset_index()

In [None]:


with open("pickled_models/model_selection.pickle", "wb") as file:
    pickle.dump((final_features_df, final_models_df), file)