In [5]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler 

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

In [119]:
#number of individuals before NA is removed
print(len(df))

544


In [120]:
#this is the number of people we have when race IS INCLUDED (race removed will give us a total of 490 people)
df = df.dropna()
print(len(df))

422


In [121]:
#counting the total number of individuals who are white or black (ignore the other races for now...)
race = (df['COMB1PF7A'] == 1) | (df['COMB1PF7A'] == 2)
df = df[race]

print(len(df))

413


# MLR

In [26]:
#MLR Function

def train_test_linear_regression(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)  # Remove the target column from X
    
    # Dividing into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # TRAINING
    print("\nTRAINING Metrics")
    X_train_const = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_const)
    results = model.fit()
    print(results.summary())

    # TEST
    print("\nTEST Metrics")
    mlr = LinearRegression()
    mlr.fit(X_train, y_train)

    print("\nCoefficients: ")
    coefficients = pd.Series(mlr.coef_, index=X.columns)
    print(coefficients)

    print("\nIntercept: ")
    print(mlr.intercept_)

    test_pred = mlr.predict(X_test)

    mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
    mlr_diff.head()

    mse_test = metrics.mean_squared_error(y_test, test_pred)
    rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
    r2_test = metrics.r2_score(y_test, test_pred)

    print("MSE:", mse_test)
    print("RMSE:", rmse_test)
    print("R2:", r2_test, "\n")
    
    #count the frequencies in training data
    white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
    white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

    black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
    print("white train frequency: ", white_train_counts.values[0])
    print("white test frequency: ", white_test_counts.values[0], "\n")
    
    print("black train frequency: ", black_train_counts.values[0])
    print("black test frequency: ", black_test_counts.values[0])

In [32]:
print("MLR: DXA Model, Handgrip Strength (Arms)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

MLR: DXA Model, Handgrip Strength (Arms)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.518
Model:                            OLS   Adj. R-squared:                  0.506
Method:                 Least Squares   F-statistic:                     43.17
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.25e-46
Time:                        22:25:46   Log-Likelihood:                -353.33
No. Observations:                 330   AIC:                             724.7
Df Residuals:                     321   BIC:                             758.8
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [34]:
print("MLR: DXA Model, Handgrip Strength (TB)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

MLR: DXA Model, Handgrip Strength (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.522
Model:                            OLS   Adj. R-squared:                  0.507
Method:                 Least Squares   F-statistic:                     34.79
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.80e-45
Time:                        22:26:03   Log-Likelihood:                -352.16
No. Observations:                 330   AIC:                             726.3
Df Residuals:                     319   BIC:                             768.1
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [35]:
print("MLR: DXA Model, Jump Power (Legs)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')


MLR: DXA Model, Jump Power (Legs)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.742
Model:                            OLS   Adj. R-squared:                  0.735
Method:                 Least Squares   F-statistic:                     115.3
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.18e-89
Time:                        22:26:08   Log-Likelihood:                 93.457
No. Observations:                 330   AIC:                            -168.9
Df Residuals:                     321   BIC:                            -134.7
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [36]:
print("MLR: DXA Model, Jump Power (TB)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')


MLR: DXA Model, Jump Power (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.768
Model:                            OLS   Adj. R-squared:                  0.761
Method:                 Least Squares   F-statistic:                     105.6
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           6.25e-95
Time:                        22:26:13   Log-Likelihood:                 111.08
No. Observations:                 330   AIC:                            -200.2
Df Residuals:                     319   BIC:                            -158.4
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [37]:
print("MLR: BIS Model, Handgrip Strength")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')


MLR: BIS Model, Handgrip Strength

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.527
Method:                 Least Squares   F-statistic:                     29.24
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.13e-46
Time:                        22:26:17   Log-Likelihood:                -343.52
No. Observations:                 330   AIC:                             715.0
Df Residuals:                     316   BIC:                             768.2
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------

In [38]:
print("MLR: BIS Model, Jump Power")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')


MLR: BIS Model, Jump Power

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.781
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                     86.80
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           6.55e-96
Time:                        22:26:24   Log-Likelihood:                 120.76
No. Observations:                 330   AIC:                            -213.5
Df Residuals:                     316   BIC:                            -160.3
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
con

In [39]:
print("MLR: Combo Models, Handgrip Strength (Arms)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')


MLR: Combo Models, Handgrip Strength (Arms)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.529
Method:                 Least Squares   F-statistic:                     22.75
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           7.20e-45
Time:                        22:26:40   Log-Likelihood:                -340.81
No. Observations:                 330   AIC:                             717.6
Df Residuals:                     312   BIC:                             786.0
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

In [40]:
print("MLR: Combo Models, Handgrip Strength (TB)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')


MLR: Combo Models, Handgrip Strength (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.550
Model:                            OLS   Adj. R-squared:                  0.524
Method:                 Least Squares   F-statistic:                     21.11
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.15e-43
Time:                        22:26:46   Log-Likelihood:                -342.12
No. Observations:                 330   AIC:                             722.2
Df Residuals:                     311   BIC:                             794.4
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [41]:
print("MLR: Combo Models, Jump Power (Legs)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')


MLR: Combo Models, Jump Power (Legs)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.800
Model:                            OLS   Adj. R-squared:                  0.790
Method:                 Least Squares   F-statistic:                     73.61
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           3.04e-98
Time:                        22:26:52   Log-Likelihood:                 135.92
No. Observations:                 330   AIC:                            -235.8
Df Residuals:                     312   BIC:                            -167.5
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [42]:
print("MLR: Combo Models, Jump Power (TB)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')


MLR: Combo Models, Jump Power (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.817
Model:                            OLS   Adj. R-squared:                  0.806
Method:                 Least Squares   F-statistic:                     77.18
Date:                Sun, 22 Oct 2023   Prob (F-statistic):          4.10e-103
Time:                        22:26:58   Log-Likelihood:                 150.29
No. Observations:                 330   AIC:                            -262.6
Df Residuals:                     311   BIC:                            -190.4
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

# Random Forest

In [43]:
#Random Forest Function

def train_test_random_forest(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)  # Remove the target column from X
    
    # Dividing into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #random forest model
    param_grid = {
        'n_estimators': [5, 10, 20],
        'max_depth': [None, 1, 2, 4, 8, 16],
        'min_samples_split': [1, 2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'max_features': [1, 2, 4, 8, X_train.shape[1]]
    }

    rf_model = RandomForestRegressor(random_state=42)

    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_rf_model = grid_search.best_estimator_

    y_pred = best_rf_model.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_pred)
    print("Best parameters:", best_params)
    print("Mean Squared Error:", mse)

    #fitting with BEST HYPERPARAMETERS
    rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
    rf.fit(X_train, y_train)

    #evaluation of the model on the TRAINING set
    y_train_pred = rf.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = rf.predict(X_test)

    #evaluation of the model on test set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    
    # Print the evaluation metrics
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance??
    feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
    print(feature_importances)
        
    #count the frequencies in training data
    white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
    white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

    black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
    print("white train frequency: ", white_train_counts.values[0])
    print("white test frequency: ", white_test_counts.values[0], "\n")
    
    print("black train frequency: ", black_train_counts.values[0])
    print("black test frequency: ", black_test_counts.values[0])

In [44]:
print("Random Forest: DXA Model, Handgrip Strength (Arms)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

Random Forest: DXA Model, Handgrip Strength (Arms)
Best parameters: {'max_depth': 4, 'max_features': 2, 'min_samples_leaf': 2, 'min_samples_split': 1, 'n_estimators': 10}
Mean Squared Error: 0.44657126802326336

Train set metrics:
MSE: 0.39884262929731285
R-squared: 0.6144393124846439

Test set metrics:
MSE: 0.44657126802326336
R-squared: 0.53249886278416 

COMB1PF7A      0.004592
COMB4DALM      0.167855
COMB4DABM      0.263230
COMB4DAFM      0.044238
COMB1PRSEX     0.266296
COMB1PRAGE     0.033722
COMB4P1A       0.169481
COMB4DLR3MD    0.050587
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [45]:
print("Random Forest: DXA Model, Handgrip Strength (TB)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

Random Forest: DXA Model, Handgrip Strength (TB)
Best parameters: {'max_depth': 4, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 20}
Mean Squared Error: 0.5154613249487807

Train set metrics:
MSE: 0.35498232883997266
R-squared: 0.6568390119068344

Test set metrics:
MSE: 0.5154613249487807
R-squared: 0.46038007176094176 

COMB1PF7A        0.000331
COMB4IALM        0.079951
COMB4DTBBM       0.033275
COMB4DTBFM       0.034865
COMB1PRSEX       0.521460
COMB1PRAGE       0.030248
COMB4P1A         0.179256
COMB4DLR3MD      0.075413
COMB4DLFNMD      0.021641
COMB4DLSL14MD    0.023561
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [46]:
print("Random Forest: DXA Model, Jump Power (Legs)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

Random Forest: DXA Model, Jump Power (Legs)
Best parameters: {'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.0320661479872633

Train set metrics:
MSE: 0.009412340555681228
R-squared: 0.9268812809151483

Test set metrics:
MSE: 0.0320661479872633
R-squared: 0.7648259121460748 

COMB1PF7A      0.003438
COMB4ILLM      0.295291
COMB4DLBM      0.240072
COMB4DLFM      0.048140
COMB1PRSEX     0.011164
COMB1PRAGE     0.242393
COMB4P1A       0.125052
COMB4DLFNMD    0.034450
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [47]:
print("Random Forest: DXA Model, Jump Power (TB)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

Random Forest: DXA Model, Jump Power (TB)
Best parameters: {'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.03683804220376537

Train set metrics:
MSE: 0.01128241386277427
R-squared: 0.9123538247526233

Test set metrics:
MSE: 0.03683804220376537
R-squared: 0.7298286973216734 

COMB1PF7A        0.001086
COMB4IALM        0.516556
COMB4DTBBM       0.106744
COMB4DTBFM       0.031383
COMB1PRSEX       0.001677
COMB1PRAGE       0.210139
COMB4P1A         0.049992
COMB4DLR3MD      0.028154
COMB4DLFNMD      0.023678
COMB4DLSL14MD    0.030592
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [48]:
print("Random Forest: BIS Model, Handgrip Strength")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

Random Forest: BIS Model, Handgrip Strength
Best parameters: {'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 20}
Mean Squared Error: 0.5057526593300937

Train set metrics:
MSE: 0.2470638210624888
R-squared: 0.7611637028949205

Test set metrics:
MSE: 0.5057526593300937
R-squared: 0.4705437624024724 

COMB1PF7A       0.003704
COMB4IMECF      0.105537
COMB4IMICF      0.056053
COMB4IMFFM      0.045998
COMB4DTBFM      0.047665
COMB4IRES0      0.018222
COMB4IRESINF    0.025672
COMB4IRESEXC    0.017271
COMB4IRESINC    0.034851
COMB4IFCHAR     0.053925
COMB4IMCAP      0.061525
COMB1PRSEX      0.340812
COMB1PRAGE      0.050574
COMB4P1A        0.138191
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [49]:
print("Random Forest: BIS Model, Jump Power")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

Random Forest: BIS Model, Jump Power
Best parameters: {'max_depth': 16, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.045631931890548406

Train set metrics:
MSE: 0.009124185488330184
R-squared: 0.9291197814557817

Test set metrics:
MSE: 0.045631931890548406
R-squared: 0.6653340474934905 

COMB1PF7A       0.006428
COMB4IMECF      0.046096
COMB4IMICF      0.132052
COMB4IMFFM      0.087147
COMB4DTBFM      0.030691
COMB4IRES0      0.021064
COMB4IRESINF    0.038150
COMB4IRESEXC    0.023639
COMB4IRESINC    0.104663
COMB4IFCHAR     0.119930
COMB4IMCAP      0.162246
COMB1PRSEX      0.005981
COMB1PRAGE      0.131306
COMB4P1A        0.090609
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [50]:
print("Random Forest: Combo Models, Handgrip Strength (Arms)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Random Forest: Combo Models, Handgrip Strength (Arms)
Best parameters: {'max_depth': 4, 'max_features': 18, 'min_samples_leaf': 4, 'min_samples_split': 20, 'n_estimators': 10}
Mean Squared Error: 0.5052210096742265

Train set metrics:
MSE: 0.3686615752374748
R-squared: 0.6436153009534565

Test set metrics:
MSE: 0.5052210096742265
R-squared: 0.47110032937512736 

COMB1PF7A       0.000000
COMB4DALM       0.032614
COMB4DABM       0.117028
COMB4DAFM       0.001651
COMB1PRSEX      0.597383
COMB1PRAGE      0.030702
COMB4P1A        0.058485
COMB4DLR3MD     0.034424
COMB4IMECF      0.013326
COMB4IMICF      0.020323
COMB4IMFFM      0.006799
COMB4DTBFM      0.021133
COMB4IRES0      0.000000
COMB4IRESINF    0.003970
COMB4IRESEXC    0.001865
COMB4IRESINC    0.006343
COMB4IFCHAR     0.023422
COMB4IMCAP      0.030530
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [51]:
print("Random Forest: Combo Models, Handgrip Strength (TB)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Random Forest: Combo Models, Handgrip Strength (TB)
Best parameters: {'max_depth': None, 'max_features': 20, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 10}
Mean Squared Error: 0.4749309162151855

Train set metrics:
MSE: 0.33852830359120845
R-squared: 0.6727450982208413

Test set metrics:
MSE: 0.4749309162151855
R-squared: 0.5028100566962725 

COMB1PF7A        0.000000
COMB4IALM        0.033566
COMB4DTBBM       0.029337
COMB4DTBFM       0.011467
COMB1PRSEX       0.627375
COMB1PRAGE       0.036862
COMB4P1A         0.054138
COMB4DLR3MD      0.032586
COMB4DLFNMD      0.017118
COMB4DLSL14MD    0.021187
COMB4IMECF       0.018543
COMB4IMICF       0.000957
COMB4IMFFM       0.007178
COMB4DTBFM       0.012871
COMB4IRES0       0.004131
COMB4IRESINF     0.003962
COMB4IRESEXC     0.004582
COMB4IRESINC     0.014221
COMB4IFCHAR      0.035849
COMB4IMCAP       0.034069
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequ

In [52]:
print("Random Forest: Combo Models, Jump Power (Legs)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Random Forest: Combo Models, Jump Power (Legs)
Best parameters: {'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.04151959869142085

Train set metrics:
MSE: 0.008525769592119382
R-squared: 0.9337685086827769

Test set metrics:
MSE: 0.04151959869142085
R-squared: 0.6954940221009918 

COMB1PF7A       0.000891
COMB4ILLM       0.096925
COMB4DLBM       0.111705
COMB4DLFM       0.018994
COMB1PRSEX      0.027637
COMB1PRAGE      0.121553
COMB4P1A        0.058318
COMB4DLFNMD     0.017589
COMB4IMECF      0.021737
COMB4IMICF      0.098980
COMB4IMFFM      0.045864
COMB4DTBFM      0.017900
COMB4IRES0      0.022065
COMB4IRESINF    0.025409
COMB4IRESEXC    0.029209
COMB4IRESINC    0.056959
COMB4IFCHAR     0.095746
COMB4IMCAP      0.132519
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [53]:
print("Random Forest: Combo Models, Jump Power (TB)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Random Forest: Combo Models, Jump Power (TB)
Best parameters: {'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.043387997202312685

Train set metrics:
MSE: 0.0072925875517212526
R-squared: 0.9433483459888049

Test set metrics:
MSE: 0.043387997202312685
R-squared: 0.68179113157229 

COMB1PF7A        0.001211
COMB4IALM        0.230469
COMB4DTBBM       0.074969
COMB4DTBFM       0.022043
COMB1PRSEX       0.002638
COMB1PRAGE       0.119983
COMB4P1A         0.033868
COMB4DLR3MD      0.015868
COMB4DLFNMD      0.014839
COMB4DLSL14MD    0.015556
COMB4IMECF       0.010635
COMB4IMICF       0.077736
COMB4IMFFM       0.012394
COMB4DTBFM       0.013801
COMB4IRES0       0.022714
COMB4IRESINF     0.013724
COMB4IRESEXC     0.022371
COMB4IRESINC     0.026795
COMB4IFCHAR      0.073830
COMB4IMCAP       0.194555
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  2

# LASSO

In [124]:
#LASSO Function

def train_test_lasso(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)  # Remove the target column from X
    
    # Dividing into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Create the Lasso regression model
    lasso_model = Lasso()

    # Define a parameter grid for hyperparameter tuning
    alphas = np.logspace(-4, 0, 100)
    max_iters = [10000, 20000, 30000, 50000]
    param_grid = {'alpha': alphas, 'max_iter': max_iters}

    # Perform grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)

    # Get the best hyperparameters from the grid search results
    best_alpha = grid_search.best_params_['alpha']
    best_max_iter = grid_search.best_params_['max_iter']

    print("Best Alpha:", best_alpha)
    print("Best Max Iterations:", best_max_iter)

    # Retrain the model with the entire training set using the best hyperparameters
    best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
    best_lasso_model.fit(X_train_scaled, y_train)
    
    # Evaluate the model on the test set
    y_pred = best_lasso_model.predict(X_test_scaled)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2_test_21 = metrics.r2_score(y_test, y_pred)

    # Evaluate the model on the train set
    y_pred_train = best_lasso_model.predict(X_train_scaled)
    mse_train = metrics.mean_squared_error(y_train, y_pred_train)
    r2_train_21 = metrics.r2_score(y_train, y_pred_train)

    print("\nTRAINING Set Metrics:")
    print("Mean Squared Error:", mse_train)
    print("R-squared:", r2_train_21)
    print("\nTEST Set Metrics:")
    print("Mean Squared Error:", mse)
    print("R-squared:", r2_test_21)

    # Get the feature importance (coefficients) from the Lasso model
    feature_importance = best_lasso_model.coef_

    # Create a DataFrame to associate each feature with its importance value
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

    # Sort the features based on their importance (absolute value of coefficients) in descending order
    importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)
    
    print(importance_df)
    
    #count the frequencies in training data
    white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
    white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

    black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
    print("white train frequency: ", white_train_counts.values[0])
    print("white test frequency: ", white_test_counts.values[0], "\n")
    
    print("black train frequency: ", black_train_counts.values[0])
    print("black test frequency: ", black_test_counts.values[0])

In [69]:
print("LASSO: DXA Model, Handgrip Strength (Arms)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

LASSO: DXA Model, Handgrip Strength (Arms)
Best Alpha: 0.007924828983539177
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.4989804025875539
R-squared: 0.5176362481179676

TEST Set Metrics:
Mean Squared Error: 0.392553210019003
R-squared: 0.5890486351395714
       Feature  Importance
4   COMB1PRSEX   -0.246612
1    COMB4DALM    0.208893
6     COMB4P1A    0.164981
0    COMB1PF7A   -0.140794
3    COMB4DAFM   -0.130594
2    COMB4DABM    0.115591
5   COMB1PRAGE   -0.093575
7  COMB4DLR3MD   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [70]:
print("LASSO: DXA Model, Handgrip Strength (TB)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

LASSO: DXA Model, Handgrip Strength (TB)
Best Alpha: 0.010476157527896652
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.4974089369279115
R-squared: 0.519155382071141

TEST Set Metrics:
Mean Squared Error: 0.4171160566709989
R-squared: 0.5633345788056379
         Feature  Importance
4     COMB1PRSEX   -0.308693
1      COMB4IALM    0.235871
3     COMB4DTBFM   -0.192075
6       COMB4P1A    0.191922
0      COMB1PF7A   -0.128019
5     COMB1PRAGE   -0.067233
7    COMB4DLR3MD    0.047280
9  COMB4DLSL14MD   -0.021209
8    COMB4DLFNMD   -0.002756
2     COMB4DTBBM   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [71]:
print("LASSO: DXA Model, Jump Power (Legs)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

LASSO: DXA Model, Jump Power (Legs)
Best Alpha: 0.002364489412645407
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.033262895076554326
R-squared: 0.7416009050391352

TEST Set Metrics:
Mean Squared Error: 0.03162560155841644
R-squared: 0.7680568928239696
       Feature  Importance
1    COMB4ILLM    0.207423
5   COMB1PRAGE   -0.155741
3    COMB4DLFM   -0.078487
2    COMB4DLBM    0.033405
4   COMB1PRSEX   -0.017261
6     COMB4P1A    0.009448
0    COMB1PF7A   -0.001918
7  COMB4DLFNMD    0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [72]:
print("LASSO: DXA Model, Jump Power (TB)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

LASSO: DXA Model, Jump Power (TB)
Best Alpha: 0.0021544346900318843
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03003332085126144
R-squared: 0.7666894926381393

TEST Set Metrics:
Mean Squared Error: 0.034907878272739924
R-squared: 0.7439845772879156
         Feature  Importance
1      COMB4IALM    0.195775
5     COMB1PRAGE   -0.138248
2     COMB4DTBBM    0.077994
9  COMB4DLSL14MD   -0.065943
3     COMB4DTBFM   -0.055462
7    COMB4DLR3MD    0.040228
0      COMB1PF7A   -0.016851
8    COMB4DLFNMD    0.008039
4     COMB1PRSEX   -0.000000
6       COMB4P1A   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [73]:
print("LASSO: BIS Model, Handgrip Strength")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

LASSO: BIS Model, Handgrip Strength
Best Alpha: 0.009545484566618348
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.4751581029317945
R-squared: 0.5406652363924997

TEST Set Metrics:
Mean Squared Error: 0.4232203045082896
R-squared: 0.5569442375317516
         Feature  Importance
13      COMB4P1A    0.307762
11    COMB1PRSEX   -0.281321
8   COMB4IRESINC   -0.259446
4     COMB4DTBFM   -0.151847
0      COMB1PF7A   -0.100740
9    COMB4IFCHAR   -0.068776
10    COMB4IMCAP   -0.062899
12    COMB1PRAGE   -0.037253
1     COMB4IMECF   -0.000000
2     COMB4IMICF   -0.000000
3     COMB4IMFFM   -0.000000
5     COMB4IRES0    0.000000
6   COMB4IRESINF    0.000000
7   COMB4IRESEXC    0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [74]:
print("LASSO: BIS Model, Jump Power")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

LASSO: BIS Model, Jump Power
Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.02824004092239362
R-squared: 0.7806203879965998

TEST Set Metrics:
Mean Squared Error: 0.03444187016406068
R-squared: 0.7474022946867969
         Feature  Importance
6   COMB4IRESINF    0.620252
2     COMB4IMICF    0.538616
7   COMB4IRESEXC   -0.481319
1     COMB4IMECF   -0.378706
8   COMB4IRESINC   -0.207309
11    COMB1PRSEX   -0.175059
10    COMB4IMCAP   -0.154108
12    COMB1PRAGE   -0.118618
9    COMB4IFCHAR   -0.093582
13      COMB4P1A    0.039589
4     COMB4DTBFM   -0.039422
0      COMB1PF7A    0.005484
3     COMB4IMFFM    0.000000
5     COMB4IRES0   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [75]:
print("LASSO: Combo Models, Handgrip Strength (Arms)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

LASSO: Combo Models, Handgrip Strength (Arms)
Best Alpha: 0.010476157527896652
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.47065463901894417
R-squared: 0.5450187295120752

TEST Set Metrics:
Mean Squared Error: 0.4019883127280184
R-squared: 0.5791713287339494
         Feature  Importance
6       COMB4P1A    0.245167
15  COMB4IRESINC   -0.224621
4     COMB1PRSEX   -0.224192
11    COMB4DTBFM   -0.159779
0      COMB1PF7A   -0.121956
2      COMB4DABM    0.102573
17    COMB4IMCAP   -0.075325
16   COMB4IFCHAR   -0.074986
1      COMB4DALM    0.049871
5     COMB1PRAGE   -0.043389
7    COMB4DLR3MD   -0.000000
8     COMB4IMECF   -0.000000
10    COMB4IMFFM   -0.000000
12    COMB4IRES0    0.000000
13  COMB4IRESINF    0.000000
14  COMB4IRESEXC    0.000000
3      COMB4DAFM    0.000000
9     COMB4IMICF   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [76]:
print("LASSO: Combo Models, Handgrip Strength (TB)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

LASSO: Combo Models, Handgrip Strength (TB)
Best Alpha: 0.015199110829529346
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.47810606552101953
R-squared: 0.5378154445217721

TEST Set Metrics:
Mean Squared Error: 0.42233477073360126
R-squared: 0.55787127443796
          Feature  Importance
6        COMB4P1A    0.308128
4      COMB1PRSEX   -0.275728
17   COMB4IRESINC   -0.221246
3      COMB4DTBFM   -0.133764
0       COMB1PF7A   -0.092055
18    COMB4IFCHAR   -0.037853
5      COMB1PRAGE   -0.031890
13     COMB4DTBFM   -0.014713
9   COMB4DLSL14MD   -0.011097
8     COMB4DLFNMD   -0.002270
7     COMB4DLR3MD    0.002207
14     COMB4IRES0    0.000000
16   COMB4IRESEXC    0.000000
15   COMB4IRESINF    0.000000
10     COMB4IMECF   -0.000000
12     COMB4IMFFM   -0.000000
11     COMB4IMICF   -0.000000
1       COMB4IALM    0.000000
2      COMB4DTBBM   -0.000000
19     COMB4IMCAP   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
bla

In [77]:
print("LASSO: Combo Models, Jump Power (Legs)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

LASSO: Combo Models, Jump Power (Legs)
Best Alpha: 0.0003678379771828634
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.02637019890504225
R-squared: 0.7951460474105321

TEST Set Metrics:
Mean Squared Error: 0.03182755357194305
R-squared: 0.7665757707200533
         Feature  Importance
8     COMB4IMECF   -0.211593
1      COMB4ILLM    0.172588
9     COMB4IMICF    0.170072
13  COMB4IRESINF    0.169248
15  COMB4IRESINC   -0.142946
5     COMB1PRAGE   -0.106836
14  COMB4IRESEXC   -0.088450
4     COMB1PRSEX   -0.071762
17    COMB4IMCAP   -0.056435
3      COMB4DLFM   -0.045642
6       COMB4P1A    0.043514
16   COMB4IFCHAR   -0.041756
2      COMB4DLBM    0.024686
0      COMB1PF7A   -0.012486
11    COMB4DTBFM    0.009259
7    COMB4DLFNMD    0.001952
10    COMB4IMFFM    0.000000
12    COMB4IRES0   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [125]:
print("LASSO: Combo Models, Jump Power (TB)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

LASSO: Combo Models, Jump Power (TB)
Best Alpha: 0.001484968262254465
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.024608313549032368
R-squared: 0.8088330575270585

TEST Set Metrics:
Mean Squared Error: 0.03600684111676248
R-squared: 0.7359247509398611
          Feature  Importance
1       COMB4IALM    0.199195
10     COMB4IMECF   -0.117637
17   COMB4IRESINC   -0.114497
5      COMB1PRAGE   -0.099389
2      COMB4DTBBM    0.086362
9   COMB4DLSL14MD   -0.052564
6        COMB4P1A    0.043634
14     COMB4IRES0    0.043197
0       COMB1PF7A   -0.024624
7     COMB4DLR3MD    0.022286
18    COMB4IFCHAR   -0.014751
3      COMB4DTBFM   -0.007374
4      COMB1PRSEX   -0.006041
13     COMB4DTBFM   -0.004849
8     COMB4DLFNMD    0.002467
11     COMB4IMICF    0.000000
12     COMB4IMFFM   -0.000000
15   COMB4IRESINF    0.000000
16   COMB4IRESEXC    0.000000
19     COMB4IMCAP    0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black t

# Removing Race

In [126]:
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

df = df.drop('COMB1PF7A', axis=1)

In [127]:
df = df.dropna()
print(len(df))

490


# MLR

In [128]:
#MLR Function

def train_test_linear_regression(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)  # Remove the target column from X
    
    # Dividing into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # TRAINING
    print("\nTRAINING Metrics")
    X_train_const = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_const)
    results = model.fit()
    print(results.summary())

    # TEST
    print("\nTEST Metrics")
    mlr = LinearRegression()
    mlr.fit(X_train, y_train)

    print("\nCoefficients: ")
    coefficients = pd.Series(mlr.coef_, index=X.columns)
    print(coefficients)

    print("\nIntercept: ")
    print(mlr.intercept_)

    test_pred = mlr.predict(X_test)

    mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
    mlr_diff.head()

    mse_test = metrics.mean_squared_error(y_test, test_pred)
    rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
    r2_test = metrics.r2_score(y_test, test_pred)

    print("MSE:", mse_test)
    print("RMSE:", rmse_test)
    print("R2:", r2_test)

In [129]:
print("MLR: DXA Model, Handgrip Strength (Arms)")

train_test_linear_regression(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

MLR: DXA Model, Handgrip Strength (Arms)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.532
Model:                            OLS   Adj. R-squared:                  0.524
Method:                 Least Squares   F-statistic:                     62.46
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.35e-59
Time:                        23:06:53   Log-Likelihood:                -427.26
No. Observations:                 392   AIC:                             870.5
Df Residuals:                     384   BIC:                             902.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [130]:
print("MLR: DXA Model, Handgrip Strength (TB)")

train_test_linear_regression(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

MLR: DXA Model, Handgrip Strength (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.520
Model:                            OLS   Adj. R-squared:                  0.508
Method:                 Least Squares   F-statistic:                     45.92
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.33e-55
Time:                        23:07:07   Log-Likelihood:                -432.53
No. Observations:                 392   AIC:                             885.1
Df Residuals:                     382   BIC:                             924.8
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [131]:
print("MLR: DXA Model, Jump Power (Legs)")

train_test_linear_regression(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')


MLR: DXA Model, Jump Power (Legs)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.684
Model:                            OLS   Adj. R-squared:                  0.678
Method:                 Least Squares   F-statistic:                     118.7
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           5.33e-92
Time:                        23:07:16   Log-Likelihood:                 67.736
No. Observations:                 392   AIC:                            -119.5
Df Residuals:                     384   BIC:                            -87.70
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [132]:
print("MLR: DXA Model, Jump Power (TB)")

train_test_linear_regression(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')


MLR: DXA Model, Jump Power (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.705
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     101.4
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.40e-95
Time:                        23:07:26   Log-Likelihood:                 81.212
No. Observations:                 392   AIC:                            -142.4
Df Residuals:                     382   BIC:                            -102.7
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [133]:
print("MLR: BIS Model, Handgrip Strength")

train_test_linear_regression(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')


MLR: BIS Model, Handgrip Strength

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.532
Method:                 Least Squares   F-statistic:                     38.00
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.07e-57
Time:                        23:07:37   Log-Likelihood:                -421.43
No. Observations:                 392   AIC:                             868.9
Df Residuals:                     379   BIC:                             920.5
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------

In [134]:
print("MLR: BIS Model, Jump Power")

train_test_linear_regression(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')


MLR: BIS Model, Jump Power

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.731
Model:                            OLS   Adj. R-squared:                  0.723
Method:                 Least Squares   F-statistic:                     85.91
Date:                Sun, 22 Oct 2023   Prob (F-statistic):          3.53e-100
Time:                        23:07:48   Log-Likelihood:                 99.434
No. Observations:                 392   AIC:                            -172.9
Df Residuals:                     379   BIC:                            -121.2
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
con

In [135]:
print("MLR: Combo Models, Handgrip Strength (Arms)")

train_test_linear_regression(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')


MLR: Combo Models, Handgrip Strength (Arms)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.559
Model:                            OLS   Adj. R-squared:                  0.541
Method:                 Least Squares   F-statistic:                     29.77
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           5.78e-57
Time:                        23:07:57   Log-Likelihood:                -415.57
No. Observations:                 392   AIC:                             865.1
Df Residuals:                     375   BIC:                             932.7
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

In [136]:
print("MLR: Combo Models, Handgrip Strength (TB)")

train_test_linear_regression(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')


MLR: Combo Models, Handgrip Strength (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.550
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     26.90
Date:                Sun, 22 Oct 2023   Prob (F-statistic):           1.47e-54
Time:                        23:08:13   Log-Likelihood:                -419.71
No. Observations:                 392   AIC:                             875.4
Df Residuals:                     374   BIC:                             946.9
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [137]:
print("MLR: Combo Models, Jump Power (Legs)")

train_test_linear_regression(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')


MLR: Combo Models, Jump Power (Legs)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.747
Model:                            OLS   Adj. R-squared:                  0.737
Method:                 Least Squares   F-statistic:                     69.36
Date:                Sun, 22 Oct 2023   Prob (F-statistic):          2.17e-101
Time:                        23:08:35   Log-Likelihood:                 111.66
No. Observations:                 392   AIC:                            -189.3
Df Residuals:                     375   BIC:                            -121.8
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [138]:
print("MLR: Combo Models, Jump Power (TB)")

train_test_linear_regression(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')


MLR: Combo Models, Jump Power (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.760
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     69.58
Date:                Sun, 22 Oct 2023   Prob (F-statistic):          1.79e-104
Time:                        23:08:39   Log-Likelihood:                 121.47
No. Observations:                 392   AIC:                            -206.9
Df Residuals:                     374   BIC:                            -135.5
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

# Random Forest

In [139]:
#Random Forest Function

def train_test_random_forest(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)  # Remove the target column from X
    
    # Dividing into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #random forest model
    param_grid = {
        'n_estimators': [5, 10, 20],
        'max_depth': [None, 1, 2, 4, 8, 16],
        'min_samples_split': [1, 2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'max_features': [1, 2, 4, 8, X_train.shape[1]]
    }

    rf_model = RandomForestRegressor(random_state=42)

    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_rf_model = grid_search.best_estimator_

    y_pred = best_rf_model.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_pred)
    print("Best parameters:", best_params)
    print("Mean Squared Error:", mse)

    #fitting with BEST HYPERPARAMETERS
    rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
    rf.fit(X_train, y_train)

    #evaluation of the model on the TRAINING set
    y_train_pred = rf.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = rf.predict(X_test)

    #evaluation of the model on test set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    
    # Print the evaluation metrics
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance??
    feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
    print(feature_importances)

In [94]:
print("Random Forest: DXA Model, Handgrip Strength (Arms)")

train_test_random_forest(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

Random Forest: DXA Model, Handgrip Strength (Arms)
Best parameters: {'max_depth': None, 'max_features': 1, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.563911553921147

Train set metrics:
MSE: 0.22113343604966268
R-squared: 0.8003518710902774

Test set metrics:
MSE: 0.563911553921147
R-squared: 0.35393336177490575 

COMB4DALM      0.226729
COMB4DABM      0.272234
COMB4DAFM      0.119894
COMB1PRSEX     0.050189
COMB1PRAGE     0.075460
COMB4P1A       0.160732
COMB4DLR3MD    0.094761
dtype: float64


In [95]:
print("Random Forest: DXA Model, Handgrip Strength (TB)")

train_test_random_forest(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

Random Forest: DXA Model, Handgrip Strength (TB)
Best parameters: {'max_depth': 4, 'max_features': 2, 'min_samples_leaf': 2, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.512220146761612

Train set metrics:
MSE: 0.4207880156699591
R-squared: 0.6200957146196774

Test set metrics:
MSE: 0.512220146761612
R-squared: 0.4131555809624118 

COMB4IALM        0.112682
COMB4DTBBM       0.193642
COMB4DTBFM       0.066693
COMB1PRSEX       0.218109
COMB1PRAGE       0.032880
COMB4P1A         0.237244
COMB4DLR3MD      0.054533
COMB4DLFNMD      0.049660
COMB4DLSL14MD    0.034557
dtype: float64


In [96]:
print("Random Forest: DXA Model, Jump Power (Legs)")

train_test_random_forest(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

Random Forest: DXA Model, Jump Power (Legs)
Best parameters: {'max_depth': 16, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.03432614078384149

Train set metrics:
MSE: 0.008457943477384067
R-squared: 0.9355065135019212

Test set metrics:
MSE: 0.03432614078384149
R-squared: 0.7357049259104158 

COMB4ILLM      0.267729
COMB4DLBM      0.262804
COMB4DLFM      0.060045
COMB1PRSEX     0.058591
COMB1PRAGE     0.217393
COMB4P1A       0.070852
COMB4DLFNMD    0.062587
dtype: float64


In [97]:
print("Random Forest: DXA Model, Jump Power (TB)")

train_test_random_forest(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

Random Forest: DXA Model, Jump Power (TB)
Best parameters: {'max_depth': None, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 20}
Mean Squared Error: 0.034639138417018885

Train set metrics:
MSE: 0.01659683698601998
R-squared: 0.8734458459162283

Test set metrics:
MSE: 0.034639138417018885
R-squared: 0.7332949919428484 

COMB4IALM        0.435438
COMB4DTBBM       0.122820
COMB4DTBFM       0.047682
COMB1PRSEX       0.045840
COMB1PRAGE       0.239257
COMB4P1A         0.038462
COMB4DLR3MD      0.024423
COMB4DLFNMD      0.023693
COMB4DLSL14MD    0.022386
dtype: float64


In [98]:
print("Random Forest: BIS Model, Handgrip Strength")

train_test_random_forest(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

Random Forest: BIS Model, Handgrip Strength
Best parameters: {'max_depth': 4, 'max_features': 8, 'min_samples_leaf': 8, 'min_samples_split': 1, 'n_estimators': 5}
Mean Squared Error: 0.5335717482503435

Train set metrics:
MSE: 0.4202205134174768
R-squared: 0.6206080783982364

Test set metrics:
MSE: 0.5335717482503435
R-squared: 0.3886933097097188 

COMB4IMECF      0.005497
COMB4IMICF      0.014433
COMB4IMFFM      0.079380
COMB4DTBFM      0.047048
COMB4IRES0      0.003911
COMB4IRESINF    0.025725
COMB4IRESEXC    0.007084
COMB4IRESINC    0.014386
COMB4IFCHAR     0.026523
COMB4IMCAP      0.034890
COMB1PRSEX      0.240722
COMB1PRAGE      0.058735
COMB4P1A        0.441668
dtype: float64


In [99]:
print("Random Forest: BIS Model, Jump Power")

train_test_random_forest(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

Random Forest: BIS Model, Jump Power
Best parameters: {'max_depth': 16, 'max_features': 13, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.03725472079599646

Train set metrics:
MSE: 0.01183245899256238
R-squared: 0.9097751674131647

Test set metrics:
MSE: 0.03725472079599646
R-squared: 0.7131562427897626 

COMB4IMECF      0.023734
COMB4IMICF      0.080625
COMB4IMFFM      0.041055
COMB4DTBFM      0.028018
COMB4IRES0      0.022546
COMB4IRESINF    0.011644
COMB4IRESEXC    0.011550
COMB4IRESINC    0.029357
COMB4IFCHAR     0.061855
COMB4IMCAP      0.243573
COMB1PRSEX      0.103335
COMB1PRAGE      0.218421
COMB4P1A        0.124288
dtype: float64


In [100]:
print("Random Forest: Combo Models, Handgrip Strength (Arms)")

train_test_random_forest(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Random Forest: Combo Models, Handgrip Strength (Arms)
Best parameters: {'max_depth': 16, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.5476929171400978

Train set metrics:
MSE: 0.09392989992467701
R-squared: 0.9151963217157821

Test set metrics:
MSE: 0.5476929171400978
R-squared: 0.37251485752342417 

COMB4DALM       0.079282
COMB4DABM       0.099317
COMB4DAFM       0.042023
COMB1PRSEX      0.079703
COMB1PRAGE      0.043118
COMB4P1A        0.082755
COMB4DLR3MD     0.069634
COMB4IMECF      0.047458
COMB4IMICF      0.068073
COMB4IMFFM      0.065056
COMB4DTBFM      0.046821
COMB4IRES0      0.036174
COMB4IRESINF    0.042292
COMB4IRESEXC    0.039163
COMB4IRESINC    0.058239
COMB4IFCHAR     0.057498
COMB4IMCAP      0.043392
dtype: float64


In [101]:
print("Random Forest: Combo Models, Handgrip Strength (TB)")

train_test_random_forest(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Random Forest: Combo Models, Handgrip Strength (TB)
Best parameters: {'max_depth': None, 'max_features': 4, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.6058466408382025

Train set metrics:
MSE: 0.16155274385354995
R-squared: 0.8541437079494875

Test set metrics:
MSE: 0.6058466408382025
R-squared: 0.30588884053786136 

COMB4IALM        0.073413
COMB4DTBBM       0.080002
COMB4DTBFM       0.034977
COMB1PRSEX       0.087965
COMB1PRAGE       0.035089
COMB4P1A         0.203463
COMB4DLR3MD      0.083524
COMB4DLFNMD      0.021510
COMB4DLSL14MD    0.028459
COMB4IMECF       0.041587
COMB4IMICF       0.030701
COMB4IMFFM       0.046991
COMB4DTBFM       0.047101
COMB4IRES0       0.027537
COMB4IRESINF     0.021963
COMB4IRESEXC     0.013692
COMB4IRESINC     0.052320
COMB4IFCHAR      0.039299
COMB4IMCAP       0.030405
dtype: float64


In [102]:
print("Random Forest: Combo Models, Jump Power (Legs)")

train_test_random_forest(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Random Forest: Combo Models, Jump Power (Legs)
Best parameters: {'max_depth': None, 'max_features': 17, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 20}
Mean Squared Error: 0.03315274697233576

Train set metrics:
MSE: 0.013733158239729065
R-squared: 0.8952819609307815

Test set metrics:
MSE: 0.03315274697233576
R-squared: 0.7447395041434036 

COMB4ILLM       0.110900
COMB4DLBM       0.272595
COMB4DLFM       0.010917
COMB1PRSEX      0.004399
COMB1PRAGE      0.193647
COMB4P1A        0.039944
COMB4DLFNMD     0.013584
COMB4IMECF      0.010851
COMB4IMICF      0.046590
COMB4IMFFM      0.005961
COMB4DTBFM      0.035148
COMB4IRES0      0.010326
COMB4IRESINF    0.005999
COMB4IRESEXC    0.012256
COMB4IRESINC    0.030904
COMB4IFCHAR     0.058250
COMB4IMCAP      0.137728
dtype: float64


In [103]:
print("Random Forest: Combo Models, Jump Power (TB)")

train_test_random_forest(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Random Forest: Combo Models, Jump Power (TB)
Best parameters: {'max_depth': 8, 'max_features': 19, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.03092390614062327

Train set metrics:
MSE: 0.011644069170392308
R-squared: 0.9112116769482531

Test set metrics:
MSE: 0.03092390614062327
R-squared: 0.7619005260147769 

COMB4IALM        0.312205
COMB4DTBBM       0.084248
COMB4DTBFM       0.013182
COMB1PRSEX       0.042987
COMB1PRAGE       0.191255
COMB4P1A         0.038271
COMB4DLR3MD      0.014858
COMB4DLFNMD      0.013623
COMB4DLSL14MD    0.013138
COMB4IMECF       0.007661
COMB4IMICF       0.023891
COMB4IMFFM       0.012707
COMB4DTBFM       0.021410
COMB4IRES0       0.015832
COMB4IRESINF     0.004292
COMB4IRESEXC     0.012097
COMB4IRESINC     0.017812
COMB4IFCHAR      0.051679
COMB4IMCAP       0.108852
dtype: float64


# LASSO

In [146]:
#LASSO Function

def train_test_lasso(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)  # Remove the target column from X
    
    # Dividing into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Create the Lasso regression model
    lasso_model = Lasso()

    # Define a parameter grid for hyperparameter tuning
    alphas = np.logspace(-4, 0, 100)
    max_iters = [10000, 20000, 30000, 50000]
    param_grid = {'alpha': alphas, 'max_iter': max_iters}

    # Perform grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)

    # Get the best hyperparameters from the grid search results
    best_alpha = grid_search.best_params_['alpha']
    best_max_iter = grid_search.best_params_['max_iter']

    print("Best Alpha:", best_alpha)
    print("Best Max Iterations:", best_max_iter)

    # Retrain the model with the entire training set using the best hyperparameters
    best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
    best_lasso_model.fit(X_train_scaled, y_train)
    
    # Evaluate the model on the test set
    y_pred = best_lasso_model.predict(X_test_scaled)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2_test_21 = metrics.r2_score(y_test, y_pred)
    
    # Evaluate the model on the train set
    y_pred_train = best_lasso_model.predict(X_train_scaled)
    mse_train = metrics.mean_squared_error(y_train, y_pred_train)
    r2_train_21 = metrics.r2_score(y_train, y_pred_train)

    print("\nTRAINING Set Metrics:")
    print("Mean Squared Error:", mse_train)
    print("R-squared:", r2_train_21, "\n")
    
    print("\nTEST Set Metrics:")
    print("Mean Squared Error:", mse)
    print("R-squared:", r2_test_21)
    
    # Get the feature importance (coefficients) from the Lasso model
    feature_importance = best_lasso_model.coef_

    # Create a DataFrame to associate each feature with its importance value
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

    # Sort the features based on their importance (absolute value of coefficients) in descending order
    importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)
    
    print(importance_df)

In [147]:
print("LASSO: DXA Model, Handgrip Strength (Arms)")

train_test_lasso(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

LASSO: DXA Model, Handgrip Strength (Arms)
Best Alpha: 0.015199110829529346
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.5191272844724935
R-squared: 0.5313110813886949 


TEST Set Metrics:
Mean Squared Error: 0.5328850007907258
R-squared: 0.38948010795752863
       Feature  Importance
0    COMB4DALM    0.248654
1    COMB4DABM    0.212920
2    COMB4DAFM   -0.186978
5     COMB4P1A    0.171863
3   COMB1PRSEX   -0.144917
4   COMB1PRAGE   -0.078825
6  COMB4DLR3MD   -0.000000


In [148]:
print("LASSO: DXA Model, Handgrip Strength (TB)")

train_test_lasso(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

LASSO: DXA Model, Handgrip Strength (TB)
Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.5320212590985757
R-squared: 0.5196698843164953 


TEST Set Metrics:
Mean Squared Error: 0.5074008887814344
R-squared: 0.4186769464679466
         Feature  Importance
3     COMB1PRSEX   -0.237801
2     COMB4DTBFM   -0.229405
0      COMB4IALM    0.228153
1     COMB4DTBBM    0.216528
5       COMB4P1A    0.166526
8  COMB4DLSL14MD   -0.093568
4     COMB1PRAGE   -0.072119
6    COMB4DLR3MD    0.064239
7    COMB4DLFNMD   -0.050742


In [149]:
print("LASSO: DXA Model, Jump Power (Legs)")

train_test_lasso(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

LASSO: DXA Model, Jump Power (Legs)
Best Alpha: 0.001788649529057435
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.04145664394638318
R-squared: 0.6838849167340781 


TEST Set Metrics:
Mean Squared Error: 0.026062174975906488
R-squared: 0.7993335601118498
       Feature  Importance
0    COMB4ILLM    0.169593
4   COMB1PRAGE   -0.162373
2    COMB4DLFM   -0.074821
3   COMB1PRSEX   -0.050973
1    COMB4DLBM    0.041781
5     COMB4P1A    0.002195
6  COMB4DLFNMD    0.000000


In [150]:
print("LASSO: DXA Model, Jump Power (TB)")

train_test_lasso(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

LASSO: DXA Model, Jump Power (TB)
Best Alpha: 0.0016297508346206436
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03877730002354542
R-squared: 0.7043154423782021 


TEST Set Metrics:
Mean Squared Error: 0.02661177762993496
R-squared: 0.7951018792164937
         Feature  Importance
0      COMB4IALM    0.168560
4     COMB1PRAGE   -0.143604
1     COMB4DTBBM    0.098377
2     COMB4DTBFM   -0.064899
8  COMB4DLSL14MD   -0.060912
3     COMB1PRSEX   -0.030647
6    COMB4DLR3MD    0.018565
5       COMB4P1A   -0.003897
7    COMB4DLFNMD    0.002343


In [156]:
print("LASSO: BIS Model, Handgrip Strength")

train_test_lasso(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

LASSO: BIS Model, Handgrip Strength
Best Alpha: 0.02915053062825179
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.5087350214301093
R-squared: 0.5406936329766134 


TEST Set Metrics:
Mean Squared Error: 0.4922774503421927
R-squared: 0.4360036867393675
         Feature  Importance
12      COMB4P1A    0.348078
10    COMB1PRSEX   -0.251379
7   COMB4IRESINC   -0.241995
3     COMB4DTBFM   -0.143444
8    COMB4IFCHAR   -0.047303
11    COMB1PRAGE   -0.022915
0     COMB4IMECF    0.000000
1     COMB4IMICF    0.000000
2     COMB4IMFFM    0.000000
4     COMB4IRES0    0.000000
5   COMB4IRESINF   -0.000000
6   COMB4IRESEXC    0.000000
9     COMB4IMCAP    0.000000


In [154]:
print("LASSO: BIS Model, Jump Power")

train_test_lasso(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

LASSO: BIS Model, Jump Power
Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03534657409215116
R-squared: 0.7304754039724852 


TEST Set Metrics:
Mean Squared Error: 0.03949265393517273
R-squared: 0.6959252144446084
         Feature  Importance
5   COMB4IRESINF    1.163917
6   COMB4IRESEXC   -0.848665
1     COMB4IMICF    0.425688
7   COMB4IRESINC   -0.380461
0     COMB4IMECF   -0.336335
10    COMB1PRSEX   -0.198819
11    COMB1PRAGE   -0.130957
8    COMB4IFCHAR   -0.072360
9     COMB4IMCAP   -0.062963
12      COMB4P1A    0.042180
3     COMB4DTBFM   -0.032782
2     COMB4IMFFM    0.000000
4     COMB4IRES0   -0.000000


In [157]:
print("LASSO: Combo Models, Handgrip Strength (Arms)")

train_test_lasso(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

LASSO: Combo Models, Handgrip Strength (Arms)
Best Alpha: 0.026560877829466867
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.49778581042646447
R-squared: 0.550579019505969 


TEST Set Metrics:
Mean Squared Error: 0.5100735511676733
R-squared: 0.4156149095386068
         Feature  Importance
5       COMB4P1A    0.255045
1      COMB4DABM    0.218064
14  COMB4IRESINC   -0.205764
3     COMB1PRSEX   -0.150038
10    COMB4DTBFM   -0.096223
2      COMB4DAFM   -0.068263
15   COMB4IFCHAR   -0.045881
4     COMB1PRAGE   -0.033069
0      COMB4DALM    0.000000
12  COMB4IRESINF    0.000000
13  COMB4IRESEXC    0.000000
8     COMB4IMICF    0.000000
11    COMB4IRES0    0.000000
9     COMB4IMFFM    0.000000
7     COMB4IMECF    0.000000
6    COMB4DLR3MD   -0.000000
16    COMB4IMCAP    0.000000


In [158]:
print("LASSO: Combo Models, Handgrip Strength (TB)")

train_test_lasso(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

LASSO: Combo Models, Handgrip Strength (TB)
Best Alpha: 0.03199267137797385
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.5090690038938129
R-squared: 0.5403921002226426 


TEST Set Metrics:
Mean Squared Error: 0.4901736478232628
R-squared: 0.4384139878077592
          Feature  Importance
5        COMB4P1A    0.343529
3      COMB1PRSEX   -0.245836
16   COMB4IRESINC   -0.238029
2      COMB4DTBFM   -0.122377
17    COMB4IFCHAR   -0.044877
12     COMB4DTBFM   -0.020653
4      COMB1PRAGE   -0.018406
6     COMB4DLR3MD    0.014218
0       COMB4IALM    0.000000
15   COMB4IRESEXC    0.000000
14   COMB4IRESINF   -0.000000
13     COMB4IRES0    0.000000
9      COMB4IMECF    0.000000
11     COMB4IMFFM    0.000000
10     COMB4IMICF    0.000000
1      COMB4DTBBM    0.000000
8   COMB4DLSL14MD   -0.000000
7     COMB4DLFNMD   -0.000000
18     COMB4IMCAP    0.000000


In [159]:
print("LASSO: Combo Models, Jump Power (Legs)")

train_test_lasso(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

LASSO: Combo Models, Jump Power (Legs)
Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03323145456494629
R-squared: 0.7466036073631044 


TEST Set Metrics:
Mean Squared Error: 0.035780175946864744
R-squared: 0.7245095417989287
         Feature  Importance
12  COMB4IRESINF    1.276814
13  COMB4IRESEXC   -0.906481
14  COMB4IRESINC   -0.443315
7     COMB4IMECF   -0.337111
8     COMB4IMICF    0.261308
3     COMB1PRSEX   -0.147421
0      COMB4ILLM    0.138329
4     COMB1PRAGE   -0.120048
2      COMB4DLFM   -0.049154
5       COMB4P1A    0.038952
15   COMB4IFCHAR   -0.033114
1      COMB4DLBM    0.033018
10    COMB4DTBFM    0.009999
6    COMB4DLFNMD    0.005253
16    COMB4IMCAP   -0.001153
9     COMB4IMFFM    0.000000
11    COMB4IRES0   -0.000000


In [160]:
print("LASSO: Combo Models, Jump Power (TB)")

train_test_lasso(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

LASSO: Combo Models, Jump Power (TB)
Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03166965347556992
R-squared: 0.7585126485785858 


TEST Set Metrics:
Mean Squared Error: 0.03360650480286075
R-squared: 0.741245783127909
          Feature  Importance
14   COMB4IRESINF    1.215816
15   COMB4IRESEXC   -0.862289
16   COMB4IRESINC   -0.434876
9      COMB4IMECF   -0.314897
10     COMB4IMICF    0.217861
0       COMB4IALM    0.143704
3      COMB1PRSEX   -0.123538
1      COMB4DTBBM    0.116239
4      COMB1PRAGE   -0.111376
8   COMB4DLSL14MD   -0.055176
17    COMB4IFCHAR   -0.027319
2      COMB4DTBFM   -0.025941
5        COMB4P1A    0.022783
7     COMB4DLFNMD    0.005134
6     COMB4DLR3MD   -0.004926
12     COMB4DTBFM   -0.000628
11     COMB4IMFFM    0.000000
13     COMB4IRES0   -0.000000
18     COMB4IMCAP    0.000000


# More Complex Models

In [6]:
#INCLUDING RACE
#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

race = (df['COMB1PF7A'] == 1) | (df['COMB1PF7A'] == 2)
race_included = df[race]
race_included = race_included.dropna()

#EXCLUDING RACE
race_removed = df.drop('COMB1PF7A', axis=1)
race_removed = race_removed.dropna()

print("number of individuals when race is INCLUDED (only white & black): ", len(race_included))
print("number of individuals when race is REMOVED: ", len(race_removed))

number of individuals when race is INCLUDED (only white & black):  413
number of individuals when race is REMOVED:  490


## Gradient Boosting Machines

In [175]:
from sklearn.ensemble import GradientBoostingRegressor

def train_test_gradient_boosting(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)  # Remove the target column from X
    
    # Dividing into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.05, 0.01],
        'max_depth': [3, 4, 5]
    }

    # Create the GBM model
    gbm = GradientBoostingRegressor(random_state=42)

    # Perform grid search
    grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X, y)
    
    # Get the best parameters
    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)

    # Retrieve the best model with the best parameters
    best_gbm = grid_search.best_estimator_

    gbm = GradientBoostingRegressor(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=random_state)
    gbm.fit(X_train, y_train)
    

    #evaluation of the model on the TRAINING set
    y_train_pred = gbm.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = gbm.predict(X_test)

    #evaluation of the model on test set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    
    # Print the evaluation metrics
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance??
    feature_importances = pd.Series(gbm.feature_importances_, index=X.columns)
    print(feature_importances)
    
    
    if 'COMB1PF7A' in df.columns:
    # Count the frequencies in training data
        white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
        white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

        black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
        black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
        print("White train frequency: ", white_train_counts.values[0])
        print("White test frequency: ", white_test_counts.values[0], "\n")
    
        print("Black train frequency: ", black_train_counts.values[0])
        print("Black test frequency: ", black_test_counts.values[0])

In [177]:
print("RACE INCLUDED GBM: DXA Model, Handgrip Strength (Arms)")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED GBM: DXA Model, Handgrip Strength (Arms)")
train_test_gradient_boosting(race_removed, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

RACE INCLUDED GBM: DXA Model, Handgrip Strength (Arms)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.30759254485534593
R-squared: 0.7026506587874803

Test set metrics:
MSE: 0.4389575269866196
R-squared: 0.5404694440731301 

COMB1PF7A      0.011185
COMB4DALM      0.076758
COMB4DABM      0.158051
COMB4DAFM      0.052611
COMB1PRSEX     0.496622
COMB1PRAGE     0.051420
COMB4P1A       0.095902
COMB4DLR3MD    0.057451
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: DXA Model, Handgrip Strength (Arms)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.3268855554633875
R-squared: 0.7048746192266286

Test set metrics:
MSE: 0.5394433451450675
R-squared: 0.38196629225386136 

COMB4DALM      0.115925
COMB4DABM      0.170526
COMB4DAFM      0.063739
COMB1PRSEX     0.441402
COMB1PRAGE     0.06303

In [181]:
print("RACE INCLUDED GBM: DXA Model, Handgrip Strength (TB)")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED GBM: DXA Model, Handgrip Strength (TB)")
train_test_gradient_boosting(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

RACE INCLUDED GBM: DXA Model, Handgrip Strength (TB)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 200}

Train set metrics:
MSE: 0.27413354943190216
R-squared: 0.7349954292092489

Test set metrics:
MSE: 0.5189263722780566
R-squared: 0.45675262484946966 

COMB1PF7A        0.004218
COMB4IALM        0.092793
COMB4DTBBM       0.016918
COMB4DTBFM       0.050682
COMB1PRSEX       0.572398
COMB1PRAGE       0.050143
COMB4P1A         0.087066
COMB4DLR3MD      0.052919
COMB4DLFNMD      0.037105
COMB4DLSL14MD    0.035757
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: DXA Model, Handgrip Strength (TB)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}

Train set metrics:
MSE: 0.3947447386786517
R-squared: 0.643608628880298

Test set metrics:
MSE: 0.48252762741456773
R-squared: 0.4471739407948846 

COMB4IALM        0.092817
COMB4DTBBM       0.042134
CO

In [183]:
print("RACE INCLUDED GBM: DXA Model, Jump Power (Legs)")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

print("\nRACE REMOVED GBM: DXA Model, Jump Power (Legs)")
train_test_gradient_boosting(race_removed, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

RACE INCLUDED GBM: DXA Model, Jump Power (Legs)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.012383768908507396
R-squared: 0.9037980707693022

Test set metrics:
MSE: 0.032069921869357886
R-squared: 0.7647982343819849 

COMB1PF7A      0.003218
COMB4ILLM      0.452152
COMB4DLBM      0.138877
COMB4DLFM      0.050823
COMB1PRSEX     0.026218
COMB1PRAGE     0.260616
COMB4P1A       0.055607
COMB4DLFNMD    0.012489
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: DXA Model, Jump Power (Legs)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.018713157332473498
R-squared: 0.8573084860420969

Test set metrics:
MSE: 0.03553493350948658
R-squared: 0.7263977927551003 

COMB4ILLM      0.195155
COMB4DLBM      0.282682
COMB4DLFM      0.026340
COMB1PRSEX     0.105267
COMB1PRAGE     0.330451
COMB4P1

In [185]:
print("RACE INCLUDED GBM: DXA Model, Jump Power (TB)")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

print("\nRACE INCLUDED GBM: DXA Model, Jump Power (TB)")
train_test_gradient_boosting(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

RACE INCLUDED GBM: DXA Model, Jump Power (TB)
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.0068269740640997
R-squared: 0.9469654124986826

Test set metrics:
MSE: 0.03528869040503813
R-squared: 0.741191689726475 

COMB1PF7A        0.000894
COMB4IALM        0.534768
COMB4DTBBM       0.064900
COMB4DTBFM       0.025757
COMB1PRSEX       0.008826
COMB1PRAGE       0.245755
COMB4P1A         0.052332
COMB4DLR3MD      0.031199
COMB4DLFNMD      0.011196
COMB4DLSL14MD    0.024374
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE INCLUDED GBM: DXA Model, Jump Power (TB)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}

Train set metrics:
MSE: 0.009916461443885636
R-squared: 0.9243850264606213

Test set metrics:
MSE: 0.03160009255953071
R-squared: 0.7566942099069189 

COMB4IALM        0.340084
COMB4DTBBM       0.125151
COMB4DTBFM     

In [186]:
print("RACE INCLUDED GBM: BIS Model, Handgrip Strength")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED GBM: BIS Model, Handgrip Strength")
train_test_gradient_boosting(race_removed, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

RACE INCLUDED GBM: BIS Model, Handgrip Strength
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.2256186146261742
R-squared: 0.7818947580282738

Test set metrics:
MSE: 0.5272904771451183
R-squared: 0.44799651173352195 

COMB1PF7A       0.010775
COMB4IMECF      0.012107
COMB4IMICF      0.043221
COMB4IMFFM      0.015318
COMB4DTBFM      0.065669
COMB4IRES0      0.003664
COMB4IRESINF    0.017841
COMB4IRESEXC    0.002430
COMB4IRESINC    0.048979
COMB4IFCHAR     0.035616
COMB4IMCAP      0.046684
COMB1PRSEX      0.526928
COMB1PRAGE      0.053941
COMB4P1A        0.116826
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: BIS Model, Handgrip Strength
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.3254135653341643
R-squared: 0.7062035909114307

Test set metrics:
MSE: 0.5112948881138201
R-squar

In [187]:
print("RACE INCLUDED GBM: BIS Model, Jump Power")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

print("\nRACE REMOVED GBM: BIS Model, Jump Power")
train_test_gradient_boosting(race_removed, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

RACE INCLUDED GBM: BIS Model, Jump Power
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.005718993128256096
R-squared: 0.9555726389712118

Test set metrics:
MSE: 0.04094979841987962
R-squared: 0.6996729543248389 

COMB1PF7A       0.002103
COMB4IMECF      0.012276
COMB4IMICF      0.075410
COMB4IMFFM      0.039101
COMB4DTBFM      0.016607
COMB4IRES0      0.021411
COMB4IRESINF    0.007712
COMB4IRESEXC    0.017618
COMB4IRESINC    0.060327
COMB4IFCHAR     0.064841
COMB4IMCAP      0.323930
COMB1PRSEX      0.021199
COMB1PRAGE      0.219079
COMB4P1A        0.118385
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: BIS Model, Jump Power
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.016811896405893936
R-squared: 0.8718059754407383

Test set metrics:
MSE: 0.037452008909673026
R-squared: 0.711

In [189]:
print("RACE INCLUDED GBM: Combo Models, Handgrip Strength (Arms)")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED GBM: Combo Models, Handgrip Strength (Arms)")
train_test_gradient_boosting(race_removed, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

RACE INCLUDED GBM: Combo Models, Handgrip Strength (Arms)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.28453953403122667
R-squared: 0.7249359764785857

Test set metrics:
MSE: 0.48727149876180026
R-squared: 0.4898910966387082 

COMB1PF7A       0.009970
COMB4DALM       0.023756
COMB4DABM       0.132109
COMB4DAFM       0.025778
COMB1PRSEX      0.473516
COMB1PRAGE      0.034765
COMB4P1A        0.088712
COMB4DLR3MD     0.041250
COMB4IMECF      0.005648
COMB4IMICF      0.021694
COMB4IMFFM      0.005220
COMB4DTBFM      0.024941
COMB4IRES0      0.002120
COMB4IRESINF    0.006494
COMB4IRESEXC    0.001186
COMB4IRESINC    0.044248
COMB4IFCHAR     0.019619
COMB4IMCAP      0.038975
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: Combo Models, Handgrip Strength (Arms)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Tr

In [190]:
print("RACE INCLUDED GBM: Combo Models, Handgrip Strength (TB)")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED GBM: Combo Models, Handgrip Strength (TB)")
train_test_gradient_boosting(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

RACE INCLUDED GBM: Combo Models, Handgrip Strength (TB)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.2772666632404921
R-squared: 0.7319666518786208

Test set metrics:
MSE: 0.5057921951225801
R-squared: 0.4705023736493851 

COMB1PF7A        0.004351
COMB4IALM        0.026819
COMB4DTBBM       0.009183
COMB4DTBFM       0.020601
COMB1PRSEX       0.556861
COMB1PRAGE       0.038465
COMB4P1A         0.103196
COMB4DLR3MD      0.030863
COMB4DLFNMD      0.018129
COMB4DLSL14MD    0.015006
COMB4IMECF       0.012293
COMB4IMICF       0.019550
COMB4IMFFM       0.005903
COMB4DTBFM       0.017735
COMB4IRES0       0.001922
COMB4IRESINF     0.007872
COMB4IRESEXC     0.002141
COMB4IRESINC     0.042218
COMB4IFCHAR      0.029847
COMB4IMCAP       0.037045
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: Combo Models, Handgrip Strength (TB)
Best Parameters: 

In [191]:
print("RACE INCLUDED GBM: Combo Models, Jump Power (Legs)")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

print("\nRACE REMOVED GBM: Combo Models, Jump Power (Legs)")
train_test_gradient_boosting(race_removed, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

RACE INCLUDED GBM: Combo Models, Jump Power (Legs)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.008955374607383744
R-squared: 0.9304311699791111

Test set metrics:
MSE: 0.03597615194636686
R-squared: 0.7361498262328976 

COMB1PF7A       0.000000
COMB4ILLM       0.161258
COMB4DLBM       0.128189
COMB4DLFM       0.008959
COMB1PRSEX      0.010048
COMB1PRAGE      0.201712
COMB4P1A        0.052146
COMB4DLFNMD     0.008060
COMB4IMECF      0.001404
COMB4IMICF      0.029074
COMB4IMFFM      0.003189
COMB4DTBFM      0.015516
COMB4IRES0      0.010090
COMB4IRESINF    0.005799
COMB4IRESEXC    0.011730
COMB4IRESINC    0.050516
COMB4IFCHAR     0.089367
COMB4IMCAP      0.212942
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: Combo Models, Jump Power (Legs)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metri

In [192]:
print("RACE INCLUDED GBM: Combo Models, Jump Power (TB)")
train_test_gradient_boosting(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

print("\nRACE REMOVED GBM: Combo Models, Jump Power (TB)")
train_test_gradient_boosting(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

RACE INCLUDED GBM: Combo Models, Jump Power (TB)
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.0044268794665074974
R-squared: 0.9656102800127982

Test set metrics:
MSE: 0.03570368259980792
R-squared: 0.7381481245651639 

COMB1PF7A        0.000137
COMB4IALM        0.357451
COMB4DTBBM       0.036640
COMB4DTBFM       0.008823
COMB1PRSEX       0.002345
COMB1PRAGE       0.184958
COMB4P1A         0.044711
COMB4DLR3MD      0.015151
COMB4DLFNMD      0.008895
COMB4DLSL14MD    0.014610
COMB4IMECF       0.001805
COMB4IMICF       0.039013
COMB4IMFFM       0.001325
COMB4DTBFM       0.008684
COMB4IRES0       0.027316
COMB4IRESINF     0.003198
COMB4IRESEXC     0.020225
COMB4IRESINC     0.058927
COMB4IFCHAR      0.062526
COMB4IMCAP       0.103260
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED GBM: Combo Models, Jump Power (TB)
Best Parameters: {'learning_

## Multi-Layer Perceptron

In [3]:
from sklearn.neural_network import MLPRegressor

def train_test_mlp(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    y = X.pop(target_column)

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    param_grid = {
        'hidden_layer_sizes': [(100, 100), (50, 50, 50), (200,)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
        'max_iter': [1000, 2000, 3000]
    }

    # Create the MLP regressor
    mlp_model = MLPRegressor(random_state = 42)  # You can adjust hidden_layer_sizes and max_iter as needed

    # Perform grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(estimator=mlp_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters from the grid search results
    best_param = grid_search.best_params_
    
    print("Best Hyperparameters:", best_param)

    #best model with best parameters
    best_mlp = grid_search.best_estimator_
    mlp = MLPRegressor(hidden_layer_sizes = best_param['hidden_layer_sizes'], activation = best_param['activation'], alpha = best_param['alpha'], max_iter = best_param['max_iter'], random_state = random_state)
    
    # Fit the MLP model to the training data
    #mlp.fit(X_train_scaled, y_train)
    mlp.fit(X_train, y_train)

    #evaluation of the model on the TRAINING set
    y_train_pred = mlp.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = mlp.predict(X_test)

    #evaluation of the model on test set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    
    # Print the evaluation metrics
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance??
    #feature_importances = pd.Series(mlp.feature_importances_, index=X.columns)
    #print(feature_importances)
    
    
    if 'COMB1PF7A' in df.columns:
    # Count the frequencies in training data
        white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
        white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

        black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
        black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
        print("White train frequency: ", white_train_counts.values[0])
        print("White test frequency: ", white_test_counts.values[0], "\n")
    
        print("Black train frequency: ", black_train_counts.values[0])
        print("Black test frequency: ", black_test_counts.values[0])


In [4]:
print("RACE INCLUDED MLP: DXA Model, Handgrip Strength (Arms)")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED MLP: DXA Model, Handgrip Strength (Arms)")
train_test_mlp(race_removed, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

RACE INCLUDED MLP: DXA Model, Handgrip Strength (Arms)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.5609565579912381
R-squared: 0.4577239736223786

Test set metrics:
MSE: 0.6141835429592404
R-squared: 0.3570309481313497 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: DXA Model, Handgrip Strength (Arms)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'max_iter': 1000}

Train set metrics:
MSE: 0.7093548222473394
R-squared: 0.3595660361241595

Test set metrics:
MSE: 0.6731634712378459
R-squared: 0.2287647631717361 



In [6]:
print("RACE INCLUDED MLP: DXA Model, Handgrip Strength (TB)")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED MLP: DXA Model, Handgrip Strength (TB)")
train_test_mlp(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

RACE INCLUDED MLP: DXA Model, Handgrip Strength (TB)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.9272710459418361
R-squared: 0.10360820101827961

Test set metrics:
MSE: 0.9220933996986225
R-squared: 0.03468999497775638 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: DXA Model, Handgrip Strength (TB)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.9797294489393855
R-squared: 0.11546098675655114

Test set metrics:
MSE: 0.8771628981555657
R-squared: -0.004954909766560434 



In [7]:
print("RACE INCLUDED MLP: DXA Model, Jump Power (Legs)")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

print("\nRACE REMOVED MLP: DXA Model, Jump Power (Legs)")
train_test_mlp(race_removed, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

RACE INCLUDED MLP: DXA Model, Jump Power (Legs)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'max_iter': 1000}

Train set metrics:
MSE: 0.10929031574530591
R-squared: 0.15099035692537177

Test set metrics:
MSE: 0.11094310976693313
R-squared: 0.1863399166784938 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: DXA Model, Jump Power (Legs)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (200,), 'max_iter': 1000}

Train set metrics:
MSE: 0.08002309977323527
R-squared: 0.3898080876799871

Test set metrics:
MSE: 0.08301222962313252
R-squared: 0.36084503303921733 



In [8]:
print("RACE INCLUDED MLP: DXA Model, Jump Power (TB)")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

print("\nRACE INCLUDED MLP: DXA Model, Jump Power (TB)")
train_test_mlp(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

RACE INCLUDED MLP: DXA Model, Jump Power (TB)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.122543524944073
R-squared: 0.04803427765425927

Test set metrics:
MSE: 0.1268535368084767
R-squared: 0.06965236916428508 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE INCLUDED MLP: DXA Model, Jump Power (TB)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.1272050597749793
R-squared: 0.030036340746237955

Test set metrics:
MSE: 0.12766022268953373
R-squared: 0.017076570696065385 



In [9]:
print("RACE INCLUDED MLP: BIS Model, Handgrip Strength")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED MLP: BIS Model, Handgrip Strength")
train_test_mlp(race_removed, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

RACE INCLUDED MLP: BIS Model, Handgrip Strength
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 1.0381932693183427
R-squared: -0.003620178207696778

Test set metrics:
MSE: 0.9567705774021565
R-squared: -0.0016124301280913134 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: BIS Model, Handgrip Strength
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 1.1289995257754808
R-squared: -0.019306021231531023

Test set metrics:
MSE: 0.9343742434060295
R-squared: -0.07050125517708716 



In [10]:
print("RACE INCLUDED MLP: BIS Model, Jump Power")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

print("\nRACE REMOVED MLP: BIS Model, Jump Power")
train_test_mlp(race_removed, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

RACE INCLUDED MLP: BIS Model, Jump Power
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (200,), 'max_iter': 1000}

Train set metrics:
MSE: 0.1230056785106854
R-squared: 0.04444408915531073

Test set metrics:
MSE: 0.12363217396110755
R-squared: 0.09327794057926986 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: BIS Model, Jump Power
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.13068794084536048
R-squared: 0.0034786859347973875

Test set metrics:
MSE: 0.13159152465381838
R-squared: -0.013192597937311756 



In [11]:
print("RACE INCLUDED MLP: Combo Models, Handgrip Strength (Arms)")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED MLP: Combo Models, Handgrip Strength (Arms)")
train_test_mlp(race_removed, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

RACE INCLUDED MLP: Combo Models, Handgrip Strength (Arms)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50, 50), 'max_iter': 1000}

Train set metrics:
MSE: 0.7007090338671914
R-squared: 0.32262542416281415

Test set metrics:
MSE: 0.7802491162871842
R-squared: 0.18318222578325205 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: Combo Models, Handgrip Strength (Arms)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.8631368134150241
R-squared: 0.22072549104374672

Test set metrics:
MSE: 0.7627604101233834
R-squared: 0.1261146353307082 



In [12]:
print("RACE INCLUDED MLP: Combo Models, Handgrip Strength (TB)")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED MLP: Combo Models, Handgrip Strength (TB)")
train_test_mlp(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

RACE INCLUDED MLP: Combo Models, Handgrip Strength (TB)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 1.0000123856293117
R-squared: 0.03328923589134869

Test set metrics:
MSE: 0.9804333081703634
R-squared: -0.026384184013503953 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: Combo Models, Handgrip Strength (TB)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 1.0921316021545302
R-squared: 0.013979817849160137

Test set metrics:
MSE: 0.9391255487269512
R-squared: -0.07594476814383522 



In [13]:
print("RACE INCLUDED MLP: Combo Models, Jump Power (Legs)")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

print("\nRACE REMOVED MLP: Combo Models, Jump Power (Legs)")
train_test_mlp(race_removed, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

RACE INCLUDED MLP: Combo Models, Jump Power (Legs)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (200,), 'max_iter': 1000}

Train set metrics:
MSE: 0.11043810201149282
R-squared: 0.14207390717833035

Test set metrics:
MSE: 0.14093513379201883
R-squared: -0.03362248403749302 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: Combo Models, Jump Power (Legs)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.11805628217962134
R-squared: 0.09979757359177077

Test set metrics:
MSE: 0.11088750542368603
R-squared: 0.14621857300783148 



In [14]:
print("RACE INCLUDED MLP: Combo Models, Jump Power (TB)")
train_test_mlp(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

print("\nRACE REMOVED MLP: Combo Models, Jump Power (TB)")
train_test_mlp(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

RACE INCLUDED MLP: Combo Models, Jump Power (TB)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (200,), 'max_iter': 1000}

Train set metrics:
MSE: 0.09922101153563692
R-squared: 0.2292126249713683

Test set metrics:
MSE: 0.11022919263375965
R-squared: 0.19157580627346638 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED MLP: Combo Models, Jump Power (TB)
Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 100), 'max_iter': 1000}

Train set metrics:
MSE: 0.11347874573013114
R-squared: 0.13470219148015705

Test set metrics:
MSE: 0.11700663699083017
R-squared: 0.09910414959838221 



# Support Vector Regression

In [7]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

def train_test_svr(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    y = X.pop(target_column)  # Remove the target column from X

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define a parameter grid for hyperparameter tuning
    param_grid = {
        'kernel': ['linear', 'rbf', 'poly'],
        'C': [0.1, 1, 10],
        'epsilon': [0.01, 0.1, 1]
    }
    
    # Create the SVR model
    svr = SVR()

    # Perform grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters from the grid search results
    best_params = grid_search.best_params_

    print("Best Hyperparameters:", best_params)
    
    best_svr = grid_search.best_estimator_    
    svr = SVR(kernel = best_params['kernel'], C = best_params['C'], epsilon = best_params['epsilon'])
    svr.fit(X_train, y_train) #X_train vs X_train_scaled...

    #evaluation of the model on the TRAINING set
    y_train_pred = svr.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = svr.predict(X_test)

    #evaluation of the model on test set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    
    # Print the evaluation metrics
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance??
    #feature_importances = pd.Series(svr.feature_importances_, index=X.columns)
    #print(feature_importances)
    
    
    if 'COMB1PF7A' in df.columns:
    # Count the frequencies in training data
        white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
        white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

        black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
        black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
        print("White train frequency: ", white_train_counts.values[0])
        print("White test frequency: ", white_test_counts.values[0], "\n")
    
        print("Black train frequency: ", black_train_counts.values[0])
        print("Black test frequency: ", black_test_counts.values[0])

In [8]:
print("RACE INCLUDED SVR: DXA Model, Handgrip Strength (Arms)")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED SVR: DXA Model, Handgrip Strength (Arms)")
train_test_svr(race_removed, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

RACE INCLUDED SVR: DXA Model, Handgrip Strength (Arms)
Best Hyperparameters: {'C': 1, 'epsilon': 1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.5564231614212547
R-squared: 0.462106402605414

Test set metrics:
MSE: 0.5446845614069274
R-squared: 0.42978720281577876 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: DXA Model, Handgrip Strength (Arms)
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.5350786511325974
R-squared: 0.5169095482504522

Test set metrics:
MSE: 0.6282611508978809
R-squared: 0.2802088078075089 



In [9]:
print("RACE INCLUDED SVR: DXA Model, Handgrip Strength (TB)")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED SVR: DXA Model, Handgrip Strength (TB)")
train_test_svr(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

RACE INCLUDED SVR: DXA Model, Handgrip Strength (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.7314692070217639
R-squared: 0.292889601965314

Test set metrics:
MSE: 0.686462343149862
R-squared: 0.2813645904740725 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: DXA Model, Handgrip Strength (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.6990967546952135
R-squared: 0.3688274306450293

Test set metrics:
MSE: 0.7488899609353292
R-squared: 0.14200584099103986 



In [10]:
print("RACE INCLUDED SVR: DXA Model, Jump Power (Legs)")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

print("\nRACE REMOVED SVR: DXA Model, Jump Power (Legs)")
train_test_svr(race_removed, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

RACE INCLUDED SVR: DXA Model, Jump Power (Legs)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.0734930988701147
R-squared: 0.4290770484589399

Test set metrics:
MSE: 0.08350672416809969
R-squared: 0.38755918878367746 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: DXA Model, Jump Power (Legs)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.07903819861916747
R-squared: 0.3973181531529547

Test set metrics:
MSE: 0.07101742509057647
R-squared: 0.45319936359403146 



In [11]:
print("RACE INCLUDED SVR: DXA Model, Jump Power (TB)")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

print("\nRACE INCLUDED SVR: DXA Model, Jump Power (TB)")
train_test_svr(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

RACE INCLUDED SVR: DXA Model, Jump Power (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.07802972910319693
R-squared: 0.39383474186768885

Test set metrics:
MSE: 0.0845061289820707
R-squared: 0.38022952400399124 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE INCLUDED SVR: DXA Model, Jump Power (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.08207027828621521
R-squared: 0.3741979479173998

Test set metrics:
MSE: 0.07758901791178203
R-squared: 0.4026012021955705 



In [12]:
print("RACE INCLUDED SVR: BIS Model, Handgrip Strength")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED SVR: BIS Model, Handgrip Strength")
train_test_svr(race_removed, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

RACE INCLUDED SVR: BIS Model, Handgrip Strength
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.8203407242214376
R-squared: 0.20697761373976187

Test set metrics:
MSE: 0.8460022338407541
R-squared: 0.11434739597467736 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: BIS Model, Handgrip Strength
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.8465572610518654
R-squared: 0.23569417541186755

Test set metrics:
MSE: 0.7939093901138815
R-squared: 0.09042762617711342 



In [13]:
print("RACE INCLUDED SVR: BIS Model, Jump Power")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

print("\nRACE REMOVED SVR: BIS Model, Jump Power")
train_test_svr(race_removed, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

RACE INCLUDED SVR: BIS Model, Jump Power
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.07820750504761757
R-squared: 0.3924537092474558

Test set metrics:
MSE: 0.09124773162829429
R-squared: 0.3307864086778539 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: BIS Model, Jump Power
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.08565360797906423
R-squared: 0.34687435255620225

Test set metrics:
MSE: 0.0756069064258901
R-squared: 0.4178625246179333 



In [14]:
print("RACE INCLUDED SVR: Combo Models, Handgrip Strength (Arms)")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED SVR: Combo Models, Handgrip Strength (Arms)")
train_test_svr(race_removed, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

RACE INCLUDED SVR: Combo Models, Handgrip Strength (Arms)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.5581077476556993
R-squared: 0.4604779151293493

Test set metrics:
MSE: 0.604409456036585
R-squared: 0.3672631262376891 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: Combo Models, Handgrip Strength (Arms)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.5422612207525189
R-squared: 0.5104248365261741

Test set metrics:
MSE: 0.6969697005435795
R-squared: 0.20149025455522762 



In [15]:
print("RACE INCLUDED SVR: Combo Models, Handgrip Strength (TB)")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

print("\nRACE REMOVED SVR: Combo Models, Handgrip Strength (TB)")
train_test_svr(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

RACE INCLUDED SVR: Combo Models, Handgrip Strength (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.7283558099628141
R-squared: 0.2958993191379037

Test set metrics:
MSE: 0.7058588668147957
R-squared: 0.2610589919712192 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: Combo Models, Handgrip Strength (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.7090115076426704
R-squared: 0.3598759943090183

Test set metrics:
MSE: 0.7268986120945272
R-squared: 0.16720106303750815 



In [16]:
print("RACE INCLUDED SVR: Combo Models, Jump Power (Legs)")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

print("\nRACE REMOVED SVR: Combo Models, Jump Power (Legs)")
train_test_svr(race_removed, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

RACE INCLUDED SVR: Combo Models, Jump Power (Legs)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.06629234716497033
R-squared: 0.4850152859263911

Test set metrics:
MSE: 0.07811333520979548
R-squared: 0.42711446462206093 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: Combo Models, Jump Power (Legs)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.07266725949069974
R-squared: 0.4458978200377701

Test set metrics:
MSE: 0.06480313416473794
R-squared: 0.5010464691280117 



In [17]:
print("RACE INCLUDED SVR: Combo Models, Jump Power (TB)")
train_test_svr(race_included, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

print("\nRACE REMOVED SVR: Combo Models, Jump Power (TB)")
train_test_svr(race_removed, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

RACE INCLUDED SVR: Combo Models, Jump Power (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.06593263061804397
R-squared: 0.4878097038492536

Test set metrics:
MSE: 0.07449993514054616
R-squared: 0.4536152487410188 

White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21

RACE REMOVED SVR: Combo Models, Jump Power (TB)
Best Hyperparameters: {'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}

Train set metrics:
MSE: 0.07144826357564651
R-squared: 0.4551929042150191

Test set metrics:
MSE: 0.06560674620717549
R-squared: 0.4948590358935918 

