In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import GradientBoostingRegressor

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

df = df.drop('COMB1PF7A', axis=1)

In [2]:
#number of individuals before NA is removed
print(len(df))

544


In [3]:
#this is the total number of people when race is removed
df = df.dropna()
print(len(df))

490


## Gradient Boosting Machines

In [4]:
#GBM function
def train_test_gradient_boosting(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)
    
    #dividing our data set into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #scaling the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #defining our parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.05, 0.01],
        'max_depth': [3, 4, 5]
    }

    gbm = GradientBoostingRegressor(random_state=42)

    #perforiming grid search to find best hyperparameters
    grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X, y)
    
    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)

    best_gbm = grid_search.best_estimator_

    #fitting our model with the best hyperparameters
    gbm = GradientBoostingRegressor(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=random_state)
    gbm.fit(X_train, y_train)
    

    #evaluation of the model on the TRAINING set
    y_train_pred = gbm.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = gbm.predict(X_test)

    #evaluation of the model on TESTING set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance
    feature_importances = pd.Series(gbm.feature_importances_, index=X.columns)
    print(feature_importances)

In [5]:
#DXA: Handgrip Strength (ARMS)

print("Gradient Boosting Machine: DXA Model, Handgrip Strength (Arms)")
train_test_gradient_boosting(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: DXA Model, Handgrip Strength (Arms)


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.32688555546338754
R-squared: 0.7048746192266286

Test set metrics:
MSE: 0.5394433451450674
R-squared: 0.3819662922538616 

COMB4DALM      0.115925
COMB4DABM      0.170526
COMB4DAFM      0.063739
COMB1PRSEX     0.441402
COMB1PRAGE     0.063036
COMB4P1A       0.094921
COMB4DLR3MD    0.050452
dtype: float64


In [6]:
#DXA: Handgrip Strength (TOTAL BODY)

print("Gradient Boosting Machine: DXA Model, Handgrip Strength (TB)")
train_test_gradient_boosting(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: DXA Model, Handgrip Strength (TB)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}

Train set metrics:
MSE: 0.3947447386786517
R-squared: 0.643608628880298

Test set metrics:
MSE: 0.48252762741456784
R-squared: 0.4471739407948845 

COMB4IALM        0.092817
COMB4DTBBM       0.042134
COMB4DTBFM       0.051030
COMB1PRSEX       0.601144
COMB1PRAGE       0.056053
COMB4P1A         0.111798
COMB4DLR3MD      0.028798
COMB4DLFNMD      0.014673
COMB4DLSL14MD    0.001553
dtype: float64


In [7]:
#DXA: Jump Power (LEGS)

print("Gradient Boosting Machine: DXA Model, Jump Power (Legs)")
train_test_gradient_boosting(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

Gradient Boosting Machine: DXA Model, Jump Power (Legs)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.018713157332473498
R-squared: 0.8573084860420969

Test set metrics:
MSE: 0.03553493350948658
R-squared: 0.7263977927551003 

COMB4ILLM      0.195155
COMB4DLBM      0.282682
COMB4DLFM      0.026340
COMB1PRSEX     0.105379
COMB1PRAGE     0.330451
COMB4P1A       0.036119
COMB4DLFNMD    0.023875
dtype: float64


In [8]:
#DXA: Jump Power (TOTAL BODY)

print("Gradient Boosting Machine: DXA Model, Jump Power (TB)")
train_test_gradient_boosting(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

Gradient Boosting Machine: DXA Model, Jump Power (TB)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}

Train set metrics:
MSE: 0.009916461443885636
R-squared: 0.9243850264606213

Test set metrics:
MSE: 0.03160009255953071
R-squared: 0.7566942099069189 

COMB4IALM        0.340084
COMB4DTBBM       0.125151
COMB4DTBFM       0.040521
COMB1PRSEX       0.079534
COMB1PRAGE       0.292880
COMB4P1A         0.039133
COMB4DLR3MD      0.029166
COMB4DLFNMD      0.024460
COMB4DLSL14MD    0.029071
dtype: float64


In [9]:
#BIS: Handgrip Strength

print("Gradient Boosting Machine: BIS Model, Handgrip Strength")
train_test_gradient_boosting(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: BIS Model, Handgrip Strength
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.3254135653341643
R-squared: 0.7062035909114307

Test set metrics:
MSE: 0.5112948881138201
R-squared: 0.4142156385900858 

COMB4IMECF      0.015948
COMB4IMICF      0.029876
COMB4IMFFM      0.005352
COMB4DTBFM      0.045512
COMB4IRES0      0.003636
COMB4IRESINF    0.017675
COMB4IRESEXC    0.004834
COMB4IRESINC    0.053976
COMB4IFCHAR     0.034114
COMB4IMCAP      0.045894
COMB1PRSEX      0.559606
COMB1PRAGE      0.046449
COMB4P1A        0.137127
dtype: float64


In [10]:
#BIS: Jump Power

print("Gradient Boosting Machine: BIS Model, Jump Power")
train_test_gradient_boosting(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

Gradient Boosting Machine: BIS Model, Jump Power


Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.01681189640589394
R-squared: 0.8718059754407382

Test set metrics:
MSE: 0.037452008909673026
R-squared: 0.7116372174804657 

COMB4IMECF      0.006595
COMB4IMICF      0.062798
COMB4IMFFM      0.033718
COMB4DTBFM      0.016603
COMB4IRES0      0.015806
COMB4IRESINF    0.005151
COMB4IRESEXC    0.025356
COMB4IRESINC    0.032800
COMB4IFCHAR     0.049928
COMB4IMCAP      0.260186
COMB1PRSEX      0.099341
COMB1PRAGE      0.275947
COMB4P1A        0.115771
dtype: float64


In [11]:
#COMBO: Handgrip Strength (ARMS)

print("Gradient Boosting Machine: Combo Models, Handgrip Strength (Arms)")
train_test_gradient_boosting(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: Combo Models, Handgrip Strength (Arms)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.22906453764103243
R-squared: 0.7931913547016374

Test set metrics:
MSE: 0.5762910073508551
R-squared: 0.3397503718985996 

COMB4DALM       0.083823
COMB4DABM       0.140756
COMB4DAFM       0.036571
COMB1PRSEX      0.402616
COMB1PRAGE      0.048345
COMB4P1A        0.089831
COMB4DLR3MD     0.031373
COMB4IMECF      0.009061
COMB4IMICF      0.008712
COMB4IMFFM      0.002666
COMB4DTBFM      0.032586
COMB4IRES0      0.005800
COMB4IRESINF    0.013195
COMB4IRESEXC    0.012868
COMB4IRESINC    0.033221
COMB4IFCHAR     0.016442
COMB4IMCAP      0.032133
dtype: float64


In [12]:
#COMBO: Handgrip Strength (TOTAL BODY)

print("Gradient Boosting Machine: Combo Models, Handgrip Strength (TB)")
train_test_gradient_boosting(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: Combo Models, Handgrip Strength (TB)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}

Train set metrics:
MSE: 0.3694449717697456
R-squared: 0.6664502724392615

Test set metrics:
MSE: 0.4829958905563238
R-squared: 0.44663745738415206 

COMB4IALM        0.064305
COMB4DTBBM       0.028428
COMB4DTBFM       0.020744
COMB1PRSEX       0.577055
COMB1PRAGE       0.034077
COMB4P1A         0.128456
COMB4DLR3MD      0.020644
COMB4DLFNMD      0.003245
COMB4DLSL14MD    0.000671
COMB4IMECF       0.005494
COMB4IMICF       0.003776
COMB4IMFFM       0.002393
COMB4DTBFM       0.019991
COMB4IRES0       0.001999
COMB4IRESINF     0.003315
COMB4IRESEXC     0.001535
COMB4IRESINC     0.035553
COMB4IFCHAR      0.014924
COMB4IMCAP       0.033394
dtype: float64


In [13]:
#COMBO: Jump Power (LEGS)

print("Gradient Boosting Machine: Combo Models, Jump Power (Legs)")
train_test_gradient_boosting(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Gradient Boosting Machine: Combo Models, Jump Power (Legs)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.014620049179903784
R-squared: 0.8885192426614561

Test set metrics:
MSE: 0.0349800135575958
R-squared: 0.7306704143329912 

COMB4ILLM       0.085782
COMB4DLBM       0.207008
COMB4DLFM       0.010406
COMB1PRSEX      0.042241
COMB1PRAGE      0.255922
COMB4P1A        0.034626
COMB4DLFNMD     0.013003
COMB4IMECF      0.001431
COMB4IMICF      0.034040
COMB4IMFFM      0.005875
COMB4DTBFM      0.025427
COMB4IRES0      0.017369
COMB4IRESINF    0.000956
COMB4IRESEXC    0.009622
COMB4IRESINC    0.020102
COMB4IFCHAR     0.065163
COMB4IMCAP      0.171027
dtype: float64


In [14]:
#COMBO: Jump Power (TOTAL BODY)

print("Gradient Boosting Machine: Combo Models, Jump Power (TB)")
train_test_gradient_boosting(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Gradient Boosting Machine: Combo Models, Jump Power (TB)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.013778577231155123
R-squared: 0.8949356321668065

Test set metrics:
MSE: 0.033587214619444965
R-squared: 0.741394308430744 

COMB4IALM        0.227513
COMB4DTBBM       0.095138
COMB4DTBFM       0.011658
COMB1PRSEX       0.061042
COMB1PRAGE       0.257714
COMB4P1A         0.037141
COMB4DLR3MD      0.016317
COMB4DLFNMD      0.017099
COMB4DLSL14MD    0.008215
COMB4IMECF       0.003469
COMB4IMICF       0.027716
COMB4IMFFM       0.009540
COMB4DTBFM       0.011992
COMB4IRES0       0.013671
COMB4IRESINF     0.001814
COMB4IRESEXC     0.012855
COMB4IRESINC     0.020893
COMB4IFCHAR      0.049388
COMB4IMCAP       0.116826
dtype: float64
