# Including Race

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import GradientBoostingRegressor

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

In [2]:
#number of individuals before NA is removed
print(len(df))

544


In [3]:
#this is the number of people we have when race IS INCLUDED
df = df.dropna()
print(len(df))

422


In [4]:
#counting the total number of individuals who are white or black (ignore the other races for now...)
race = (df['COMB1PF7A'] == 1) | (df['COMB1PF7A'] == 2)
df = df[race]

print(len(df))

413


## Gradient Boosting Machines

In [5]:
#GBM function

def train_test_gradient_boosting(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)
    
    #dividing our data set into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #scaling the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #defining our parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.05, 0.01],
        'max_depth': [3, 4, 5]
    }

    gbm = GradientBoostingRegressor(random_state=42)

    #perforiming grid search to find best hyperparameters
    grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X, y)
    
    best_params = grid_search.best_params_
    print("Best Parameters:", best_params)

    best_gbm = grid_search.best_estimator_

    #fitting our model with the best hyperparameters
    gbm = GradientBoostingRegressor(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=random_state)
    gbm.fit(X_train, y_train)
    

    #evaluation of the model on the TRAINING set
    y_train_pred = gbm.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = gbm.predict(X_test)

    #evaluation of the model on TESTING set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance
    feature_importances = pd.Series(gbm.feature_importances_, index=X.columns)
    print(feature_importances)
    
    #counting frequencies
    if 'COMB1PF7A' in df.columns:
        white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
        white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

        black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
        black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
        print("White train frequency: ", white_train_counts.values[0])
        print("White test frequency: ", white_test_counts.values[0], "\n")
    
        print("Black train frequency: ", black_train_counts.values[0])
        print("Black test frequency: ", black_test_counts.values[0])

In [6]:
#DXA: Handgrip Strength (ARMS)

print("Gradient Boosting Machine: DXA Model, Handgrip Strength (Arms)")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: DXA Model, Handgrip Strength (Arms)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.30759254485534593
R-squared: 0.7026506587874803

Test set metrics:
MSE: 0.43884556491587356
R-squared: 0.540586653573935 

COMB1PF7A      0.011185
COMB4DALM      0.076758
COMB4DABM      0.158099
COMB4DAFM      0.052513
COMB1PRSEX     0.496622
COMB1PRAGE     0.051420
COMB4P1A       0.095902
COMB4DLR3MD    0.057501
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [7]:
#DXA: Handgrip Strength (TOTAL BODY)

print("Gradient Boosting Machine: DXA Model, Handgrip Strength (TB)")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: DXA Model, Handgrip Strength (TB)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 200}

Train set metrics:
MSE: 0.2741335494319022
R-squared: 0.7349954292092489

Test set metrics:
MSE: 0.5189263722780566
R-squared: 0.45675262484946966 

COMB1PF7A        0.004218
COMB4IALM        0.092793
COMB4DTBBM       0.016918
COMB4DTBFM       0.050682
COMB1PRSEX       0.572398
COMB1PRAGE       0.050143
COMB4P1A         0.087066
COMB4DLR3MD      0.052919
COMB4DLFNMD      0.037105
COMB4DLSL14MD    0.035757
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [8]:
#DXA: Jump Power (LEGS)

print("Gradient Boosting Machine: DXA Model, Jump Power (Legs)")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

Gradient Boosting Machine: DXA Model, Jump Power (Legs)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.012383768908507396
R-squared: 0.9037980707693022

Test set metrics:
MSE: 0.032069921869357886
R-squared: 0.7647982343819848 

COMB1PF7A      0.003218
COMB4ILLM      0.452152
COMB4DLBM      0.138877
COMB4DLFM      0.050823
COMB1PRSEX     0.026218
COMB1PRAGE     0.260616
COMB4P1A       0.055607
COMB4DLFNMD    0.012489
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [9]:
#DXA: Jump Power (TOTAL BODY)

print("Gradient Boosting Machine: DXA Model, Jump Power (TB)")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

Gradient Boosting Machine: DXA Model, Jump Power (TB)
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.0068269740640997
R-squared: 0.9469654124986826

Test set metrics:
MSE: 0.03528869040503813
R-squared: 0.741191689726475 

COMB1PF7A        0.000894
COMB4IALM        0.534768
COMB4DTBBM       0.064900
COMB4DTBFM       0.025757
COMB1PRSEX       0.008826
COMB1PRAGE       0.245755
COMB4P1A         0.052332
COMB4DLR3MD      0.031199
COMB4DLFNMD      0.011196
COMB4DLSL14MD    0.024374
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [10]:
#BIS: Handgrip Strength

print("Gradient Boosting Machine: BIS Model, Handgrip Strength")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: BIS Model, Handgrip Strength
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.22561861462617427
R-squared: 0.7818947580282738

Test set metrics:
MSE: 0.5272904771451183
R-squared: 0.44799651173352195 

COMB1PF7A       0.010775
COMB4IMECF      0.012107
COMB4IMICF      0.043221
COMB4IMFFM      0.015318
COMB4DTBFM      0.065669
COMB4IRES0      0.003664
COMB4IRESINF    0.017841
COMB4IRESEXC    0.002430
COMB4IRESINC    0.048979
COMB4IFCHAR     0.035616
COMB4IMCAP      0.046684
COMB1PRSEX      0.526928
COMB1PRAGE      0.053941
COMB4P1A        0.116826
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [11]:
#BIS: Jump Power

print("Gradient Boosting Machine: BIS Model, Jump Power")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

Gradient Boosting Machine: BIS Model, Jump Power
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.005718993128256097
R-squared: 0.9555726389712118

Test set metrics:
MSE: 0.04098718761701438
R-squared: 0.6993987408354216 

COMB1PF7A       0.002103
COMB4IMECF      0.012232
COMB4IMICF      0.075410
COMB4IMFFM      0.039101
COMB4DTBFM      0.016607
COMB4IRES0      0.021411
COMB4IRESINF    0.007757
COMB4IRESEXC    0.017618
COMB4IRESINC    0.060327
COMB4IFCHAR     0.064841
COMB4IMCAP      0.323930
COMB1PRSEX      0.021199
COMB1PRAGE      0.219079
COMB4P1A        0.118385
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [12]:
#COMBO: Handgrip Strength (ARMS)

print("Gradient Boosting Machine: Combo Models, Handgrip Strength (Arms)")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: Combo Models, Handgrip Strength (Arms)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.2845395340312267
R-squared: 0.7249359764785855

Test set metrics:
MSE: 0.48769533825248285
R-squared: 0.48944739267009385 

COMB1PF7A       0.009982
COMB4DALM       0.023756
COMB4DABM       0.132119
COMB4DAFM       0.025766
COMB1PRSEX      0.473516
COMB1PRAGE      0.034746
COMB4P1A        0.088712
COMB4DLR3MD     0.041250
COMB4IMECF      0.005664
COMB4IMICF      0.021694
COMB4IMFFM      0.005220
COMB4DTBFM      0.024941
COMB4IRES0      0.002062
COMB4IRESINF    0.006514
COMB4IRESEXC    0.001234
COMB4IRESINC    0.044248
COMB4IFCHAR     0.019590
COMB4IMCAP      0.038986
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [13]:
#COMBO: Handgrip Strength (TOTAL BODY)

print("Gradient Boosting Machine: Combo Models, Handgrip Strength (TB)")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Gradient Boosting Machine: Combo Models, Handgrip Strength (TB)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Train set metrics:
MSE: 0.2772666632404922
R-squared: 0.7319666518786208

Test set metrics:
MSE: 0.505791941810336
R-squared: 0.4705026388338487 

COMB1PF7A        0.004351
COMB4IALM        0.026819
COMB4DTBBM       0.009183
COMB4DTBFM       0.020601
COMB1PRSEX       0.556861
COMB1PRAGE       0.038465
COMB4P1A         0.103196
COMB4DLR3MD      0.030863
COMB4DLFNMD      0.018129
COMB4DLSL14MD    0.015006
COMB4IMECF       0.012293
COMB4IMICF       0.019550
COMB4IMFFM       0.005903
COMB4DTBFM       0.017735
COMB4IRES0       0.001922
COMB4IRESINF     0.007872
COMB4IRESEXC     0.002141
COMB4IRESINC     0.042218
COMB4IFCHAR      0.029950
COMB4IMCAP       0.036942
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [14]:
#COMBO: Jump Power (LEGS)

print("Gradient Boosting Machine: Combo Models, Jump Power (Legs)")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Gradient Boosting Machine: Combo Models, Jump Power (Legs)
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.008955374607383744
R-squared: 0.9304311699791111

Test set metrics:
MSE: 0.036073281939255804
R-squared: 0.7354374719616583 

COMB1PF7A       0.000000
COMB4ILLM       0.161258
COMB4DLBM       0.128189
COMB4DLFM       0.008959
COMB1PRSEX      0.010048
COMB1PRAGE      0.201712
COMB4P1A        0.052146
COMB4DLFNMD     0.008060
COMB4IMECF      0.001404
COMB4IMICF      0.029147
COMB4IMFFM      0.003189
COMB4DTBFM      0.015516
COMB4IRES0      0.010090
COMB4IRESINF    0.005799
COMB4IRESEXC    0.011657
COMB4IRESINC    0.050516
COMB4IFCHAR     0.089367
COMB4IMCAP      0.212942
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21


In [15]:
#COMBO: Jump Power (TOTAL BODY)

print("RACE INCLUDED GBM: Combo Models, Jump Power (TB)")
train_test_gradient_boosting(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

RACE INCLUDED GBM: Combo Models, Jump Power (TB)
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

Train set metrics:
MSE: 0.004426879466507499
R-squared: 0.9656102800127982

Test set metrics:
MSE: 0.035703682599807926
R-squared: 0.7381481245651638 

COMB1PF7A        0.000137
COMB4IALM        0.357451
COMB4DTBBM       0.036640
COMB4DTBFM       0.008823
COMB1PRSEX       0.002345
COMB1PRAGE       0.184958
COMB4P1A         0.044711
COMB4DLR3MD      0.015151
COMB4DLFNMD      0.008895
COMB4DLSL14MD    0.014610
COMB4IMECF       0.001805
COMB4IMICF       0.039013
COMB4IMFFM       0.001325
COMB4DTBFM       0.008684
COMB4IRES0       0.027316
COMB4IRESINF     0.003198
COMB4IRESEXC     0.020225
COMB4IRESINC     0.058927
COMB4IFCHAR      0.062526
COMB4IMCAP       0.103260
dtype: float64
White train frequency:  251
White test frequency:  62 

Black train frequency:  79
Black test frequency:  21
