# Including Race

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

In [4]:
#number of individuals before NA is removed
print(len(df))

544


In [5]:
#this is the number of people we have when race IS INCLUDED
df = df.dropna()
print(len(df))

422


In [6]:
#counting the total number of individuals who are white or black (ignore the other races for now...)
race = (df['COMB1PF7A'] == 1) | (df['COMB1PF7A'] == 2)
df = df[race]

print(len(df))

413


# Random Forest

In [7]:
#Random Forest Function

def train_test_random_forest(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)
    
    #dividing our data set into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #random forest model
    param_grid = { #our hyperparameters
        'n_estimators': [5, 10, 20],
        'max_depth': [None, 1, 2, 4, 8, 16],
        'min_samples_split': [1, 2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'max_features': [1, 2, 4, 8, X_train.shape[1]]
    }

    rf_model = RandomForestRegressor(random_state=42)

    #performing gridsearch to find best hyperparameters
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_rf_model = grid_search.best_estimator_

    y_pred = best_rf_model.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_pred)
    print("Best parameters:", best_params)
    print("Mean Squared Error:", mse)

    #fitting with BEST HYPERPARAMETERS
    rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
    rf.fit(X_train, y_train)

    #evaluation of the model on the TRAINING set
    y_train_pred = rf.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = rf.predict(X_test)

    #evaluation of the model on TESTING set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance??
    feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
    print(feature_importances)
        
    #count the frequencies in training data
    white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
    white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

    black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
    print("white train frequency: ", white_train_counts.values[0])
    print("white test frequency: ", white_test_counts.values[0], "\n")
    
    print("black train frequency: ", black_train_counts.values[0])
    print("black test frequency: ", black_test_counts.values[0])

In [44]:
#DXA: Handgrip Strength (ARMS)

print("Random Forest: DXA Model, Handgrip Strength (Arms)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

Random Forest: DXA Model, Handgrip Strength (Arms)
Best parameters: {'max_depth': 4, 'max_features': 2, 'min_samples_leaf': 2, 'min_samples_split': 1, 'n_estimators': 10}
Mean Squared Error: 0.44657126802326336

Train set metrics:
MSE: 0.39884262929731285
R-squared: 0.6144393124846439

Test set metrics:
MSE: 0.44657126802326336
R-squared: 0.53249886278416 

COMB1PF7A      0.004592
COMB4DALM      0.167855
COMB4DABM      0.263230
COMB4DAFM      0.044238
COMB1PRSEX     0.266296
COMB1PRAGE     0.033722
COMB4P1A       0.169481
COMB4DLR3MD    0.050587
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [45]:
#DXA: Handgrip Strength (TOTAL BODY)

print("Random Forest: DXA Model, Handgrip Strength (TB)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

Random Forest: DXA Model, Handgrip Strength (TB)
Best parameters: {'max_depth': 4, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 20}
Mean Squared Error: 0.5154613249487807

Train set metrics:
MSE: 0.35498232883997266
R-squared: 0.6568390119068344

Test set metrics:
MSE: 0.5154613249487807
R-squared: 0.46038007176094176 

COMB1PF7A        0.000331
COMB4IALM        0.079951
COMB4DTBBM       0.033275
COMB4DTBFM       0.034865
COMB1PRSEX       0.521460
COMB1PRAGE       0.030248
COMB4P1A         0.179256
COMB4DLR3MD      0.075413
COMB4DLFNMD      0.021641
COMB4DLSL14MD    0.023561
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [46]:
#DXA: Jump Power (LEGS)

print("Random Forest: DXA Model, Jump Power (Legs)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

Random Forest: DXA Model, Jump Power (Legs)
Best parameters: {'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.0320661479872633

Train set metrics:
MSE: 0.009412340555681228
R-squared: 0.9268812809151483

Test set metrics:
MSE: 0.0320661479872633
R-squared: 0.7648259121460748 

COMB1PF7A      0.003438
COMB4ILLM      0.295291
COMB4DLBM      0.240072
COMB4DLFM      0.048140
COMB1PRSEX     0.011164
COMB1PRAGE     0.242393
COMB4P1A       0.125052
COMB4DLFNMD    0.034450
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [47]:
#DXA: Jump Power (TOTAL BODY)

print("Random Forest: DXA Model, Jump Power (TB)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

Random Forest: DXA Model, Jump Power (TB)
Best parameters: {'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.03683804220376537

Train set metrics:
MSE: 0.01128241386277427
R-squared: 0.9123538247526233

Test set metrics:
MSE: 0.03683804220376537
R-squared: 0.7298286973216734 

COMB1PF7A        0.001086
COMB4IALM        0.516556
COMB4DTBBM       0.106744
COMB4DTBFM       0.031383
COMB1PRSEX       0.001677
COMB1PRAGE       0.210139
COMB4P1A         0.049992
COMB4DLR3MD      0.028154
COMB4DLFNMD      0.023678
COMB4DLSL14MD    0.030592
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [48]:
#BIS: Handgrip Strength

print("Random Forest: BIS Model, Handgrip Strength")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

Random Forest: BIS Model, Handgrip Strength
Best parameters: {'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 20}
Mean Squared Error: 0.5057526593300937

Train set metrics:
MSE: 0.2470638210624888
R-squared: 0.7611637028949205

Test set metrics:
MSE: 0.5057526593300937
R-squared: 0.4705437624024724 

COMB1PF7A       0.003704
COMB4IMECF      0.105537
COMB4IMICF      0.056053
COMB4IMFFM      0.045998
COMB4DTBFM      0.047665
COMB4IRES0      0.018222
COMB4IRESINF    0.025672
COMB4IRESEXC    0.017271
COMB4IRESINC    0.034851
COMB4IFCHAR     0.053925
COMB4IMCAP      0.061525
COMB1PRSEX      0.340812
COMB1PRAGE      0.050574
COMB4P1A        0.138191
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [49]:
#BIS: Jump Power

print("Random Forest: BIS Model, Jump Power")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

Random Forest: BIS Model, Jump Power
Best parameters: {'max_depth': 16, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.045631931890548406

Train set metrics:
MSE: 0.009124185488330184
R-squared: 0.9291197814557817

Test set metrics:
MSE: 0.045631931890548406
R-squared: 0.6653340474934905 

COMB1PF7A       0.006428
COMB4IMECF      0.046096
COMB4IMICF      0.132052
COMB4IMFFM      0.087147
COMB4DTBFM      0.030691
COMB4IRES0      0.021064
COMB4IRESINF    0.038150
COMB4IRESEXC    0.023639
COMB4IRESINC    0.104663
COMB4IFCHAR     0.119930
COMB4IMCAP      0.162246
COMB1PRSEX      0.005981
COMB1PRAGE      0.131306
COMB4P1A        0.090609
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [50]:
#COMBO: Handgrip Strength (ARMS)

print("Random Forest: Combo Models, Handgrip Strength (Arms)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Random Forest: Combo Models, Handgrip Strength (Arms)
Best parameters: {'max_depth': 4, 'max_features': 18, 'min_samples_leaf': 4, 'min_samples_split': 20, 'n_estimators': 10}
Mean Squared Error: 0.5052210096742265

Train set metrics:
MSE: 0.3686615752374748
R-squared: 0.6436153009534565

Test set metrics:
MSE: 0.5052210096742265
R-squared: 0.47110032937512736 

COMB1PF7A       0.000000
COMB4DALM       0.032614
COMB4DABM       0.117028
COMB4DAFM       0.001651
COMB1PRSEX      0.597383
COMB1PRAGE      0.030702
COMB4P1A        0.058485
COMB4DLR3MD     0.034424
COMB4IMECF      0.013326
COMB4IMICF      0.020323
COMB4IMFFM      0.006799
COMB4DTBFM      0.021133
COMB4IRES0      0.000000
COMB4IRESINF    0.003970
COMB4IRESEXC    0.001865
COMB4IRESINC    0.006343
COMB4IFCHAR     0.023422
COMB4IMCAP      0.030530
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [51]:
#COMBO: Handgrip Strength (TOTAL BODY)

print("Random Forest: Combo Models, Handgrip Strength (TB)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Random Forest: Combo Models, Handgrip Strength (TB)
Best parameters: {'max_depth': None, 'max_features': 20, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 10}
Mean Squared Error: 0.4749309162151855

Train set metrics:
MSE: 0.33852830359120845
R-squared: 0.6727450982208413

Test set metrics:
MSE: 0.4749309162151855
R-squared: 0.5028100566962725 

COMB1PF7A        0.000000
COMB4IALM        0.033566
COMB4DTBBM       0.029337
COMB4DTBFM       0.011467
COMB1PRSEX       0.627375
COMB1PRAGE       0.036862
COMB4P1A         0.054138
COMB4DLR3MD      0.032586
COMB4DLFNMD      0.017118
COMB4DLSL14MD    0.021187
COMB4IMECF       0.018543
COMB4IMICF       0.000957
COMB4IMFFM       0.007178
COMB4DTBFM       0.012871
COMB4IRES0       0.004131
COMB4IRESINF     0.003962
COMB4IRESEXC     0.004582
COMB4IRESINC     0.014221
COMB4IFCHAR      0.035849
COMB4IMCAP       0.034069
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequ

In [52]:
#COMBO: Jump Power (LEGS)

print("Random Forest: Combo Models, Jump Power (Legs)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Random Forest: Combo Models, Jump Power (Legs)
Best parameters: {'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.04151959869142085

Train set metrics:
MSE: 0.008525769592119382
R-squared: 0.9337685086827769

Test set metrics:
MSE: 0.04151959869142085
R-squared: 0.6954940221009918 

COMB1PF7A       0.000891
COMB4ILLM       0.096925
COMB4DLBM       0.111705
COMB4DLFM       0.018994
COMB1PRSEX      0.027637
COMB1PRAGE      0.121553
COMB4P1A        0.058318
COMB4DLFNMD     0.017589
COMB4IMECF      0.021737
COMB4IMICF      0.098980
COMB4IMFFM      0.045864
COMB4DTBFM      0.017900
COMB4IRES0      0.022065
COMB4IRESINF    0.025409
COMB4IRESEXC    0.029209
COMB4IRESINC    0.056959
COMB4IFCHAR     0.095746
COMB4IMCAP      0.132519
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [53]:
#COMBO: Jump Power (TOTAL BODY)

print("Random Forest: Combo Models, Jump Power (TB)")

train_test_random_forest(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Random Forest: Combo Models, Jump Power (TB)
Best parameters: {'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.043387997202312685

Train set metrics:
MSE: 0.0072925875517212526
R-squared: 0.9433483459888049

Test set metrics:
MSE: 0.043387997202312685
R-squared: 0.68179113157229 

COMB1PF7A        0.001211
COMB4IALM        0.230469
COMB4DTBBM       0.074969
COMB4DTBFM       0.022043
COMB1PRSEX       0.002638
COMB1PRAGE       0.119983
COMB4P1A         0.033868
COMB4DLR3MD      0.015868
COMB4DLFNMD      0.014839
COMB4DLSL14MD    0.015556
COMB4IMECF       0.010635
COMB4IMICF       0.077736
COMB4IMFFM       0.012394
COMB4DTBFM       0.013801
COMB4IRES0       0.022714
COMB4IRESINF     0.013724
COMB4IRESEXC     0.022371
COMB4IRESINC     0.026795
COMB4IFCHAR      0.073830
COMB4IMCAP       0.194555
dtype: float64
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  2