# Excluding Race

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

df = df.drop('COMB1PF7A', axis=1)

In [2]:
#number of individuals before NA is removed
print(len(df))

544


In [3]:
#this is the total number of people when race is removed
df = df.dropna()
print(len(df))

490


# Random Forest

In [None]:
#Random Forest Function

def train_test_random_forest(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)
    
    #dividing our data set into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #random forest model
    param_grid = { #our hyperparameters
        'n_estimators': [5, 10, 20],
        'max_depth': [None, 1, 2, 4, 8, 16],
        'min_samples_split': [1, 2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'max_features': [1, 2, 4, 8, X_train.shape[1]]
    }

    rf_model = RandomForestRegressor(random_state=42)

    #performing gridsearch to find best hyperparameters
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_rf_model = grid_search.best_estimator_

    y_pred = best_rf_model.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_pred)
    print("Best parameters:", best_params)
    print("Mean Squared Error:", mse)

    #fitting with BEST HYPERPARAMETERS
    rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
    rf.fit(X_train, y_train)

    #evaluation of the model on the TRAINING set
    y_train_pred = rf.predict(X_train)
    mse_train = metrics.mean_squared_error(y_train, y_train_pred)
    r2_train_11 = metrics.r2_score(y_train, y_train_pred)
    print("\nTrain set metrics:")
    print("MSE:", mse_train)
    print("R-squared:", r2_train_11)

    y_test_pred = rf.predict(X_test)

    #evaluation of the model on TESTING set
    mse_test = metrics.mean_squared_error(y_test, y_test_pred)
    r2_test_11 = metrics.r2_score(y_test, y_test_pred)
    print("\nTest set metrics:")
    print("MSE:", mse_test)
    print("R-squared:", r2_test_11, "\n")

    #feature importance??
    feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
    print(feature_importances)

In [94]:
#DXA: Handgrip Strength (ARMS)

print("Random Forest: DXA Model, Handgrip Strength (Arms)")

train_test_random_forest(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

Random Forest: DXA Model, Handgrip Strength (Arms)
Best parameters: {'max_depth': None, 'max_features': 1, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.563911553921147

Train set metrics:
MSE: 0.22113343604966268
R-squared: 0.8003518710902774

Test set metrics:
MSE: 0.563911553921147
R-squared: 0.35393336177490575 

COMB4DALM      0.226729
COMB4DABM      0.272234
COMB4DAFM      0.119894
COMB1PRSEX     0.050189
COMB1PRAGE     0.075460
COMB4P1A       0.160732
COMB4DLR3MD    0.094761
dtype: float64


In [95]:
#DXA: Handgrip Strength (TOTAL BODY)

print("Random Forest: DXA Model, Handgrip Strength (TB)")

train_test_random_forest(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

Random Forest: DXA Model, Handgrip Strength (TB)
Best parameters: {'max_depth': 4, 'max_features': 2, 'min_samples_leaf': 2, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.512220146761612

Train set metrics:
MSE: 0.4207880156699591
R-squared: 0.6200957146196774

Test set metrics:
MSE: 0.512220146761612
R-squared: 0.4131555809624118 

COMB4IALM        0.112682
COMB4DTBBM       0.193642
COMB4DTBFM       0.066693
COMB1PRSEX       0.218109
COMB1PRAGE       0.032880
COMB4P1A         0.237244
COMB4DLR3MD      0.054533
COMB4DLFNMD      0.049660
COMB4DLSL14MD    0.034557
dtype: float64


In [96]:
#DXA: Jump Power (LEGS)

print("Random Forest: DXA Model, Jump Power (Legs)")

train_test_random_forest(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

Random Forest: DXA Model, Jump Power (Legs)
Best parameters: {'max_depth': 16, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.03432614078384149

Train set metrics:
MSE: 0.008457943477384067
R-squared: 0.9355065135019212

Test set metrics:
MSE: 0.03432614078384149
R-squared: 0.7357049259104158 

COMB4ILLM      0.267729
COMB4DLBM      0.262804
COMB4DLFM      0.060045
COMB1PRSEX     0.058591
COMB1PRAGE     0.217393
COMB4P1A       0.070852
COMB4DLFNMD    0.062587
dtype: float64


In [97]:
#DXA: Jump Power (TOTAL BODY)

print("Random Forest: DXA Model, Jump Power (TB)")

train_test_random_forest(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

Random Forest: DXA Model, Jump Power (TB)
Best parameters: {'max_depth': None, 'max_features': 8, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 20}
Mean Squared Error: 0.034639138417018885

Train set metrics:
MSE: 0.01659683698601998
R-squared: 0.8734458459162283

Test set metrics:
MSE: 0.034639138417018885
R-squared: 0.7332949919428484 

COMB4IALM        0.435438
COMB4DTBBM       0.122820
COMB4DTBFM       0.047682
COMB1PRSEX       0.045840
COMB1PRAGE       0.239257
COMB4P1A         0.038462
COMB4DLR3MD      0.024423
COMB4DLFNMD      0.023693
COMB4DLSL14MD    0.022386
dtype: float64


In [98]:
#BIS: Handgrip Strength

print("Random Forest: BIS Model, Handgrip Strength")

train_test_random_forest(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

Random Forest: BIS Model, Handgrip Strength
Best parameters: {'max_depth': 4, 'max_features': 8, 'min_samples_leaf': 8, 'min_samples_split': 1, 'n_estimators': 5}
Mean Squared Error: 0.5335717482503435

Train set metrics:
MSE: 0.4202205134174768
R-squared: 0.6206080783982364

Test set metrics:
MSE: 0.5335717482503435
R-squared: 0.3886933097097188 

COMB4IMECF      0.005497
COMB4IMICF      0.014433
COMB4IMFFM      0.079380
COMB4DTBFM      0.047048
COMB4IRES0      0.003911
COMB4IRESINF    0.025725
COMB4IRESEXC    0.007084
COMB4IRESINC    0.014386
COMB4IFCHAR     0.026523
COMB4IMCAP      0.034890
COMB1PRSEX      0.240722
COMB1PRAGE      0.058735
COMB4P1A        0.441668
dtype: float64


In [99]:
#BIS: Jump Power

print("Random Forest: BIS Model, Jump Power")

train_test_random_forest(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

Random Forest: BIS Model, Jump Power
Best parameters: {'max_depth': 16, 'max_features': 13, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.03725472079599646

Train set metrics:
MSE: 0.01183245899256238
R-squared: 0.9097751674131647

Test set metrics:
MSE: 0.03725472079599646
R-squared: 0.7131562427897626 

COMB4IMECF      0.023734
COMB4IMICF      0.080625
COMB4IMFFM      0.041055
COMB4DTBFM      0.028018
COMB4IRES0      0.022546
COMB4IRESINF    0.011644
COMB4IRESEXC    0.011550
COMB4IRESINC    0.029357
COMB4IFCHAR     0.061855
COMB4IMCAP      0.243573
COMB1PRSEX      0.103335
COMB1PRAGE      0.218421
COMB4P1A        0.124288
dtype: float64


In [100]:
#COMBO: Handgrip Strength (ARMS)

print("Random Forest: Combo Models, Handgrip Strength (Arms)")

train_test_random_forest(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Random Forest: Combo Models, Handgrip Strength (Arms)
Best parameters: {'max_depth': 16, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.5476929171400978

Train set metrics:
MSE: 0.09392989992467701
R-squared: 0.9151963217157821

Test set metrics:
MSE: 0.5476929171400978
R-squared: 0.37251485752342417 

COMB4DALM       0.079282
COMB4DABM       0.099317
COMB4DAFM       0.042023
COMB1PRSEX      0.079703
COMB1PRAGE      0.043118
COMB4P1A        0.082755
COMB4DLR3MD     0.069634
COMB4IMECF      0.047458
COMB4IMICF      0.068073
COMB4IMFFM      0.065056
COMB4DTBFM      0.046821
COMB4IRES0      0.036174
COMB4IRESINF    0.042292
COMB4IRESEXC    0.039163
COMB4IRESINC    0.058239
COMB4IFCHAR     0.057498
COMB4IMCAP      0.043392
dtype: float64


In [101]:
#COMBO: Handgrip Strength (TOTAL BODY)

print("Random Forest: Combo Models, Handgrip Strength (TB)")

train_test_random_forest(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

Random Forest: Combo Models, Handgrip Strength (TB)
Best parameters: {'max_depth': None, 'max_features': 4, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.6058466408382025

Train set metrics:
MSE: 0.16155274385354995
R-squared: 0.8541437079494875

Test set metrics:
MSE: 0.6058466408382025
R-squared: 0.30588884053786136 

COMB4IALM        0.073413
COMB4DTBBM       0.080002
COMB4DTBFM       0.034977
COMB1PRSEX       0.087965
COMB1PRAGE       0.035089
COMB4P1A         0.203463
COMB4DLR3MD      0.083524
COMB4DLFNMD      0.021510
COMB4DLSL14MD    0.028459
COMB4IMECF       0.041587
COMB4IMICF       0.030701
COMB4IMFFM       0.046991
COMB4DTBFM       0.047101
COMB4IRES0       0.027537
COMB4IRESINF     0.021963
COMB4IRESEXC     0.013692
COMB4IRESINC     0.052320
COMB4IFCHAR      0.039299
COMB4IMCAP       0.030405
dtype: float64


In [102]:
#COMBO: Jump Power (LEGS)

print("Random Forest: Combo Models, Jump Power (Legs)")

train_test_random_forest(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Random Forest: Combo Models, Jump Power (Legs)
Best parameters: {'max_depth': None, 'max_features': 17, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 20}
Mean Squared Error: 0.03315274697233576

Train set metrics:
MSE: 0.013733158239729065
R-squared: 0.8952819609307815

Test set metrics:
MSE: 0.03315274697233576
R-squared: 0.7447395041434036 

COMB4ILLM       0.110900
COMB4DLBM       0.272595
COMB4DLFM       0.010917
COMB1PRSEX      0.004399
COMB1PRAGE      0.193647
COMB4P1A        0.039944
COMB4DLFNMD     0.013584
COMB4IMECF      0.010851
COMB4IMICF      0.046590
COMB4IMFFM      0.005961
COMB4DTBFM      0.035148
COMB4IRES0      0.010326
COMB4IRESINF    0.005999
COMB4IRESEXC    0.012256
COMB4IRESINC    0.030904
COMB4IFCHAR     0.058250
COMB4IMCAP      0.137728
dtype: float64


In [103]:
#COMBO: Jump Power (TOTAL BODY)

print("Random Forest: Combo Models, Jump Power (TB)")

train_test_random_forest(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

Random Forest: Combo Models, Jump Power (TB)
Best parameters: {'max_depth': 8, 'max_features': 19, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 20}
Mean Squared Error: 0.03092390614062327

Train set metrics:
MSE: 0.011644069170392308
R-squared: 0.9112116769482531

Test set metrics:
MSE: 0.03092390614062327
R-squared: 0.7619005260147769 

COMB4IALM        0.312205
COMB4DTBBM       0.084248
COMB4DTBFM       0.013182
COMB1PRSEX       0.042987
COMB1PRAGE       0.191255
COMB4P1A         0.038271
COMB4DLR3MD      0.014858
COMB4DLFNMD      0.013623
COMB4DLSL14MD    0.013138
COMB4IMECF       0.007661
COMB4IMICF       0.023891
COMB4IMFFM       0.012707
COMB4DTBFM       0.021410
COMB4IRES0       0.015832
COMB4IRESINF     0.004292
COMB4IRESEXC     0.012097
COMB4IRESINC     0.017812
COMB4IFCHAR      0.051679
COMB4IMCAP       0.108852
dtype: float64
