In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('/Users/yunjuha/Desktop/SROP/DXA_BIS_Project/narrowed down list/narrowed_transformed_data.csv')
df = pd.DataFrame(data)
df = df.dropna()

# Random Forest

In [3]:
#DXA model, handgrip strength (TB)

print("Random Forest: DXA Model, Handgrip Strength (TB)")

print("UNTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['RA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = None, min_samples_leaf = 4, min_samples_split = 10, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

#evaluation of the model on TEST set
y_test_pred = rf.predict(X_test)
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: DXA Model, Handgrip Strength (TB)
UNTRANSFORMED
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Mean Squared Error: 106.35842137330285

Train set metrics:
MSE: 29.705477744113786
R-squared: 0.7857792673202947

Test set metrics:
MSE: 106.35842137330285
R-squared: 0.27264227282817943 

RA4IALM       0.039156
RA4DTBBM      0.061252
RA4DTBFM      0.040352
RA1PRSEX      0.588070
RA1PF7A       0.004021
Age_40_50     0.002308
Age_51_61     0.004898
Age_61plus    0.004662
RA4P1A        0.079846
RA4DLR3MD     0.104782
RA4DLFNMD     0.041230
RA4DLSMD      0.029422
dtype: float64


In [4]:
#DXA model, jump power (TB)

print("Random Forest: DXA Model, Jump Power (TB)")

print("UNTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['jumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = 10, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

#evaluation of the model on TEST set
y_test_pred = rf.predict(X_test)
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: DXA Model, Jump Power (TB)
UNTRANSFORMED
Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Mean Squared Error: 0.2963559029615469

Train set metrics:
MSE: 0.04023516205426254
R-squared: 0.9545238185384726

Test set metrics:
MSE: 0.2963559029615469
R-squared: 0.7174552907210019 

RA4IALM       0.576506
RA4DTBBM      0.099795
RA4DTBFM      0.069231
RA1PRSEX      0.003515
RA1PF7A       0.002603
Age_40_50     0.012069
Age_51_61     0.004328
Age_61plus    0.052080
RA4P1A        0.050211
RA4DLR3MD     0.031774
RA4DLFNMD     0.048601
RA4DLSMD      0.049288
dtype: float64


In [5]:
#BIS Model, handgrip strength

print("Lasso: BIS Model, Handgrip Strength")

print("UNTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['RA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = 10, min_samples_leaf = 4, min_samples_split = 10, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)


Lasso: BIS Model, Handgrip Strength
UNTRANSFORMED
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Mean Squared Error: 92.20567574878139

Train set metrics:
MSE: 31.930101704878993
R-squared: 0.7697364155972187

Test set metrics:
MSE: 92.20567574878139
R-squared: 0.36942923861589083 

RA4IMECF      0.020377
RA4IMICF      0.025678
RA4IMFFM      0.016681
RA4DTBFM      0.052480
RA4IRES0      0.011143
RA4IRESINF    0.018147
RA4IRESEXC    0.009620
RA4IRESINC    0.056024
RA4IFCHAR     0.044276
RA4IMCAP      0.029440
RA1PRSEX      0.592103
RA1PF7A       0.004587
Age_40_50     0.002129
Age_51_61     0.003658
Age_61plus    0.009150
RA4P1A        0.104507
dtype: float64


In [6]:
#BIS Model, jumppower

print("Random Forest: BIS Model, Jump Power")

print("UNTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['jumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)


#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = 10, min_samples_leaf = 1, min_samples_split = 10, n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: BIS Model, Jump Power
UNTRANSFORMED
Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Mean Squared Error: 0.41328474880878124

Train set metrics:
MSE: 0.10894447811185544
R-squared: 0.8768644488329762

Test set metrics:
MSE: 0.41328474880878124
R-squared: 0.6059757270406985 

RA4IMECF      0.021537
RA4IMICF      0.147514
RA4IMFFM      0.044766
RA4DTBFM      0.034409
RA4IRES0      0.011125
RA4IRESINF    0.012271
RA4IRESEXC    0.008621
RA4IRESINC    0.309988
RA4IFCHAR     0.018541
RA4IMCAP      0.177766
RA1PRSEX      0.002332
RA1PF7A       0.002595
Age_40_50     0.008825
Age_51_61     0.008302
Age_61plus    0.046535
RA4P1A        0.144871
dtype: float64


In [7]:
#Combo Models, handgrip strength (TB)

print("Random Forest: Combo Models, Handgrip Strength (TB)")

print("UNTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['RA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = None, min_samples_leaf = 4, min_samples_split = 10, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

#evaluation of the model on test set
y_test_pred = rf.predict(X_test)
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: Combo Models, Handgrip Strength (TB)
UNTRANSFORMED
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Mean Squared Error: 97.37708391363199

Train set metrics:
MSE: 27.822316306363316
R-squared: 0.7993596657378562

Test set metrics:
MSE: 97.37708391363199
R-squared: 0.3340633161012897 

RA4IALM       0.016214
RA4DTBBM      0.043254
RA4DTBFM      0.013964
RA1PRSEX      0.574271
RA1PF7A       0.001706
Age_40_50     0.001392
Age_51_61     0.002278
Age_61plus    0.003839
RA4P1A        0.066079
RA4DLR3MD     0.082476
RA4DLFNMD     0.023769
RA4DLSMD      0.017661
RA4IMECF      0.010989
RA4IMICF      0.012984
RA4IMFFM      0.006529
RA4DTBFM      0.014905
RA4IRES0      0.007229
RA4IRESINF    0.011148
RA4IRESEXC    0.005892
RA4IRESINC    0.037492
RA4IFCHAR     0.027616
RA4IMCAP      0.018315
dtype: float64


In [8]:
#Combo Models, jumppower (TB)

print("Random Forest: Combo Models, Jump Power (TB)")

print("UNTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['jumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = None, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: Combo Models, Jump Power (TB)
UNTRANSFORMED
Best parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Mean Squared Error: 0.35328017474411655

Train set metrics:
MSE: 0.05946397207712237
R-squared: 0.9327902698402097

Test set metrics:
MSE: 0.35328017474411655
R-squared: 0.6631838837370427 

RA4IALM       0.369678
RA4DTBBM      0.063626
RA4DTBFM      0.016674
RA1PRSEX      0.001513
RA1PF7A       0.001721
Age_40_50     0.005457
Age_51_61     0.001822
Age_61plus    0.031240
RA4P1A        0.044128
RA4DLR3MD     0.024145
RA4DLFNMD     0.039445
RA4DLSMD      0.026234
RA4IMECF      0.009535
RA4IMICF      0.066875
RA4IMFFM      0.013710
RA4DTBFM      0.016992
RA4IRES0      0.008762
RA4IRESINF    0.007292
RA4IRESEXC    0.009436
RA4IRESINC    0.117095
RA4IFCHAR     0.018524
RA4IMCAP      0.106096
dtype: float64
