In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('/Users/yunjuha/Desktop/SROP/DXA_BIS_Project/narrowed down list/narrowed_transformed_data.csv')
df = pd.DataFrame(data)
df = df.dropna()

# Random Forest

In [21]:
#DXA model, handgrip strength (TB)

print("Random Forest: DXA Model, Handgrip Strength (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = 10, min_samples_leaf = 4, min_samples_split = 10, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

#evaluation of the model on TEST set
y_test_pred = rf.predict(X_test)
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: DXA Model, Handgrip Strength (TB)

TRANSFORMED
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Mean Squared Error: 0.9243882885881357

Train set metrics:
MSE: 0.2419378087447967
R-squared: 0.7823987512947201

Test set metrics:
MSE: 0.9243882885881357
R-squared: 0.2757662608334903 

RA4IALM       0.037424
RA4DTBBM      0.070920
RA4DTBFM      0.040448
RA1PRSEX      0.591522
RA1PF7A       0.003696
Age_40_50     0.002346
Age_51_61     0.006429
Age_61plus    0.003839
RA4P1A        0.065256
RA4DLR3MD     0.096608
RA4DLFNMD     0.048646
RA4DLSMD      0.032865
dtype: float64


In [16]:
#DXA model, jump power (TB)

print("Random Forest: DXA Model, Jump Power (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = 10, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

#evaluation of the model on TEST set
y_test_pred = rf.predict(X_test)
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: DXA Model, Jump Power (TB)

TRANSFORMED
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Mean Squared Error: 0.03902086089853464

Train set metrics:
MSE: 0.009966822194863832
R-squared: 0.9146311523798262

Test set metrics:
MSE: 0.03902086089853464
R-squared: 0.72722619179498 

RA4IALM       0.527402
RA4DTBBM      0.162974
RA4DTBFM      0.065925
RA1PRSEX      0.003542
RA1PF7A       0.002298
Age_40_50     0.005692
Age_51_61     0.008397
Age_61plus    0.041519
RA4P1A        0.040242
RA4DLR3MD     0.040707
RA4DLFNMD     0.049002
RA4DLSMD      0.052300
dtype: float64


In [23]:
#BIS Model, handgrip strength

print("Random Forest: BIS Model, Handgrip Strength")

print("\nTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = None, min_samples_leaf = 4, min_samples_split = 10, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: BIS Model, Handgrip Strength

TRANSFORMED
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Mean Squared Error: 0.837406031478118

Train set metrics:
MSE: 0.2522166357586576
R-squared: 0.7731538730136202

Test set metrics:
MSE: 0.837406031478118
R-squared: 0.3439145553171288 

RA4IMECF      0.019394
RA4IMICF      0.023596
RA4IMFFM      0.016541
RA4DTBFM      0.054769
RA4IRES0      0.010493
RA4IRESINF    0.020708
RA4IRESEXC    0.011086
RA4IRESINC    0.069374
RA4IFCHAR     0.047634
RA4IMCAP      0.030829
RA1PRSEX      0.592403
RA1PF7A       0.003688
Age_40_50     0.002506
Age_51_61     0.003920
Age_61plus    0.005922
RA4P1A        0.087136
dtype: float64


In [18]:
#BIS Model, jumppower

print("Random Forest: BIS Model, Jump Power")

print("\nTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)


#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = None, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: BIS Model, Jump Power

TRANSFORMED
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Mean Squared Error: 0.055120228087212335

Train set metrics:
MSE: 0.00869333592395215
R-squared: 0.9255389475910086

Test set metrics:
MSE: 0.055120228087212335
R-squared: 0.6146841925508917 

RA4IMECF      0.046467
RA4IMICF      0.240372
RA4IMFFM      0.058127
RA4DTBFM      0.046487
RA4IRES0      0.021495
RA4IRESINF    0.019006
RA4IRESEXC    0.021168
RA4IRESINC    0.190039
RA4IFCHAR     0.032146
RA4IMCAP      0.164475
RA1PRSEX      0.002918
RA1PF7A       0.002777
Age_40_50     0.003158
Age_51_61     0.007113
Age_61plus    0.030014
RA4P1A        0.114238
dtype: float64


In [19]:
#Combo Models, handgrip strength (TB)

print("Random Forest: Combo Models, Handgrip Strength (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = 10, min_samples_leaf = 4, min_samples_split = 10, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

#evaluation of the model on test set
y_test_pred = rf.predict(X_test)
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: Combo Models, Handgrip Strength (TB)

TRANSFORMED
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Mean Squared Error: 0.8537715821874045

Train set metrics:
MSE: 0.2210954107784993
R-squared: 0.8011446093605121

Test set metrics:
MSE: 0.8537715821874045
R-squared: 0.3310925798226009 

RA4IALM       0.014710
RA4DTBBM      0.050404
RA4DTBFM      0.015758
RA1PRSEX      0.575571
RA1PF7A       0.002450
Age_40_50     0.001536
Age_51_61     0.003098
Age_61plus    0.002681
RA4P1A        0.050878
RA4DLR3MD     0.070707
RA4DLFNMD     0.024991
RA4DLSMD      0.019516
RA4IMECF      0.009854
RA4IMICF      0.012228
RA4IMFFM      0.008013
RA4DTBFM      0.015559
RA4IRES0      0.007388
RA4IRESINF    0.013753
RA4IRESEXC    0.006901
RA4IRESINC    0.044843
RA4IFCHAR     0.029371
RA4IMCAP      0.019791
dtype: float64


In [22]:
#Combo Models, jumppower (TB)

print("Random Forest: Combo Models, Jump Power (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = 10, min_samples_leaf = 2, min_samples_split = 5, n_estimators = 300, random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: Combo Models, Jump Power (TB)

TRANSFORMED
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Mean Squared Error: 0.0456739869707685

Train set metrics:
MSE: 0.009005179637623102
R-squared: 0.9228679118332519

Test set metrics:
MSE: 0.0456739869707685
R-squared: 0.6807177731337328 

RA4IALM       0.294582
RA4DTBBM      0.104087
RA4DTBFM      0.016343
RA1PRSEX      0.000998
RA1PF7A       0.001089
Age_40_50     0.003514
Age_51_61     0.003521
Age_61plus    0.020745
RA4P1A        0.039967
RA4DLR3MD     0.025993
RA4DLFNMD     0.030308
RA4DLSMD      0.026627
RA4IMECF      0.010723
RA4IMICF      0.126124
RA4IMFFM      0.020974
RA4DTBFM      0.017241
RA4IRES0      0.013379
RA4IRESINF    0.012972
RA4IRESEXC    0.012568
RA4IRESINC    0.101770
RA4IFCHAR     0.027439
RA4IMCAP      0.089036
dtype: float64
