In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('/Users/yunjuha/Desktop/SROP/DXA_BIS_Project/narrowed down list/narrowed_transformed_data.csv')
df = pd.DataFrame(data)
df = df.dropna()

# Random Forest

In [2]:
#DXA model, handgrip strength (Arms)

print("MLR: DXA Model, Handgrip Strength (Arms)")

print("\nTRANSFORMED")
columns = ['RA4DALM', 'RA4DABM', 'RA4DAFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

MLR: DXA Model, Handgrip Strength (Arms)

TRANSFORMED
Best parameters: {'max_depth': 1, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.862904326842523

Train set metrics:
MSE: 0.5532649628488784
R-squared: 0.5023880418248783

Test set metrics:
MSE: 0.862904326842523
R-squared: 0.32393731639025036 

RA4DALM       0.0
RA4DABM       0.2
RA4DAFM       0.0
RA1PRSEX      0.8
RA1PF7A       0.0
Age_40_50     0.0
Age_51_61     0.0
Age_61plus    0.0
RA4P1A        0.0
RA4DLR3MD     0.0
dtype: float64


In [3]:
#DXA model, handgrip strength (TB)

print("Random Forest: DXA Model, Handgrip Strength (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: DXA Model, Handgrip Strength (TB)

TRANSFORMED
Best parameters: {'max_depth': 1, 'max_features': 12, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.8896080023624383

Train set metrics:
MSE: 0.5779601312104581
R-squared: 0.4801769641116007

Test set metrics:
MSE: 0.8896080023624383
R-squared: 0.3030156939430697 

RA4IALM       0.0
RA4DTBBM      0.0
RA4DTBFM      0.0
RA1PRSEX      1.0
RA1PF7A       0.0
Age_40_50     0.0
Age_51_61     0.0
Age_61plus    0.0
RA4P1A        0.0
RA4DLR3MD     0.0
RA4DLFNMD     0.0
RA4DLSMD      0.0
dtype: float64


In [4]:
#DXA model, jump power (Legs)

print("MLR: DXA Model, Jump Power (Legs)")

print("\nTRANSFORMED")
columns = ['RA4ILLM', 'RA4DLBM', 'RA4DLFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLFNMD']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

MLR: DXA Model, Jump Power (Legs)

TRANSFORMED
Best parameters: {'max_depth': 16, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.03796940937599663

Train set metrics:
MSE: 0.006697000671018316
R-squared: 0.9426381630403363

Test set metrics:
MSE: 0.03796940937599663
R-squared: 0.7345763227797575 

RA4ILLM       0.223683
RA4DLBM       0.177594
RA4DLFM       0.098263
RA1PRSEX      0.081673
RA1PF7A       0.013352
Age_40_50     0.014337
Age_51_61     0.024715
Age_61plus    0.051677
RA4P1A        0.203243
RA4DLFNMD     0.111463
dtype: float64


In [5]:
#DXA model, jump power (TB)

print("Random Forest: DXA Model, Jump Power (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: DXA Model, Jump Power (TB)

TRANSFORMED
Best parameters: {'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.048457477054174414

Train set metrics:
MSE: 0.0061830542366412755
R-squared: 0.9470402697479431

Test set metrics:
MSE: 0.048457477054174414
R-squared: 0.6612598942172274 

RA4IALM       0.270152
RA4DTBBM      0.217418
RA4DTBFM      0.047394
RA1PRSEX      0.047974
RA1PF7A       0.006584
Age_40_50     0.008011
Age_51_61     0.011192
Age_61plus    0.036451
RA4P1A        0.096086
RA4DLR3MD     0.113701
RA4DLFNMD     0.079603
RA4DLSMD      0.065434
dtype: float64


In [6]:
#BIS Model, handgrip strength

print("Random Forest: BIS Model, Handgrip Strength")

print("\nTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: BIS Model, Handgrip Strength

TRANSFORMED
Best parameters: {'max_depth': 2, 'max_features': 16, 'min_samples_leaf': 8, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.8300541962449599

Train set metrics:
MSE: 0.49538576745280893
R-squared: 0.5544451603713594

Test set metrics:
MSE: 0.8300541962449599
R-squared: 0.3496745235844523 

RA4IMECF      0.012043
RA4IMICF      0.009000
RA4IMFFM      0.003927
RA4DTBFM      0.008470
RA4IRES0      0.000000
RA4IRESINF    0.006229
RA4IRESEXC    0.000000
RA4IRESINC    0.045768
RA4IFCHAR     0.002883
RA4IMCAP      0.010221
RA1PRSEX      0.847180
RA1PF7A       0.000000
Age_40_50     0.000000
Age_51_61     0.000000
Age_61plus    0.000000
RA4P1A        0.054279
dtype: float64


In [7]:
#BIS Model, jumppower

print("Random Forest: BIS Model, Jump Power")

print("\nTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: BIS Model, Jump Power

TRANSFORMED
Best parameters: {'max_depth': 16, 'max_features': 16, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.06158612031343274

Train set metrics:
MSE: 0.006331067445474895
R-squared: 0.9457724918321168

Test set metrics:
MSE: 0.06158612031343274
R-squared: 0.5694846249135124 

RA4IMECF      0.025779
RA4IMICF      0.263542
RA4IMFFM      0.080792
RA4DTBFM      0.055144
RA4IRES0      0.019343
RA4IRESINF    0.018219
RA4IRESEXC    0.024008
RA4IRESINC    0.162955
RA4IFCHAR     0.038624
RA4IMCAP      0.174097
RA1PRSEX      0.003329
RA1PF7A       0.006171
Age_40_50     0.003512
Age_51_61     0.008270
Age_61plus    0.030835
RA4P1A        0.085381
dtype: float64


In [8]:
#Combo Models, handgrip strength (Arms)
print("MLR: Combo Models, Handgrip Strength (Arms)")

print("\nTRANSFORMED")
columns = ['RA4DALM', 'RA4DABM', 'RA4DAFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

MLR: Combo Models, Handgrip Strength (Arms)

TRANSFORMED
Best parameters: {'max_depth': 4, 'max_features': 20, 'min_samples_leaf': 4, 'min_samples_split': 1, 'n_estimators': 5}
Mean Squared Error: 0.8435507781202635

Train set metrics:
MSE: 0.36431257103238746
R-squared: 0.6723336845229491

Test set metrics:
MSE: 0.8435507781202635
R-squared: 0.3391003091804473 

RA4DALM       0.040687
RA4DABM       0.135787
RA4DAFM       0.032019
RA1PRSEX      0.545644
RA1PF7A       0.000000
Age_40_50     0.000000
Age_51_61     0.000000
Age_61plus    0.007262
RA4P1A        0.045853
RA4DLR3MD     0.035471
RA4IMECF      0.014175
RA4IMICF      0.000000
RA4IMFFM      0.000000
RA4DTBFM      0.011862
RA4IRES0      0.008982
RA4IRESINF    0.018842
RA4IRESEXC    0.000000
RA4IRESINC    0.052555
RA4IFCHAR     0.013568
RA4IMCAP      0.037293
dtype: float64


In [9]:
#Combo Models, handgrip strength (TB)

print("Random Forest: Combo Models, Handgrip Strength (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: Combo Models, Handgrip Strength (TB)

TRANSFORMED
Best parameters: {'max_depth': 4, 'max_features': 22, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 10}
Mean Squared Error: 0.8628817085414633

Train set metrics:
MSE: 0.3411394532290612
R-squared: 0.6931758149693796

Test set metrics:
MSE: 0.8628817085414633
R-squared: 0.32395503723002084 

RA4IALM       0.000000
RA4DTBBM      0.029720
RA4DTBFM      0.009751
RA1PRSEX      0.658935
RA1PF7A       0.005334
Age_40_50     0.000000
Age_51_61     0.000000
Age_61plus    0.003638
RA4P1A        0.034203
RA4DLR3MD     0.053894
RA4DLFNMD     0.017655
RA4DLSMD      0.003416
RA4IMECF      0.015018
RA4IMICF      0.008392
RA4IMFFM      0.000000
RA4DTBFM      0.016866
RA4IRES0      0.005715
RA4IRESINF    0.018204
RA4IRESEXC    0.002584
RA4IRESINC    0.066281
RA4IFCHAR     0.017898
RA4IMCAP      0.032499
dtype: float64


In [10]:
#Combo Models, jumppower (Legs)

print("MLR: Combo Models, Jump Power (Legs)")

print("\nTRANSFORMED")
columns = ['RA4ILLM', 'RA4DLBM', 'RA4DLFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLFNMD', 'RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape)
print(X_test.shape)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

MLR: Combo Models, Jump Power (Legs)

TRANSFORMED
(195, 20)
(49, 20)
Best parameters: {'max_depth': 8, 'max_features': 2, 'min_samples_leaf': 2, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.0468822067831696

Train set metrics:
MSE: 0.015093204031363671
R-squared: 0.8707221409439734

Test set metrics:
MSE: 0.0468822067831696
R-squared: 0.672271759685173 

RA4ILLM       0.107629
RA4DLBM       0.110243
RA4DLFM       0.031544
RA1PRSEX      0.037636
RA1PF7A       0.003133
Age_40_50     0.013670
Age_51_61     0.014209
Age_61plus    0.015881
RA4P1A        0.036259
RA4DLFNMD     0.043721
RA4IMECF      0.057146
RA4IMICF      0.109228
RA4IMFFM      0.093468
RA4DTBFM      0.020029
RA4IRES0      0.031006
RA4IRESINF    0.021594
RA4IRESEXC    0.030877
RA4IRESINC    0.093850
RA4IFCHAR     0.069199
RA4IMCAP      0.059678
dtype: float64


In [11]:
#Combo Models, jumppower (TB)

print("Random Forest: Combo Models, Jump Power (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape)
print(X_test.shape)

#random forest model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_depth': [None, 1, 2, 4, 8, 16],
    'min_samples_split': [1, 2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [1, 2, 4, 8, X_train.shape[1]]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
print("Best parameters:", best_params)
print("Mean Squared Error:", mse)

#fitting with BEST HYPERPARAMETERS
rf = RandomForestRegressor(max_depth = best_params['max_depth'],
                           min_samples_leaf = best_params['min_samples_leaf'], 
                           min_samples_split = best_params['min_samples_split'], 
                           n_estimators = best_params['n_estimators'], 
                           max_features = best_params['max_features'],random_state = 42)
rf.fit(X_train, y_train)

#evaluation of the model on the TRAINING set
y_train_pred = rf.predict(X_train)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
r2_train = metrics.r2_score(y_train, y_train_pred)
print("\nTrain set metrics:")
print("MSE:", mse_train)
print("R-squared:", r2_train)

y_test_pred = rf.predict(X_test)

#evaluation of the model on test set
mse_test = metrics.mean_squared_error(y_test, y_test_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("\nTest set metrics:")
print("MSE:", mse_test)
print("R-squared:", r2_test, "\n")

#feature importance??
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
print(feature_importances)

Random Forest: Combo Models, Jump Power (TB)

TRANSFORMED
(195, 22)
(49, 22)
Best parameters: {'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 20}
Mean Squared Error: 0.04919392903373765

Train set metrics:
MSE: 0.007132857439323352
R-squared: 0.938904917949071

Test set metrics:
MSE: 0.04919392903373765
R-squared: 0.6561117553411095 

RA4IALM       0.347769
RA4DTBBM      0.061355
RA4DTBFM      0.023528
RA1PRSEX      0.001806
RA1PF7A       0.000974
Age_40_50     0.002938
Age_51_61     0.004521
Age_61plus    0.012909
RA4P1A        0.023812
RA4DLR3MD     0.026545
RA4DLFNMD     0.032606
RA4DLSMD      0.026259
RA4IMECF      0.021248
RA4IMICF      0.089401
RA4IMFFM      0.078145
RA4DTBFM      0.025809
RA4IRES0      0.023848
RA4IRESINF    0.016404
RA4IRESEXC    0.012989
RA4IRESINC    0.062724
RA4IFCHAR     0.017317
RA4IMCAP      0.087093
dtype: float64
