In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

data = pd.read_csv('/Users/yunjuha/Desktop/SROP/DXA_BIS_Project/narrowed down list/narrowed_transformed_data.csv')
df = pd.DataFrame(data)
df = df.dropna()

# Multiple Linear Regression

In [3]:
#DXA model, handgrip strength (TB)

print("MLR: DXA Model, Handgrip Strength (TB)\n")

print("UNTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['RA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

mlr = LinearRegression()
mlr.fit(X_train, y_train)

print("\nCoefficients: ")
coefficients = pd.Series(mlr.coef_, index=X.columns)
print(coefficients)

print("\nIntercept: ")
print(mlr.intercept_)

test_pred = mlr.predict(X_test)

mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
mlr_diff.head()

#evaluation metrics for the TEST set
mse_test = metrics.mean_squared_error(y_test, test_pred)
rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
r2_test = metrics.r2_score(y_test, test_pred)

print("\nTEST set metrics:")
print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("R-squared:", r2_test)

#to see which variables are useful...
print("\nThis is the training set metrics table")
X_train_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_const)
results = model.fit()
print(results.summary())

MLR: DXA Model, Handgrip Strength (TB)

UNTRANSFORMED

Coefficients: 
RA4IALM        0.520115
RA4DTBBM      -0.000908
RA4DTBFM      -0.000128
RA1PRSEX      -9.176805
RA1PF7A       -1.304513
Age_40_50      0.639199
Age_51_61     -0.721213
Age_61plus    -0.593316
RA4P1A         0.191791
RA4DLR3MD     10.958198
RA4DLFNMD     -2.088917
RA4DLSMD      -0.144642
dtype: float64

Intercept: 
1.179386352786338

TEST set metrics:
MSE: 87.5710032420619
RMSE: 9.35793798024233
R-squared: 0.40112456482650893

This is the training set metrics table
                            OLS Regression Results                            
Dep. Variable:            RA4IMaxGrip   R-squared:                       0.540
Model:                            OLS   Adj. R-squared:                  0.510
Method:                 Least Squares   F-statistic:                     17.80
Date:                Thu, 20 Jul 2023   Prob (F-statistic):           6.02e-25
Time:                        21:10:32   Log-Likelihood:           

In [4]:
#DXA model, jump power (TB)

print("MLR: DXA Model, Jump Power (TB)")

print("UNTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['jumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

mlr = LinearRegression()
mlr.fit(X_train, y_train)

print("\nCoefficients: ")
coefficients = pd.Series(mlr.coef_, index=X.columns)
print(coefficients)

print("\nIntercept: ")
print(mlr.intercept_)

test_pred = mlr.predict(X_test)

mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
mlr_diff.head()

#evaluation metrics for the TEST set
mse_test = metrics.mean_squared_error(y_test, test_pred)
rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
r2_test = metrics.r2_score(y_test, test_pred)

print("\nTEST set metrics:")
print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("R-squared:", r2_test)

#to see which variables are useful...
print("\nThis is the training set metrics table")
X_train_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_const)
results = model.fit()
print(results.summary())

MLR: DXA Model, Jump Power (TB)
UNTRANSFORMED

Coefficients: 
RA4IALM       0.139090
RA4DTBBM      0.000223
RA4DTBFM     -0.000024
RA1PRSEX      0.063134
RA1PF7A      -0.146916
Age_40_50    -0.033911
Age_51_61    -0.165494
Age_61plus   -0.540957
RA4P1A       -0.009854
RA4DLR3MD     0.397057
RA4DLFNMD     0.741472
RA4DLSMD     -0.838684
dtype: float64

Intercept: 
1.3816033615850893

TEST set metrics:
MSE: 0.2128496453097077
RMSE: 0.4613563105775272
R-squared: 0.7970698725647705

This is the training set metrics table
                            OLS Regression Results                            
Dep. Variable:            jumppownums   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     51.59
Date:                Thu, 20 Jul 2023   Prob (F-statistic):           4.63e-52
Time:                        21:10:43   Log-Likelihood:                -120.26
No.

In [5]:
#BIS Model, handgrip strength

print("MLR: BIS Model, Handgrip Strength")

print("UNTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['RA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

mlr = LinearRegression()
mlr.fit(X_train, y_train)

print("\nCoefficients: ")
coefficients = pd.Series(mlr.coef_, index=X.columns)
print(coefficients)

print("\nIntercept: ")
print(mlr.intercept_)

test_pred = mlr.predict(X_test)

mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
mlr_diff.head()

#evaluation metrics for the TEST set
mse_test = metrics.mean_squared_error(y_test, test_pred)
rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
r2_test = metrics.r2_score(y_test, test_pred)

print("\nTEST set metrics:")
print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("R-squared:", r2_test)

#to see which variables are useful...
print("\nThis is the training set metrics table")
X_train_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_const)
results = model.fit()
print(results.summary())

MLR: BIS Model, Handgrip Strength
UNTRANSFORMED

Coefficients: 
RA4IMECF      28.604950
RA4IMICF      29.156850
RA4IMFFM     -20.874423
RA4DTBFM      -0.000092
RA4IRES0      -0.033164
RA4IRESINF     0.179422
RA4IRESEXC    -0.033164
RA4IRESINC    -0.022791
RA4IFCHAR      0.149180
RA4IMCAP      -2.187452
RA1PRSEX     -11.182071
RA1PF7A       -0.456287
Age_40_50      0.679332
Age_51_61      0.138049
Age_61plus     0.356182
RA4P1A         0.201298
dtype: float64

Intercept: 
-3.976051853617026

TEST set metrics:
MSE: 85.90240526660394
RMSE: 9.268355046425656
R-squared: 0.4125356746880666

This is the training set metrics table
                            OLS Regression Results                            
Dep. Variable:            RA4IMaxGrip   R-squared:                       0.551
Model:                            OLS   Adj. R-squared:                  0.513
Method:                 Least Squares   F-statistic:                     14.64
Date:                Thu, 20 Jul 2023   Prob (F-stati

In [6]:
#BIS Model, jumppower

print("MLR: BIS Model, Jump Power")

columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['jumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

mlr = LinearRegression()
mlr.fit(X_train, y_train)

print("\nCoefficients: ")
coefficients = pd.Series(mlr.coef_, index=X.columns)
print(coefficients)

print("\nIntercept: ")
print(mlr.intercept_)

test_pred = mlr.predict(X_test)

mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
mlr_diff.head()

#evaluation metrics for the TEST set
mse_test = metrics.mean_squared_error(y_test, test_pred)
rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
r2_test = metrics.r2_score(y_test, test_pred)

print("\nTEST set metrics:")
print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("R-squared:", r2_test)

#to see which variables are useful...
print("\nThis is the training set metrics table")
X_train_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_const)
results = model.fit()
print(results.summary())

MLR: BIS Model, Jump Power

Coefficients: 
RA4IMECF     -5.967518
RA4IMICF     -5.172722
RA4IMFFM      4.130676
RA4DTBFM     -0.000022
RA4IRES0     -0.002877
RA4IRESINF    0.007121
RA4IRESEXC   -0.002877
RA4IRESINC    0.000530
RA4IFCHAR    -0.038460
RA4IMCAP     -1.116267
RA1PRSEX     -1.431381
RA1PF7A       0.044012
Age_40_50    -0.180462
Age_51_61    -0.223112
Age_61plus   -0.510199
RA4P1A       -0.014872
dtype: float64

Intercept: 
6.428684320823896

TEST set metrics:
MSE: 0.20959474339949072
RMSE: 0.45781518476290267
R-squared: 0.8001730849683826

This is the training set metrics table
                            OLS Regression Results                            
Dep. Variable:            jumppownums   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.738
Method:                 Least Squares   F-statistic:                     37.36
Date:                Thu, 20 Jul 2023   Prob (F-statistic):           4.20e-47
Time:   

In [7]:
#Combo Models, handgrip strength (TB)

print("MLR: Combo Models, Handgrip Strength (TB)")

columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['RA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

mlr = LinearRegression()
mlr.fit(X_train, y_train)

print("\nCoefficients: ")
coefficients = pd.Series(mlr.coef_, index=X.columns)
print(coefficients)

print("\nIntercept: ")
print(mlr.intercept_)

test_pred = mlr.predict(X_test)

mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
mlr_diff.head()

#evaluation metrics for the TEST set
mse_test = metrics.mean_squared_error(y_test, test_pred)
rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
r2_test = metrics.r2_score(y_test, test_pred)

print("\nTEST set metrics:")
print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("R-squared:", r2_test)

#to see which variables are useful...
print("\nThis is the training set metrics table")
X_train_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_const)
results = model.fit()
print(results.summary())

MLR: Combo Models, Handgrip Strength (TB)

Coefficients: 
RA4IALM        0.299995
RA4DTBBM      -0.000779
RA4DTBFM      -0.000055
RA1PRSEX      -9.603598
RA1PF7A       -1.049621
Age_40_50      0.656146
Age_51_61     -0.209387
Age_61plus     0.498480
RA4P1A         0.185002
RA4DLR3MD      9.659547
RA4DLFNMD     -4.220282
RA4DLSMD       1.145274
RA4IMECF      50.750093
RA4IMICF      50.972032
RA4IMFFM     -37.034947
RA4DTBFM      -0.000055
RA4IRES0      -0.020492
RA4IRESINF     0.135839
RA4IRESEXC    -0.020492
RA4IRESINC    -0.020255
RA4IFCHAR      0.152228
RA4IMCAP      -2.355360
dtype: float64

Intercept: 
-8.58940039381623

TEST set metrics:
MSE: 84.65406677290933
RMSE: 9.200764466766298
R-squared: 0.42107273868159745

This is the training set metrics table
                            OLS Regression Results                            
Dep. Variable:            RA4IMaxGrip   R-squared:                       0.556
Model:                            OLS   Adj. R-squared:                  

In [8]:
#Combo Models, jumppower (TB)

print("MLR: Combo Models, Jump Power (TB)")

columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['jumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

mlr = LinearRegression()
mlr.fit(X_train, y_train)

print("\nCoefficients: ")
coefficients = pd.Series(mlr.coef_, index=X.columns)
print(coefficients)

print("\nIntercept: ")
print(mlr.intercept_)

test_pred = mlr.predict(X_test)

mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
mlr_diff.head()

#evaluation metrics for the TEST set
mse_test = metrics.mean_squared_error(y_test, test_pred)
rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
r2_test = metrics.r2_score(y_test, test_pred)

print("\nTEST set metrics:")
print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("R-squared:", r2_test)

#to see which variables are useful...
print("\nThis is the training set metrics table")
X_train_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_const)
results = model.fit()
print(results.summary())


MLR: Combo Models, Jump Power (TB)

Coefficients: 
RA4IALM        0.160832
RA4DTBBM       0.000003
RA4DTBFM      -0.000010
RA1PRSEX      -0.779018
RA1PF7A       -0.248299
Age_40_50     -0.015992
Age_51_61     -0.118445
Age_61plus    -0.317145
RA4P1A        -0.009828
RA4DLR3MD      0.289067
RA4DLFNMD      0.877062
RA4DLSMD      -0.484679
RA4IMECF     -12.979022
RA4IMICF     -12.464971
RA4IMFFM       9.294490
RA4DTBFM      -0.000010
RA4IRES0      -0.003424
RA4IRESINF     0.012040
RA4IRESEXC    -0.003424
RA4IRESINC    -0.000347
RA4IFCHAR     -0.009547
RA4IMCAP      -0.484572
dtype: float64

Intercept: 
2.726933870653932

TEST set metrics:
MSE: 0.1975540128819309
RMSE: 0.4444704859514644
R-squared: 0.8116526764649357

This is the training set metrics table
                            OLS Regression Results                            
Dep. Variable:            jumppownums   R-squared:                       0.814
Model:                            OLS   Adj. R-squared:                  0.793
