In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler 

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

In [2]:
#number of individuals before NA is removed
print(len(df))

544


In [4]:
#this is the number of people we have when race IS INCLUDED
df = df.dropna()
print(len(df))

422


In [5]:
#counting the total number of individuals who are white or black (ignore the other races for now...)
race = (df['COMB1PF7A'] == 1) | (df['COMB1PF7A'] == 2)
df = df[race]

print(len(df))

413


# MLR

In [7]:
#MLR Function

def train_test_linear_regression(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)
    
    #dividing the data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #TRAINING
    print("\nTRAINING Metrics")
    X_train_const = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_const)
    results = model.fit()
    print(results.summary())

    #TESTING
    print("\nTEST Metrics")
    mlr = LinearRegression()
    mlr.fit(X_train, y_train)

    #MLR coefficients
    print("\nCoefficients: ")
    coefficients = pd.Series(mlr.coef_, index=X.columns)
    print(coefficients)

    #MLR intercept
    print("\nIntercept: ")
    print(mlr.intercept_)

    test_pred = mlr.predict(X_test)

    mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
    mlr_diff.head()

    mse_test = metrics.mean_squared_error(y_test, test_pred)
    rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
    r2_test = metrics.r2_score(y_test, test_pred)

    print("MSE:", mse_test)
    print("RMSE:", rmse_test)
    print("R2:", r2_test, "\n")
    
    #count the frequencies in training data
    white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
    white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

    black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
    print("white train frequency: ", white_train_counts.values[0])
    print("white test frequency: ", white_test_counts.values[0], "\n")
    
    print("black train frequency: ", black_train_counts.values[0])
    print("black test frequency: ", black_test_counts.values[0])

In [10]:
#DXA: Handgrip Strength (ARMS)

print("MLR: DXA Model, Handgrip Strength (Arms)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

MLR: DXA Model, Handgrip Strength (Arms)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.518
Model:                            OLS   Adj. R-squared:                  0.506
Method:                 Least Squares   F-statistic:                     43.17
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.25e-46
Time:                        18:54:30   Log-Likelihood:                -353.33
No. Observations:                 330   AIC:                             724.7
Df Residuals:                     321   BIC:                             758.8
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [11]:
#DXA: Hangrip Strength (TOTAL BODY)

print("MLR: DXA Model, Handgrip Strength (TB)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

MLR: DXA Model, Handgrip Strength (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.522
Model:                            OLS   Adj. R-squared:                  0.507
Method:                 Least Squares   F-statistic:                     34.79
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.80e-45
Time:                        18:56:14   Log-Likelihood:                -352.16
No. Observations:                 330   AIC:                             726.3
Df Residuals:                     319   BIC:                             768.1
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [13]:
#DXA: Jump Power (LEGS)

print("MLR: DXA Model, Jump Power (Legs)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')


MLR: DXA Model, Jump Power (Legs)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.742
Model:                            OLS   Adj. R-squared:                  0.735
Method:                 Least Squares   F-statistic:                     115.3
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.18e-89
Time:                        18:56:43   Log-Likelihood:                 93.457
No. Observations:                 330   AIC:                            -168.9
Df Residuals:                     321   BIC:                            -134.7
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [14]:
#DXA: Jump Power (TOTAL BODY)
print("MLR: DXA Model, Jump Power (TB)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')


MLR: DXA Model, Jump Power (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.768
Model:                            OLS   Adj. R-squared:                  0.761
Method:                 Least Squares   F-statistic:                     105.6
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           6.25e-95
Time:                        18:56:59   Log-Likelihood:                 111.08
No. Observations:                 330   AIC:                            -200.2
Df Residuals:                     319   BIC:                            -158.4
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [15]:
#BIS: Handgrip Strength
print("MLR: BIS Model, Handgrip Strength")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')


MLR: BIS Model, Handgrip Strength

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.527
Method:                 Least Squares   F-statistic:                     29.24
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.13e-46
Time:                        18:57:31   Log-Likelihood:                -343.52
No. Observations:                 330   AIC:                             715.0
Df Residuals:                     316   BIC:                             768.2
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------

In [16]:
#BIS: Jump Power

print("MLR: BIS Model, Jump Power")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')


MLR: BIS Model, Jump Power

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.781
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                     86.80
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           6.55e-96
Time:                        18:57:42   Log-Likelihood:                 120.76
No. Observations:                 330   AIC:                            -213.5
Df Residuals:                     316   BIC:                            -160.3
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
con

In [17]:
#COMBO: Handgrip Strength (ARMS)

print("MLR: Combo Models, Handgrip Strength (Arms)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')


MLR: Combo Models, Handgrip Strength (Arms)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.529
Method:                 Least Squares   F-statistic:                     22.75
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           7.20e-45
Time:                        18:57:58   Log-Likelihood:                -340.81
No. Observations:                 330   AIC:                             717.6
Df Residuals:                     312   BIC:                             786.0
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

In [18]:
#COMBO: Handgrip Strength (TOTAL BODY)

print("MLR: Combo Models, Handgrip Strength (TB)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')


MLR: Combo Models, Handgrip Strength (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.550
Model:                            OLS   Adj. R-squared:                  0.524
Method:                 Least Squares   F-statistic:                     21.11
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.15e-43
Time:                        18:58:15   Log-Likelihood:                -342.12
No. Observations:                 330   AIC:                             722.2
Df Residuals:                     311   BIC:                             794.4
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [19]:
#COMBO: Jump Power (LEGS)

print("MLR: Combo Models, Jump Power (Legs)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')


MLR: Combo Models, Jump Power (Legs)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.800
Model:                            OLS   Adj. R-squared:                  0.790
Method:                 Least Squares   F-statistic:                     73.61
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           3.04e-98
Time:                        18:58:32   Log-Likelihood:                 135.92
No. Observations:                 330   AIC:                            -235.8
Df Residuals:                     312   BIC:                            -167.5
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [20]:
#COMBO: Jump Power (TOTAL BODY)

print("MLR: Combo Models, Jump Power (TB)")

train_test_linear_regression(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')


MLR: Combo Models, Jump Power (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.817
Model:                            OLS   Adj. R-squared:                  0.806
Method:                 Least Squares   F-statistic:                     77.18
Date:                Wed, 03 Jan 2024   Prob (F-statistic):          4.10e-103
Time:                        18:58:46   Log-Likelihood:                 150.29
No. Observations:                 330   AIC:                            -262.6
Df Residuals:                     311   BIC:                            -190.4
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------