# Removing Race

In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics 

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

df = df.drop('COMB1PF7A', axis=1)

In [5]:
#total number of individuals when race is not included
df = df.dropna()
print(len(df))

490


# MLR

In [6]:
#MLR Function

def train_test_linear_regression(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)
    
    #dividing into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #TRAINING
    print("\nTRAINING Metrics")
    X_train_const = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_const)
    results = model.fit()
    print(results.summary())

    #TESTING
    print("\nTEST Metrics")
    mlr = LinearRegression()
    mlr.fit(X_train, y_train)

    #MLR coefficients
    print("\nCoefficients: ")
    coefficients = pd.Series(mlr.coef_, index=X.columns)
    print(coefficients)

    #MLR intercept
    print("\nIntercept: ")
    print(mlr.intercept_)

    test_pred = mlr.predict(X_test)

    mlr_diff = pd.DataFrame({'\nActual value': y_test, 'Predicted value': test_pred})
    mlr_diff.head()

    mse_test = metrics.mean_squared_error(y_test, test_pred)
    rmse_test = metrics.mean_squared_error(y_test, test_pred, squared=False)
    r2_test = metrics.r2_score(y_test, test_pred)

    print("MSE:", mse_test)
    print("RMSE:", rmse_test)
    print("R2:", r2_test)

In [9]:
#DXA: Handgrip Strength (ARMS)

print("MLR: DXA Model, Handgrip Strength (Arms)")

train_test_linear_regression(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

MLR: DXA Model, Handgrip Strength (Arms)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.532
Model:                            OLS   Adj. R-squared:                  0.524
Method:                 Least Squares   F-statistic:                     62.46
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.35e-59
Time:                        19:04:05   Log-Likelihood:                -427.26
No. Observations:                 392   AIC:                             870.5
Df Residuals:                     384   BIC:                             902.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [18]:
print("MLR: DXA Model, Handgrip Strength (TB)")

train_test_linear_regression(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

MLR: DXA Model, Handgrip Strength (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.520
Model:                            OLS   Adj. R-squared:                  0.508
Method:                 Least Squares   F-statistic:                     45.92
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.33e-55
Time:                        19:06:21   Log-Likelihood:                -432.53
No. Observations:                 392   AIC:                             885.1
Df Residuals:                     382   BIC:                             924.8
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [10]:
#DXA: Jump Power (LEGS)

print("MLR: DXA Model, Jump Power (Legs)")

train_test_linear_regression(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')


MLR: DXA Model, Jump Power (Legs)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.684
Model:                            OLS   Adj. R-squared:                  0.678
Method:                 Least Squares   F-statistic:                     118.7
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           5.33e-92
Time:                        19:04:08   Log-Likelihood:                 67.736
No. Observations:                 392   AIC:                            -119.5
Df Residuals:                     384   BIC:                            -87.70
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [11]:
#DXA: Jump Power (TOTAL BODY)

print("MLR: DXA Model, Jump Power (TB)")

train_test_linear_regression(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')


MLR: DXA Model, Jump Power (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.705
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     101.4
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.40e-95
Time:                        19:04:19   Log-Likelihood:                 81.212
No. Observations:                 392   AIC:                            -142.4
Df Residuals:                     382   BIC:                            -102.7
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------

In [12]:
#BIS: Handgrip Strength

print("MLR: BIS Model, Handgrip Strength")

train_test_linear_regression(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')


MLR: BIS Model, Handgrip Strength

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.532
Method:                 Least Squares   F-statistic:                     38.00
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.07e-57
Time:                        19:04:29   Log-Likelihood:                -421.43
No. Observations:                 392   AIC:                             868.9
Df Residuals:                     379   BIC:                             920.5
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------

In [13]:
#BIS: Jump Power

print("MLR: BIS Model, Jump Power")

train_test_linear_regression(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')


MLR: BIS Model, Jump Power

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.731
Model:                            OLS   Adj. R-squared:                  0.723
Method:                 Least Squares   F-statistic:                     85.91
Date:                Wed, 03 Jan 2024   Prob (F-statistic):          3.53e-100
Time:                        19:04:38   Log-Likelihood:                 99.434
No. Observations:                 392   AIC:                            -172.9
Df Residuals:                     379   BIC:                            -121.2
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
con

In [14]:
#COMBO: Handgrip Strength (ARMS)

print("MLR: Combo Models, Handgrip Strength (Arms)")

train_test_linear_regression(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')


MLR: Combo Models, Handgrip Strength (Arms)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.559
Model:                            OLS   Adj. R-squared:                  0.541
Method:                 Least Squares   F-statistic:                     29.77
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           5.78e-57
Time:                        19:05:08   Log-Likelihood:                -415.57
No. Observations:                 392   AIC:                             865.1
Df Residuals:                     375   BIC:                             932.7
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

In [15]:
#COMBO: Handgrip Strength (TOTAL BODY)

print("MLR: Combo Models, Handgrip Strength (TB)")

train_test_linear_regression(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')


MLR: Combo Models, Handgrip Strength (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:         tCOMB4IMaxGrip   R-squared:                       0.550
Model:                            OLS   Adj. R-squared:                  0.530
Method:                 Least Squares   F-statistic:                     26.90
Date:                Wed, 03 Jan 2024   Prob (F-statistic):           1.47e-54
Time:                        19:05:20   Log-Likelihood:                -419.71
No. Observations:                 392   AIC:                             875.4
Df Residuals:                     374   BIC:                             946.9
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [16]:
#COMBO: Jump Power (LEGS)

print("MLR: Combo Models, Jump Power (Legs)")

train_test_linear_regression(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')


MLR: Combo Models, Jump Power (Legs)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.747
Model:                            OLS   Adj. R-squared:                  0.737
Method:                 Least Squares   F-statistic:                     69.36
Date:                Wed, 03 Jan 2024   Prob (F-statistic):          2.17e-101
Time:                        19:05:39   Log-Likelihood:                 111.66
No. Observations:                 392   AIC:                            -189.3
Df Residuals:                     375   BIC:                            -121.8
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [17]:
#COMBO: Jump Power (TOTAL BODY)

print("MLR: Combo Models, Jump Power (TB)")

train_test_linear_regression(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')


MLR: Combo Models, Jump Power (TB)

TRAINING Metrics
                            OLS Regression Results                            
Dep. Variable:           tjumppownums   R-squared:                       0.760
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     69.58
Date:                Wed, 03 Jan 2024   Prob (F-statistic):          1.79e-104
Time:                        19:05:50   Log-Likelihood:                 121.47
No. Observations:                 392   AIC:                            -206.9
Df Residuals:                     374   BIC:                            -135.5
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------