In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

df = df.drop('COMB1PF7A', axis=1)

In [2]:
#number of individuals before NA is removed
print(len(df))

544


In [3]:
#total number of individuals when race is excluded
df = df.dropna()
print(len(df))

490


# LASSO

In [4]:
#LASSO Function

def train_test_lasso(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)
    
    #dividing our data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #scaling our features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    lasso_model = Lasso()

    #this is our parameter grid for hyperparameter tuning
    alphas = np.logspace(-4, 0, 100)
    max_iters = [10000, 20000, 30000, 50000]
    param_grid = {'alpha': alphas, 'max_iter': max_iters}

    #doing grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)

    best_alpha = grid_search.best_params_['alpha']
    best_max_iter = grid_search.best_params_['max_iter']

    print("Best Alpha:", best_alpha)
    print("Best Max Iterations:", best_max_iter)

    #running the model using the best hyperparameters
    best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
    best_lasso_model.fit(X_train_scaled, y_train)
    
    #evaluation of the model on the TRAINING set
    y_pred_train = best_lasso_model.predict(X_train_scaled)
    mse_train = metrics.mean_squared_error(y_train, y_pred_train)
    r2_train_21 = metrics.r2_score(y_train, y_pred_train)
    print("\nTRAINING Set Metrics:")
    print("Mean Squared Error:", mse_train)
    print("R-squared:", r2_train_21)

    #evaluation of the model on the TESTING set
    y_pred = best_lasso_model.predict(X_test_scaled)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2_test_21 = metrics.r2_score(y_test, y_pred)
    print("\nTEST Set Metrics:")
    print("Mean Squared Error:", mse)
    print("R-squared:", r2_test_21)

    #feature importance (coefficients) from the Lasso model
    feature_importance = best_lasso_model.coef_
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

    #sorting the features based on their importance (absolute value of coefficients) in descending order
    importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)
    
    print(importance_df)

In [5]:
#DXA: Handgrip Strength (ARMS)

print("LASSO: DXA Model, Handgrip Strength (Arms)")

train_test_lasso(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

LASSO: DXA Model, Handgrip Strength (Arms)


Best Alpha: 0.015199110829529346
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.5191272844724935
R-squared: 0.5313110813886949

TEST Set Metrics:
Mean Squared Error: 0.5328850007907258
R-squared: 0.38948010795752863
       Feature  Importance
0    COMB4DALM    0.248654
1    COMB4DABM    0.212920
2    COMB4DAFM   -0.186978
5     COMB4P1A    0.171863
3   COMB1PRSEX   -0.144917
4   COMB1PRAGE   -0.078825
6  COMB4DLR3MD   -0.000000


In [6]:
#DXA: Handgrip Strength (TOTAL BODY)

print("LASSO: DXA Model, Handgrip Strength (TB)")

train_test_lasso(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

LASSO: DXA Model, Handgrip Strength (TB)


Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.5320212590985757
R-squared: 0.5196698843164953

TEST Set Metrics:
Mean Squared Error: 0.5074008887814344
R-squared: 0.4186769464679466
         Feature  Importance
3     COMB1PRSEX   -0.237801
2     COMB4DTBFM   -0.229405
0      COMB4IALM    0.228153
1     COMB4DTBBM    0.216528
5       COMB4P1A    0.166526
8  COMB4DLSL14MD   -0.093568
4     COMB1PRAGE   -0.072119
6    COMB4DLR3MD    0.064239
7    COMB4DLFNMD   -0.050742


In [7]:
#DXA: Jump Power (LEGS)

print("LASSO: DXA Model, Jump Power (Legs)")

train_test_lasso(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

LASSO: DXA Model, Jump Power (Legs)


Best Alpha: 0.001788649529057435
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.04145664394638318
R-squared: 0.6838849167340781

TEST Set Metrics:
Mean Squared Error: 0.026062174975906488
R-squared: 0.7993335601118498
       Feature  Importance
0    COMB4ILLM    0.169593
4   COMB1PRAGE   -0.162373
2    COMB4DLFM   -0.074821
3   COMB1PRSEX   -0.050973
1    COMB4DLBM    0.041781
5     COMB4P1A    0.002195
6  COMB4DLFNMD    0.000000


In [8]:
#DXA: Jump Power (TOTAL BODY)

print("LASSO: DXA Model, Jump Power (TB)")

train_test_lasso(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

LASSO: DXA Model, Jump Power (TB)


Best Alpha: 0.0016297508346206436
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03877730002354542
R-squared: 0.7043154423782021

TEST Set Metrics:
Mean Squared Error: 0.02661177762993496
R-squared: 0.7951018792164937
         Feature  Importance
0      COMB4IALM    0.168560
4     COMB1PRAGE   -0.143604
1     COMB4DTBBM    0.098377
2     COMB4DTBFM   -0.064899
8  COMB4DLSL14MD   -0.060912
3     COMB1PRSEX   -0.030647
6    COMB4DLR3MD    0.018565
5       COMB4P1A   -0.003897
7    COMB4DLFNMD    0.002343


In [9]:
#BIS: Handgrip Strength

print("LASSO: BIS Model, Handgrip Strength")

train_test_lasso(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

LASSO: BIS Model, Handgrip Strength
Best Alpha: 0.02915053062825179
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.5087350214301093
R-squared: 0.5406936329766134

TEST Set Metrics:
Mean Squared Error: 0.4922774503421927
R-squared: 0.4360036867393675
         Feature  Importance
12      COMB4P1A    0.348078
10    COMB1PRSEX   -0.251379
7   COMB4IRESINC   -0.241995
3     COMB4DTBFM   -0.143444
8    COMB4IFCHAR   -0.047303
11    COMB1PRAGE   -0.022915
0     COMB4IMECF    0.000000
1     COMB4IMICF    0.000000
2     COMB4IMFFM    0.000000
4     COMB4IRES0    0.000000
5   COMB4IRESINF   -0.000000
6   COMB4IRESEXC    0.000000
9     COMB4IMCAP    0.000000


In [10]:
#BIS: Jump Power

print("LASSO: BIS Model, Jump Power")

train_test_lasso(df, ['COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

LASSO: BIS Model, Jump Power
Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03534657409215116
R-squared: 0.7304754039724852

TEST Set Metrics:
Mean Squared Error: 0.03949265393517273
R-squared: 0.6959252144446084
         Feature  Importance
5   COMB4IRESINF    1.163917
6   COMB4IRESEXC   -0.848665
1     COMB4IMICF    0.425688
7   COMB4IRESINC   -0.380461
0     COMB4IMECF   -0.336335
10    COMB1PRSEX   -0.198819
11    COMB1PRAGE   -0.130957
8    COMB4IFCHAR   -0.072360
9     COMB4IMCAP   -0.062963
12      COMB4P1A    0.042180
3     COMB4DTBFM   -0.032782
2     COMB4IMFFM    0.000000
4     COMB4IRES0   -0.000000


In [11]:
#COMBO: Handgrip Strength (ARMS)

print("LASSO: Combo Models, Handgrip Strength (Arms)")

train_test_lasso(df, ['COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

LASSO: Combo Models, Handgrip Strength (Arms)
Best Alpha: 0.026560877829466867
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.49778581042646447
R-squared: 0.550579019505969

TEST Set Metrics:
Mean Squared Error: 0.5100735511676733
R-squared: 0.4156149095386068
         Feature  Importance
5       COMB4P1A    0.255045
1      COMB4DABM    0.218064
14  COMB4IRESINC   -0.205764
3     COMB1PRSEX   -0.150038
10    COMB4DTBFM   -0.096223
2      COMB4DAFM   -0.068263
15   COMB4IFCHAR   -0.045881
4     COMB1PRAGE   -0.033069
0      COMB4DALM    0.000000
12  COMB4IRESINF    0.000000
13  COMB4IRESEXC    0.000000
8     COMB4IMICF    0.000000
11    COMB4IRES0    0.000000
9     COMB4IMFFM    0.000000
7     COMB4IMECF    0.000000
6    COMB4DLR3MD   -0.000000
16    COMB4IMCAP    0.000000


In [12]:
#COMBO: Handgrip Strength (TOTAL BODY)

print("LASSO: Combo Models, Handgrip Strength (TB)")

train_test_lasso(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

LASSO: Combo Models, Handgrip Strength (TB)
Best Alpha: 0.03199267137797385
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.5090690038938129
R-squared: 0.5403921002226426

TEST Set Metrics:
Mean Squared Error: 0.4901736478232628
R-squared: 0.4384139878077592
          Feature  Importance
5        COMB4P1A    0.343529
3      COMB1PRSEX   -0.245836
16   COMB4IRESINC   -0.238029
2      COMB4DTBFM   -0.122377
17    COMB4IFCHAR   -0.044877
12     COMB4DTBFM   -0.020653
4      COMB1PRAGE   -0.018406
6     COMB4DLR3MD    0.014218
0       COMB4IALM    0.000000
15   COMB4IRESEXC    0.000000
14   COMB4IRESINF   -0.000000
13     COMB4IRES0    0.000000
9      COMB4IMECF    0.000000
11     COMB4IMFFM    0.000000
10     COMB4IMICF    0.000000
1      COMB4DTBBM    0.000000
8   COMB4DLSL14MD   -0.000000
7     COMB4DLFNMD   -0.000000
18     COMB4IMCAP    0.000000


In [13]:
#COMBO: Jump Power (LEGS)

print("LASSO: Combo Models, Jump Power (Legs)")

train_test_lasso(df, ['COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

LASSO: Combo Models, Jump Power (Legs)
Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03323145456494629
R-squared: 0.7466036073631044

TEST Set Metrics:
Mean Squared Error: 0.035780175946864744
R-squared: 0.7245095417989287
         Feature  Importance
12  COMB4IRESINF    1.276814
13  COMB4IRESEXC   -0.906481
14  COMB4IRESINC   -0.443315
7     COMB4IMECF   -0.337111
8     COMB4IMICF    0.261308
3     COMB1PRSEX   -0.147421
0      COMB4ILLM    0.138329
4     COMB1PRAGE   -0.120048
2      COMB4DLFM   -0.049154
5       COMB4P1A    0.038952
15   COMB4IFCHAR   -0.033114
1      COMB4DLBM    0.033018
10    COMB4DTBFM    0.009999
6    COMB4DLFNMD    0.005253
16    COMB4IMCAP   -0.001153
9     COMB4IMFFM    0.000000
11    COMB4IRES0   -0.000000


In [14]:
#COMBO: Jump Power (TOTAL BODY)

print("LASSO: Combo Models, Jump Power (TB)")

train_test_lasso(df, ['COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

LASSO: Combo Models, Jump Power (TB)
Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03166965347556992
R-squared: 0.7585126485785858

TEST Set Metrics:
Mean Squared Error: 0.03360650480286075
R-squared: 0.741245783127909
          Feature  Importance
14   COMB4IRESINF    1.215816
15   COMB4IRESEXC   -0.862289
16   COMB4IRESINC   -0.434876
9      COMB4IMECF   -0.314897
10     COMB4IMICF    0.217861
0       COMB4IALM    0.143704
3      COMB1PRSEX   -0.123538
1      COMB4DTBBM    0.116239
4      COMB1PRAGE   -0.111376
8   COMB4DLSL14MD   -0.055176
17    COMB4IFCHAR   -0.027319
2      COMB4DTBFM   -0.025941
5        COMB4P1A    0.022783
7     COMB4DLFNMD    0.005134
6     COMB4DLR3MD   -0.004926
12     COMB4DTBFM   -0.000628
11     COMB4IMFFM    0.000000
13     COMB4IRES0   -0.000000
18     COMB4IMCAP    0.000000
