In [7]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso

#loading data
data = pd.read_csv('/Users/yunjuha/Desktop/MIDUS3/ML Models/Combined_MIDUS3_Refresher.csv')
df = pd.DataFrame(data)

#TRANSFORMING THE TARGETS
df['tjumppownums'] = np.log(df['jumppownums'] + 0.1)
df['tCOMB4IMaxGrip'] = np.sqrt(df['COMB4IMaxGrip'])

#dropping columns
column_to_drop = ['M2ID', 'MIDUSID', 'SAMPLMAJ', 'Height.cm.', 'Weight.kg.', 'Age.years.', 'TBW.litres.', 'ECF.litres.', 'ICF.litres.']
df = df.drop(column_to_drop, axis=1)

In [8]:
#number of individuals before NA is removed
print(len(df))

544


In [9]:
#this is the number of people we have when race IS INCLUDED
df = df.dropna()
print(len(df))

422


In [10]:
#counting the total number of individuals who are white or black (ignore the other races for now...)
race = (df['COMB1PF7A'] == 1) | (df['COMB1PF7A'] == 2)
df = df[race]

print(len(df))

413


# LASSO

In [11]:
#LASSO Function

def train_test_lasso(df, feature_columns, target_column, test_size=0.2, random_state=42):
    columns = feature_columns + [target_column]
    X = df[columns]
    
    y = X.pop(target_column)
    
    #dividing our data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    #scaling our features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    lasso_model = Lasso()

    #this is our parameter grid for hyperparameter tuning
    alphas = np.logspace(-4, 0, 100)
    max_iters = [10000, 20000, 30000, 50000]
    param_grid = {'alpha': alphas, 'max_iter': max_iters}

    #doing grid search with cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)

    best_alpha = grid_search.best_params_['alpha']
    best_max_iter = grid_search.best_params_['max_iter']

    print("Best Alpha:", best_alpha)
    print("Best Max Iterations:", best_max_iter)

    #running the model using the best hyperparameters
    best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
    best_lasso_model.fit(X_train_scaled, y_train)
    
    #evaluation of the model on the TRAINING set
    y_pred_train = best_lasso_model.predict(X_train_scaled)
    mse_train = metrics.mean_squared_error(y_train, y_pred_train)
    r2_train_21 = metrics.r2_score(y_train, y_pred_train)
    print("\nTRAINING Set Metrics:")
    print("Mean Squared Error:", mse_train)
    print("R-squared:", r2_train_21)

    #evaluation of the model on the TESTING set
    y_pred = best_lasso_model.predict(X_test_scaled)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2_test_21 = metrics.r2_score(y_test, y_pred)
    print("\nTEST Set Metrics:")
    print("Mean Squared Error:", mse)
    print("R-squared:", r2_test_21)

    #feature importance (coefficients) from the Lasso model
    feature_importance = best_lasso_model.coef_
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

    #sorting the features based on their importance (absolute value of coefficients) in descending order
    importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)
    
    print(importance_df)
    
    #count the frequencies in training data
    white_train_counts = X_train[X_train['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()
    white_test_counts = X_test[X_test['COMB1PF7A'] == 1]['COMB1PF7A'].value_counts()

    black_train_counts = X_train[X_train['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    black_test_counts = X_test[X_test['COMB1PF7A'] == 2]['COMB1PF7A'].value_counts()
    
    print("white train frequency: ", white_train_counts.values[0])
    print("white test frequency: ", white_test_counts.values[0], "\n")
    
    print("black train frequency: ", black_train_counts.values[0])
    print("black test frequency: ", black_test_counts.values[0])

In [12]:
#DXA: Handgrip Strength (ARMS)

print("LASSO: DXA Model, Handgrip Strength (Arms)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD'], 'tCOMB4IMaxGrip')

LASSO: DXA Model, Handgrip Strength (Arms)


Best Alpha: 0.007924828983539177
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.4989804025875539
R-squared: 0.5176362481179676

TEST Set Metrics:
Mean Squared Error: 0.392553210019003
R-squared: 0.5890486351395714
       Feature  Importance
4   COMB1PRSEX   -0.246612
1    COMB4DALM    0.208893
6     COMB4P1A    0.164981
0    COMB1PF7A   -0.140794
3    COMB4DAFM   -0.130594
2    COMB4DABM    0.115591
5   COMB1PRAGE   -0.093575
7  COMB4DLR3MD   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [13]:
#DXA: Handgrip Strength (TOTAL BODY)

print("LASSO: DXA Model, Handgrip Strength (TB)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tCOMB4IMaxGrip')

LASSO: DXA Model, Handgrip Strength (TB)
Best Alpha: 0.010476157527896652
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.4974089369279115
R-squared: 0.519155382071141

TEST Set Metrics:
Mean Squared Error: 0.4171160566709989
R-squared: 0.5633345788056379
         Feature  Importance
4     COMB1PRSEX   -0.308693
1      COMB4IALM    0.235871
3     COMB4DTBFM   -0.192075
6       COMB4P1A    0.191922
0      COMB1PF7A   -0.128019
5     COMB1PRAGE   -0.067233
7    COMB4DLR3MD    0.047280
9  COMB4DLSL14MD   -0.021209
8    COMB4DLFNMD   -0.002756
2     COMB4DTBBM   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [14]:
#DXA: Jump Power (LEGS)

print("LASSO: DXA Model, Jump Power (Legs)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD'], 'tjumppownums')

LASSO: DXA Model, Jump Power (Legs)


Best Alpha: 0.002364489412645407
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.033262895076554326
R-squared: 0.7416009050391352

TEST Set Metrics:
Mean Squared Error: 0.03162560155841644
R-squared: 0.7680568928239696
       Feature  Importance
1    COMB4ILLM    0.207423
5   COMB1PRAGE   -0.155741
3    COMB4DLFM   -0.078487
2    COMB4DLBM    0.033405
4   COMB1PRSEX   -0.017261
6     COMB4P1A    0.009448
0    COMB1PF7A   -0.001918
7  COMB4DLFNMD    0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [15]:
#DXA: Jump Power (TOTAL BODY)

print("LASSO: DXA Model, Jump Power (TB)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD'], 'tjumppownums')

LASSO: DXA Model, Jump Power (TB)
Best Alpha: 0.0021544346900318843
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.03003332085126144
R-squared: 0.7666894926381393

TEST Set Metrics:
Mean Squared Error: 0.034907878272739924
R-squared: 0.7439845772879156
         Feature  Importance
1      COMB4IALM    0.195775
5     COMB1PRAGE   -0.138248
2     COMB4DTBBM    0.077994
9  COMB4DLSL14MD   -0.065943
3     COMB4DTBFM   -0.055462
7    COMB4DLR3MD    0.040228
0      COMB1PF7A   -0.016851
8    COMB4DLFNMD    0.008039
4     COMB1PRSEX   -0.000000
6       COMB4P1A   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [16]:
#BIS: Handgrip Strength

print("LASSO: BIS Model, Handgrip Strength")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tCOMB4IMaxGrip')

LASSO: BIS Model, Handgrip Strength
Best Alpha: 0.009545484566618348
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.4751581029317945
R-squared: 0.5406652363924997

TEST Set Metrics:
Mean Squared Error: 0.4232203045082896
R-squared: 0.5569442375317516
         Feature  Importance
13      COMB4P1A    0.307762
11    COMB1PRSEX   -0.281321
8   COMB4IRESINC   -0.259446
4     COMB4DTBFM   -0.151847
0      COMB1PF7A   -0.100740
9    COMB4IFCHAR   -0.068776
10    COMB4IMCAP   -0.062899
12    COMB1PRAGE   -0.037253
1     COMB4IMECF   -0.000000
2     COMB4IMICF   -0.000000
3     COMB4IMFFM   -0.000000
5     COMB4IRES0    0.000000
6   COMB4IRESINF    0.000000
7   COMB4IRESEXC    0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [17]:
#BIS: Jump Power

print("LASSO: BIS Model, Jump Power")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A'], 'tjumppownums')

LASSO: BIS Model, Jump Power


Best Alpha: 0.0001
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.02824004092239362
R-squared: 0.7806203879965998

TEST Set Metrics:
Mean Squared Error: 0.03444187016406068
R-squared: 0.7474022946867969
         Feature  Importance
6   COMB4IRESINF    0.620252
2     COMB4IMICF    0.538616
7   COMB4IRESEXC   -0.481319
1     COMB4IMECF   -0.378706
8   COMB4IRESINC   -0.207309
11    COMB1PRSEX   -0.175059
10    COMB4IMCAP   -0.154108
12    COMB1PRAGE   -0.118618
9    COMB4IFCHAR   -0.093582
13      COMB4P1A    0.039589
4     COMB4DTBFM   -0.039422
0      COMB1PF7A    0.005484
3     COMB4IMFFM    0.000000
5     COMB4IRES0   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [18]:
#COMBO: Handgrip Strength (ARMS)

print("LASSO: Combo Models, Handgrip Strength (Arms)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4DALM', 'COMB4DABM', 'COMB4DAFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

LASSO: Combo Models, Handgrip Strength (Arms)


Best Alpha: 0.010476157527896652
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.47065463901894417
R-squared: 0.5450187295120752

TEST Set Metrics:
Mean Squared Error: 0.4019883127280184
R-squared: 0.5791713287339494
         Feature  Importance
6       COMB4P1A    0.245167
15  COMB4IRESINC   -0.224621
4     COMB1PRSEX   -0.224192
11    COMB4DTBFM   -0.159779
0      COMB1PF7A   -0.121956
2      COMB4DABM    0.102573
17    COMB4IMCAP   -0.075325
16   COMB4IFCHAR   -0.074986
1      COMB4DALM    0.049871
5     COMB1PRAGE   -0.043389
7    COMB4DLR3MD   -0.000000
8     COMB4IMECF   -0.000000
10    COMB4IMFFM   -0.000000
12    COMB4IRES0    0.000000
13  COMB4IRESINF    0.000000
14  COMB4IRESEXC    0.000000
3      COMB4DAFM    0.000000
9     COMB4IMICF   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [19]:
#COMBO: Handgrip Strength (TOTAL BODY)

print("LASSO: Combo Models, Handgrip Strength (TB)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tCOMB4IMaxGrip')

LASSO: Combo Models, Handgrip Strength (TB)
Best Alpha: 0.015199110829529346
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.47810606552101953
R-squared: 0.5378154445217721

TEST Set Metrics:
Mean Squared Error: 0.42233477073360126
R-squared: 0.55787127443796
          Feature  Importance
6        COMB4P1A    0.308128
4      COMB1PRSEX   -0.275728
17   COMB4IRESINC   -0.221246
3      COMB4DTBFM   -0.133764
0       COMB1PF7A   -0.092055
18    COMB4IFCHAR   -0.037853
5      COMB1PRAGE   -0.031890
13     COMB4DTBFM   -0.014713
9   COMB4DLSL14MD   -0.011097
8     COMB4DLFNMD   -0.002270
7     COMB4DLR3MD    0.002207
14     COMB4IRES0    0.000000
16   COMB4IRESEXC    0.000000
15   COMB4IRESINF    0.000000
10     COMB4IMECF   -0.000000
12     COMB4IMFFM   -0.000000
11     COMB4IMICF   -0.000000
1       COMB4IALM    0.000000
2      COMB4DTBBM   -0.000000
19     COMB4IMCAP   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
bla

In [20]:
#COMBO: Jump Power (LEGS)

print("LASSO: Combo Models, Jump Power (Legs)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4ILLM', 'COMB4DLBM', 'COMB4DLFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLFNMD', 'COMB4IMECF', 'COMB4IMICF', 'COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

LASSO: Combo Models, Jump Power (Legs)


Best Alpha: 0.0003678379771828634
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.02637019890504225
R-squared: 0.7951460474105321

TEST Set Metrics:
Mean Squared Error: 0.03182755357194305
R-squared: 0.7665757707200533
         Feature  Importance
8     COMB4IMECF   -0.211593
1      COMB4ILLM    0.172588
9     COMB4IMICF    0.170072
13  COMB4IRESINF    0.169248
15  COMB4IRESINC   -0.142946
5     COMB1PRAGE   -0.106836
14  COMB4IRESEXC   -0.088450
4     COMB1PRSEX   -0.071762
17    COMB4IMCAP   -0.056435
3      COMB4DLFM   -0.045642
6       COMB4P1A    0.043514
16   COMB4IFCHAR   -0.041756
2      COMB4DLBM    0.024686
0      COMB1PF7A   -0.012486
11    COMB4DTBFM    0.009259
7    COMB4DLFNMD    0.001952
10    COMB4IMFFM    0.000000
12    COMB4IRES0   -0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21


In [21]:
#COMBO: Jump Power (TOTAL BODY)

print("LASSO: Combo Models, Jump Power (TB)")

train_test_lasso(df, ['COMB1PF7A', 'COMB4IALM', 'COMB4DTBBM', 'COMB4DTBFM', 'COMB1PRSEX', 'COMB1PRAGE', 'COMB4P1A', 'COMB4DLR3MD', 'COMB4DLFNMD', 'COMB4DLSL14MD', 'COMB4IMECF', 'COMB4IMICF','COMB4IMFFM', 'COMB4DTBFM', 'COMB4IRES0', 'COMB4IRESINF', 'COMB4IRESEXC', 'COMB4IRESINC', 'COMB4IFCHAR', 'COMB4IMCAP'], 'tjumppownums')

LASSO: Combo Models, Jump Power (TB)


Best Alpha: 0.001484968262254465
Best Max Iterations: 10000

TRAINING Set Metrics:
Mean Squared Error: 0.024608313549032368
R-squared: 0.8088330575270585

TEST Set Metrics:
Mean Squared Error: 0.03600684111676248
R-squared: 0.7359247509398611
          Feature  Importance
1       COMB4IALM    0.199195
10     COMB4IMECF   -0.117637
17   COMB4IRESINC   -0.114497
5      COMB1PRAGE   -0.099389
2      COMB4DTBBM    0.086362
9   COMB4DLSL14MD   -0.052564
6        COMB4P1A    0.043634
14     COMB4IRES0    0.043197
0       COMB1PF7A   -0.024624
7     COMB4DLR3MD    0.022286
18    COMB4IFCHAR   -0.014751
3      COMB4DTBFM   -0.007374
4      COMB1PRSEX   -0.006041
13     COMB4DTBFM   -0.004849
8     COMB4DLFNMD    0.002467
11     COMB4IMICF    0.000000
12     COMB4IMFFM   -0.000000
15   COMB4IRESINF    0.000000
16   COMB4IRESEXC    0.000000
19     COMB4IMCAP    0.000000
white train frequency:  251
white test frequency:  62 

black train frequency:  79
black test frequency:  21
