In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.model_selection import GridSearchCV

data = pd.read_csv('/Users/yunjuha/Desktop/SROP/DXA_BIS_Project/narrowed down list/narrowed_transformed_data.csv')
df = pd.DataFrame(data)
df = df.dropna()

# Lasso Models

In [2]:
#DXA model, handgrip strength (Arms)

print("MLR: DXA Model, Handgrip Strength (Arms)")

print("\nTRANSFORMED")
columns = ['RA4DALM', 'RA4DABM', 'RA4DAFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [10000, 20000, 30000, 50000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

MLR: DXA Model, Handgrip Strength (Arms)

TRANSFORMED
Best Alpha: 0.08902150854450393
Best Max Iterations: 10000

TEST Set Metrics:
Mean Squared Error: 0.8057440379110141
R-squared: 0.3687208887155251

TRAINING Set Metrics:
Mean Squared Error: 0.5324461818143136
R-squared: 0.5211126585873147
      Feature  Importance
3    RA1PRSEX   -0.402187
1     RA4DABM    0.248962
8      RA4P1A    0.074148
0     RA4DALM    0.000000
2     RA4DAFM   -0.000000
4     RA1PF7A   -0.000000
5   Age_40_50    0.000000
6   Age_51_61   -0.000000
7  Age_61plus   -0.000000
9   RA4DLR3MD    0.000000


In [3]:
#DXA model, handgrip strength (TB)

print("Lasso: DXA Model, Handgrip Strength (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [10000, 20000, 30000, 50000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

Lasso: DXA Model, Handgrip Strength (TB)

TRANSFORMED
Best Alpha: 0.026560877829466867
Best Max Iterations: 10000

TEST Set Metrics:
Mean Squared Error: 0.7994562521150469
R-squared: 0.37364720233680404

TRAINING Set Metrics:
Mean Squared Error: 0.5234229958359314
R-squared: 0.5292282009498037
       Feature  Importance
3     RA1PRSEX   -0.463062
0      RA4IALM    0.167635
8       RA4P1A    0.154175
2     RA4DTBFM   -0.111588
9    RA4DLR3MD    0.043877
5    Age_40_50    0.019857
6    Age_51_61   -0.016805
4      RA1PF7A   -0.014649
1     RA4DTBBM    0.000000
7   Age_61plus   -0.000000
10   RA4DLFNMD    0.000000
11    RA4DLSMD   -0.000000


In [4]:
#DXA model, jump power (Legs)

print("MLR: DXA Model, Jump Power (Legs)")

print("\nTRANSFORMED")
columns = ['RA4ILLM', 'RA4DLBM', 'RA4DLFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLFNMD']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [10000, 20000, 30000, 50000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

MLR: DXA Model, Jump Power (Legs)

TRANSFORMED
Best Alpha: 0.0031257158496882354
Best Max Iterations: 10000

TEST Set Metrics:
Mean Squared Error: 0.031784446934849185
R-squared: 0.7778120618017147

TRAINING Set Metrics:
Mean Squared Error: 0.034491751002571604
R-squared: 0.7045677170042773
      Feature  Importance
0     RA4ILLM    0.230873
7  Age_61plus   -0.106555
2     RA4DLFM   -0.097609
3    RA1PRSEX   -0.044297
9   RA4DLFNMD    0.043985
6   Age_51_61   -0.032467
5   Age_40_50   -0.010648
4     RA1PF7A   -0.007053
1     RA4DLBM    0.000000
8      RA4P1A    0.000000


In [5]:
#DXA model, jump power (TB)

print("Lasso: DXA Model, Jump Power (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA4DTBFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [10000, 20000, 30000, 50000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

Lasso: DXA Model, Jump Power (TB)

TRANSFORMED
Best Alpha: 0.0041320124001153384
Best Max Iterations: 10000

TEST Set Metrics:
Mean Squared Error: 0.03063773190504295
R-squared: 0.7858281285495767

TRAINING Set Metrics:
Mean Squared Error: 0.03006382195190246
R-squared: 0.7424942690162275
       Feature  Importance
0      RA4IALM    0.252593
2     RA4DTBFM   -0.107874
7   Age_61plus   -0.073140
11    RA4DLSMD   -0.064418
10   RA4DLFNMD    0.062543
1     RA4DTBBM    0.025896
9    RA4DLR3MD    0.025528
6    Age_51_61   -0.020998
4      RA1PF7A   -0.018153
3     RA1PRSEX   -0.001397
5    Age_40_50   -0.000000
8       RA4P1A   -0.000000


In [6]:
#BIS Model, handgrip strength

print("Lasso: BIS Model, Handgrip Strength")

print("\nTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [10000, 20000, 30000, 50000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

Lasso: BIS Model, Handgrip Strength

TRANSFORMED
Best Alpha: 0.06734150657750829
Best Max Iterations: 10000

TEST Set Metrics:
Mean Squared Error: 0.7854498261319686
R-squared: 0.3846208660945817

TRAINING Set Metrics:
Mean Squared Error: 0.5267236488731454
R-squared: 0.5262595610911591
       Feature  Importance
10    RA1PRSEX   -0.463386
15      RA4P1A    0.201971
7   RA4IRESINC   -0.133330
3     RA4DTBFM   -0.008380
0     RA4IMECF    0.000000
1     RA4IMICF    0.000000
2     RA4IMFFM    0.000000
4     RA4IRES0   -0.000000
5   RA4IRESINF   -0.000000
6   RA4IRESEXC   -0.000000
8    RA4IFCHAR   -0.000000
9     RA4IMCAP    0.000000
11     RA1PF7A   -0.000000
12   Age_40_50    0.000000
13   Age_51_61   -0.000000
14  Age_61plus   -0.000000


In [7]:
#BIS Model, jumppower

print("Lasso: BIS Model, Jump Power")

print("\nTRANSFORMED")
columns = ['RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [10000, 20000, 30000, 50000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

Lasso: BIS Model, Jump Power

TRANSFORMED
Best Alpha: 0.00023101297000831605
Best Max Iterations: 10000

TEST Set Metrics:
Mean Squared Error: 0.030708695722430537
R-squared: 0.785332058748383

TRAINING Set Metrics:
Mean Squared Error: 0.03021443820527312
R-squared: 0.7412041952363768
       Feature  Importance
1     RA4IMICF    0.630075
0     RA4IMECF   -0.397771
9     RA4IMCAP   -0.223159
10    RA1PRSEX   -0.188537
5   RA4IRESINF    0.103823
8    RA4IFCHAR   -0.089501
14  Age_61plus   -0.076885
6   RA4IRESEXC   -0.074774
3     RA4DTBFM   -0.065637
7   RA4IRESINC   -0.057627
13   Age_51_61   -0.033408
4     RA4IRES0   -0.032187
12   Age_40_50   -0.027423
15      RA4P1A    0.026145
11     RA1PF7A    0.007058
2     RA4IMFFM    0.000000


In [8]:
#Combo Models, handgrip strength (Arms)
print("MLR: Combo Models, Handgrip Strength (Arms)")

print("\nTRANSFORMED")
columns = ['RA4DALM', 'RA4DABM', 'RA4DAFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP']
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [10000, 20000, 30000, 50000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

MLR: Combo Models, Handgrip Strength (Arms)

TRANSFORMED
Best Alpha: 0.04229242874389499
Best Max Iterations: 10000

TEST Set Metrics:
Mean Squared Error: 0.7680800159231048
R-squared: 0.3982296523044804

TRAINING Set Metrics:
Mean Squared Error: 0.5061607882306464
R-squared: 0.5447540005317261
       Feature  Importance
3     RA1PRSEX   -0.368550
1      RA4DABM    0.212095
8       RA4P1A    0.136006
17  RA4IRESINC   -0.115391
13    RA4DTBFM   -0.054737
5    Age_40_50    0.007846
4      RA1PF7A   -0.005901
0      RA4DALM    0.000000
18   RA4IFCHAR    0.000000
16  RA4IRESEXC   -0.000000
15  RA4IRESINF   -0.000000
14    RA4IRES0   -0.000000
10    RA4IMECF    0.000000
12    RA4IMFFM    0.000000
11    RA4IMICF    0.000000
9    RA4DLR3MD    0.000000
7   Age_61plus   -0.000000
6    Age_51_61   -0.000000
2      RA4DAFM   -0.000000
19    RA4IMCAP    0.000000


In [9]:
#Combo Models, handgrip strength (TB)

print("Lasso: Combo Models, Handgrip Strength (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['tRA4IMaxGrip']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [10000, 20000, 30000, 50000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

Lasso: Combo Models, Handgrip Strength (TB)

TRANSFORMED
Best Alpha: 0.04229242874389499
Best Max Iterations: 10000

TEST Set Metrics:
Mean Squared Error: 0.775998599619775
R-squared: 0.3920256517243138

TRAINING Set Metrics:
Mean Squared Error: 0.5169995439524833
R-squared: 0.5350055168555633
       Feature  Importance
2     RA1PRSEX   -0.446021
7       RA4P1A    0.225807
18  RA4IRESINC   -0.158719
14    RA4DTBFM   -0.047405
8    RA4DLR3MD    0.015574
4    Age_40_50    0.003199
0      RA4IALM    0.000000
13    RA4IMFFM    0.000000
19   RA4IFCHAR    0.000000
17  RA4IRESEXC   -0.000000
16  RA4IRESINF   -0.000000
15    RA4IRES0   -0.000000
10    RA4DLSMD    0.000000
12    RA4IMICF    0.000000
11    RA4IMECF    0.000000
1     RA4DTBBM    0.000000
9    RA4DLFNMD    0.000000
6   Age_61plus   -0.000000
5    Age_51_61   -0.000000
3      RA1PF7A   -0.000000
20    RA4IMCAP    0.000000


In [10]:
#Combo Models, jumppower (Legs)

print("MLR: Combo Models, Jump Power (Legs)")

print("\nTRANSFORMED")
columns = ['RA4ILLM', 'RA4DLBM', 'RA4DLFM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLFNMD', 'RA4IMECF', 'RA4IMICF', 'RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP']
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [100000, 200000, 300000, 500000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

MLR: Combo Models, Jump Power (Legs)

TRANSFORMED
Best Alpha: 0.0001
Best Max Iterations: 100000

TEST Set Metrics:
Mean Squared Error: 0.0272855204692876
R-squared: 0.8092616320125097

TRAINING Set Metrics:
Mean Squared Error: 0.02483479054249781
R-squared: 0.7872825051084335
       Feature  Importance
15  RA4IRESINF    0.735387
16  RA4IRESEXC   -0.511309
10    RA4IMECF   -0.477401
11    RA4IMICF    0.382957
17  RA4IRESINC   -0.307531
0      RA4ILLM    0.291955
3     RA1PRSEX   -0.168906
19    RA4IMCAP   -0.119236
2      RA4DLFM   -0.061662
9    RA4DLFNMD    0.060938
1      RA4DLBM   -0.058924
8       RA4P1A    0.051931
7   Age_61plus   -0.049331
4      RA1PF7A   -0.031362
18   RA4IFCHAR   -0.028974
6    Age_51_61   -0.018622
13    RA4DTBFM   -0.010084
5    Age_40_50   -0.004647
12    RA4IMFFM    0.000000
14    RA4IRES0   -0.000000


In [11]:
#Combo Models, jumppower (TB)

print("Lasso: Combo Models, Jump Power (TB)")

print("\nTRANSFORMED")
columns = ['RA4IALM', 'RA4DTBBM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['tjumppownums']

#dividing into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the Lasso regression model
lasso_model = Lasso()

# Define a parameter grid for hyperparameter tuning
alphas = np.logspace(-4, 0, 100)
max_iters = [100000, 200000, 300000, 500000]
param_grid = {'alpha': alphas, 'max_iter': max_iters}

# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search results
best_alpha = grid_search.best_params_['alpha']
best_max_iter = grid_search.best_params_['max_iter']

print("Best Alpha:", best_alpha)
print("Best Max Iterations:", best_max_iter)

# Retrain the model with the entire training set using the best hyperparameters
best_lasso_model = Lasso(alpha=best_alpha, max_iter=best_max_iter)
best_lasso_model.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred = best_lasso_model.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)

print("\nTEST Set Metrics:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Evaluate the model on the train set
y_pred_train = best_lasso_model.predict(X_train_scaled)
mse_train = metrics.mean_squared_error(y_train, y_pred_train)
r2_train = metrics.r2_score(y_train, y_pred_train)

print("\nTRAINING Set Metrics:")
print("Mean Squared Error:", mse_train)
print("R-squared:", r2_train)

# Get the feature importance (coefficients) from the Lasso model
feature_importance = best_lasso_model.coef_

# Create a DataFrame to associate each feature with its importance value
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the features based on their importance (absolute value of coefficients) in descending order
importance_df = importance_df.reindex(importance_df['Importance'].abs().sort_values(ascending=False).index)

print(importance_df)

Lasso: Combo Models, Jump Power (TB)

TRANSFORMED
Best Alpha: 0.0001
Best Max Iterations: 100000

TEST Set Metrics:
Mean Squared Error: 0.026975864371782433
R-squared: 0.8114262709000835

TRAINING Set Metrics:
Mean Squared Error: 0.023802768535850196
R-squared: 0.7961220858389971
       Feature  Importance
16  RA4IRESINF    0.558539
17  RA4IRESEXC   -0.378355
11    RA4IMECF   -0.372867
0      RA4IALM    0.304621
18  RA4IRESINC   -0.268781
12    RA4IMICF    0.225653
2     RA1PRSEX   -0.089887
20    RA4IMCAP   -0.071651
14    RA4DTBFM   -0.053495
9    RA4DLFNMD    0.052204
6   Age_61plus   -0.045836
10    RA4DLSMD   -0.043420
3      RA1PF7A   -0.041051
7       RA4P1A    0.038741
1     RA4DTBBM    0.021806
5    Age_51_61   -0.017769
8    RA4DLR3MD    0.015352
19   RA4IFCHAR   -0.008570
4    Age_40_50   -0.001387
13    RA4IMFFM    0.000000
15    RA4IRES0   -0.000000


In [12]:
columns = ['RA4IALM', 'RA4DTBBM', 'RA1PRSEX', 'RA1PF7A', 'Age_40_50', 'Age_51_61', 'Age_61plus', 'RA4P1A', 'RA4DLR3MD', 'RA4DLFNMD', 'RA4DLSMD', 'RA4IMECF', 'RA4IMICF','RA4IMFFM', 'RA4DTBFM', 'RA4IRES0', 'RA4IRESINF', 'RA4IRESEXC', 'RA4IRESINC', 'RA4IFCHAR', 'RA4IMCAP'] 
X = df[columns]
y = df['tjumppownums']

# Splitting the data into train, test, and validation sets (80 train, 10 test, 10 validation)
X_train, X_testval, y_train, y_testval = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_testval, y_testval, test_size=0.5, random_state=42)

sparsity_levels = [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035]
#sparsity_levels = [0.0001, 0.0002, 0.0004, 0.0008, 0.0016]

train_errors = []
val_errors = []
active_coefficients = []

for sparsity in sparsity_levels:
    print(sparsity)
    
    #fit lasso
    model = sm.OLS(y_train, X_train)
    results = model.fit_regularized(alpha=sparsity, L1_wt=1)

    #predicted for training and validation
    y_train_pred = results.predict(X_train)
    y_val_pred = results.predict(X_val)

    #mse
    train_error = np.mean((y_train_pred - y_train) ** 2)
    val_error = np.mean((y_val_pred - y_val) ** 2)

    train_errors.append(train_error)
    val_errors.append(val_error)

    #count of the non-zero coefficients
    non_zero_coeffs = np.sum(results.params != 0)

    #percentage of active coefficients
    active_percentage = non_zero_coeffs / len(X.columns) * 100
    active_coefficients.append(active_percentage)

# Plot the errors and active coefficients
fig, ax1 = plt.subplots()

ax1.plot(sparsity_levels, train_errors, label='Training error')
ax1.plot(sparsity_levels, val_errors, label='Validation error')
ax1.set_xlabel('Sparsity')
ax1.set_ylabel('Mean Squared Error')
ax1.legend(loc='upper left')

ax2 = ax1.twinx()
ax2.plot(sparsity_levels, active_coefficients, label='Active Coefficients', color='green')
ax2.set_ylabel('Active Coefficients (%)')
ax2.legend(loc='upper right')

plt.show()

0.005
0.01
0.015
0.02
0.025
0.03
0.035


NameError: name 'plt' is not defined