In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import *
from random import seed
from scipy import stats
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso
import statsmodels.api as sm
from statsmodels.stats.api import anova_lm
from statsmodels.formula.api import ols
from statsmodels.regression import linear_model
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from statsmodels.stats.mediation import Mediation
from sklearn.svm import SVR
from pingouin import ancova
import matplotlib.cm as cm
from scipy.stats import pearsonr
%matplotlib inline

seed(888)
np.random.seed(888)
pd.set_option('display.max_columns', None)

# 1 Load validation set

In [None]:
# load data from pickle and convert to dataframe
main_test = pd.read_pickle("2_val_test.pkl")
main_test = pd.DataFrame(main_test)

# 2 Preprocessing

## 2.1 Recoding

In [None]:
# recoding imaging site and gender
# 1 = Cheadle;  2 = Reading;  3 = Newcastle;  4 = Bristol
main_test['IS'] = main_test['IS'].map({'11025': 0, '11026': 1,'11027': 2, '11028': 3})
main_test["Sex"] = main_test["Sex"].map({"Female":0, "Male":1})
main_test["Sex"] = main_test["Sex"].astype("int")

main_test = pd.get_dummies(main_test, columns=["IS"], prefix="IS", drop_first=True)

# delete unnecessary variables
del main_test["ID"]
del main_test["MVPA"]
del main_test["TPA"]

## 2.2 Remove outliers on SRPA

In [None]:
# remove outliers on self-reported physical activity data for each intensity seperately
before = main_test.shape[0]
outlier = np.mean(main_test["SRLPA"]) + np.std(main_test["SRLPA"])*3
main_test = main_test[main_test["SRLPA"]<outlier]
outlier = np.mean(main_test["SRMPA"]) + np.std(main_test["SRMPA"])*3
main_test = main_test[main_test["SRMPA"]<outlier]
outlier = np.mean(main_test["SRVPA"]) + np.std(main_test["SRVPA"])*3
main_test = main_test[main_test["SRVPA"]<outlier]
after = main_test.shape[0]
print("Outliers on self-reported PA:", before-after)
print(after)

## 2.3 Convert SRPA from MET min/week to min/week

In [None]:
main_test["SRLPA"] = main_test["SRLPA"]/3.3
main_test["SRMPA"] = main_test["SRMPA"]/4
main_test["SRVPA"] = main_test["SRVPA"]/8

## 2.4 Remove people with Sex other than Male or Female

In [None]:
before = main_test.shape[0]
main_test = main_test[(main_test["Sex"] == 0) | (main_test["Sex"] == 1)]
after = main_test.shape[0]
print("Sex not Male or Female:", before-after)
print(after)

# 3 Basic distributions of validation set

## 3.1 Data and Descriptives

In [None]:
main_test.head()

In [None]:
# get mean and std for all variables for males and females seperately for the test set
female = main_test[main_test["Sex"] == 0]
male = main_test[main_test["Sex"]== 1]

print("Number of females:", female.shape[0])
print("NUmber of males:", male.shape[0])

print("Age")
mean = main_test["Age"].mean()
print("Mean", mean)
std = main_test["Age"].std()
print("Std",std)

print("LPA")
mean = main_test["LPA"].mean()
print("Mean", mean)
std = main_test["LPA"].std()
print("Std",std)

print("MPA")
mean = main_test["MPA"].mean()
print("Mean", mean)
std = main_test["MPA"].std()
print("Std",std)

print("VPA")
mean = main_test["VPA"].mean()
print("Mean", mean)
std = main_test["VPA"].std()
print("Std",std)

print("SRLPA")
mean = main_test["SRLPA"].mean()
print("Mean", mean)
std = main_test["SRLPA"].std()
print("Std",std)

print("SRMPA")
mean = main_test["SRMPA"].mean()
print("Mean", mean)
std = main_test["SRMPA"].std()
print("Std",std)

print("SRVPA")
mean = main_test["SRVPA"].mean()
print("Mean", mean)
std = main_test["SRVPA"].std()
print("Std",std)

print("DBP")
mean = main_test["DBP"].mean()
print("Mean", mean)
std = main_test["DBP"].std()
print("Std",std)

print("SBP")
mean = main_test["SBP"].mean()
print("Mean", mean)
std = main_test["SBP"].std()
print("Std",std)

print("BMI")
mean = main_test["BMI"].mean()
print("Mean", mean)
std = main_test["BMI"].std()
print("Std",std)

print("HG")
mean = main_test["HG"].mean()
print("Mean", mean)
std = main_test["HG"].std()
print("Std",std)

print("HR")
mean = main_test["HR"].mean()
print("Mean", mean)
std = main_test["HR"].std()
print("Std",std)

#print("IS")
#print(main_test["IS"].value_counts())

## 3.2 Physical activity

In [None]:
# plot physical activity intensity categories for self-report and accelerometer together and seperately

plt.figure(figsize=(9,6))
sns.kdeplot(data = main_test, x= 'LPA', fill=True, label = "Light PA")
sns.kdeplot(data = main_test, x= 'MPA', fill=True, label = "Moderate PA")
sns.kdeplot(data = main_test, x= 'VPA', fill=True, label = "Vigorous PA")
plt.xlabel("physical activity [min/week]", fontsize = 16)
plt.xticks(range(0,5000,1000),fontsize = 14)
plt.ylabel("density",fontsize = 16)
plt.yticks(fontsize = 14)
plt.legend()
plt.savefig('valpa.png', format='png', dpi=800)
plt.show()

plt.figure(figsize=(9,6))
sns.kdeplot(data = main_test, x= 'SRLPA', fill=True, label = "SR Light PA")
sns.kdeplot(data = main_test, x= 'SRMPA', fill=True, label = "SR Moderate PA")
sns.kdeplot(data = main_test, x= 'SRVPA', fill=True, label = "SR Vigorous PA")
plt.xlabel("physical activity [min/week]", fontsize = 16)
plt.xticks(range(0,2000,500),fontsize = 14)
plt.yticks(fontsize = 14)
plt.ylabel("density",fontsize = 16)
plt.savefig('valsrpa.png', format='png', dpi=800)
plt.legend()
plt.show()

# plt.figure()
# sns.kdeplot(data = main_test, x= 'SRLPA', fill=True, label = "SR Light PA", color="#6baed6")
# sns.kdeplot(data = main_test, x= 'LPA', fill=True, label = "Light PA", color="#08519c")
# plt.xlabel("Min/week of physical activity", fontsize = 16)
# plt.xticks(range(0,4500,1000),fontsize = 14)
# plt.yticks(fontsize = 14)
# plt.ylabel("Density",fontsize = 16)
# plt.legend()
# plt.show()

# plt.figure()
# sns.kdeplot(data = main_test, x= 'SRMPA', fill=True, label = "SR Moderate PA", color="#fd8d3c")
# sns.kdeplot(data = main_test, x= 'MPA', fill=True, label = "Moderate PA", color="#a63603")
# plt.xlabel("Min/week of physical activity", fontsize = 16)
# plt.xticks(range(0,2500,500),fontsize = 14)
# plt.yticks(fontsize = 14)
# plt.ylabel("Density",fontsize = 16)
# plt.legend()
# plt.show()

# plt.figure()
# sns.kdeplot(data = main_test, x= 'SRVPA', fill=True, label = "SR Vigorous PA", color="#74c476")
# sns.kdeplot(data = main_test, x= 'VPA', fill=True, label = "Vigorous PA", color="#006d2c")
# plt.ylim(0,0.01)
# plt.xticks(range(0,1500,500),fontsize = 14)
# plt.yticks(fontsize = 14)
# plt.ylabel("Density",fontsize = 16)
# plt.xlabel("Min/week of physical activity", fontsize = 16)
# plt.legend()
# plt.show()

plt.figure(figsize=(9,6))
plt.scatter("MPA", "SRMPA", data = main_test, label = "Moderate PA", alpha=0.3, s=10, color = "darkcyan")
plt.scatter("VPA", "SRVPA", data = main_test, label = "Vigorous PA", alpha=0.3, s=10, color = "mediumvioletred")
plt.ylabel("self-reported PA [min/week]", fontsize = 16)
plt.xlabel("accelerometer PA [min/week]", fontsize = 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.legend(loc="upper right")
plt.savefig('valpascat.png', format='png', dpi=800)
plt.show()

# 4 Reorder Dataframe

In [None]:
# reorder dataframe
main_test = main_test[['Age', 'Sex', 'IS_1', "IS_2", "IS_3" , 'LPA', 'MPA', 'VPA', 'SRLPA', 'SRMPA', 'SRVPA', 'HG', 'HR', 'BMI', 'DBP', 'SBP', 'BA']]

# 5 Standardization of all variables

In [None]:
main_test = pd.DataFrame(main_test)
main_test[["LPA", "MPA","VPA","SRLPA","SRMPA","SRVPA","HR", "DBP", "SBP", "HG","BMI","Age","Sex", 'IS_1', "IS_2","IS_3"]] = stats.zscore(main_test[["LPA", "MPA","VPA","SRLPA","SRMPA","SRVPA","HR", "DBP", "SBP", "HG","BMI","Age","Sex", 'IS_1', "IS_2","IS_3"]], nan_policy='omit')

# 6 Correlation Heatmap

In [None]:
def calculate_pvalues(df):
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            tmp = df[df[r].notnull() & df[c].notnull()]
            pvalues[r][c] = round(pearsonr(tmp[r], tmp[c])[1], 4)
    return pvalues

In [None]:
corr_test = main_test
corr_test = corr_test[:]
del corr_test["BA"]
del corr_test["SBP"]
del corr_test["IS_1"]
del corr_test["IS_2"]
del corr_test["IS_3"]
corr = corr_test.corr()
corr = round(corr,3)

fig, ax = plt.subplots(figsize=(13,10)) 
sns.heatmap(corr, annot=True, cmap = "coolwarm", vmin=-1, vmax=1)
plt.xticks(size=10)
plt.yticks(size=10)


p_values = np.asarray(calculate_pvalues(corr_test))

threshold = 0.0007575  # Define your significance threshold

for i in range(len(corr.columns)):
    for j in range(len(corr.columns)):
        if i != j:
            correlation = corr.iloc[i, j]
            p_value = p_values[i,j]  # Calculate or provide the p-value for the correlation

            if p_value < threshold:
                ax.text(j + 0.9, i + 0.2, "*", ha='right', va='top', color='black', fontsize=12)

plt.savefig('valheatmap.png', format='png', dpi=800)               
plt.show()

# 7 Models with OLS

## 7.1 Baseline Covariates

In [None]:
cov_model = ols('BA ~ Sex + Age + IS_1 + IS_2 + IS_3', missing='drop', data = main_test).fit()
print(cov_model.summary())

#calculate effect size
results = cov_model
coefficients = results.params
stderr = results.bse
effect_size = coefficients / (stderr* np.sqrt(5452))
print(effect_size)

## 7.2 Accelerometer PA

In [None]:
lpa = ols('BA ~ LPA + Age + Sex + IS_1 + IS_2 + IS_3 + Sex:LPA', missing='drop', data = main_test).fit()
print(lpa.summary())

#calculate effect size
results = lpa
coefficients = results.params
stderr = results.bse
effect_size = coefficients / (stderr* np.sqrt(5452))
print(effect_size)

In [None]:
mpa = ols('BA ~ MPA + Sex + Age + IS_1 + IS_2 + IS_3 + Sex:MPA', missing='drop', data = main_test).fit()
print(mpa.summary())

#calculate effect size
results = mpa
coefficients = results.params
stderr = results.bse
effect_size = coefficients / (stderr* np.sqrt(5452))
print(effect_size)

In [None]:
vpa = ols('BA ~ VPA + Age + Sex + IS_1 + IS_2 + IS_3 + Sex:VPA', missing='drop', data = main_test).fit()
print(vpa.summary())

#calculate effect size
results = vpa
coefficients = results.params
stderr = results.bse
effect_size = coefficients / (stderr* np.sqrt(5452))
print(effect_size)

## 7.3 Self-report PA

In [None]:
srlpa = ols('BA ~ SRLPA + Age + Sex + IS_1 + IS_2 + IS_3 + Sex:SRLPA', missing='drop', data = main_test).fit()
print(srlpa.summary())

#calculate effect size
results = srlpa
coefficients = results.params
stderr = results.bse
effect_size = coefficients / (stderr* np.sqrt(5452))
print(effect_size)

In [None]:
srmpa = ols('BA ~ SRMPA + Sex + Age + IS_1 + IS_2 + IS_3 + Sex:SRMPA', missing='drop', data = main_test).fit()
print(srmpa.summary())

#calculate effect size
results = srmpa
coefficients = results.params
stderr = results.bse
effect_size = coefficients / (stderr* np.sqrt(5452))
print(effect_size)

In [None]:
srvpa = ols('BA ~ SRVPA + Age + Sex + IS_1 + IS_2 + IS_3 + Sex:SRVPA', missing='drop', data = main_test).fit()
print(srvpa.summary())

#calculate effect size
results = srvpa
coefficients = results.params
stderr = results.bse
effect_size = coefficients / (stderr* np.sqrt(5452))
print(effect_size)

# 8 Model comparisons

## 8.1 Cov vs. PA

In [None]:
model_comparison = anova_lm(cov_model, lpa)
print(model_comparison)

In [None]:
model_comparison = anova_lm(cov_model, mpa)
print(model_comparison)

In [None]:
model_comparison = anova_lm(cov_model, vpa)
print(model_comparison)

## 8.2 Cov vs. SRPA

In [None]:
model_comparison = anova_lm(cov_model, srlpa)
print(model_comparison)

In [None]:
model_comparison = anova_lm(cov_model, srmpa)
print(model_comparison)

In [None]:
model_comparison = anova_lm(cov_model, srvpa)
print(model_comparison)

# 9 Mediation analysis

## 9.1 BMI

In [None]:
# outcome_model = sm.OLS.from_formula("BA ~ BMI + LPA + Gender + Age", main_test)
# mediator_model = sm.OLS.from_formula("BMI ~ LPA + Gender + Age", main_test)
# res = Mediation(outcome_model, mediator_model, mediator = "BMI", exposure = "LPA").fit()
# print(res.summary())

In [None]:
outcome_model = sm.OLS.from_formula("BA ~ BMI + MPA + Age + Sex + IS_1 + IS_2 + IS_3", main_test)
mediator_model = sm.OLS.from_formula("BMI ~ MPA + Age + Sex + IS_1 + IS_2 + IS_3", main_test)
res = Mediation(outcome_model, mediator_model, mediator = "BMI", exposure = "MPA").fit()
print(res.summary())

modelm = ols("BMI ~ MPA + Age + Sex + IS_1 + IS_2 + IS_3", missing='drop', data = main_test).fit()
print(modelm.summary())
modely = ols("BA ~ BMI + Age + Sex + IS_1 + IS_2 + IS_3", missing='drop', data = main_test).fit()
print(modely.summary())

In [None]:
# outcome_model = sm.OLS.from_formula("BA ~ BMI + VPA + Age + Gender", main_test)
# mediator_model = sm.OLS.from_formula("BMI ~ VPA + Age + Gender", main_test)
# res = Mediation(outcome_model, mediator_model, mediator = "BMI", exposure = "VPA").fit()
# print(res.summary())

## 9.2 DBP

In [None]:
# outcome_model = sm.OLS.from_formula("BA ~ DBP + LPA + Age + Gender", main_test)
# mediator_model = sm.OLS.from_formula("DBP ~ LPA + Age + Gender", main_test)
# res = Mediation(outcome_model, mediator_model, mediator = "DBP", exposure = "LPA").fit()
# print(res.summary())

In [None]:
outcome_model = sm.OLS.from_formula("BA ~ DBP + MPA + Age + Sex + IS_1 + IS_2 + IS_3", main_test)
mediator_model = sm.OLS.from_formula("DBP ~ MPA + Age + Sex + IS_1 + IS_2 + IS_3", main_test)
res = Mediation(outcome_model, mediator_model, mediator = "DBP", exposure = "MPA").fit()
print(res.summary())

modelm = ols("DBP ~ MPA + Age + Sex + IS_1 + IS_2 + IS_3", missing='drop', data = main_test).fit()
print(modelm.summary())
modely = ols("BA ~ DBP + Age + Sex + IS_1 + IS_2 + IS_3", missing='drop', data = main_test).fit()
print(modely.summary())

In [None]:
# outcome_model = sm.OLS.from_formula("BA ~ DBP + VPA + Age + Gender", main_test)
# mediator_model = sm.OLS.from_formula("DBP ~ VPA + Age + Gender", main_test)
# res = Mediation(outcome_model, mediator_model, mediator = "DBP", exposure = "VPA").fit()
# print(res.summary())

## 9.3 Heart rate

In [None]:
# outcome_model = sm.OLS.from_formula("BA ~ HR + LPA + Age + Gender", main_test)
# mediator_model = sm.OLS.from_formula("HR ~ LPA + Age + Gender", main_test)
# res = Mediation(outcome_model, mediator_model, mediator = "HR", exposure = "LPA").fit()
# print(res.summary())

In [None]:
outcome_model = sm.OLS.from_formula("BA ~ HR + MPA + Age + Sex + IS_1 + IS_2 + IS_3", main_test)
mediator_model = sm.OLS.from_formula("HR ~ MPA + Age + Sex + IS_1 + IS_2 + IS_3", main_test)
res = Mediation(outcome_model, mediator_model, mediator = "HR", exposure = "MPA").fit()
print(res.summary())

modelm = ols("HR ~ MPA + Age + Sex + IS_1 + IS_2 + IS_3", missing='drop', data = main_test).fit()
print(modelm.summary())
modely = ols("BA ~ HR + Age + Sex + IS_1 + IS_2 + IS_3", missing='drop', data = main_test).fit()
print(modely.summary())

In [None]:
# outcome_model = sm.OLS.from_formula("BA ~ HR + VPA + Age + Gender", main_test)
# mediator_model = sm.OLS.from_formula("HR ~ VPA + Age + Gender", main_test)
# res = Mediation(outcome_model, mediator_model, mediator = "HR", exposure = "VPA").fit()
# print(res.summary())

## 9.4 Hand grip

In [None]:
# outcome_model = sm.OLS.from_formula("BA ~ HG + LPA + Gender + Age", main_test)
# mediator_model = sm.OLS.from_formula("HG ~ LPA + Gender + Age", main_test)
# res = Mediation(outcome_model, mediator_model, mediator = "HG", exposure = "LPA").fit()
# print(res.summary())

In [None]:
outcome_model = sm.OLS.from_formula("BA ~ HG + MPA + Sex + IS_1 + IS_2 + IS_3 + Age", main_test)
mediator_model = sm.OLS.from_formula("HG ~ MPA + Sex + IS_1 + IS_2 + IS_3 + Age", main_test)
res = Mediation(outcome_model, mediator_model, mediator = "HG", exposure = "MPA").fit()
print(res.summary())

modelm = ols("HG ~ MPA + Age + Sex + IS_1 + IS_2 + IS_3", missing='drop', data = main_test).fit()
print(modelm.summary())
modely = ols("BA ~ HG + Age + Sex + IS_1 + IS_2 + IS_3", missing='drop', data = main_test).fit()
print(modely.summary())

In [None]:
# outcome_model = sm.OLS.from_formula("BA ~ HG + VPA + Gender + Age", main_test)
# mediator_model = sm.OLS.from_formula("HG ~ VPA + Gender + Age", main_test)
# res = Mediation(outcome_model, mediator_model, mediator = "HG", exposure = "VPA").fit()
# print(res.summary())

# 10 LASSO Regression

## 10.1 Accelerometer PA

In [None]:
df = main_test[["Age","Sex", "MPA", "SRMPA", "HR", "DBP", "HG", "BMI", "BA", "IS_1", "IS_2", "IS_3"]]
df = df.dropna()
df = pd.DataFrame(df)
X = df[["Age","Sex","MPA", "HR", "DBP", "HG", "BMI", "IS_1", "IS_2", "IS_3"]]
Y = df["BA"]
columns = ["Age","Sex", "MPA", "HR", "DBP", "HG", "BMI", "IS 1", "IS 2", "IS 3"]

coefs = []
alphas = np.power(10,np.linspace(start=-3, stop=0, num=100))

for a in alphas:
    lasso = Lasso(alpha=a)
    lasso.fit(X,Y)
    coefs.append(lasso.coef_)

coefs = pd.DataFrame(coefs)
coefs.columns = columns
coefs = pd.DataFrame(coefs)


# lasso with cross validated alpha
lassocv = LassoCV(alphas = alphas, random_state=888).fit(X,Y)
best_alpha = lassocv.alpha_
print("Optimal alpha:", best_alpha)

# Get the coefficients of the Lasso model at the best lambda value
lasso = Lasso(alpha=best_alpha)
lasso.fit(X, Y)
coefficients = lasso.coef_
r_squared = lasso.score(X,Y)



cmap = cm.get_cmap('turbo')
plt.figure(figsize=(7,6))
grey_shades = np.linspace(0.6, 0.75, 3)  # Three shades of mid-grey

num_colorful = 7  # First 7 columns use the colormap
num_grey = 3  # Last 3 columns use shades of grey

for i, column in enumerate(columns):
    if i < num_colorful:
        color = cmap(i / (num_colorful - 1))  # Normalize within the first 7 columns
    else:
        color = (grey_shades[i - num_colorful],) * 3  # Different grey shades for last 3
    
    plt.plot(alphas, coefs[column], label=column, color=color)

for i, column in enumerate(columns):
    y_pos = coefficients[i]
    if i < num_colorful:
        color = cmap(i / (num_colorful - 1))  # Normalize within the first 7 columns
    else:
        color = (grey_shades[i - num_colorful],) * 3

    if i == 0:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(0, -6),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
    elif i == 1:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(0, 4),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 2:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(0, -5),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 3:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(-40, -1),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 4:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(0, -2),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 5:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(0, -10),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 6:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(-80, 0),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
    elif i == 7:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(0, -4),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
    elif i == 8:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(0, -2),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
    elif i == 9:
        plt.annotate(column, (0.009, y_pos), textcoords="offset points", xytext=(0, -3),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
        
        
plt.xticks(alphas, fontsize = 14)
plt.yticks(fontsize = 14)
plt.xscale("log")
plt.axvline(x = best_alpha, color = 'gray', linestyle='dashed')
plt.axis('tight')
plt.xlabel('lambda', fontsize = 16)
plt.xlim(0.001, 1)
plt.ylabel('coefficients', fontsize = 16)
plt.legend().set_visible(False)
plt.savefig('vallasso.png', format='png', dpi=800)
plt.show()

# Print the coefficients
print(coefficients)
print(r_squared)

## 10.2 Self-report PA

In [None]:
df = main_test[["Age", "Sex", "LPA", "MPA", "VPA",  "SRLPA", "SRMPA", "SRVPA", "HR", "DBP", "HG", "BMI", "BA", "IS_1", "IS_2", "IS_3"]]
df = df.dropna()
df = pd.DataFrame(df)
X = df[["Age", "Sex", "SRMPA", "HR", "DBP", "HG", "BMI", "IS_1", "IS_2", "IS_3"]]
Y = df["BA"]
columns = ["Age", "Sex", "SRMPA", "HR", "DBP", "HG", "BMI", "IS 1", "IS 2", "IS 3"]

coefs = []
alphas = np.power(10,np.linspace(start=-3, stop=0, num=100))

for a in alphas:
    lasso = Lasso(alpha=a)
    lasso.fit(X,Y)
    coefs.append(lasso.coef_)

coefs = pd.DataFrame(coefs)
coefs.columns = columns
coefs = pd.DataFrame(coefs)

# lasso with cross validated alpha
lassocv = LassoCV(alphas = alphas, random_state=888).fit(X,Y)
best_alpha = lassocv.alpha_
print("Optimal alpha:", best_alpha)


# Get the coefficients of the Lasso model at the best lambda value
lasso = Lasso(alpha=best_alpha)
lasso.fit(X, Y)
coefficients = lasso.coef_
r_squared = lasso.score(X,Y)


cmap = cm.get_cmap('turbo')
plt.figure(figsize=(7,6))
grey_shades = np.linspace(0.6, 0.75, 3)  # Three shades of mid-grey

num_colorful = 7  # First 7 columns use the colormap
num_grey = 3  # Last 3 columns use shades of grey

for i, column in enumerate(columns):
    if i < num_colorful:
        color = cmap(i / (num_colorful - 1))  # Normalize within the first 7 columns
    else:
        color = (grey_shades[i - num_colorful],) * 3  # Different grey shades for last 3
    
    plt.plot(alphas, coefs[column], label=column, color=color)



for i, column in enumerate(columns):
    y_pos = coefficients[i]
    if i < num_colorful:
        color = cmap(i / (num_colorful - 1))  # Normalize within the first 7 columns
    else:
        color = (grey_shades[i - num_colorful],) * 3
    
    if i == 0:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(0, -6),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
    elif i == 1:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(0, 3),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 2:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(0, -3),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 3:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(0, -3),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 4:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(0, -2),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 5:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(0, -9),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='square', facecolor='white', edgecolor="white", pad=0))
    elif i == 6:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(-40, -1),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
    elif i == 7:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(0, -4),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
    elif i == 8:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(-40, -2),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))
    elif i == 9:
        plt.annotate(column, (0.007, y_pos), textcoords="offset points", xytext=(0, -4),
                     ha='center', fontsize=10, color=color, bbox=dict(boxstyle='round', facecolor='white', edgecolor="white", pad=0))


plt.xticks(alphas, fontsize = 14)
plt.yticks(fontsize = 14)
plt.xscale("log")
plt.axis('tight')
plt.xlabel('lambda', fontsize = 16)
plt.axvline(x = best_alpha, color = 'gray', linestyle='dashed')
plt.xlim(0.001, 1)
plt.ylabel('coefficients', fontsize = 16)
#plt.title('Lasso coefficients as a function of alpha')
#plt.legend(columns, bbox_to_anchor=(1.01, 1.02), loc='upper left')
plt.legend().set_visible(False)
plt.savefig('vallassosr.png', format='png', dpi=800)
plt.show()


# Print the coefficients
print(coefficients)
print(r_squared)