In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

In [2]:
from platform import python_version
print(python_version())

3.8.6


## Load feature matrix and rescale

In [3]:
df_X = pd.read_csv('data/pattern_all_isolates.csv', index_col=0).astype(int)
scaler = StandardScaler()
df_X_scaled = pd.DataFrame(scaler.fit_transform(df_X), columns=scaler.get_feature_names_out(), index=df_X.index)

## Load heteroresistance phenotype

In [4]:
df_y = pd.read_csv("data/hr_phenotype.csv", index_col=0).loc[df_X_scaled.index]
df_y.HR = df_y.HR.astype(int)

## Run Lasso

In [5]:
lasso_res = LassoCV(
    cv=5, 
    random_state=42,
    n_alphas=1000,
    max_iter=100000,
    verbose=0,
    n_jobs=-1,
).fit(df_X_scaled.values, np.ravel(df_y[['HR']].values))
print('best alpha = %2.4f'%(lasso_res.alpha_))

best alpha = 0.0727


## Run phylogenetic Lasso

In [6]:
df_phy = pd.read_csv('output/variance_covariance_matrix.csv', index_col=0)
df_phy = df_phy.loc[list(df_X_scaled.index), list(df_X_scaled.index)]
L = np.linalg.cholesky(df_phy.values) # Cholesky decomposition
LT = np.transpose(L)
transformer = np.matmul(np.linalg.inv(np.matmul(LT,L)),LT)
X_phy_corrected = np.matmul(transformer, df_X_scaled.values)
y_phy_corrected = np.matmul(transformer, df_y[['HR']].values)
phylo_lasso_res = LassoCV(
    cv=5, 
    random_state=42,
    n_alphas=1000,
    max_iter=100000,
    verbose=0,
    n_jobs=-1,
).fit(X_phy_corrected, np.ravel(y_phy_corrected))
print('best alpha = %2.4f'%(phylo_lasso_res.alpha_))

best alpha = 12.9179


## Combine Lasso and phylogenetic Lasso results

In [7]:
df_lasso_res = pd.DataFrame(lasso_res.coef_, index=df_X_scaled.columns, columns=['Lasso'])
df_phylo_lasso_res = pd.DataFrame(phylo_lasso_res.coef_, index=df_X_scaled.columns, columns=['PhyloLasso'])
df_res_joined = pd.merge(df_lasso_res, df_phylo_lasso_res, left_index=True, right_index=True, how='outer').fillna(0)
df_res_joined.index.name = 'Pattern'
df_res_joined = df_res_joined.sort_values(['Lasso'])
df_res_joined.to_csv("output/lasso_phylolasso_coefs.csv")
df_res_joined.head()

Unnamed: 0_level_0,Lasso,PhyloLasso
Pattern,Unnamed: 1_level_1,Unnamed: 2_level_1
Pattern_4020,-0.019563,-0.0
Pattern_4054,-0.016259,-0.0
Pattern_212,-0.007649,-0.0804
Pattern_667,-0.007613,-0.071362
Pattern_206,-0.0,-0.0
