# Multiple regression with titers and number of HA1 mutations
We know that neutralization titers (specifically, the fraction of individuals with low neutralization titesr, as well as mean and median titers) and number of HA1 mutations both correlate strongly with MLR-estimated strain growth rates. These metrics are also collinear. Using multiple regression, we want to determine which of the predictors (neutralization titers or HA1 mutations) more fully explain the dependent outcome variable (growth rate). 

Author: Caroline Kikawa

In [2]:
# Import modules
import os
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.linear_model import Ridge
import statsmodels.api
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Input and output directories
datadir = '../data'
resultsdir = '../results'

Select the `growth_vs_titers` input to use from the top-level `results` directory. Note there are many different model fits to choose from and neutralization titers to choose from. I selected:
* MLR growth rate estimates from models fit to HA1 sequences within 1 amino acid mutation of a library strain with minimum 80 sequencing counts
* Neutralization titers from children and pre-vaccination adults (95 total individual sera)

In [57]:
growth_vs_titers = pd.read_csv(os.path.join(resultsdir, 'growth_vs_titers/growth_vs_titers_gisaid-ha1-within1_2023-mincounts80_child-and-adultprevax-sera_scatter.csv'))

## Repeated train-test splitting (80-20)

In [19]:
X = growth_vs_titers
X = X[['HA1_protein_mutations', 'frac_below_titer']]  
y = growth_vs_titers.growth_advantage_median

In [20]:
# Initialize the scaler
scaler = MinMaxScaler()
scaler = StandardScaler() # Uncomment to use z-scoring instead of min-max scaling

# Standardize X values 
X_scaled = scaler.fit_transform(X)

In [21]:
# Determine if X raw data or standardized data should be used
X = X_scaled

In [22]:
# define model
n_splits = 500  # Number of different random splits
mse_list = []
betas = []  # Store beta coefficients

for _ in range(n_splits):
    # Randomly split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

    # Train and evaluate model
    TT_model = LinearRegression()
    TT_model.fit(X_train, y_train)
    y_pred = TT_model.predict(X_test)

    # Store MSE
    mse_list.append(mean_squared_error(y_test, y_pred))

    # Store beta coefficients (intercept + slopes)
    beta_values = np.append(TT_model.intercept_, TT_model.coef_)  # [β0, β1, β2]
    betas.append(beta_values)

# Convert betas into a DataFrame
betas_df = pd.DataFrame(betas, columns=['Intercept'] + list(growth_vs_titers[['HA1_protein_mutations', 'frac_below_titer']].columns))

# Compute summary statistics
betas_mean = betas_df.mean()
betas_std = betas_df.std()

print(f"Average train-test set MSE: {np.mean(mse_list):.4f}")
print(f"Standard deviation of MSE: {np.std(mse_list):.4f}\n")

print('Average betas:\n', betas_mean, '\n')
print('Standard deviation of betas:\n', betas_std)

Average train-test set MSE: 0.0011
Standard deviation of MSE: 0.0007

Average betas:
 Intercept                0.986701
HA1_protein_mutations    0.035113
frac_below_titer         0.047537
dtype: float64 

Standard deviation of betas:
 Intercept                0.005296
HA1_protein_mutations    0.005174
frac_below_titer         0.006979
dtype: float64


## LeaveOneOut cross validation

In [23]:
# Initialize LeaveOneOUt module
leaveOneOut = LeaveOneOut()
# Initialize lists for model values
mse_list = []
beta_list = []
intercept_list = []

for train_index, test_index in leaveOneOut.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    loo_model = LinearRegression()
    loo_model.fit(X_train, y_train)
    y_pred = loo_model.predict(X_test)

    mse_list.append(mean_squared_error(y_test, y_pred))
    beta_list.append(loo_model.coef_)
    intercept_list.append(loo_model.intercept_)
    
# Calculate average MSE across all splits
average_mse = np.mean(mse_list)
std_mse = np.std(mse_list, ddof=1) 

print("LeaveOneOut average MSE:", average_mse)
print("LeaveOneOut standard deviation of MSE:", std_mse)

# Calculate average and standard deviation of betas
mean_X1 = sum(row[0] for row in beta_list) / len(beta_list)
mean_X2 = sum(row[1] for row in beta_list) / len(beta_list)

std_values = np.std(beta_list, axis=0, ddof=1)  

print("\nIntercept average:",  np.mean(intercept_list))
print("Beta average for HA1_mutations:", mean_X1)
print("Beta average for frac_below_cutoff:", mean_X2)

print("\nIntercept standard deviation:",  np.std(intercept_list))
print("Beta standard deviation for HA1_mutations:", std_values[0])
print("Beta standard deviation for frac_below_cutoff:", std_values[1])

LeaveOneOut average MSE: 0.001007555509317768
LeaveOneOut standard deviation of MSE: 0.0010380823835710949

Intercept average: 0.9866383292270368
Beta average for HA1_mutations: 0.03517572563840705
Beta average for frac_below_cutoff: 0.047214317537812765

Intercept standard deviation: 0.002645016115010733
Beta standard deviation for HA1_mutations: 0.002614126027510468
Beta standard deviation for frac_below_cutoff: 0.0034241336974872687


## Quantify multicollinearity with variance inflation factor (VIF)

In [24]:
X_scaled_with_intercept = np.column_stack((np.ones(len(X_scaled)), X_scaled))  # Add intercept for VIF calc
vif = [variance_inflation_factor(X_scaled_with_intercept, i) for i in range(1, X_scaled_with_intercept.shape[1])]
print(vif)  # Ignore the intercept

[np.float64(2.784140488315549), np.float64(2.7841404883155483)]


This isn't a *huge* VIF, meaning the betas are somewhat reliable but definitely still influenced by multicollinearity. 

## Permutation test (feature importance via shuffling)
We can shuffle the predictors and see how model performance is affected. Larger drops in MSE from the shuffling of a given variable implies that that variable is more important.
https://scikit-learn.org/stable/modules/permutation_importance.html

In [42]:
def permutation_importance(model, X, y, n_permutations=100):
    base_mse = mean_squared_error(y, model.predict(X))
    importances = []
    
    for col in range(X.shape[1]):
        permuted_mse = []
        for _ in range(n_permutations):
            X_permuted = X.copy()
            X_permuted[:, col] = shuffle(X[:, col], random_state=42)  # Shuffle one column
            permuted_mse.append(mean_squared_error(y, model.predict(X_permuted)))
        
        importance = np.mean(permuted_mse) - base_mse
        importances.append(importance)
    
    return importances

# Lists for importance values
HA1_importances=[]
titers_importances=[]

for n in range(100):
    # Train a model 
    # Randomly split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    # Fit the model
    TT_model = LinearRegression()
    TT_model.fit(X_train, y_train)
    
    importances = permutation_importance(TT_model, X, y)

    HA1_importances.append(importances[0])
    titers_importances.append(importances[1])

print(f"Average HA1 mutations importance: {np.mean(HA1_importances)}")
print(f"Average fraction titers below cutoff importance: {np.mean(titers_importances)}")

# Report relative importance
total_importance = np.mean(HA1_importances) + np.mean(titers_importances)
HA1_relative_importance = np.mean(HA1_importances) / total_importance * 100
titers_relative_importance = np.mean(titers_importances) / total_importance * 100

print('Percentage contribution of HA1 mutations ', HA1_relative_importance) 
print('Percentage contribution of titers ', titers_relative_importance)  

Average HA1 mutations importance: 0.003261011522286963
Average fraction titers below cutoff importance: 0.006376129068762077
Percentage contribution of HA1 mutations  33.83795734302958
Percentage contribution of titers  66.16204265697043


So by this metric, titers are the primary driver of predictions (>50%) but HA1 mutations are moderately important (20-50%).

**What if the data isn't scaled?** The relative importances shouldnt change, since the MSE isn't dependent on variable magnitude. But here I'll quickly test that...

In [43]:
# Recalculate using unscaled X
X = growth_vs_titers[['HA1_protein_mutations', 'frac_below_titer']].values

# Lists for importance values
HA1_importances=[]
titers_importances=[]

for n in range(100):
    # Train a model 
    # Randomly split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    # Fit the model
    TT_model = LinearRegression()
    TT_model.fit(X_train, y_train)
    
    importances = permutation_importance(TT_model, X, y)

    HA1_importances.append(importances[0])
    titers_importances.append(importances[1])

print(f"Average HA1 mutations importance: {np.mean(HA1_importances)}")
print(f"Average fraction titers below cutoff importance: {np.mean(titers_importances)}")

# Report relative importance
total_importance = np.mean(HA1_importances) + np.mean(titers_importances)
HA1_relative_importance = np.mean(HA1_importances) / total_importance * 100
titers_relative_importance = np.mean(titers_importances) / total_importance * 100

print('Percentage contribution of HA1 mutations ', HA1_relative_importance) 
print('Percentage contribution of titers ', titers_relative_importance) 

Average HA1 mutations importance: 0.00312471208145247
Average fraction titers below cutoff importance: 0.006604026769537033
Percentage contribution of HA1 mutations  32.11836733735183
Percentage contribution of titers  67.88163266264817


These contributions of HA1 mutations and titers are not meaningfully different without scaling, as expected.

## Quantifying the amount of variance explained by each predictor
By re-fitting model to datasets without different predictor variables, we can estimate how much of the variance is explained by different predictors. More important "driving" predictors will explain greater proportion of variance. Here I repurposed a lot of code from Jesse (https://dms-vep.org/SARS-CoV-2_XBB.1.5_spike_DMS/notebooks/current_dms_compare_natural_ba2_ba5_xbb.html)


In [44]:
phenotype_basic_colors = {
    "HA1_protein_mutations": "red",
    "frac_below_titer": "blue",

}

phenotypes = list(phenotype_basic_colors)

In [51]:
# Initialize the scaler
# scaler = MinMaxScaler()
scaler = StandardScaler() # For smaller datasets, z-scoring is preferable over min-max scaling

# Standardize X values 
phenotypes_scaled = pd.DataFrame(scaler.fit_transform(growth_vs_titers[['HA1_protein_mutations', 'frac_below_titer']]), columns=['HA1_protein_mutations', 'frac_below_titer'])
phenotypes_scaled['growth_advantage_median'] = growth_vs_titers.growth_advantage_median.tolist()

# Standardize data
ols_df = phenotypes_scaled

In [52]:
# Number of points
n = len(ols_df)

# Number of randomizations
n_rand = 200

# Randomize data
randomized_dfs = []
for i in range(n_rand):
    randomized_df = ols_df.apply(np.random.permutation).copy()  # Shuffle each column independently
    randomized_df['randomization'] = i  # Add identifier column
    randomized_dfs.append(randomized_df)

# Concatenate all randomized dataframes
ols_df_rand = pd.concat(randomized_dfs, ignore_index=True)

In [53]:
def ols_unique_var_explained(var_endog, vars, df, full_r2):
    """Get unique variance explained by fitting model after removing each variable.

    https://blog.minitab.com/en/adventures-in-statistics-2/how-to-identify-the-most-important-predictor-variables-in-regression-models
    
    """
    unique_var = {}
    for vremove in vars:
        vremove_ols_model = statsmodels.api.OLS(
            endog=df[[var_endog]],
            exog=statsmodels.api.add_constant(df[[v for v in vars if v != vremove]].astype(float)),
        )
        vremove_res_ols = vremove_ols_model.fit()
        unique_var[vremove] = full_r2 - vremove_res_ols.rsquared
    return unique_var



# https://www.einblick.ai/python-code-examples/ordinary-least-squares-regression-statsmodels/
ols_model = statsmodels.api.OLS(
    endog=ols_df[["growth_advantage_median"]],
    exog=statsmodels.api.add_constant(ols_df[phenotypes].astype(float)),
)
res_ols = ols_model.fit()
ols_df = ols_df.assign(predicted_change_in_growth_rate=res_ols.predict())
r2 = res_ols.rsquared
r = math.sqrt(r2)
unique_var = ols_unique_var_explained("growth_advantage_median", phenotypes, ols_df, r2)

subtitle = [
    # https://stackoverflow.com/a/53966201
    f"{p}: {unique_var[p] * 100:.0f}% of variance (coef {res_ols.params[p]:.4f} \u00B1 {res_ols.bse[p]:.8f})"
    for p in phenotypes
]

# Number of points
n = len(ols_df)

# Randomized fits
rand_rs = []
for _, ols_df_rand_i in ols_df_rand.groupby("randomization"):
    ols_model_rand_i = statsmodels.api.OLS(
        endog=ols_df_rand_i[["growth_advantage_median"]],
        exog=statsmodels.api.add_constant(ols_df_rand_i[phenotypes].astype(float)),
    )
    res_ols_rand_i = ols_model_rand_i.fit()
    rand_rs.append(math.sqrt(res_ols_rand_i.rsquared))
n_ge = sum(rand_r >= r for rand_r in rand_rs)
if n_ge:
    p_str = f"P = {n_ge / len(rand_rs)}"
else:
    p_str = f"P < {1 / len(rand_rs)}" 
# Print p-values
print(f"For randomized DMS data, {p_str}: {n_ge} of {len(rand_rs)} have r >= observed value of {r:.3f}")

# Print output from model fit and variance contributions
print('OLS regression r2: ', r2)
print('Variance explained by each predictor:')
for item in subtitle: print(item)

For randomized DMS data, P < 0.005: 0 of 200 have r >= observed value of 0.954
OLS regression r2:  0.9110570127505621
Variance explained by each predictor:
HA1_protein_mutations: 7% of variance (coef 0.0352 ± 0.01358306)
frac_below_titer: 12% of variance (coef 0.0470 ± 0.01358306)


So frac_below_titer explains more variance, which is consistent with the MSE-drop permutation test I did above. The coefficients are low (because data is standardized and estimated growth rates are 0.9-1.1), and actually basically the same as from the `sklearn` model. 

Below, we rerun the analysis on unscaled data to show that variance explained by each variable doesn't change whether or not the data is scaled.

In [56]:
# Unscaled data
unscaled_ols_df = growth_vs_titers[['HA1_protein_mutations', 'frac_below_titer', 'growth_advantage_median']]

# https://www.einblick.ai/python-code-examples/ordinary-least-squares-regression-statsmodels/
ols_model = statsmodels.api.OLS(
    endog=ols_df[["growth_advantage_median"]],
    exog=statsmodels.api.add_constant(unscaled_ols_df[phenotypes].astype(float)),
)
res_ols = ols_model.fit()
unscaled_ols_df = unscaled_ols_df.assign(predicted_change_in_growth_rate=res_ols.predict())
r2 = res_ols.rsquared
r = math.sqrt(r2)
unique_var = ols_unique_var_explained("growth_advantage_median", phenotypes, unscaled_ols_df, r2)

subtitle = [
    # https://stackoverflow.com/a/53966201
    f"{p}: {unique_var[p] * 100:.0f}% of variance (coef {res_ols.params[p]:.4f} \u00B1 {res_ols.bse[p]:.8f})"
    for p in phenotypes
]

# Number of points
n = len(unscaled_ols_df)

# Randomized fits
rand_rs = []
for _, ols_df_rand_i in ols_df_rand.groupby("randomization"):
    ols_model_rand_i = statsmodels.api.OLS(
        endog=ols_df_rand_i[["growth_advantage_median"]],
        exog=statsmodels.api.add_constant(ols_df_rand_i[phenotypes].astype(float)),
    )
    res_ols_rand_i = ols_model_rand_i.fit()
    rand_rs.append(math.sqrt(res_ols_rand_i.rsquared))
n_ge = sum(rand_r >= r for rand_r in rand_rs)
if n_ge:
    p_str = f"P = {n_ge / len(rand_rs)}"
else:
    p_str = f"P < {1 / len(rand_rs)}" 
# Print p-values
print(f"For randomized DMS data, {p_str}: {n_ge} of {len(rand_rs)} have r >= observed value of {r:.3f}")

# Print output from model fit and variance contributions
print('OLS regression r2: ', r2)
print('Variance explained by each predictor:')
for item in subtitle: print(item)

For randomized DMS data, P < 0.005: 0 of 200 have r >= observed value of 0.954
OLS regression r2:  0.9110570127505621
Variance explained by each predictor:
HA1_protein_mutations: 7% of variance (coef 0.0166 ± 0.00638834)
frac_below_titer: 12% of variance (coef 0.7178 ± 0.20725169)
