In [15]:
# Basic
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump, load
import pickle

# GLM
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import family
from statsmodels.stats.multitest import multipletests
 
# Modelling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, ElasticNet, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, StackingRegressor, StackingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from mlxtend.regressor import StackingCVRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import cross_val_score

# Mertrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, precision_score, roc_curve, recall_score, precision_recall_curve, precision_recall_fscore_support, roc_auc_score, ConfusionMatrixDisplay, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from scipy.stats import randint

# oversampling method by SMOTE 
# !pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image

# import graphviz
%matplotlib inline

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# glance wd
os.getcwd()

'/home/jhou2/HSV434/LandscapeProject/HSV434-IFNG-mechanism'

In [16]:
os.chdir('/home/jhou2/HSV434/LandscapeProject/HSV434-IFNG-mechanism')

### Without any resampling, Just as original data

In [17]:
exp_meta_df = load('Processed/HSV434_Tcell_IFNG_mechanism_exp_matrix')

In [18]:
exp_meta_df['IFNG_bin'].value_counts()

IFNG_bin
0    16375
1     2768
Name: count, dtype: int64

### Feature selection:

In [19]:
# scaling prior glm modeling
exp_matrix_featureSel = exp_meta_df.copy()
#scaler = MinMaxScaler(feature_range=(0, 1))
scaler = StandardScaler()
columns_to_scale = exp_matrix_featureSel.columns.difference(['Subject', 'Status', 'CellType_Level3', 'IFNG_bin'])
exp_matrix_featureSel[columns_to_scale] = scaler.fit_transform(exp_matrix_featureSel[columns_to_scale])
exp_matrix_featureSel.head(2)

Unnamed: 0,MIR1302_2HG,AL627309_1,AL627309_3,AL669831_5,FAM87B,LINC00115,FAM41C,AL645608_7,SAMD11,NOC2L,...,HSV2_UL23,HSV2_UL26,HSV2_UL47,HSV2_UL49,HSV2_UL50,HSV2_US9,Subject,Status,CellType_Level3,IFNG_bin
Subject1_8WPH_AACTTTCCACTTAAGC-1,-0.007228,-0.024929,-0.007228,-0.16344,-0.015375,-0.129093,-0.04986,-0.007228,-0.024386,-0.404232,...,-0.007228,-0.007228,-0.007228,-0.007228,-0.007228,-0.007228,Subject1,Post,CD4 EM 2,0
Subject1_8WPH_AACTTTCTCAGCGATT-1,-0.007228,-0.024929,-0.007228,-0.16344,-0.015375,-0.129093,-0.04986,-0.007228,-0.024386,-0.404232,...,-0.007228,-0.007228,-0.007228,-0.007228,-0.007228,-0.007228,Subject1,Post,CD4 EM 3,0


### (1) Apply GLM model to filter variables, end up with 650+ variables highly associated with IFNG expression

In [20]:
# Use GLM model within QuasiPoisson Distribution to model most significant genes associated with IFN_bin
results = []

gene_columns = [col for col in exp_matrix_featureSel.columns if (col not in ['Subject', 'Status', 'CellType_Level3'])]

# Iterate over each gene to fit a quasi-Poisson model or try sm.families.Binomial
for gene in gene_columns:
    formula = f'IFNG_bin ~ {gene}' # + CellType_Fine + Status
    # model = smf.glm(formula=formula, data=exp_matrix_sampling, family=sm.families.Poisson(link = sm.families.links.Log())).fit()
    model = smf.glm(formula = formula, data = exp_matrix_featureSel, family = sm.families.Binomial()).fit()
    summary = model.summary()
    p_value = model.pvalues[1]  # Get the p-value for 'IFNGbin'
    results.append((gene, p_value))

# Convert results to a DataFrame
results_df = pd.DataFrame(results, columns=['Gene', 'P_value'])

# Adjusting p-values for multiple testing, for example using Bonferroni correction
rejected, corrected_p_values, _, _ = multipletests(results_df['P_value'], method='bonferroni')

# Adding corrected p-values to the results DataFrame
results_df['Corrected_P_value'] = corrected_p_values



In [21]:
# Identifying significant genes
significant_genes = results_df[results_df['Corrected_P_value'] < 0.05]
significant_genes

Unnamed: 0,Gene,P_value,Corrected_P_value
117,ACOT7,1.550692e-10,3.869596e-06
121,TNFRSF25,1.722342e-08,4.297932e-04
137,TNFRSF9,7.024480e-20,1.752889e-15
199,FBXO6,1.822252e-12,4.547247e-08
200,MAD2L2,3.824419e-11,9.543456e-07
...,...,...,...
24686,MIR155HG,2.166036e-30,5.405125e-26
24692,APP,1.195740e-10,2.983850e-06
24716,TIAM1,9.096449e-07,2.269928e-02
24792,TTC3,1.339214e-07,3.341875e-03


### (2) Apply Lasso model by using L1 penalty to further narrow down the feature selection

In [22]:
# Prepare your data (excluding the target 'IFNG_bin' and any categorical columns like 'CellType_Level3')
X = exp_matrix_featureSel.drop(columns=['Subject', 'Status', 'IFNG_bin', 'CellType_Level3'])
y = exp_matrix_featureSel['IFNG_bin']

# Subset the data to only include selected genes (from p-value filtering or feature importance)
significant_gene_list = significant_genes['Gene'].tolist()  # Extract the list of genes
X = X[significant_gene_list]

In [23]:
# Step 1: Apply LogisticRegressionCV with L1 regularization and cross-validation to find the best C
lasso_cv = LogisticRegressionCV(penalty='l1', solver='saga', max_iter=10000, cv=5)
lasso_cv.fit(X, y)

# Step 2: Retrieve the best C parameter from cross-validation
best_C = lasso_cv.C_
print(f"Best C parameter: {best_C}")

Best C parameter: [0.00599484]


In [24]:
# Step 3: Retrain the model using the best C
lasso = LogisticRegression(penalty='l1', solver='saga', max_iter=10000, C=float(best_C))
lasso.fit(X, y)

  lasso = LogisticRegression(penalty='l1', solver='saga', max_iter=10000, C=float(best_C))


In [25]:
# Step 4: Extract the coefficients and create a DataFrame of features with their coefficients
coefficients = lasso.coef_[0]
feature_importance = pd.DataFrame({
    'Gene': X.columns,
    'Coefficient': coefficients
})

# Step 5: Select genes with non-zero coefficients
selected_significant_genes = feature_importance[feature_importance['Coefficient'] != 0]

# Step 6: Sort the selected genes by the absolute value of their coefficients
selected_significant_genes = selected_significant_genes.sort_values(by='Coefficient', key=abs, ascending=False)

# Display the selected genes
print(selected_significant_genes)

       Gene  Coefficient
520    CCL5     0.224936
523    CCL4     0.173872
414    CD69     0.124025
203     TNF     0.103572
169    IL7R    -0.100408
..      ...          ...
41   SLAMF7     0.000582
26     CSF1     0.000553
183    EGR1     0.000478
118   EOMES     0.000214
241   TAGAP     0.000213

[101 rows x 2 columns]


In [26]:
# Save up
dump(selected_significant_genes, 'Processed/HSV434_Tcell_IFNG_mechanism_ML_Original_significant_genes')

['Processed/HSV434_Tcell_IFNG_mechanism_ML_Original_significant_genes']