In [1]:
# Basic
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump, load

# GLM
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import family
from statsmodels.stats.multitest import multipletests
 
# Modelling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, ElasticNet
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, StackingRegressor, StackingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from mlxtend.regressor import StackingCVRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Mertrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, precision_score, roc_curve, recall_score, precision_recall_curve, precision_recall_fscore_support, roc_auc_score, ConfusionMatrixDisplay, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
# import graphviz
%matplotlib inline

# glance wd
os.getcwd()

'/home/jhou2/HSV434/LandscapeProject/HSV434-IFNG-mechanism/Code'

In [2]:
os.chdir('/home/jhou2/HSV434/LandscapeProject/HSV434-IFNG-mechanism')

## Under-sampling (Reduce Majority Class, IFNG-)

In [3]:
exp_meta_df = load('Processed/HSV434_Tcell_IFNG_mechanism_exp_matrix')

In [4]:
# create a balanced sampling: each cell type has equal IFNG+ and IFNG- cells for following steps, 
exp_matrix_UnderSampling = [] # Initialize an empty list to store sampled DataFrames

for cluster in ["CD4 CM", "CD4 EM 1", "CD4 EM 2", "CD4 EM 3", "CD4 TRM", 
                "CD4 ISG", "CD4 Act", "CD4 Prolif", 
                "CD8 CM", "CD8 EM 1", "CD8 EM 2", "CD8 TRM 1", "CD8 TRM 2", "CD8 ISG"]:
    sample_size = exp_meta_df[(exp_meta_df['CellType_Level3'] == cluster) & (exp_meta_df['IFNG_bin'] == 1)]['IFNG_bin'].value_counts().min()
    print(f"{cluster}, get {sample_size} cells")
    sub_temp_positive = exp_meta_df[(exp_meta_df['CellType_Level3'] == cluster) & (exp_meta_df['IFNG_bin'] == 1)].sample(n=sample_size, random_state=42)
    sub_temp_negative = exp_meta_df[(exp_meta_df['CellType_Level3'] == cluster) & (exp_meta_df['IFNG_bin'] == 0)].sample(n=sample_size, random_state=42)
    exp_matrix_UnderSampling.append(pd.concat([sub_temp_positive, sub_temp_negative], ignore_index=True))

CD4 CM, get 73 cells
CD4 EM 1, get 330 cells
CD4 EM 2, get 63 cells
CD4 EM 3, get 65 cells
CD4 TRM, get 296 cells
CD4 ISG, get 106 cells
CD4 Act, get 47 cells
CD4 Prolif, get 161 cells
CD8 CM, get 114 cells
CD8 EM 1, get 728 cells
CD8 EM 2, get 190 cells
CD8 TRM 1, get 261 cells
CD8 TRM 2, get 224 cells
CD8 ISG, get 110 cells


In [5]:
# Concatenate all sampled DataFrames into one DataFrame
exp_matrix_UnderSampling = pd.concat(exp_matrix_UnderSampling, ignore_index=True)
pd.crosstab(exp_matrix_UnderSampling['IFNG_bin'], exp_matrix_UnderSampling['CellType_Level3'])

CellType_Level3,CD4 Act,CD4 CM,CD4 EM 1,CD4 EM 2,CD4 EM 3,CD4 ISG,CD4 Prolif,CD4 TRM,CD8 CM,CD8 EM 1,CD8 EM 2,CD8 ISG,CD8 TRM 1,CD8 TRM 2
IFNG_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,47,73,330,63,65,106,161,296,114,728,190,110,261,224
1,47,73,330,63,65,106,161,296,114,728,190,110,261,224


In [6]:
# some columns are totally 0 as selected cells may not express all genes
exp_matrix_UnderSampling.columns[(exp_matrix_UnderSampling == 0).all()].tolist()
print("how many columns total empty:",len(exp_matrix_UnderSampling.columns[(exp_matrix_UnderSampling == 0).all()]))

how many columns total empty: 2919


In [7]:
# Identify columns that are not entirely 0
columns_not_entirely_zero = exp_matrix_UnderSampling.columns[(exp_matrix_UnderSampling != 0).any()]
# Select only the columns that are not entirely 0
exp_matrix_UnderSampling = exp_matrix_UnderSampling[columns_not_entirely_zero]

In [8]:
exp_matrix_UnderSampling['IFNG_bin'].value_counts()
# check how many cells per cluster and IFNG expression binary
# pd.crosstab(exp_matrix_select['IFNG_bin'], exp_matrix_select['CellType_Fine'])

IFNG_bin
1    2768
0    2768
Name: count, dtype: int64

In [9]:
# Save up
dump(exp_matrix_UnderSampling, 'Processed/HSV434_Tcell_IFNG_mechanism_exp_matrix_UnderSampling')

['Processed/HSV434_Tcell_IFNG_mechanism_exp_matrix_UnderSampling']

In [10]:
exp_matrix_UnderSampling.head(3)

Unnamed: 0,AL627309_1,AL669831_5,FAM87B,LINC00115,FAM41C,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,...,AC233755_2,AC233755_1,AC240274_1,HSV2_UL18,HSV2_UL26,HSV2_UL47,Subject,Status,CellType_Level3,IFNG_bin
0,0.0,0.0,0.0,0.0,0.0,0.0,1.727663,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Subject1,Prior,CD4 CM,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Subject5,Lesion,CD4 CM,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Subject2,Post,CD4 CM,1


### Feature selection: 
### GLM model within QuasiPoisson Distribution to model most significant genes associated with IFN_bin

In [11]:
# load data
exp_matrix_UnderSampling = load('Processed/HSV434_Tcell_IFNG_mechanism_exp_matrix_UnderSampling')

In [12]:
# scaling prior glm modeling
exp_matrix_featureSel = exp_matrix_UnderSampling.copy()
#scaler = MinMaxScaler(feature_range=(0, 1))
scaler = StandardScaler()
columns_to_scale = exp_matrix_featureSel.columns.difference(['Subject', 'Status', 'CellType_Level3', 'IFNG_bin'])
exp_matrix_featureSel[columns_to_scale] = scaler.fit_transform(exp_matrix_featureSel[columns_to_scale])
exp_matrix_featureSel.head(2)

Unnamed: 0,AL627309_1,AL669831_5,FAM87B,LINC00115,FAM41C,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,...,AC233755_2,AC233755_1,AC240274_1,HSV2_UL18,HSV2_UL26,HSV2_UL47,Subject,Status,CellType_Level3,IFNG_bin
0,-0.027434,-0.168653,-0.013441,-0.136639,-0.050214,-0.023083,2.676805,-0.146396,-0.159186,-0.013441,...,-0.022969,-0.029765,-0.068425,-0.013441,-0.013441,-0.013441,Subject1,Prior,CD4 CM,1
1,-0.027434,-0.168653,-0.013441,-0.136639,-0.050214,-0.023083,-0.419984,-0.146396,-0.159186,-0.013441,...,-0.022969,-0.029765,-0.068425,-0.013441,-0.013441,-0.013441,Subject5,Lesion,CD4 CM,1


In [13]:
# Use GLM model within QuasiPoisson Distribution to model most significant genes associated with IFN_bin
results = []

gene_columns = [col for col in exp_matrix_featureSel.columns if (col not in ['Subject', 'Status', 'CellType_Level3'])]

# Iterate over each gene to fit a quasi-Poisson model or try sm.families.Binomial
for gene in gene_columns:
    formula = f'IFNG_bin ~ {gene}' # + CellType_Fine + Status
    # model = smf.glm(formula=formula, data=exp_matrix_sampling, family=sm.families.Poisson(link = sm.families.links.Log())).fit()
    model = smf.glm(formula = formula, data = exp_matrix_featureSel, family = sm.families.Binomial()).fit()
    summary = model.summary()
    p_value = model.pvalues[1]  # Get the p-value for 'IFNGbin'
    results.append((gene, p_value))

# Convert results to a DataFrame
results_df = pd.DataFrame(results, columns=['Gene', 'P_value'])

# Adjusting p-values for multiple testing, for example using Bonferroni correction
rejected, corrected_p_values, _, _ = multipletests(results_df['P_value'], method='bonferroni')

# Adding corrected p-values to the results DataFrame
results_df['Corrected_P_value'] = corrected_p_values

# Identifying significant genes
significant_genes = results_df[results_df['Corrected_P_value'] < 0.1]
significant_genes



Unnamed: 0,Gene,P_value,Corrected_P_value
176,MAD2L2,2.094232e-06,4.614640e-02
340,RSRP1,2.170102e-06,4.781820e-02
800,JUN,8.423955e-24,1.856218e-19
845,IL12RB2,5.191066e-10,1.143851e-05
1031,S1PR1,2.961729e-06,6.526169e-02
...,...,...,...
20739,FOSB,2.311671e-23,5.093768e-19
20847,PPP1R15A,3.548654e-18,7.819460e-14
20945,NKG7,7.839605e-07,1.727457e-02
21078,UBE2S,1.815557e-13,4.000579e-09


In [14]:
# save significant_genes
significant_genes.to_csv('Results/UnderSampling_significant_genes.csv')
dump(significant_genes, 'Processed/HSV434_Tcell_IFNG_mechanism_ML_UnderSampling_significant_genes')

['Processed/HSV434_Tcell_IFNG_mechanism_ML_UnderSampling_significant_genes']