In [1]:
# Basic
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump, load

# GLM
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import family
from statsmodels.stats.multitest import multipletests
 
# Modelling
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, ElasticNet
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, StackingRegressor, StackingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from mlxtend.regressor import StackingCVRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Mertrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, precision_score, roc_curve, recall_score, precision_recall_curve, precision_recall_fscore_support, roc_auc_score, ConfusionMatrixDisplay, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
# import graphviz
%matplotlib inline

# glance wd
os.getcwd()

'/home/jhou2/HSV434/LandscapeProject/HSV434-IFNG-mechanism/Code'

In [2]:
os.chdir('/home/jhou2/HSV434/LandscapeProject/HSV434-IFNG-mechanism')

In [3]:
# Prior to this, refer "Data Processing for ML in Python.R" code in local folder, "GitHub/HSV434-IFNG-mechanism",
# see how data was selected and transferred. The 2 csv files upload into cloud and processding following

## Data loading and processing

In [5]:
# load raw/processed data
exp_matrix = pd.read_csv('Processed/HSV434_Tcell_IFNG_mechanism_exp_data.csv', index_col=0)
exp_matrix.head(2)

Unnamed: 0,Subject1_8WPH_AACTTTCCACTTAAGC.1,Subject1_8WPH_AACTTTCTCAGCGATT.1,Subject1_8WPH_ACAGCCGCATATACGC.1,Subject1_8WPH_ACATCAGAGACCCACC.1,Subject1_8WPH_ACATCAGTCAGGCGAA.1,Subject1_8WPH_ACATCAGTCTGGCGAC.1,Subject1_8WPH_ACATGGTAGCCTCGTG.1,Subject1_8WPH_ACCAGTAAGAGCTATA.1,Subject1_8WPH_ACCAGTATCACATGCA.1,Subject1_8WPH_ACGATGTGTGGACGAT.1,...,Subject18_Entry_TGCACCTGTCTAGTGT.1,Subject18_Entry_TGCCCTAGTTACGCGC.1,Subject18_Entry_TGGCTGGGTCTCTCTG.1,Subject18_Entry_TGGGCGTTCCTGCCAT.1,Subject18_Entry_TGGTTCCCACAAGCCC.1,Subject18_Entry_TGGTTCCCAGCGAACA.1,Subject18_Entry_TTCTCAAAGAACAACT.1,Subject18_Entry_TTGTAGGTCTGCAAGT.1,Subject18_Entry_TTTGGTTTCCAGGGCT.1,Subject18_Entry_TTTGTCATCCCAAGAT.1
MIR1302-2HG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# need transpose dataframe, row should be cell id, cols should be by genes
exp_matrix_T = exp_matrix.transpose()
exp_matrix_T.head(2)

Unnamed: 0,MIR1302-2HG,AL627309.1,AL627309.3,AL669831.5,FAM87B,LINC00115,FAM41C,AL645608.7,SAMD11,NOC2L,...,AC240274.1,AC213203.1,HSV1-UL36,HSV2-UL18,HSV2-UL23,HSV2-UL26,HSV2-UL47,HSV2-UL49,HSV2-UL50,HSV2-US9
Subject1_8WPH_AACTTTCCACTTAAGC.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Subject1_8WPH_AACTTTCTCAGCGATT.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# keep the post fix is consistent with meta data for later merging
exp_matrix_T.index = exp_matrix_T.index.str.replace('.1', '-1')

In [8]:
# load meta info for each cell
meta_df = pd.read_csv('Processed/HSV434_Tcell_IFNG_mechanism_meta_data.csv', index_col=0)
meta_df.head(2)

Unnamed: 0,orig.ident,Batch,Subject,Status,CellType_Level3
Subject1_8WPH_AACTTTCCACTTAAGC-1,Subject1_8WPH,Batch7,Subject1,Post,CD4 EM 2
Subject1_8WPH_AACTTTCTCAGCGATT-1,Subject1_8WPH,Batch7,Subject1,Post,CD4 EM 3


In [9]:
# merge expression data and meta data
exp_meta_df = exp_matrix_T.merge(meta_df, left_index=True, right_index=True)
exp_meta_df.shape
exp_meta_df.columns

Index(['MIR1302-2HG', 'AL627309.1', 'AL627309.3', 'AL669831.5', 'FAM87B',
       'LINC00115', 'FAM41C', 'AL645608.7', 'SAMD11', 'NOC2L',
       ...
       'HSV2-UL26', 'HSV2-UL47', 'HSV2-UL49', 'HSV2-UL50', 'HSV2-US9',
       'orig.ident', 'Batch', 'Subject', 'Status', 'CellType_Level3'],
      dtype='object', length=24958)

In [10]:
# Replace '.' with '_' in the column names
exp_meta_df.columns = exp_meta_df.columns.str.replace('[.-]', '_', regex=True)
exp_meta_df.columns

Index(['MIR1302_2HG', 'AL627309_1', 'AL627309_3', 'AL669831_5', 'FAM87B',
       'LINC00115', 'FAM41C', 'AL645608_7', 'SAMD11', 'NOC2L',
       ...
       'HSV2_UL26', 'HSV2_UL47', 'HSV2_UL49', 'HSV2_UL50', 'HSV2_US9',
       'orig_ident', 'Batch', 'Subject', 'Status', 'CellType_Level3'],
      dtype='object', length=24958)

In [11]:
# drop some data no needed
exp_meta_df = exp_meta_df.drop(['orig_ident', 'Batch'], axis = 1)

In [12]:
# create a categorical value present IFNG
exp_meta_df['IFNG_bin'] = exp_meta_df['IFNG'].apply(lambda x: 1 if x > 0 else 0)

In [14]:
# Let save up data
dump(exp_meta_df, 'Processed/HSV434_Tcell_IFNG_mechanism_exp_matrix')

['Processed/HSV434_Tcell_IFNG_mechanism_exp_matrix']