In [9]:
## Load in all necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import kurtosis

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay,roc_curve,auc, make_scorer,mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA, NMF, FastICA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
import warnings
import pickle
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

#font for plots
font = {'fontname':'Arial'}

In [13]:
# Load in the data
# Note the raw will be for if individuals wish to put in a MAD filter on the data
train_data = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_individual/human_train_vst_normalised_data.txt", sep = "\t").T
test_data = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_individual/human_test_vst_normalised_data.txt", sep = "\t").T
val_data = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_individual/human_val_vst_normalised_data.txt", sep = "\t").T
train_labels = pd.read_csv("/home/workspace/jogrady/ML4TB/data/human/train_labels.txt", sep = "\t").T.to_numpy() 
test_labels = pd.read_csv("/home/workspace/jogrady/ML4TB/data/human/test_labels.txt", sep = "\t").T.to_numpy() 
val_labels = pd.read_csv("/home/workspace/jogrady/ML4TB/data/human/val_labels.txt", sep = "\t").T.to_numpy()

In [14]:
# Read in the normalised data for pairwise comparison
train_data_test_normalised = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_combined/human_train_vst_normalised_human_test.txt", sep = "\t").T
train_data_val_normalised = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_combined/human_train_vst_normalised_human_val.txt", sep = "\t").T
test_data_train_normalised = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_combined/human_test_vst_normalised_human_train.txt", sep = "\t").T
test_data_val_normalised = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_combined/human_test_vst_normalised_human_val.txt", sep = "\t").T
val_data_train_normalised = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_combined/human_val_vst_normalised_human_train.txt", sep = "\t").T
val_data_test_normalised = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_combined/human_val_vst_normalised_human_test.txt", sep = "\t").T

In [15]:
train_data = train_data.loc[:,~train_data.columns.duplicated()].copy()
test_data = test_data.loc[:,~test_data.columns.duplicated()].copy()
val_data = val_data.loc[:,~val_data.columns.duplicated()].copy()
# And for pairwise comparison
train_data_test_normalised = train_data_test_normalised.loc[:,~train_data_test_normalised.columns.duplicated()].copy()
train_data_val_normalised = train_data_val_normalised.loc[:,~train_data_val_normalised.columns.duplicated()].copy()
test_data_train_normalised = test_data_train_normalised.loc[:,~test_data_train_normalised.columns.duplicated()].copy()
test_data_val_normalised = test_data_val_normalised.loc[:,~test_data_val_normalised.columns.duplicated()].copy()
val_data_train_normalised = val_data_train_normalised.loc[:,~val_data_train_normalised.columns.duplicated()].copy()
val_data_test_normalised = val_data_test_normalised.loc[:,~val_data_test_normalised.columns.duplicated()].copy()

# 1. Training Set
## Gene based

We need to Train the models on the trianing set, evlauate via k fold cross validation internally. THen we need to do the same in the test and val and then do a cross comparison on eachother.

In [16]:
train_labels = train_labels.astype(int)

In [17]:
# Calcualte variances for VST normalised genes
variances = train_data.var(axis=0)
variances = variances
# take top 20% and filter
threshold = variances.quantile(.80) 
genes = variances > threshold
genes= genes.loc[genes==True].index
train_data = train_data.filter(items = genes, axis=1)

# Reapply to test
#test_data = test_data.filter(items = genes, axis = 1)

In [20]:
print(train_data.shape, test_data.shape, val_data.shape)

(130, 11092) (61, 55460) (60, 55460)


In [23]:
KF =  KFold(n_splits=5, shuffle=True, random_state=42)

In [24]:
# Make a pipeline for logistic regression and set the paramaters
log_pipe = Pipeline(steps=[
('scaler', StandardScaler()), # see comment above (in markdown)
('classifier', LogisticRegression(max_iter=10000, solver='saga', tol=0.0001, random_state=42))]) # classifier

precision_scorer = make_scorer(precision_score, zero_division=1)  # had to modify zero_division as it was giving problems
f1_scorer = make_scorer(f1_score)
accuracy_scorer = make_scorer(accuracy_score)
recall_scorer = make_scorer(recall_score)

# Define scoring dictionary for GridSearchCV
scoring = {
    'accuracy': accuracy_scorer,
    'f1': f1_scorer,
    'precision': precision_scorer,
    'recall': recall_scorer
}

# Create a parameter grid - we will search through all these combinations
param_grid = {
    'classifier__penalty': ["elasticnet"],
    'classifier__l1_ratio': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
}
 
# Create GridSearchCV object
grid_search = GridSearchCV(log_pipe, param_grid, cv=KF, verbose=1, n_jobs=60, scoring=scoring, refit="accuracy")

In [25]:
grid_search.fit(train_data, train_labels.ravel())

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [None]:
# Post-processing: Count non-zero coefficients
# unfortunatley, grid search CV doesn't return coefficients for all models so we will have to re run and fit with all the paramaters again
non_zero_counts = []
for params, mean_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_accuracy']):
    # Manually fit the pipeline with the parameters
    model = log_pipe.set_params(**params)
    model.fit(train_data, train_labels.ravel())  # Re-fit the model on the entire dataset
    non_zero_count = np.sum(model.named_steps['classifier'].coef_ != 0)
    non_zero_counts.append((params, mean_score, non_zero_count))

non_zero_counts_df = pd.DataFrame(non_zero_counts)

non_zero_counts_df