# Applying logistic regression in python to discriminate between control and M. bovis infected animals using peripheral blood transcriptomics data from kirsten_pbl et al., 2020
### This analysis considers two approaches, one using logistic regression on variable genes that have been preprocessed using DESeq2 (vst normalised) and the other using latent variables inferred using PCA, ICA and NMF

In [3]:
## Load in all necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from scipy import stats
from scipy.stats import kurtosis

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay,roc_curve,auc, make_scorer,mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, PredefinedSplit
from sklearn.decomposition import PCA, NMF, FastICA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
import warnings
import pickle
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

#font for plots
font = {'fontname':'Arial'}

In [8]:
kirsten_pbl_data_raw = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_individual/kirsten_pbl_vst_normalised_data.txt", sep = "\t").T
kirsten_pbl_data_raw.head()

Unnamed: 0,ENSBTAG00000006648,ENSBTAG00000049697,ENSBTAG00000047028,ENSBTAG00000053686,ENSBTAG00000054829,ENSBTAG00000046619,ENSBTAG00000001753,ENSBTAG00000046015,RIPK4,RCAN1,...,ENSBTAG00000054007,ENSBTAG00000051847,ENSBTAG00000049255,ENSBTAG00000049755,ENSBTAG00000054374,ENSBTAG00000049857,ENSBTAG00000049227,ENSBTAG00000050558,ENSBTAG00000052952,ENSBTAG00000048929
A016_CON,4.402651,4.402651,4.402651,4.402651,5.068936,4.402651,4.702737,4.402651,4.402651,8.654799,...,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651
A017_CON,4.402651,4.402651,4.402651,4.402651,4.718415,4.402651,4.402651,4.402651,4.402651,8.361204,...,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651
A018_CON,4.402651,4.402651,4.402651,4.402651,4.989332,4.402651,4.625704,4.402651,4.402651,7.788779,...,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651
A020_CON,4.402651,4.402651,4.402651,4.402651,4.692148,4.402651,4.402651,4.402651,4.402651,7.929821,...,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651
A022_CON,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,8.141159,...,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651,4.402651


In [14]:
kirsten_pbl_data_raw = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_individual/kirsten_pbl_vst_normalised_data.txt", sep = "\t").T
kirsten_pbl_labels = pd.read_csv("/home/workspace/jogrady/ML4TB/data/kirsten_pbl/kirsten_pbl_samples.csv", sep = "\t")

kirsten_pbl_labels
np.array(kirsten_pbl_labels["Run_Code"]) == np.array(kirsten_pbl_data_raw.index)
kirsten_pbl_labels["Fold"] = kirsten_pbl_labels.groupby("Run_Code").ngroup()


kirsten_pbl_labels_simple = kirsten_pbl_labels["Status"].to_numpy()
kirsten_pbl_folds = kirsten_pbl_labels["Fold"].to_numpy()


kirsten_pbl_labels_simple = np.where(kirsten_pbl_labels_simple == "Control", 0, np.where(kirsten_pbl_labels_simple == "Infected", 1, kirsten_pbl_labels_simple))
kirsten_pbl_labels_simple = kirsten_pbl_labels_simple.astype(int)

In [16]:
kirsten_pbl_labels_simple

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [17]:
# Calcualte variances for VST normalised genes
variances = kirsten_pbl_data_raw.var(axis=0)
# take top 20% and filter
threshold = variances.quantile(.80) 
genes = variances > threshold
genes= genes.loc[genes==True].index
kirsten_pbl_data = kirsten_pbl_data_raw.filter(items = genes, axis=1)

In [18]:
kirsten_pbl_data.head()

Unnamed: 0,SMIM11,ENSBTAG00000054497,C2CD2,ZBTB21,ITSN1,ABCG1,TFF2,TMPRSS3,UBASH3A,RSPH1,...,ENSBTAG00000007238,ENSBTAG00000050585,ENSBTAG00000052194,ENSBTAG00000053934,SERPINB4,ENSBTAG00000050608,ENSBTAG00000052012,MAD2L1,ENSBTAG00000054081,ENSBTAG00000049569
A016_CON,7.303273,5.336907,5.87131,7.622638,9.276541,8.902467,4.402651,5.188291,9.188335,5.462578,...,4.402651,5.241103,5.537489,7.60897,5.422674,4.826278,4.402651,6.486422,6.4386,5.462578
A017_CON,7.289366,5.103228,6.290879,7.589244,9.346436,8.409673,4.947421,4.718415,9.706054,5.430159,...,4.402651,4.848327,5.827994,7.038607,6.140536,5.700765,5.103228,7.159211,6.388111,6.045418
A018_CON,7.709903,4.946339,6.090871,7.180881,8.79784,8.652466,8.010655,5.066603,9.923994,5.333688,...,4.847438,4.402651,5.282074,7.095979,5.763729,5.779466,4.717783,6.430543,6.030005,5.53364
A020_CON,7.532594,4.811381,6.131371,7.6737,8.338821,9.244143,7.035411,4.902414,9.919084,5.105976,...,4.402651,5.387923,4.402651,6.422834,8.742676,5.92004,5.965498,6.31095,6.294123,8.698735
A022_CON,7.700725,4.402651,6.025026,7.361728,8.789821,8.889933,5.105398,4.849724,9.814909,5.559028,...,4.402651,5.477132,5.387128,7.067954,6.212295,4.949122,4.402651,6.751722,5.973176,6.807558


In [19]:
# Convert the custom folds array to a PredefinedSplit object
ps = PredefinedSplit(test_fold=kirsten_pbl_folds)

In [20]:

# Make a pipeline for logistic regression and set the paramaters
log_pipe = Pipeline(steps=[
('scaler', StandardScaler()), # see comment above (in markdown)
('classifier', LogisticRegression(max_iter=10000, solver='saga', tol=0.0001, random_state=42))]) # classifier

precision_scorer = make_scorer(precision_score, zero_division=1)  # had to modify zero_division as it was giving problems
f1_scorer = make_scorer(f1_score)
accuracy_scorer = make_scorer(accuracy_score)
recall_scorer = make_scorer(recall_score)

# Define scoring dictionary for GridSearchCV
scoring = {
    'accuracy': accuracy_scorer,
    #'f1': f1_scorer,
    #'precision': precision_scorer,
    #'recall': recall_scorer
}

# Create a parameter grid - we will search through all these combinations
param_grid = {
    'classifier__penalty': ["elasticnet"],
    'classifier__l1_ratio': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
}
 
# Create GridSearchCV object
grid_search = GridSearchCV(log_pipe, param_grid, cv=ps, verbose=3, n_jobs=60, scoring=scoring, refit="accuracy")

In [21]:
kirsten_pbl_labels_simple

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [22]:
grid_search.fit(kirsten_pbl_data, kirsten_pbl_labels_simple)

Fitting 16 folds for each of 10 candidates, totalling 160 fits


[CV 13/16] END classifier__l1_ratio=0.1, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   5.3s
[CV 4/16] END classifier__l1_ratio=0.1, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   5.4s
[CV 11/16] END classifier__l1_ratio=0.1, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   5.4s
[CV 5/16] END classifier__l1_ratio=0.4, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   5.7s
[CV 14/16] END classifier__l1_ratio=0.3, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   5.8s
[CV 10/16] END classifier__l1_ratio=0.1, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   6.0s
[CV 8/16] END classifier__l1_ratio=0.3, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   5.9s
[CV 12/16] END classifier__l1_ratio=0.3, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   6.0s
[CV 16/16] END classifier__l1_ratio=0.3, classifier__penalty=elasticnet; accuracy: 

In [23]:
# Look at best paramaters and accuracy and save results to a data frame
print("Best Parameters:", grid_search.best_params_)
print(f"Best CV average accuracy: {grid_search.best_score_:.3f}")
results_genes = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),
           pd.DataFrame(grid_search.cv_results_["mean_test_accuracy"], columns = ["Average Accuracy"]),
           pd.DataFrame(grid_search.cv_results_["std_test_accuracy"], columns=["SD accuracy"])],axis=1)
results_genes.sort_values(by='Average Accuracy', inplace=True)

Best Parameters: {'classifier__l1_ratio': 0.1, 'classifier__penalty': 'elasticnet'}
Best CV average accuracy: 1.000


In [24]:
results_genes
results_models = pd.DataFrame(grid_search.cv_results_)
results_models.sort_values(by='rank_test_accuracy', inplace=True)
results_genes

Unnamed: 0,classifier__l1_ratio,classifier__penalty,Average Accuracy,SD accuracy
0,0.1,elasticnet,1.0,0.0
1,0.2,elasticnet,1.0,0.0
2,0.3,elasticnet,1.0,0.0
3,0.4,elasticnet,1.0,0.0
4,0.5,elasticnet,1.0,0.0
5,0.6,elasticnet,1.0,0.0
6,0.7,elasticnet,1.0,0.0
7,0.8,elasticnet,1.0,0.0
8,0.9,elasticnet,1.0,0.0
9,1.0,elasticnet,1.0,0.0


In [None]:
data_test = pd.DataFrame(zip(kirsten_pbl_data.columns, np.transpose(grid_search.best_estimator_.named_steps["classifier"].coef_)), columns=['features', 'coef'])#.sort_values(by='coef', inplace=True)
data_test.sort_values(by='coef', inplace = True)
data_test = data_test.loc[(data_test != 0).all(axis=1), :]
data_test

Unnamed: 0,features,coef
888,DIRAS3,[-0.05842370002683579]
3543,ENSBTAG00000048885,[-0.05254238789476099]
5484,ENSBTAG00000052325,[-0.05120259011795917]
2029,FGF1,[-0.05059383055309786]
2886,MS4A14,[-0.048294800587800446]
...,...,...
1891,HK3,[0.05488966822134522]
2293,ADCY3,[0.05818909888388803]
3109,ENSBTAG00000054718,[0.0636322316731776]
2819,MDK,[0.06398508405895612]
