# Applying logistic regression in python to discriminate between control and M. bovis infected animals using peripheral blood transcriptomics data from abdelaal et al., 2020
### This analysis considers two approaches, one using logistic regression on variable genes that have been preprocessed using DESeq2 (vst normalised) and the other using latent variables inferred using PCA, ICA and NMF

In [1]:
## Load in all necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from scipy import stats
from scipy.stats import kurtosis

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay,roc_curve,auc, make_scorer,mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, PredefinedSplit
from sklearn.decomposition import PCA, NMF, FastICA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
import warnings
import pickle
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

#font for plots
font = {'fontname':'Arial'}

In [2]:
abdelaal_data_raw = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_individual/abdelaal_vst_normalised_data.txt", sep = "\t").T

In [46]:
abdelaal_data_raw = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/vst_individual/abdelaal_vst_normalised_data.txt", sep = "\t").T
abdelaal_labels = pd.read_csv("/home/workspace/jogrady/ML4TB/data/abdelaal/abdelaal_samples.csv", sep = "\t")
abdelaal_labels = abdelaal_labels[["Animal_Code", "Week", "Status"]].drop_duplicates()



np.array(abdelaal_labels["Animal_Code"]) == np.array(abdelaal_data_raw.index)
abdelaal_labels["ID"] = abdelaal_labels["Animal_Code"].str.extract(r"^(.*)_[^_]+$", expand=False)
abdelaal_labels["Fold"] = abdelaal_labels.groupby("ID").ngroup()
abdelaal_labels.loc[abdelaal_labels["Week"] == "W0", "Status"] = "Control" # Note the infected samples were sampled after infection (immediately) so may be worthwhile labelling them as Control (note this increased the CV accuracy from ~ 50 to 80)
# Check right order
np.array(abdelaal_labels["Animal_Code"]) == np.array(abdelaal_data_raw.index)

abdelaal_labels_simple = abdelaal_labels["Status"].to_numpy()
abdelaal_folds = abdelaal_labels["Fold"].to_numpy()


abdelaal_labels_simple = np.where(abdelaal_labels_simple == "Control", 0, np.where(abdelaal_labels_simple == "Infected", 1, abdelaal_labels_simple))
abdelaal_labels_simple = abdelaal_labels_simple.astype(int)

In [47]:
abdelaal_labels

Unnamed: 0,Animal_Code,Week,Status,ID,Fold
0,Infected_1_20,W20,Infected,Infected_1,0
2,Infected_1_8,W8,Infected,Infected_1,0
4,Infected_2_20,W20,Infected,Infected_2,1
6,Infected_2_8,W8,Infected,Infected_2,1
8,Infected_3_20,W20,Infected,Infected_3,2
10,Infected_3_8,W8,Infected,Infected_3,2
12,Infected_4_20,W20,Infected,Infected_4,3
14,Infected_4_8,W8,Infected,Infected_4,3
16,Infected_5_20,W20,Infected,Infected_5,4
18,Infected_5_8,W8,Infected,Infected_5,4


In [48]:
# Calcualte variances for VST normalised genes
variances = abdelaal_data_raw.var(axis=0)
# take top 20% and filter
threshold = variances.quantile(.80) 
genes = variances > threshold
genes= genes.loc[genes==True].index
abdelaal_data = abdelaal_data_raw.filter(items = genes, axis=1)

In [49]:
abdelaal_data.head()

Unnamed: 0,ENSBTAG00000054829,RCAN1,SMIM11,ITSN1,ABCG1,TFF2,TMPRSS3,RSPH1,IL1RAP,P3H2,...,ENSBTAG00000053997,ENSBTAG00000054086,ENSBTAG00000050585,ENSBTAG00000052194,SERPINB4,ENSBTAG00000050153,ENSBTAG00000050608,MAD2L1,ENSBTAG00000054081,ENSBTAG00000049569
Infected_1_20,5.020736,9.472897,8.265235,9.607765,10.199095,6.4232,5.446811,5.020736,8.26928,6.30895,...,4.543355,8.865097,4.933724,6.391624,4.543355,4.543355,6.238502,7.58131,6.291721,5.020736
Infected_1_8,4.824982,9.568001,8.278523,9.761322,10.372165,6.454131,5.738425,5.421722,8.782285,5.952382,...,4.543355,10.476969,4.941008,6.33842,4.941008,4.543355,6.500611,7.688112,7.109312,4.824982
Infected_2_20,4.543355,9.08321,8.664853,8.918923,10.963202,5.516665,5.000588,5.070595,8.26221,6.481582,...,4.543355,7.116487,6.936616,5.370276,5.000588,6.467869,5.48248,6.983877,6.337144,4.808073
Infected_2_8,5.016344,9.322084,8.500184,9.392623,10.808786,6.327843,5.209333,5.677687,9.055278,6.423233,...,4.543355,7.195843,6.634089,5.842797,5.355504,5.985045,5.707205,7.254159,6.110927,5.8171
Infected_3_20,4.924988,9.444165,9.05813,8.918001,11.077527,5.945029,5.501468,5.465131,8.492798,6.445798,...,4.543355,7.618384,7.150779,6.221806,5.081516,6.557695,4.924988,7.116096,5.720759,5.465131


In [50]:
# Convert the custom folds array to a PredefinedSplit object
ps = PredefinedSplit(test_fold=abdelaal_folds)

In [51]:

# Make a pipeline for logistic regression and set the paramaters
log_pipe = Pipeline(steps=[
('scaler', StandardScaler()), # see comment above (in markdown)
('classifier', LogisticRegression(max_iter=10000, solver='saga', tol=0.0001, random_state=42))]) # classifier

precision_scorer = make_scorer(precision_score, zero_division=1)  # had to modify zero_division as it was giving problems
f1_scorer = make_scorer(f1_score)
accuracy_scorer = make_scorer(accuracy_score)
recall_scorer = make_scorer(recall_score)

# Define scoring dictionary for GridSearchCV
scoring = {
    'accuracy': accuracy_scorer,
    #'f1': f1_scorer,
    #'precision': precision_scorer,
    #'recall': recall_scorer
}

# Create a parameter grid - we will search through all these combinations
param_grid = {
    'classifier__penalty': ["elasticnet"],
    'classifier__l1_ratio': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
}
 
# Create GridSearchCV object
grid_search = GridSearchCV(log_pipe, param_grid, cv=ps, verbose=3, n_jobs=60, scoring=scoring, refit="accuracy")

In [52]:
abdelaal_labels_simple

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [53]:
grid_search.fit(abdelaal_data, abdelaal_labels_simple)

Fitting 12 folds for each of 10 candidates, totalling 120 fits


[CV 1/12] END classifier__l1_ratio=0.1, classifier__penalty=elasticnet; accuracy: (test=0.500) total time=   8.6s
[CV 12/12] END classifier__l1_ratio=0.1, classifier__penalty=elasticnet; accuracy: (test=0.500) total time=   8.9s
[CV 11/12] END classifier__l1_ratio=0.2, classifier__penalty=elasticnet; accuracy: (test=0.000) total time=   8.8s
[CV 5/12] END classifier__l1_ratio=0.2, classifier__penalty=elasticnet; accuracy: (test=0.500) total time=   8.4s
[CV 7/12] END classifier__l1_ratio=0.1, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   8.8s
[CV 2/12] END classifier__l1_ratio=0.4, classifier__penalty=elasticnet; accuracy: (test=0.000) total time=   8.9s
[CV 2/12] END classifier__l1_ratio=0.3, classifier__penalty=elasticnet; accuracy: (test=0.500) total time=   9.1s
[CV 9/12] END classifier__l1_ratio=0.3, classifier__penalty=elasticnet; accuracy: (test=1.000) total time=   9.3s
[CV 4/12] END classifier__l1_ratio=0.5, classifier__penalty=elasticnet; accuracy: (tes

In [54]:
# Look at best paramaters and accuracy and save results to a data frame
print("Best Parameters:", grid_search.best_params_)
print(f"Best CV average accuracy: {grid_search.best_score_:.3f}")
results_genes = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),
           pd.DataFrame(grid_search.cv_results_["mean_test_accuracy"], columns = ["Average Accuracy"]),
           pd.DataFrame(grid_search.cv_results_["std_test_accuracy"], columns=["SD accuracy"])],axis=1)
results_genes.sort_values(by='Average Accuracy', inplace=True)

Best Parameters: {'classifier__l1_ratio': 0.1, 'classifier__penalty': 'elasticnet'}
Best CV average accuracy: 0.667


In [55]:
results_genes
results_models = pd.DataFrame(grid_search.cv_results_)
results_models.sort_values(by='rank_test_accuracy', inplace=True)
results_genes

Unnamed: 0,classifier__l1_ratio,classifier__penalty,Average Accuracy,SD accuracy
6,0.7,elasticnet,0.5,0.408248
7,0.8,elasticnet,0.5,0.408248
8,0.9,elasticnet,0.5,0.408248
4,0.5,elasticnet,0.541667,0.431003
5,0.6,elasticnet,0.541667,0.431003
9,1.0,elasticnet,0.541667,0.379601
3,0.4,elasticnet,0.583333,0.399653
2,0.3,elasticnet,0.625,0.360844
0,0.1,elasticnet,0.666667,0.372678
1,0.2,elasticnet,0.666667,0.372678


In [None]:
data_test = pd.DataFrame(zip(abdelaal_data.columns, np.transpose(grid_search.best_estimator_.named_steps["classifier"].coef_)), columns=['features', 'coef'])#.sort_values(by='coef', inplace=True)
data_test.sort_values(by='coef', inplace = True)
data_test = data_test.loc[(data_test != 0).all(axis=1), :]
data_test

Unnamed: 0,features,coef
888,DIRAS3,[-0.05842370002683579]
3543,ENSBTAG00000048885,[-0.05254238789476099]
5484,ENSBTAG00000052325,[-0.05120259011795917]
2029,FGF1,[-0.05059383055309786]
2886,MS4A14,[-0.048294800587800446]
...,...,...
1891,HK3,[0.05488966822134522]
2293,ADCY3,[0.05818909888388803]
3109,ENSBTAG00000054718,[0.0636322316731776]
2819,MDK,[0.06398508405895612]


# PCA