# Applying logistic regression in python to discriminate between control and M. bovis infected animals using peripheral blood transcriptomics data
### This analysis considers two approaches, one using logistic regression on variable genes that have been preprocessed using DESeq2 (vst normalised) and the other using latent variables inferred using PCA, ICA and NMF


RNA-seq data often suffers from a curse of dimensionality whereby there are many more features/genes (p) than samples (n) and this can lead to model overfitting, spurious correlations and poor generalizability. Hence, there are strategies to mitigate against this issue.
Such strategies include: applying a penalization to input features to account for multi-collinearity. Other strategies involve projecting the data into a reduced dimensional space and using these latent variabels (e.g. PCs) as input for a classification model

In [137]:
## Load in all necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import kurtosis

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay,roc_curve,auc, make_scorer,mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA, NMF, FastICA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
import warnings
import pickle
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

#font for plots
font = {'fontname':'Arial'}

In [2]:
# Load in the data
# Note the raw will be for if individuals wish to put in a MAD filter on the data
train_data_raw = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/Train_raw_data.txt", sep = "\t").T
test_data_raw = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/Test_raw_data.txt", sep = "\t").T
train_data = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/Train_vst_normalised_data.txt", sep = "\t").T
test_data = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/Test_vst_normalised_data.txt", sep = "\t").T
train_labels = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/Train_labels.txt", sep = "\t").T 
test_labels = pd.read_csv("/home/workspace/jogrady/ML4TB/work/normalisation/Test_labels.txt", sep = "\t").T.to_numpy() 

In [3]:
# Need to do some data wrangling to ensure labels are in right format for ML functions in scikit learn
# Convert labels to numeric really is the main thing
train_labels = train_labels.to_numpy()
train_labels = np.where(train_labels == "Control", 0, np.where(train_labels == "Infected", 1, train_labels))
test_labels = np.where(test_labels == "Control", 0, np.where(test_labels == "Infected", 1, test_labels))
train_labels = train_labels.astype(int)
test_labels = test_labels.astype(int)

I was unsure as to whether or not VST normalsied gene expression data should be scaled further (e.g. using StandardScaler) as this could increase technical noise (which would is initially removed in the VST step). However, Dr Mike love (author of DESeq2) clears it up with comments below in the context of glmnet (R package)

- "Scaling (for each gene, across samples) and VST are to some degree at odds. The VST shrinks technical variance so that biological differences are not overwhelmed. And doing so it outperforms simply transformations such as log(x + 1). But then if you force all genes to have unit variance, you undo that effect, increasing technical noise which was just shrunk.

 - I'd suggest you use the VST, then use a variance filter on the VST data to remove genes with minimal variance (take a look at the meanSdPlot to get a sense of the genes which likely have no biological signal, see vignette), then feed the remaining genes to glmnet with standardize=TRUE". - https://support.bioconductor.org/p/93160/

In [4]:
test_data.head()

In [5]:
# Calcualte variances for VST normalised genes
variances = train_data.var(axis=0)
# take top 20% and filter
threshold = variances.quantile(.80) 
genes = variances > threshold
genes= genes.loc[genes==True].index
train_data = train_data.filter(items = genes, axis=1)

# Reapply to test
test_data = test_data.filter(items = genes, axis = 1)

In [6]:
# 5522 most variable genes (note this is still more than we had DE)
test_data.shape
test_data.head()

In [7]:
# set up k fold cross validation and set random_state to 42 to ensure reproducibility - THis ensures each fold in evaluation is the same
KF =  KFold(n_splits=10, shuffle=True, random_state=42)

In [15]:

# Make a pipeline for logistic regression and set the paramaters
log_pipe = Pipeline(steps=[
('scaler', StandardScaler()), # see comment above (in markdown)
('classifier', LogisticRegression(max_iter=10000, solver='saga', tol=0.0001, random_state=42))]) # classifier

precision_scorer = make_scorer(precision_score, zero_division=1)  # had to modify zero_division as it was giving problems
f1_scorer = make_scorer(f1_score)
accuracy_scorer = make_scorer(accuracy_score)
recall_scorer = make_scorer(recall_score)

# Define scoring dictionary for GridSearchCV
scoring = {
    'accuracy': accuracy_scorer,
    'f1': f1_scorer,
    'precision': precision_scorer,
    'recall': recall_scorer
}

# Create a parameter grid - we will search through all these combinations
param_grid = {
    'classifier__penalty': ["elasticnet"],
    'classifier__l1_ratio': [0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1]
}
 
# Create GridSearchCV object
grid_search = GridSearchCV(log_pipe, param_grid, cv=KF, verbose=1, n_jobs=60, scoring=scoring, refit="accuracy")

In [16]:
# Fit the grid search
grid_search.fit(train_data, train_labels.ravel())

In [17]:
# Look at best paramaters and accuracy and save results to a data frame
print("Best Parameters:", grid_search.best_params_)
print(f"Best CV average accuracy: {grid_search.best_score_:.2f}")
results_genes = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),
           pd.DataFrame(grid_search.cv_results_["mean_test_accuracy"], columns = ["Average Accuracy"]),
           pd.DataFrame(grid_search.cv_results_["std_test_accuracy"], columns=["SD accuracy"]),
           pd.DataFrame(grid_search.cv_results_["mean_test_precision"], columns = ["Average precision"]),
           pd.DataFrame(grid_search.cv_results_["std_test_precision"], columns=["SD precision"]),
           pd.DataFrame(grid_search.cv_results_["mean_test_recall"], columns=["Average recall"]),
           pd.DataFrame(grid_search.cv_results_["std_test_recall"], columns=["SD recall"])],axis=1)
results_genes.sort_values(by='Average Accuracy', inplace=True)

In [21]:
grid_search.best_estimator_.named_steps["classifier"].coef_

In [22]:
results_genes
results_models = pd.DataFrame(grid_search.cv_results_)
results_models.sort_values(by='rank_test_accuracy', inplace=True)
results_genes

In [23]:
params_full_lasso = results_models.iloc[7]['params']
print(params_full_lasso)
clf_2nd_best_full_lasso = grid_search.best_estimator_.set_params(**params_full_lasso)

In [47]:
# Save model in case we ever need it again e.g. for external data
with open('/home/workspace/jogrady/ML4TB/work/models/Logistic_regression_CV_search.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

In [31]:
data_test = pd.DataFrame(zip(train_data.columns, np.transpose(grid_search.best_estimator_.named_steps["classifier"].coef_)), columns=['features', 'coef'])#.sort_values(by='coef', inplace=True)
data_test.sort_values(by='coef', inplace = True)
data_test = data_test.loc[(data_test != 0).all(axis=1), :]
data_test

In [32]:
confusion_matrix(clf_2nd_best_full_lasso.predict(test_data), test_labels.ravel()) # Note decision threshold is 0.5 meaning there are some difficult to classify samples

In [33]:
target_names = ['Control', 'Infected']
print(classification_report(test_labels.ravel(), grid_search.predict(test_data), target_names=target_names))
Gene_cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(test_labels.ravel(),grid_search.predict(test_data)), display_labels = ["Control", "Infected"])

# Using latent variables as a means of reducing dimensionality and extracting biological insight

### 2.1 PCA

People generally choose the elbow method for picking the optimal number of PCs. However, others choose the numbr of PCs that explain e.g 80, 90 95% of the variation as input. Unclear which is best so we will settle for 80% as an upper limit and evaluate the contribution of differening number of PCs before picking the best model to evaluate on the test set

- Side point: The number of PCs is important to specify for ICA

In [34]:
 
# Do not need to scale for PCA - not recommended in VST
pca = PCA(random_state=88, n_components=87)
pca.fit(train_data)
explained_variance = pca.explained_variance_ratio_
# Plotting the elbow curve
plt.figure(figsize=(15, 6))
plt.plot(range(1, 87 + 1), explained_variance, marker='o')
plt.xlabel('Number of Principal Components based on training set')
plt.ylabel('Variance explained')
plt.title('Elbow Method for Optimal Number of Components in training set')
plt.grid()
plt.xticks(range(1, 87 + 1,2))
plt.axvline(x=11, color='r', linestyle='--', label='Optimal Components (based on elbow)')
plt.axvline(x=36, color='g', linestyle='--', label='80% variance captured')
plt.legend()
plt.show()
print(f'Variance captured by 36 PCs:{pca.explained_variance_ratio_[:36].sum():.2f}')


### 2.2. Set up the PCA logistic regression pipeline and Evaluate

In [35]:
# Set up the pipeline
PCA_Pipeline = Pipeline(steps=[('pca', PCA(random_state=42)),
('classifier', LogisticRegression(max_iter=10000, penalty="none", solver='saga', tol=0.0001, random_state=42))])

# Set up a grid for each PC
pca_param_grid = {'pca__n_components': list(range(1, 37))}

# Apply
LR_pca_search_model = GridSearchCV(PCA_Pipeline, pca_param_grid, cv=KF, scoring=scoring, refit="accuracy")

# Fit
LR_pca_search_model.fit(train_data, train_labels.ravel())

In [38]:
# Look at best paramaters and accuracy and save results to a data frame
print(f"Best Paramater:", LR_pca_search_model.best_params_)
print(f"Best Score: {LR_pca_search_model.best_score_:.2f}")
results_pca = pd.concat([pd.DataFrame(LR_pca_search_model.cv_results_["params"]),
           pd.DataFrame(LR_pca_search_model.cv_results_["mean_test_accuracy"], columns = ["CV Accuracy"]),
           pd.DataFrame(LR_pca_search_model.cv_results_["std_test_accuracy"], columns=["SD accuracy"]),
           pd.DataFrame(LR_pca_search_model.cv_results_["mean_test_precision"], columns = ["CV precision"]),
           pd.DataFrame(LR_pca_search_model.cv_results_["std_test_precision"], columns=["SD precision"]),
           pd.DataFrame(LR_pca_search_model.cv_results_["mean_test_recall"], columns=["CV recall"]),
           pd.DataFrame(LR_pca_search_model.cv_results_["std_test_recall"], columns=["SD recall"])],axis=1)
results_pca.sort_values(by='CV Accuracy', inplace=True)

In [37]:
results_pca

### 2.3 Plot each of the components and find the genes that are driving each of them and also, find components that are different between the two groups

In [152]:
# 'residuals_all.columns' contains the names of the genes

pca = PCA(random_state=42, n_components=30)
pca_fit = pca.fit(train_data)
comp_genes= []
pca_genes = pd.DataFrame()
pca_results = pca_fit.fit_transform(train_data)
colors = ['steelblue' if label == 0 else 'crimson' for label in train_labels.ravel()]
plt.scatter(pca_results[:, 0], pca_results[:, 1], label="Training data", c=colors, s=20)
handles = [plt.Line2D([0], [0], marker='s', color='w', markerfacecolor='steelblue', markersize=5, label='Control'),
           plt.Line2D([0], [0], marker='s', color='w', markerfacecolor='crimson', markersize=5, label='Infected')]
plt.title("PCA Visualization of eigenvectors", fontweight='bold')
plt.legend(handles=handles)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
# Iterate through each component
for component_idx in range(30):
    # Calculate loadings for the current component
    loadings = pca_fit.components_.T * np.sqrt(pca_fit.explained_variance_)

    # Sort genes by their influence on the current component
    sorted_genes = train_data.columns[np.argsort(loadings[:, component_idx])]

    # Visualize the top N genes that contribute the most to the current component
    N = pca_fit.components_.shape[1]
    top_genes = sorted_genes[-N:]

    pca_genes[component_idx] = top_genes[::-1]
    comp_genes = np.append(comp_genes, top_genes)
pca_genes.columns = ["Component_1","Component_2","Component_3","Component_4",
                     "Component_5","Component_6","Component_7","Component_8",
                     "Component_9","Component_10","Component_11","Component_12",
                     "Component_13","Component_14","Component_15","Component_16",
                     "Component_17","Component_18","Component_19","Component_20",
                     "Component_21","Component_22","Component_23","Component_24",
                     "Component_25","Component_26","Component_27","Component_28",
                     "Component_29","Component_30"]                      
pca_genes   

In [83]:
# Set up the grid for subplots (10 rows, 3 columns)
fig, axes = plt.subplots(10, 3, figsize=(18, 30))
fig.suptitle("PCA Visualization of Eigenvectors", fontweight='bold', fontsize=16)

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Loop to generate scatter plots from PC1 vs each PC up to PC30
for i in range(1, 30):
    ax = axes[i-1]
    ax.scatter(pca_results[:, 0], pca_results[:, i], c=colors, s=20)
    ax.set_title(f"PC1 vs PC{i+1}")
    ax.set_xlabel("PC1")
    ax.set_ylabel(f"PC{i+1}")

# Custom legend only in the first subplot (for cleanliness)
handles = [
    plt.Line2D([0], [0], marker='s', color='w', markerfacecolor='steelblue', markersize=5, label='Control'),
    plt.Line2D([0], [0], marker='s', color='w', markerfacecolor='crimson', markersize=5, label='Infected')
]
axes[0].legend(handles=handles)

# Hide any unused subplots (since there are only 29 plots for a 30-slot grid)
for j in range(29, len(axes)):
    axes[j].axis('off')

plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust layout to fit title
plt.show()


In [87]:
Feature_weight_pca = pd.DataFrame(zip(pca_genes.columns, np.transpose(LR_pca_search_model.best_estimator_.named_steps["classifier"].coef_)), columns=['features', 'coef'])#.sort_values(by='coef', inplace=True)
Feature_weight_pca.sort_values(by='coef', inplace = True)
Feature_weight_pca = Feature_weight_pca.loc[(Feature_weight_pca != 0).all(axis=1), :]
Feature_weight_pca

In [113]:
df_pca = pd.DataFrame(pca_results)
Control = df_pca.iloc[:44, :] # 45 samples in control
Infected = df_pca.iloc[45:, :] # 42 in infected

# Perform t-test for each component
results = pd.DataFrame(columns=['Component', 'T-Statistic', 'P-Value'])

for column in df_pca.columns:
    t_statistic, p_value = stats.ttest_ind(Control[column], Infected[column])
    
    new_row = pd.DataFrame({'Component': [column], 'T-Statistic': [t_statistic], 'P-Value': [p_value]})
    results = pd.concat([results, new_row], ignore_index=True)

# Print the results
print(results)

# Identify components with significantly different means

from statsmodels.stats.multitest import multipletests


reject, pvals_corrected, _, _ = multipletests(results["P-Value"], alpha=0.05, method='fdr_bh')
results['Corrected P-Value'] = pvals_corrected
results['Significant (BH)'] = reject 


significantly_different = results[results['Corrected P-Value'] < 0.05]
print("Components with significantly different means:")
print(significantly_different)

In [150]:
# Bar plot for each component
plt.figure(figsize=(15, 8))
plt.bar(results['Component'], -np.log10(results['Corrected P-Value']), color='teal', alpha=0.5)

plt.xlabel('PCA Component Number', fontsize=18)
plt.ylabel('-log10(P-Value)', fontsize=18)
plt.plot([-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],np.repeat(-np.log10(0.05),32),linestyle='--', label='Padj=0.05')
plt.title('PCA: Difference per Component between Control and Infected samples', 
            fontsize=20, fontweight='bold')
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30], fontsize=16)
plt.ylim(0,4)
plt.legend(fontsize=14)
plt.xlim(-0.5,29.5)
plt.show()

Can see there is a significnat difference in the components for PC1, 5 and 12

### 2.4 Evaluate on Test data

In [153]:
# Performance on Test data
print(classification_report(test_labels.ravel(), LR_pca_search_model.predict(test_data), target_names=target_names))
PCA_cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(test_labels.ravel(),LR_pca_search_model.predict(test_data)), display_labels = ["Control", "Infected"])

### 2. ICA

ICA and PCA are similar to each other however ICA attemps to transform the data into statistically significant non-Gaussian components. Often times this is estimated by Kurtosis but the number of  !'PCs'! used for the whitening procedure is often those that reach 80,90,95% etc, hence we will use this for below  

In [19]:
# ICA
from sklearn.decomposition import FastICA
ICA_transformer = FastICA(n_components=36, # from PCA - 80% of variance
        random_state=42,
        max_iter=1000, tol=0.0001,
        whiten='unit-variance')

df_train_ica = ICA_transformer.fit_transform(train_data)
kurtosis_scores = [kurtosis(df_train_ica[:, i]) for i in range(df_train_ica.shape[1])]
n_components = np.argmax(kurtosis_scores) + 1

In [20]:
# Number of non-normal components in training data = 14 - however we will use the same as above for PCA
n_components

It appears that 14 components are non-gaussian

In [21]:
# Set up the pipeline
ICA_Pipeline = Pipeline(steps=[('ica', FastICA( 
        random_state=42,
        max_iter=5000, tol=0.0001,
        whiten='unit-variance')),
('classifier', LogisticRegression(max_iter=10000, penalty="none", solver='saga', tol=0.0001, random_state=42))])

# Set up grid of components - 80% from PCA above
ica_param_grid = {'ica__n_components': list(range(1, 37))}


# Apply
LR_ica_search_model = GridSearchCV(ICA_Pipeline, ica_param_grid, cv=KF, scoring=scoring, refit="accuracy")


# Fit
LR_ica_search_model.fit(train_data, train_labels.ravel())


In [22]:
# Look at best paramaters and accuracy and save results to a data frame
print("Best Parameters:", LR_ica_search_model.best_params_)
print(f"Best Score: {LR_ica_search_model.best_score_:.2f}")
results_ica = pd.concat([pd.DataFrame(LR_pca_search_model.cv_results_["params"]),
           pd.DataFrame(LR_ica_search_model.cv_results_["mean_test_accuracy"], columns = ["CV Accuracy"]),
           pd.DataFrame(LR_ica_search_model.cv_results_["std_test_accuracy"], columns=["SD accuracy"]),
           pd.DataFrame(LR_ica_search_model.cv_results_["mean_test_precision"], columns = ["CV precision"]),
           pd.DataFrame(LR_ica_search_model.cv_results_["std_test_precision"], columns=["SD precision"]),
           pd.DataFrame(LR_ica_search_model.cv_results_["mean_test_recall"], columns=["CV recall"]),
           pd.DataFrame(LR_ica_search_model.cv_results_["std_test_recall"], columns=["SD recall"])],axis=1)
results_ica.sort_values(by='CV Accuracy', inplace=True)

In [23]:
results_ica

In [156]:
print(classification_report(test_labels.ravel(),LR_ica_search_model.predict(test_data), target_names=target_names))
ICA_cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(test_labels.ravel(),LR_ica_search_model.predict(test_data)), display_labels = ["Control", "Infected"])
# display matrix

### 3. Non-negative matrix factorization

In [25]:

NMF_Pipeline = Pipeline(steps=[('nmf', NMF(
    init="random", solver='cd', 
    beta_loss='frobenius', tol=0.0001, 
    max_iter=10000, random_state=42, verbose=0)),
('classifier', LogisticRegression(max_iter=10000, penalty=None, solver='saga', tol=0.0001, random_state=42))])

nmf_param_grid = {'nmf__n_components': list(range(1, 37))}

LR_nmf_search_model = GridSearchCV(NMF_Pipeline, nmf_param_grid, cv=KF, n_jobs= 30, scoring=scoring, refit="accuracy")

LR_nmf_search_model.fit(train_data, train_labels.ravel())

In [26]:
# Look at best paramaters and accuracy and save results to a data frame
print("Best Parameters:", LR_nmf_search_model.best_params_)
print(f"Best Score: {LR_nmf_search_model.best_score_:.2f}")
results_nmf = pd.concat([pd.DataFrame(LR_pca_search_model.cv_results_["params"]),
           pd.DataFrame(LR_nmf_search_model.cv_results_["mean_test_accuracy"], columns = ["CV Accuracy"]),
           pd.DataFrame(LR_nmf_search_model.cv_results_["std_test_accuracy"], columns=["SD accuracy"]),
           pd.DataFrame(LR_nmf_search_model.cv_results_["mean_test_precision"], columns = ["CV precision"]),
           pd.DataFrame(LR_nmf_search_model.cv_results_["std_test_precision"], columns=["SD precision"]),
           pd.DataFrame(LR_nmf_search_model.cv_results_["mean_test_recall"], columns=["CV recall"]),
           pd.DataFrame(LR_nmf_search_model.cv_results_["std_test_recall"], columns=["SD recall"])],axis=1)
results_nmf.sort_values(by='CV Accuracy', inplace=True)

In [27]:
results_nmf

In [28]:
print(classification_report(test_labels.ravel(), LR_nmf_search_model.predict(test_data), target_names=target_names))
NMF_cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(test_labels.ravel(),LR_nmf_search_model.predict(test_data)), display_labels = ["Control", "Infected"])

## Evaluation
### 1. Confusion matrix
 - Note, this is based on a theta value of 0.5.

In [154]:
# Genes
Gene_cm_display.plot()
plt.title("Performance of best gene-based model on test data")

# PCA
PCA_cm_display.plot()
plt.title('PCA best model on test data')

# ICA
ICA_cm_display.plot()
plt.title('ICA best model on test data')

# NMF
NMF_cm_display.plot()
plt.title('NMF best model on test data')

plt.show()


### 2. ROC curve

In [30]:
# predict continuous value
# For different decision thresholds
y_score_gene = grid_search.predict_proba(test_data)
fprG_gene, tprG_gene, t_gene = roc_curve(test_labels.ravel(), y_score_gene[:,1])
roc_aucG_gene = auc(fprG_gene, tprG_gene)


y_score_pca = LR_pca_search_model.predict_proba(test_data)
fprG_pca, tprG_pca, t_pca = roc_curve(test_labels.ravel(), y_score_pca[:,1])
roc_aucG_pca = auc(fprG_pca, tprG_pca)


y_score_ica = LR_pca_search_model.predict_proba(test_data)
fprG_ica, tprG_ica, t_ica = roc_curve(test_labels.ravel(), y_score_ica[:,1])
roc_aucG_ica = auc(fprG_ica, tprG_ica)



y_score_nmf = LR_pca_search_model.predict_proba(test_data)
fprG_nmf, tprG_nmf, t_nmf = roc_curve(test_labels.ravel(), y_score_nmf[:,1])
roc_aucG_nmf = auc(fprG_nmf, tprG_nmf)

In [155]:
%matplotlib inline
plt.figure()
lw = 2
plt.plot(fprG_gene, tprG_gene, color='red',
         lw=lw, label='ROC Gene (area = %0.2f)' % roc_aucG_gene)
plt.plot(fprG_pca, tprG_pca, color='green',
         lw=lw, label='ROC PCA (area = %0.2f)' % roc_aucG_pca)
plt.plot(fprG_ica, tprG_ica, color='red',
         lw=lw, label='ROC ICA (area = %0.2f)' % roc_aucG_ica)
plt.plot(fprG_nmf, tprG_nmf, color='blue',
         lw=lw, label='ROC NMF (area = %0.2f)' % roc_aucG_nmf)

plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Analysis Different models')
plt.legend(loc="lower right")
plt.show()

# Bite NMF covers both PCA and ICA as they are identical

### 3. Look at coefficients of model.

In [32]:
grid_search.best_estimator_.named_steps['classifier'].coef_
coefficient_data_frame = pd.DataFrame({"GeneID": np.array(train_data.columns),
                                       "Coefficient": grid_search.best_estimator_.named_steps['classifier'].coef_.ravel()})
coefficient_data_frame

Write results to files

In [33]:
results_genes.to_csv(path_or_buf = "/home/workspace/jogrady/ML4TB/work/LogisticRegression/Gene_based_CV_search.txt", sep = "\t", index = False)
results_pca.to_csv(path_or_buf = "/home/workspace/jogrady/ML4TB/work/LogisticRegression/PCA_CV_search.txt", sep = "\t", index = False)
results_ica.to_csv(path_or_buf = "/home/workspace/jogrady/ML4TB/work/LogisticRegression/ICA_based_CV_search.txt", sep = "\t", index = False)
results_nmf.to_csv(path_or_buf = "/home/workspace/jogrady/ML4TB/work/LogisticRegression/NMF_based_CV_search.txt", sep = "\t", index = False)