# Notebook para ejecutar NSAUC-ROC

Este notebook está preparado para ejecutar en GoogleColab por necesidades de RAM

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
!pip install scanpy
import scanpy as sc



In [4]:
import sys
sys.path.append('/content/drive/MyDrive/TFM/src/NSForest/scripts/')

## Import libreries

In [5]:
from NSAUCROC_v3dot9_2_test import *

## Load data

In [6]:
# Loading h5ad
file = "/content/drive/MyDrive/TFM/src/NSForest/data/adata_switched_2_2.h5ad"
adata = sc.read_h5ad(file)
adata_test = sc.read_h5ad("/content/drive/MyDrive/TFM/src/NSForest/data/adata_selected_park_6cl.h5ad")

## Quick view of datasets

In [7]:
adata #quick look of the data

AnnData object with n_obs × n_vars = 85429 × 16653
    obs: 'Origin', 'suspension_type', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'author_cell_type', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'cell_type_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'development_stage_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'donor_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'batch'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'schema_version', 'title'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'

In [8]:
adata_test

AnnData object with n_obs × n_vars = 20571 × 16653
    obs: 'Origin', 'suspension_type', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'author_cell_type', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'cell_type_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'development_stage_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'donor_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'schema_version', 'title'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'

## Get cluster names

In [9]:
cluster_header = "cell_type" #<---
set(adata.obs[cluster_header])

{'endothelial cell',
 'epithelial cell of proximal tubule',
 'kidney collecting duct principal cell',
 'kidney connecting tubule epithelial cell',
 'kidney distal convoluted tubule epithelial cell',
 'renal beta-intercalated cell'}

In [10]:
np.unique(adata.obs[cluster_header])

array(['endothelial cell', 'epithelial cell of proximal tubule',
       'kidney collecting duct principal cell',
       'kidney connecting tubule epithelial cell',
       'kidney distal convoluted tubule epithelial cell',
       'renal beta-intercalated cell'], dtype=object)

In [11]:
len(np.unique(adata.obs[cluster_header]))

6

In [12]:
# Extrae el nombre del archivo sin la extensión
dataset_name = os.path.splitext(os.path.basename(file))[0]
outputfilename = "AUCROC"
output_folder = f"../outputs_experimentation/AUCROC/{dataset_name}/"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Creating new directory...\n{output_folder}")

## Runing NSAUCROC to obtain the biomarkers

### Runing NSAUCROC using all clusters

In [13]:
NSAUCROC_results = NSAUCROC(adata, cluster_header=cluster_header, n_genes_eval=6, beta = 0.2,
                            output_folder = output_folder, outputfilename_prefix = outputfilename) #<---

Preparing data...
Calculating medians...
--- 33.50292110443115 seconds ---
Number of clusters to evaluate: 6
1 out of 6:
	endothelial cell
Tiempo de ejecución: 430.17 segundos
	['ENSMUSG00000021759', 'ENSMUSG00000039706']
	fbeta: 0.952259847093361
2 out of 6:
	renal beta-intercalated cell
Tiempo de ejecución: 329.13 segundos
	['ENSMUSG00000020566', 'ENSMUSG00000028238']
	fbeta: 0.9562839364274107
3 out of 6:
	epithelial cell of proximal tubule
Tiempo de ejecución: 407.24 segundos
	['ENSMUSG00000030945', 'ENSMUSG00000076441', 'ENSMUSG00000022613', 'ENSMUSG00000055373']
	fbeta: 0.9316028625964944
4 out of 6:
	kidney connecting tubule epithelial cell
Tiempo de ejecución: 399.96 segundos
	['ENSMUSG00000054640', 'ENSMUSG00000031618', 'ENSMUSG00000054728']
	fbeta: 0.8060660391453758
5 out of 6:
	kidney distal convoluted tubule epithelial cell
Tiempo de ejecución: 414.84 segundos
	['ENSMUSG00000031766', 'ENSMUSG00000028017']
	fbeta: 0.9357226100224669
6 out of 6:
	kidney collecting duct princ

In [14]:
NSAUCROC_results

Unnamed: 0,clusterName,clusterSize,f_score,PPV,TN,FP,FN,TP,marker_count,NSAUCROC_markers,thresholds,binary_genes
0,endothelial cell,4824,0.95226,0.990658,80583,22,2491,2333,2,"[ENSMUSG00000021759, ENSMUSG00000039706]","[0.8481884002685547, 0.7238551378250122]","[ENSMUSG00000027210, ENSMUSG00000021759, ENSMU..."
1,renal beta-intercalated cell,1756,0.956284,0.988285,83662,11,828,928,2,"[ENSMUSG00000020566, ENSMUSG00000028238]","[0.4649316668510437, 0.4547166973352432]","[ENSMUSG00000024485, ENSMUSG00000063296, ENSMU..."
2,epithelial cell of proximal tubule,59680,0.931603,0.960311,24434,1315,27862,31818,4,"[ENSMUSG00000030945, ENSMUSG00000076441, ENSMU...","[0.40984900295734406, 0.31009024381637573, 0.3...","[ENSMUSG00000030945, ENSMUSG00000076441, ENSMU..."
3,kidney connecting tubule epithelial cell,3072,0.806066,0.875527,82239,118,2242,830,3,"[ENSMUSG00000054640, ENSMUSG00000031618, ENSMU...","[2.118518114089966, 0.9148422181606293, 1.9899...","[ENSMUSG00000054640, ENSMUSG00000031558, ENSMU..."
4,kidney distal convoluted tubule epithelial cell,11491,0.935723,0.966446,73730,208,5500,5991,2,"[ENSMUSG00000031766, ENSMUSG00000028017]","[1.1833228468894958, 1.0618106722831726]","[ENSMUSG00000031766, ENSMUSG00000022490, ENSMU..."
5,kidney collecting duct principal cell,4606,0.89829,0.954416,80743,80,2931,1675,2,"[ENSMUSG00000023013, ENSMUSG00000004988]","[1.4653971195220947, 1.1409903764724731]","[ENSMUSG00000023013, ENSMUSG00000004988, ENSMU..."
6,Average,14238,0.913371,0.955941,70898,292,6975,7262,-,-,-,-


### Testing the model performance:

### Approach 1:
- Train a decission tree for each of selected genes as biomarkers by NSForest
- Obtein the prediction as the dot product of the individual prediction for each gene

In [15]:
NSAUCROC_results = NSAUCROC_results.iloc[:-1]
df_test_result = myDecisionTreeEvaluationTest(adata, adata_test, cluster_header, NSAUCROC_results, beta = 0.2,
                                              output_folder = output_folder, outputfilename_prefix = outputfilename)

In [16]:
df_test_result

Unnamed: 0,clusterName,clusterSize,f_score,PPV,TN,FP,FN,TP,marker_count,NSAUCROC_markers,threshold
0,endothelial cell,638,0.271147,1.0,19933,0,629,9,2,"['ENSMUSG00000021759', 'ENSMUSG00000039706']","[0.8481884002685547, 0.7238551378250122]"
1,renal beta-intercalated cell,691,0.903678,0.973799,19874,6,468,223,2,"['ENSMUSG00000020566', 'ENSMUSG00000028238']","[0.4649316668510437, 0.4547166973352432]"
2,epithelial cell of proximal tubule,13152,0.907854,0.974201,7302,117,8734,4418,4,"['ENSMUSG00000030945', 'ENSMUSG00000076441', '...","[0.40984900295734406, 0.31009024381637573, 0.3..."
3,kidney connecting tubule epithelial cell,831,0.0,0.0,19740,0,831,0,3,"['ENSMUSG00000054640', 'ENSMUSG00000031618', '...","[2.118518114089966, 0.9148422181606293, 1.9899..."
4,kidney distal convoluted tubule epithelial cell,4890,0.911411,0.958373,15594,87,2887,2003,2,"['ENSMUSG00000031766', 'ENSMUSG00000028017']","[1.1833228468894958, 1.0618106722831726]"
5,kidney collecting duct principal cell,369,0.937311,0.957627,20192,10,143,226,2,"['ENSMUSG00000023013', 'ENSMUSG00000004988']","[1.4653971195220947, 1.1409903764724731]"
6,Average,3428,0.655233,0.810667,17105,36,2282,1146,-,-,-


### Approach 2:
- Train a decission tree with all NSForest selected gene at the same time

In [17]:
df_combined_test_result = myDecisionTreeEvaluationTestCombined(adata, adata_test, cluster_header, NSAUCROC_results, beta = 0.2,
                                                               output_folder = output_folder, outputfilename_prefix = outputfilename)

In [18]:
df_combined_test_result

Unnamed: 0,clusterName,clusterSize,f_score,PPV,TN,FP,FN,TP,marker_count,NSAUCROC_markers,threshold
0,endothelial cell,638,0.662252,0.943396,19930,3,588,50,2,"['ENSMUSG00000021759', 'ENSMUSG00000039706']",[0.8481884002685547]
1,renal beta-intercalated cell,691,0.707209,0.714815,19726,154,305,386,2,"['ENSMUSG00000020566', 'ENSMUSG00000028238']",[0.4547166973352432]
2,epithelial cell of proximal tubule,13152,0.828174,0.832054,5450,1969,3397,9755,4,"['ENSMUSG00000030945', 'ENSMUSG00000076441', '...",[0.40984900295734406]
3,kidney connecting tubule epithelial cell,831,0.405983,0.423469,19514,226,665,166,3,"['ENSMUSG00000054640', 'ENSMUSG00000031618', '...",[2.118518114089966]
4,kidney distal convoluted tubule epithelial cell,4890,0.833702,0.845893,15135,546,1893,2997,2,"['ENSMUSG00000031766', 'ENSMUSG00000028017']",[1.1833228468894958]
5,kidney collecting duct principal cell,369,0.711595,0.709924,20088,114,90,279,2,"['ENSMUSG00000023013', 'ENSMUSG00000004988']",[1.4653971195220947]
6,Average,3428,0.691486,0.744925,16640,502,1156,2272,-,-,-


In [19]:
new_eval = myDecisionTreeEvaluationTestMove(adata, adata_test, cluster_header, NSAUCROC_results, beta = 0.2,
                                                               output_folder = output_folder, outputfilename_prefix = outputfilename, coef=0.5)

In [20]:
new_eval

Unnamed: 0,clusterName,clusterSize,f_score,PPV,TN,FP,FN,TP,marker_count,NSAUCROC_markers,original_threshold,new_threshold,NSAUCRIC_markers
0,endothelial cell,638,0.223862,1.0,19933,0,631,7,2,"['ENSMUSG00000021759', 'ENSMUSG00000039706']","[0.8481884002685547, 0.7238551378250122]","[0.884131882339716, 0.7579623945057392]",
1,renal beta-intercalated cell,691,0.896081,0.9801,19876,4,494,197,2,"['ENSMUSG00000020566', 'ENSMUSG00000028238']","[0.4649316668510437, 0.4547166973352432]","[0.5622370913624763, 0.5515786409378052]",
2,epithelial cell of proximal tubule,13152,0.874163,0.98236,7364,55,10089,3063,4,"['ENSMUSG00000030945', 'ENSMUSG00000076441', '...","[0.40984900295734406, 0.31009024381637573, 0.3...","[0.824158564209938, 0.7408173680305481, 0.7460...",
3,kidney connecting tubule epithelial cell,831,0.0,0.0,19740,0,831,0,3,"['ENSMUSG00000054640', 'ENSMUSG00000031618', '...","[2.118518114089966, 0.9148422181606293, 1.9899...","[2.376619905233383, 0.9522566311061382, 2.0418...",
4,kidney distal convoluted tubule epithelial cell,4890,0.887445,0.976401,15649,32,3566,1324,2,"['ENSMUSG00000031766', 'ENSMUSG00000028017']","[1.1833228468894958, 1.0618106722831726]","[1.6282113194465637, 1.4157971143722534]",
5,kidney collecting duct principal cell,369,0.938741,0.96347,20194,8,158,211,2,"['ENSMUSG00000023013', 'ENSMUSG00000004988']","[1.4653971195220947, 1.1409903764724731]","[1.6408644318580627, 1.2931244373321533]",
6,Average,3428,0.636715,0.817055,17126,16,2628,800,-,,-,-,-
