## Notebook for transferring labels from Healthy epithelial reference to cancer cells using `scBalance`

- **Developed by**: Anna Maguza
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- 8th June 2023

### Load required modules

In [1]:
import scBalance as sb
import scBalance.scbalance_IO as ss
import scanpy as sc
import pandas as pd
import numpy as np

### Data upload

In [2]:
input_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_epithelial_cells_all_genes.h5ad'
Healthy_adata = sc.read(input_healthy)

In [3]:
input_cancer = '/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/anndata/Joanito_raw_anndata_tumor_cells.h5ad'
Cancer_adata = sc.read(input_cancer)

In [4]:
# Filter epithelial cells
Cancer_adata = Cancer_adata[Cancer_adata.obs['Cell Type'] == 'Epithelial',:]

### Preprocess

In [5]:
Healthy_adata.layers['counts'] = Healthy_adata.X.copy()

### HVGs selection
# Calculate HVGs for cancer dataset
sc.pp.highly_variable_genes(
    Healthy_adata,
    flavor = "seurat_v3",
    n_top_genes = 7000,
    layer = "counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [6]:
Cancer_adata.layers['counts'] = Cancer_adata.X.copy()

# Extract same HVGs in the cancer dataset as in the healthy dataset

#Make indexes as string
Cancer_adata.var.index = Cancer_adata.var.index.astype(str)

# Ensure indexes are unique
Cancer_adata.var_names_make_unique()

# Identify common genes
common_genes = list(set(Healthy_adata.var_names) & set(Cancer_adata.var_names))

# Filter genes
Healthy_adata = Healthy_adata[:, common_genes]
Cancer_adata = Cancer_adata[:, common_genes]

#Ensure the same order of the genes
Cancer_adata = Cancer_adata[:, Healthy_adata.var_names]

In [7]:
sc.pp.normalize_total(Healthy_adata, target_sum=1e4)
sc.pp.log1p(Healthy_adata)

sc.pp.normalize_total(Cancer_adata, target_sum=1e4)
sc.pp.log1p(Cancer_adata)

  view_to_actual(adata)


In [8]:
gene = Healthy_adata.var_names & Cancer_adata.var_names

  gene = Healthy_adata.var_names & Cancer_adata.var_names


In [9]:
X_train = Healthy_adata.to_df()[gene]
X_test = Cancer_adata.to_df()[gene]

In [10]:
y_train = pd.DataFrame(Healthy_adata.obs['Unified Cell States'])

# Rename 'Unified Cell States' column in the dataframe to 'Label' to be consistent with the scBalance input
y_train = y_train.rename(columns={'Unified Cell States': 'Label'})

In [11]:
# Convert y_train to category and store the categories
y_train['Label'] = y_train['Label'].astype('category')
categories = y_train['Label'].cat.categories

# Convert to integer codes
y_train_values = y_train['Label'].cat.codes.values

# then, convert it back to DataFrame for sb.scBalance()
y_train = pd.DataFrame(y_train_values, columns=['Label'])

In [12]:
pred_result = sb.scBalance(X_test, X_train, y_train, processing_unit = 'cpu', weighted_sampling = True)

--------Start annotating----------
Computational unit be used is: cpu
--------Annotation Finished----------


In [13]:
# Create a DataFrame from the prediction result
pred_result_df = pd.DataFrame(pred_result, columns=['Label'])

# Map integers back to original labels using categories
pred_result_df['Label'] = categories[pred_result_df['Label']]

In [14]:
# Convert dataframe to numpy array
pred_result_array = pred_result_df['Label'].values

# Assign the numpy array to the 'Predicted Label' column in your AnnData object
Cancer_adata.obs['Predicted Label'] = pred_result_array

In [15]:
Cancer_adata.obs['Predicted Label'].value_counts()

TA                        22094
Paneth cells               7622
Colonocyte                 2136
Enterocyte                 1661
Tuft cells                 1400
Stem cells                  555
Epithelial cells            103
Goblet cells                 94
Enteroendocrine cells        31
Microfold cell                9
Enterochromaffin cells        6
L cells                       3
Name: Predicted Label, dtype: int64

In [16]:
Healthy_adata.obs['Unified Cell States'].value_counts() 

Enterocyte                56398
TA                        40570
Stem cells                27658
Goblet cells              13051
Colonocyte                11707
Paneth cells               3468
Tuft cells                 1204
Epithelial cells           1141
Microfold cell              340
Enteroendocrine cells       311
L cells                     228
Enterochromaffin cells      119
Name: Unified Cell States, dtype: int64

In [18]:
# Save the output
Cancer_adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/Labels_transfer/scBalance/Joanito_predicted_labels_with_scBalance_7000.h5ad')