# Run Scyan predictions on CD45+ preprocessed data

Run Scyan cell type labeling on flow data.

Start from preprocessed AnnData object from preprocessing script.

1) Import and format preprocessed AnnData  
2) Run Scyan cell type labeling  
3) Visualize predictions  
   a) Validate presence of level 1 populations with UMAP and bar charts  
   b) Re-run with batch correction (external notebook) if invalid
4) Save predictions

## Setup

In [None]:
# Start in minimal Python kernel
import hisepy
import os

In [None]:
# Import Lilly flow notes metadata
meta_fid = "9dadc265-cf58-4a79-b42e-69c135c794bd"
fres = hisepy.read_files([meta_fid])

In [None]:
hisepy.list_project_stores()

In [None]:
help(hisepy)

In [None]:
# Import batch CD45+ csv files
panel = "PL1"
batch_ref = "B237"
batch_new = "B175"
qda_version = "v3.2"

ps = hisepy.list_files_in_project_store("PD-1")

ps = ps[ps['name'].str.contains(f"{batch_ref}|{batch_new}")]
ps = ps[ps['name'].str.contains(panel)]
ps = ps[ps['name'].str.contains('flow/scyan/preprocess')]
ps = ps[ps['name'].str.contains('processed_adata')]
ps

In [None]:
ps_fid = ps['id'].to_list()

In [None]:
fres = hisepy.read_files(ps_fid)

In [None]:
in_fids = [str(fid) for fid in ps_fid] + [meta_fid]
in_fids

In [None]:

# Save input FIDs for upload
base_path = '/home/workspace/lilly-pd1-analysis/03_flow/'
output_path = base_path + '02-scyan-predictions/output/'
os.makedirs(output_path, exist_ok=True)

with open(output_path + "in_fids.txt", "w") as f:
    for line in in_fids:
        f.write(str(line) + "\n")


In [None]:
# Switch to python scyan kernel

import scyan as sy
import os
import glob
import anndata
import re
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import scanpy as sc
import uuid
import sys
import gc
import random
import torch

import warnings
warnings.filterwarnings('ignore')

print(sy.__version__)

sc.settings.n_jobs=12

In [None]:

# Define the working parameters

batch_ref = "B237"
batch_new = "B175"
panel = "PL1"
qda_version = 'v3.2'
panel_version = 'v2'
proj_name = 'EL_flow_label_pred_scyan_' + panel + '_' + batch_ref + '_' + batch_new + '_'

# Paths and files inside repository
base_path = '/home/workspace/lilly-pd1-analysis/03_flow/'

fig_path = base_path + '02-scyan-predictions/results/' + panel + '/'

know_tb_file = base_path + 'knowledge_tables/Lilly_flow_population_knowledge_table_' + panel + '_' + panel_version + '.csv'
know_tb = pd.read_csv(know_tb_file, index_col=[0, 1])

panel_file = base_path + 'panels/AIFI_flow_' + panel + '_panel_breakdown.csv'
panel_meta = pd.read_csv(panel_file)

output_path = base_path + '02-scyan-predictions/output/'

# Paths and files imported from HISE
input_path = '/home/workspace/input/2506117363/PD-1/'

meta_file = glob.glob(os.path.join(input_path, '**', 'Lilly_flow_cytometry_sample_metadata_all_batches_updated.csv'), recursive=True)
meta_data = pd.read_csv(meta_file[0],index_col=0) 

adata_file_pattern_ref = "flow/scyan/preprocess/EL_flow_label_pred_scyan_" + panel + "_" + batch_ref + "_processed_adata.h5ad"
adata_file_ref = glob.glob(os.path.join(input_path, '**', adata_file_pattern_ref), recursive=True)
adata_file_ref

adata_file_pattern_new = "flow/scyan/preprocess/EL_flow_label_pred_scyan_" + panel + "_" + batch_new + "_processed_adata.h5ad"
adata_file_new = glob.glob(os.path.join(input_path, '**', adata_file_pattern_new), recursive=True)
adata_file_new


In [None]:

# Function for importing data 
def importAData(adata_path, batch, file_suffix='_processed_adata.h5ad', panel='PL1'):
    know_tb = pd.read_csv(know_tb_file, index_col=[0, 1])

    adata_file = adata_path + '/EL_flow_label_pred_scyan_' + panel + '_' + batch + file_suffix
    adata = sc.read_h5ad(adata_file)

    new_ags = adata.var_names[~adata.var_names.isin(know_tb.columns)].tolist()
    know_tb[new_ags] = np.nan
    
    all(know_tb.columns.isin(adata.var_names))
    adata.var_names[~adata.var_names.isin(know_tb.columns)]

    return(adata)


In [None]:

# Seed
seed = 42

# Set the seed for Python's random module
random.seed(seed)

# Set the seed for NumPy
np.random.seed(seed)

# Set the seed for PyTorch (if Scyan uses PyTorch internally)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # If using GPU

# Ensure deterministic behavior in PyTorch
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


## Import preprocessed data

In [None]:

# Import new and ref batches
adata_new = importAData(os.path.dirname(adata_file_ref[0]), batch_ref) # odd
adata_ref = importAData(os.path.dirname(adata_file_new[0]), batch_new) # ref



In [None]:

print(len(adata_new.obs))


In [None]:

print(len(adata_ref.obs))


### Scale data

In [None]:

# Scale data 
sy.preprocess.scale(adata_new)
sy.preprocess.scale(adata_ref)


### Concatenate data

In [None]:

adata = anndata.concat([adata_ref, adata_new], join='inner', label='batch', keys=['batch_ref', 'batch_new'])
adata


In [None]:

print(len(adata.obs))


## Scyan predictions

### Build model

In [None]:

model = sy.Scyan(adata, know_tb, batch_key='batch')


In [None]:

#When training for the first time on a new dataset, we advise to start with the default arguments (i.e., remove 'prior_std' and 'lr')
model.fit(patience=50, min_delta=0.1)


### Save model

In [None]:

model.save(output_path + proj_name +  "model.pt")


### Batch correction

In [None]:

# Correct batch effect using reference batch
adata.obsm["scyan_corrected"] = model.batch_effect_correction(batch_ref='batch_ref').numpy(force=True)


### Predict cell types

In [None]:

# Predict
model.predict()
adata


In [None]:

# rename the nan in prediction as unknwon 
adata.obs['scyan_pop'] = adata.obs['scyan_pop'].cat.add_categories("unknown").fillna('unknown')
adata.obs['scyan_pop_l1_labels'] = adata.obs['scyan_pop_l1_labels'].cat.add_categories("unknown").fillna('unknown')
adata.obs.head()


In [None]:
# Create a new column "Unknown" based on values in "scyan_pop" column
adata.obs['unknown'] = np.where(adata.obs['scyan_pop'] == 'unknown', 'unknown', np.nan)
adata.obs.head()

## Visualize predictions

### UMAP

In [None]:

# check the gating antigens
panel_meta = pd.read_csv(panel_file)
gating_antigens = panel_meta[(panel_meta['used_for_cyanno']=='YES')&(panel_meta['antigen'].isin(adata.var_names))].antigen.tolist()
gating_antigens


In [None]:

sy.tools.umap(adata, obsm='scyan_corrected', markers=gating_antigens)


In [None]:
# site information
p1=sy.plot.umap(adata, color=['cohort','batch','Visit', 'subject'],ncols=2, wspace=0.5,show=False, return_fig=True)
# p1.savefig(fig_path+proj_name +  "cohort_umap.png",  bbox_inches='tight')


p1=sy.plot.umap(adata, color=adata.var_names.sort_values(),ncols=6, show=False,   return_fig=True)
# p1.savefig(fig_path+proj_name+'expression_umap.png')

p1=sy.plot.umap(adata, color=["scyan_pop"], ncols=1,  show=False, return_fig=True)
# p1.savefig(fig_path+proj_name+'scyan_pop_l2_prediction_umap.png', bbox_inches='tight')

p1=sy.plot.umap(adata, color=["scyan_pop_l1_labels"], ncols=1, show=False, return_fig=True)
# p1.savefig(fig_path+proj_name+'scyan_pop_l1_prediction_umap.png', bbox_inches='tight')

p1=sy.plot.umap(adata, color=["unknown"], ncols=1, show=False, return_fig=True)
# p1.savefig(fig_path+proj_name+'scyan_pop_l1_unknown_umap.png', bbox_inches='tight')

### Percentages

In [None]:

sy.plot.pop_percentage(adata, groupby='cohort', key='scyan_pop_l1_labels')
sy.plot.pop_percentage(adata, groupby='cohort', key='scyan_pop')
sy.plot.pop_percentage(adata, groupby='subject', key='scyan_pop_l1_labels')
sy.plot.pop_percentage(adata, groupby='subject', key='scyan_pop')


## Output frequency tables

In [None]:

# calcualte the cell counts for each samples
cell_counts = adata.obs.groupby(['sample_id']).size().reset_index().rename({0:'cell_numbers'},axis=1)

cell_counts.head()

cell_counts.to_csv(output_path + proj_name + 'total_cell_counts_v2.csv')


In [None]:

# output the freuqency tables of L1 and L2 prediction labels
l1_freq_table = sy.tools.cell_type_ratios(adata, groupby='sample_id',normalize=True, key='scyan_pop_l1_labels', among=None)
l1_freq_table.head()


In [None]:

meta_data.head()


In [None]:

# merge basic metadata
# l1_freq_table = l1_freq_table.merge(meta_data, how='left', 
#                                     left_index=True, 
#                                     right_on='AIFI_barcodes')
l1_freq_table = l1_freq_table.merge(meta_data, how='left', left_on='sample_id', right_index=True)
l1_freq_table.head()


In [None]:

# l1_freq_table.rename(columns={'AIFI_barcodes': 'sample_id'}, inplace=True)

l1_freq_table.to_csv(output_path + proj_name +  'l1_prediction_cell_frequency_v2.csv')


In [None]:

# find out which samples have repeated measures
reap_kits=l1_freq_table.groupby(['sample_kit']).count().loc[l1_freq_table.groupby(['sample_kit']).count()['b_cells ratio']>=2,:].index
l1_freq_table_rep = l1_freq_table.loc[l1_freq_table['sample_kit'].isin(reap_kits), :]
l1_freq_table_rep.head()


In [None]:

# output the freuqency tables of L1 and L2 prediction labels
l2_freq_table = sy.tools.cell_type_ratios(adata, groupby='sample_id', normalize=True, key='scyan_pop', among=None)

# add metadata on 
l2_freq_table = l2_freq_table.merge(meta_data, how='left', left_on='sample_id', right_index=True)

l2_freq_table.to_csv(output_path + proj_name + 'l2_prediction_cell_frequency_v2.csv')


## Save prediction AnnData object

In [None]:

# save the dataset
adata.write_h5ad(output_path + proj_name + "predicted_adata.h5ad")


In [None]:

adata.obs.head()


In [None]:
print(len(adata.obs["scyan_pop"]))

In [None]:
print(sum(adata.obs["scyan_pop"] == "neutrophils"))

## Upload to HISE

In [None]:

# Switch to minimal Python kernel
import hisepy
import os
import glob

base_path = '/home/workspace/lilly-pd1-analysis/03_flow/'
output_path = base_path + '02-scyan-predictions/output/'

with open(output_path + "in_fids.txt") as f:
    in_fids = [line.rstrip('\n') for line in f]

in_fids


In [None]:

outputs = [os.path.join(output_path, f) for f in os.listdir(output_path)]
outputs = [f for f in outputs if not f.endswith("in_fids.txt")]
outputs = [f for f in outputs if not f.endswith(".ipynb_checkpoints")]
outputs = [f for f in outputs if os.path.isfile(f)]
outputs


In [None]:
hisepy.get_study_spaces()

In [None]:

batch = "B237_B175"
panel = "PL1"

hisepy.upload_files(
    files = outputs,
    study_space_id = 'cea64a3f-6050-4b24-960c-bbda4dd9a2ee',
    title = 'Lilly Flow Scyan Predictions, ' + panel + ' ' + batch, 
    input_file_ids = in_fids, 
    destination = 'flow/scyan/predict'
)
