# Preprocess CD45+ data for Scyan

Flow data preparation for Scyan cell type labeling. 

Start from CD45+ datasets per batch from QDA viability calling. 

1) Create the pytbon Anndata object and format data
2) Identify and remove autofluorescent and other infiltrating cells/debris  
   a) Identify poorly clustered/autofluorescent infiltrants using UMAP  
   b) Set gating thresholds (CXCR5, TCRva72, SSC-A, FSC-A) to exclude infiltrants  
   c) Verify gates remove infiltrants
4) Identify samples with poor quality data and remove (if indicated)
5) Transform and save data without infiltrants and poor quality samples

## Setup

In [None]:

# Start in minimal Python kernel
import hisepy
import os


In [None]:

# Import Lilly flow notes metadata
meta_fid = "9dadc265-cf58-4a79-b42e-69c135c794bd"
fres = hisepy.read_files([meta_fid])


In [None]:

# Import batch CD45+ csv files
panel = "PL1"
batch = "B237"
qda_version = "v3.2"

ps = hisepy.list_files_in_project_store("PD-1")

ps = ps[ps['name'].str.contains(batch)]
ps = ps[ps['name'].str.contains(panel)]
ps = ps[ps['name'].str.contains('flow/qda/')]
ps = ps[ps['name'].str.contains('allsamples')]

ps


In [None]:
ps_fid = ps['id'].to_list()

In [None]:

fres = hisepy.read_files(ps_fid)


In [None]:

in_fids = [str(fid) for fid in ps_fid] + [meta_fid]
in_fids


In [None]:

# Save input FIDs for upload
base_path = '/home/workspace/lilly-pd1-analysis/03_flow/'
output_path = base_path + '02-scyan-predictions/output/'
os.makedirs(output_path, exist_ok=True)

with open(output_path + "in_fids.txt", "w") as f:
    for line in in_fids:
        f.write(str(line) + "\n")


In [None]:

# Switch to Python scyan kernel
import scyan as sy
import os
import glob
import anndata
import re
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import scanpy as sc
import uuid
import sys
import gc
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

print(sy.__version__)

sc.settings.n_jobs=12


In [None]:

# Define the working parameters

batch = "B237"
panel = "PL1"
qda_version = 'v3.2'
panel_version = 'v2'
proj_name = 'EL_flow_label_pred_scyan_' + panel + '_' + batch + '_'

base_path = '/home/workspace/lilly-pd1-analysis/03_flow/'

fig_path = base_path + '02-scyan-predictions/results/' + panel + '/'

know_tb_file = base_path + 'knowledge_tables/Lilly_flow_population_knowledge_table_' + panel + '_' + panel_version + '.csv'
know_tb = pd.read_csv(know_tb_file, index_col=[0, 1])

panel_file = base_path + 'panels/AIFI_flow_' + panel + '_panel_breakdown.csv'
panel_meta = pd.read_csv(panel_file)

output_path = base_path + '02-scyan-predictions/output/'

# Paths and files imported from HISE
input_path = '/home/workspace/input/2506117363/PD-1/'

# in_fp <- list.files(paste0('./cache/',in_fid), full.names = TRUE, recursive = TRUE)
# in_fp
meta_file = glob.glob(os.path.join(input_path, '**', 'Lilly_flow_cytometry_sample_metadata_all_batches_updated.csv'), recursive=True)
meta_data = pd.read_csv(meta_file[0],index_col=0) 

panel_cd45_file_pattern = "flow/qda/" + batch + "_" + panel+ "_" + qda_version + "_CD45+_predicted_allsamples.csv"
panel_cd45_file = glob.glob(os.path.join(input_path, '**', panel_cd45_file_pattern), recursive=True)
panel_cd45_file


In [None]:
def generate_uuids(n): return [str(uuid.uuid1()) for _ in range(n)]

In [None]:

# Function for plotting marker expression data
def plotBivariateMarkers(adata, marker_x, marker_y, populations=None, palette=None, key='scyan_pop'): 

    temp = adata.copy()

    if palette is None: 
        palette=sns.color_palette()

    if populations is None:
        sc.pl.scatter(temp, x=marker_x, y=marker_y, color=key, palette=palette)
        return
    
    temp.obs[key] = temp.obs[key].apply(lambda x: x if x in populations else 'Other')
    populations = populations + ['Other']
    temp.obs["pop_ordered"] = pd.Categorical(
        values=temp.obs[key], categories=populations, ordered=True
    )
    sc.pl.scatter(temp, x=marker_x, y=marker_y, color='pop_ordered', palette=palette)
    


## Create AnnData object

### Import CD45+ data

In [None]:

adata = sy.read_csv(panel_cd45_file[0],marker_regex='cd|hla|tcr|ig|ccr|cx3cr1')
print(f"Created anndata object with {adata.n_obs} cells and {adata.n_vars} markers.\n\n-> The markers names are: {', '.join(adata.var_names)}\n-> The non-marker names are: {', '.join(adata.obs.columns)}")


### Format marker names and metadata

In [None]:

# name variables based on panel data
adata.var


In [None]:

# filter for only markers
marker_panel = panel_meta[panel_meta['fcs_colname'].str.startswith("FJComp-")]

fcs_colname_list = marker_panel['fcs_colname'].tolist()
antigen_list = marker_panel['antigen'].tolist()
combined_values = [f"{fcs}_{antigen}" for fcs, antigen in zip(fcs_colname_list, antigen_list)]

# create mapping dictionary
name_mapping = {old_name: new_name for old_name, new_name in zip(combined_values, antigen_list)}
name_mapping


In [None]:

# map the keys of adata vars with mapping dictionary
adata.var_names = adata.var_names.map(name_mapping)

print(f"Created anndata object with {adata.n_obs} cells and {adata.n_vars} markers.\n\n-> The markers names are: {', '.join(adata.var_names)}\n-> The non-marker names are: {', '.join(adata.obs.columns)}")


In [None]:

# filter to relevant metadata
meta_data = meta_data[meta_data['batch'] == batch]
meta_data.head()


In [None]:

# merge metadata
meta_data = meta_data[['subject', "cohort", "sample_kit", "Visit", "disease_status", "Notes"]]
meta_data.head()


In [None]:

# check if all the sample are in the metatdata
all(adata.obs['sample_id'].isin(meta_data.index))


In [None]:
adata.obs['sample_id'][~adata.obs['sample_id'].isin(meta_data.index)].unique()

In [None]:

# unique sample IDs
adata.obs['sample_id'].unique()


In [None]:

# sample IDs by cohort
meta_data.loc[adata.obs['sample_id'].unique(), :].groupby(['cohort']).size()


### Merge with metadata

In [None]:

# add metadata on, left join on samples from adata
adata.obs = adata.obs.merge(meta_data, how='left', left_on='sample_id', right_index=True)

#check if any missing fields in sample ID
adata.obs.sample_id.isnull().any()


In [None]:

# preview data
adata.obs.head()


In [None]:

# add unique barcodes
adata.obs['barcodes'] = generate_uuids(adata.n_obs)
adata.obs.head()


### Remove improperly mixed samples

In [None]:

# remove improperly mixed samples
samples_to_remove = meta_data[meta_data['Notes'] == "Improperly Mixed Test Sample"]["sample_kit"]

len(samples_to_remove)



In [None]:

adata.obs['sample_kit'] = adata.obs['sample_kit'].astype(str)

# Define the condition based on the metadata in the "obs" slot
condition = ~adata.obs['sample_kit'].isin(samples_to_remove)

# Use boolean indexing with the negation operator (~) to remove samples that meet the condition
adata = adata[condition]
len(adata.obs)


In [None]:

#check
adata.obs['sample_kit'].isin(samples_to_remove).any()


In [None]:

# adata.obs["Notes"] = "Mixed"
adata.obs.columns


## Save initial AnnData object

In [None]:

adata.obs["Notes"] = adata.obs["Notes"].fillna("").astype(str)


In [None]:

adata.write_h5ad(output_path + proj_name + 'raw_adata.h5ad')


## Visualize initial data (UMAP)

In [None]:

adata = sc.read_h5ad(output_path + proj_name + "raw_adata.h5ad")


In [None]:

gating_antigens = panel_meta[(panel_meta['used_for_cyanno']=='YES')& (panel_meta['antigen'].isin(adata.var_names))].antigen.tolist()
gating_antigens


In [None]:

# Save raw data for transformation after QC
adata.layers['raw'] = adata.X.copy()


In [None]:

# Initial transformation for UMAP
sy.preprocess.auto_logicle_transform(adata)
adata.X


In [None]:

sy.tools.umap(adata, markers=gating_antigens)


In [None]:

p1=sy.plot.umap(adata, color=['cohort','batch','subject','Visit'],  ncols=2, wspace=0.4,show=False, return_fig=True)

p1=sy.plot.umap(adata, color=['CD185 (CXCR5)'], show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['TCR va72'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['CD279 (PD-1)'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['IgA'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['IgD'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['IgG'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['IgM'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['CD16'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['CD4'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['CD8'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['CD3'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['CD15'], ncols=3, show=False, return_fig=True)
p1=sy.plot.umap(adata, color=['CD14'], ncols=3, show=False, return_fig=True)


In [None]:

adata.obsm['umap_initial'] = adata.obsm['X_umap'].copy()


## Assess data quality

### Identify and remove nonviable cells 

#### Set marker thresholds 

In [None]:

plotBivariateMarkers(adata, 'CD185 (CXCR5)', 'TCR va72', key = 'subject')


In [None]:

plotBivariateMarkers(adata, 'CD16', 'SSC-A', key = 'subject')


In [None]:

plotBivariateMarkers(adata, 'FSC-A', 'SSC-A', key = 'subject')


In [None]:

# Set limits for viable cells
thresholds = {
    "CD185 (CXCR5)": 0.55,
    "TCR va72": 0.5,
    "SSC-A": 4000000, 
    "FSC-A": 4000000
}

# Create a boolean mask for cells to keep
mask = ~(
    (adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() |
    (adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten() |
    (adata.obs["SSC-A"] > thresholds["SSC-A"]) |
    (adata.obs["FSC-A"] > thresholds["FSC-A"]) #|
    # (adata.obs["FSC-A"] < 1000000)
) 



In [None]:

# Create masks or each combination of exclusion criteria (for visualization)
mask1 = (
    ((adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() |
    (adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten()) |
    (adata.obs["SSC-A"] > thresholds["SSC-A"])
)

mask2 = (
    ((adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() |
    (adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten()) &
    ~(adata.obs["SSC-A"] > thresholds["SSC-A"])
)

mask3 = (
    ~(adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() &
    ~(adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten() &
    (adata.obs["SSC-A"] > thresholds["SSC-A"])
)


#### Initial filtering and assessment

In [None]:

adata_filtered = adata[mask].copy()


In [None]:

# Look at initial vs filtered data UMAP 
print('Original data')
p1=sy.plot.umap(adata, color=["subject"], ncols=1, show=False, return_fig=True)


In [None]:

print('CXCR5 high + TCRva72 high + SSC-A high')
p1=sy.plot.umap(adata[~mask1], color=["subject"], ncols=1, show=False, return_fig=True)



In [None]:

print('CXCR5 high + TCRva72 high + SSC-A normal')
p1=sy.plot.umap(adata[~mask2], color=["subject"], ncols=1, show=False, return_fig=True)



In [None]:

print('CXCR5 normal + TCRva72 normal + SSC-A high')
p1=sy.plot.umap(adata[~mask3], color=["subject"], ncols=1, show=False, return_fig=True)



In [None]:

print('All criteria applied')
p1=sy.plot.umap(adata[mask], color=["subject"], ncols=1, show=False, return_fig=True)


In [None]:

plotBivariateMarkers(adata_filtered, 'CD185 (CXCR5)', 'TCR va72', key = 'subject')


In [None]:

plotBivariateMarkers(adata_filtered, 'CD16', 'SSC-A', key = 'subject')


In [None]:

plotBivariateMarkers(adata_filtered, 'FSC-A', 'SSC-A', key = 'subject')


In [None]:

print(len(adata.obs))


In [None]:

print(len(adata_filtered.obs))


In [None]:

# Create masks for each condition
all_three_conditions = (
    ((adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() |
    (adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten()) &
    (adata.obs["SSC-A"] > thresholds["SSC-A"])
)

cxcr5_tcr_only = (
    ((adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() |
    (adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten()) &
    ~(adata.obs["SSC-A"] > thresholds["SSC-A"])
)

ssca_only = (
    ~(adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() &
    ~(adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten() &
    (adata.obs["SSC-A"] > thresholds["SSC-A"])
)

fsca_only = (
    ~(adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() &
    ~(adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten() &
    ~(adata.obs["SSC-A"] > thresholds["SSC-A"]) &
    (adata.obs["FSC-A"] > thresholds["FSC-A"]) 
)

# Count the number of cells for each condition
counts = {
    "All three conditions": all_three_conditions.sum(),
    "CXCR5 and TCR va72 only": cxcr5_tcr_only.sum(),
    "SSC-A only": ssca_only.sum(), 
    "FSC-A only": fsca_only.sum()
}

counts_table = pd.DataFrame(list(counts.items()), columns=["Condition", "Cell Count"])
counts_table


#### Metadata association with nonviable cells

In [None]:

# Create masks for each condition
adata.obs["all_three_conditions"] = (
    ((adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() |
    (adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten()) &
    (adata.obs["SSC-A"] > thresholds["SSC-A"])
)

adata.obs["cxcr5_tcr_only"] = (
    ((adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() |
    (adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten()) &
    ~(adata.obs["SSC-A"] > thresholds["SSC-A"])
)

adata.obs["ssca_only"] = (
    ~(adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() &
    ~(adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten() &
    (adata.obs["SSC-A"] > thresholds["SSC-A"])
)

adata.obs["fsca_only"] = (
    ~(adata[:, "CD185 (CXCR5)"].X > thresholds["CD185 (CXCR5)"]).flatten() &
    ~(adata[:, "TCR va72"].X > thresholds["TCR va72"]).flatten() &
    ~(adata.obs["SSC-A"] > thresholds["SSC-A"]) &
    (adata.obs["FSC-A"] > thresholds["FSC-A"]) 
)

# Group by Visit and Subject and compute counts for each condition
grouped_counts = (
    adata.obs.groupby(["sample_id"])
    .agg({
        "all_three_conditions": "sum",
        "cxcr5_tcr_only": "sum",
        "ssca_only": "sum",
        "fsca_only": "sum"
    })
    .reset_index()
)

grouped_counts = grouped_counts.merge(meta_data, how='left', left_on='sample_id', right_index=True)

# grouped_counts.to_csv(fig_path + batch + "_excluded_cells_sample_visit.csv", index=False)


In [None]:

# Set the sample_id as the index for easier plotting
grouped_counts.set_index("sample_id", inplace=True)

# Plot a stacked bar chart
grouped_counts[["all_three_conditions", "cxcr5_tcr_only", "ssca_only", "fsca_only"]].plot(
    kind="bar", stacked=True, figsize=(12, 6), color=["blue", "orange", "green", "red"]
)

# Add labels and title
plt.xlabel("Sample ID", fontsize=12)
plt.ylabel("Cell Count", fontsize=12)
plt.title("Contribution per sample to each exlusion criterion", fontsize=14)
plt.xticks(rotation=90)
plt.legend(title="Criteria", fontsize=10)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:

visit_order = ['Bloodworks', 'Lilly PD1 Baseline', 'Lilly PD1 W4', 'Lilly PD1 W8', 'Lilly PD1 W12', 'Lilly PD1 W24']
grouped_counts["Visit"] = pd.Categorical(grouped_counts["Visit"], categories=grouped_counts, ordered=True)


In [None]:

# Aggregate counts by Visit
visit_composition = (
    adata.obs.groupby("Visit")[["all_three_conditions", "cxcr5_tcr_only", "ssca_only", "fsca_only"]]
    .sum()
    .reset_index()
)

# Plot a stacked bar chart
visit_composition.set_index("Visit").plot(
    kind="bar", stacked=True, figsize=(10, 6), color=["blue", "orange", "green", "red"]
)

# Add labels and title
plt.xlabel("Visit", fontsize=12)
plt.ylabel("Cell Count", fontsize=12)
plt.title("Contribution per visit to each exlusion criterion", fontsize=14)
plt.legend(title="Criteria", fontsize=10)
plt.xticks(rotation=45)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Calculate total counts for each Visit from the original adata object
total_counts = adata.obs["Visit"].value_counts()

# Ensure Visit is categorical and matches the order
visit_order = visit_composition["Visit"].unique()  # Use unique visits from visit_composition
total_counts = total_counts.reindex(visit_order, fill_value=0)

# Add total counts to the visit_composition DataFrame
visit_composition["Total"] = total_counts.values

# Set up the bar positions
x = np.arange(len(visit_composition["Visit"]))  # Positions for each Visit
width = 0.35  # Width of the bars

# Create the plot
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot the stacked bars for each criterion on the primary y-axis
ax1.bar(x - width / 2, visit_composition["all_three_conditions"], width, label="All Three Conditions", color="blue")
ax1.bar(x - width / 2, visit_composition["cxcr5_tcr_only"], width, bottom=visit_composition["all_three_conditions"], label="CXCR5 and TCR va72 Only", color="orange")
ax1.bar(x - width / 2, visit_composition["ssca_only"], width, bottom=visit_composition["all_three_conditions"] + visit_composition["cxcr5_tcr_only"], label="SSC-A Only", color="green")
ax1.bar(x - width / 2, visit_composition["fsca_only"], width, bottom=visit_composition["all_three_conditions"] + visit_composition["cxcr5_tcr_only"] + visit_composition["ssca_only"], label="FSC-A Only", color="red")

# Add labels and title for the primary y-axis
ax1.set_xlabel("Visit", fontsize=12)
ax1.set_ylabel("Cell Count (Criteria)", fontsize=12)
ax1.set_title("Contribution per visit to each exlusion criterion, compared with total visit composition of data (dual axes)", fontsize=14)
ax1.set_xticks(x)
ax1.set_xticklabels(visit_composition["Visit"], rotation=45)
ax1.legend(loc="upper left", title="Criteria", fontsize=10)

# Create a secondary y-axis for the total bar
ax2 = ax1.twinx()
ax2.bar(x + width / 2, visit_composition["Total"], width, label="Total (Original Data)", color="gray", alpha=0.7)

# Add labels and legend for the secondary y-axis
ax2.set_ylabel("Cell Count (Total)", fontsize=12)
ax2.legend(loc="upper right", title="Total", fontsize=10)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:

# Aggregate by Cohort
cohort_counts = grouped_counts.groupby("cohort")[["all_three_conditions", "cxcr5_tcr_only", "ssca_only", "fsca_only"]].sum()

# Plot a stacked bar chart
cohort_counts.plot(
    kind="bar", stacked=True, figsize=(10, 6), color=["blue", "orange", "green", "red"]
)

# Add labels and title
plt.xlabel("Cohort", fontsize=12)
plt.ylabel("Cell Count", fontsize=12)
plt.title("Contribution per cohort to each exclusion criterion", fontsize=14)
plt.legend(title="Criteria", fontsize=10)

# Show the plot
plt.tight_layout()
plt.show()


#### Final filtering of nonviable cells

In [None]:

# Revert to raw data that can be re-transformed without weird data
adata_filtered.X = adata_filtered.layers['raw'].copy()


In [None]:

# Remove layers for sake of space
del adata
del adata_filtered.layers['raw']

gc.collect()


### Identify and remove nonviable samples

In [None]:

print(len(adata_filtered.obs))


In [None]:

counts = []

for sample in adata_filtered.obs['sample_id'].unique():
    count = sum(adata_filtered.obs['sample_id'] == sample)
    counts.append({'sample_id': sample, 'count': count})

counts_df = pd.DataFrame(counts)

print(pd.DataFrame(counts_df))


In [None]:

# Define the coloring logic
def assign_color(cell_count):
    if cell_count > 2e6 or cell_count < 5e4:  # Red for > 2M or < 80K
        return 'red'
    elif (1e6 < cell_count <= 2e6) or (5e4 <= cell_count < 2e5):  # Yellow for 1M-2M or 80K-200K
        return 'yellow'
    else:  # Green otherwise
        return 'green'

# Apply the coloring logic to the DataFrame
counts_df['color'] = counts_df['count'].apply(assign_color)

# Plot the bar chart with colored bars
plt.figure(figsize=(12, 6))
bars = plt.bar(counts_df['sample_id'], counts_df['count'], color=counts_df['color'])

# Add labels and title
plt.xlabel('Sample ID', fontsize=12)
plt.ylabel('Cell Numbers', fontsize=12)
plt.title('Sample Counts', fontsize=14)
plt.xticks(rotation=90, ha='center')

# Add a legend for the colors
legend_labels = {'red': '> 2M or < 50K', 'yellow': '1M-2M or 50K-200K', 'green': 'Other'}
handles = [plt.Rectangle((0, 0), 1, 1, color=color) for color in legend_labels.keys()]
plt.legend(handles, legend_labels.values(), title="Cell Count Range", fontsize=10)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:

# If samples exist with concerning counts, look at sample viability 
for sample in adata_filtered.obs['sample_id'].unique():
    print(sample)
    temp = adata_filtered[adata_filtered.obs['sample_id'] == sample].copy()
    plotBivariateMarkers(temp, 'FSC-A', 'SSC-A', key = 'sample_id')
    

In [None]:

# If needed, remove 'problem' samples (to be conservative, only choose those with low counts and obviously abnormal MFI expression; more can always be removed in/after scyan prediction step)
kits_to_exclude = ['']

adata_filtered_2 = adata_filtered[~adata_filtered.obs['sample_kit'].isin(kits_to_exclude)].copy()


In [None]:

print(len(adata_filtered_2.obs))


In [None]:

# Look at initial vs filtered data UMAP 
p1=sy.plot.umap(adata_filtered, color=["sample_id"], ncols=1, show=False, return_fig=True)
p1=sy.plot.umap(adata_filtered[adata_filtered.obs['sample_kit'].isin(kits_to_exclude)], color=["sample_id"], size=2, ncols=1, show=False, return_fig=True)


In [None]:

# If the new data looks good on UMAP, replace initial filtered adata object and remove second object for lower memory overhead
# adata_filtered = adata_filtered_2.copy()
del adata_filtered_2
gc.collect()


## Transform data

In [None]:

# Logicle transform on all data
sy.preprocess.auto_logicle_transform(adata_filtered)
adata_filtered.X


In [None]:

adata_filtered.X.shape


## Visualize preprocessed data (UMAP)

In [None]:

sy.tools.umap(adata_filtered, markers=gating_antigens)


In [None]:

p1=sy.plot.umap(adata_filtered, color=['cohort','batch','subject','Visit'],  ncols=2, wspace=0.4,show=False, return_fig=True)

for var in gating_antigens:
    p1=sy.plot.umap(adata_filtered, color=[var], show=False, return_fig=True)


In [None]:

adata_filtered.obsm['umap_processed'] = adata_filtered.obsm['X_umap']


## Save preprocessed AnnData object

In [None]:

adata_filtered.write_h5ad(output_path + proj_name + "processed_adata.h5ad")
 

## Upload to HISE

In [None]:

# Switch to minimal Python kernel
import hisepy
import os
import glob

base_path = '/home/workspace/lilly-pd1-analysis/03_flow/'
output_path = base_path + '02-scyan-predictions/output/'

with open(output_path + "in_fids.txt") as f:
    in_fids = [line.rstrip('\n') for line in f]

in_fids


In [None]:

outputs = [os.path.join(output_path, f) for f in os.listdir(output_path)]
outputs = [f for f in outputs if not f.endswith("in_fids.txt")]
outputs = [f for f in outputs if not f.endswith(".ipynb_checkpoints")]
outputs = [f for f in outputs if os.path.isfile(f)]
outputs


In [None]:
hisepy.get_study_spaces()

In [None]:

batch = "B237"
panel = "PL1"

hisepy.upload_files(
    files = outputs,
    study_space_id = 'cea64a3f-6050-4b24-960c-bbda4dd9a2ee',
    title = 'Lilly Flow Scyan Preprocessing, ' + panel + ' ' + batch, 
    input_file_ids = in_fids, 
    destination = 'flow/scyan/preprocess'
)
