In [1]:
#### ---------
#### Use the scanpy.yml environment
#### ---------
import os 
import pandas as pd

In [2]:
## read data and map cell type annotations to Leiden clusters for each panel 
panels = ['immune','tumor','tracked']
panel_df_dict = {}

for p in panels:

    # read annotations and turn into a dictionary with leiden clusters as keys 
    annotation_df = pd.read_csv(f'annotations/E06_{p}_annotations.csv')
    annotation_map = pd.Series(annotation_df.cell_type_refined.values, index = annotation_df.cluster).to_dict()

    # read leiden clustered data
    df = pd.read_csv(f'data/E06_{p}_indiv_leiden.csv')

    # map cell types using the annotation dict 
    df[f'cell_type_{p}'] = df['leiden'].map(annotation_map)

    # save df in dict with panel as key
    panel_df_dict[p] = df

In [3]:
## merge immune and tumor cell types into the tracked dataframe 
# prep for merging with tracked data
immune = panel_df_dict['immune'][['CellID','cell_type_immune']]
tumor = panel_df_dict['tumor'][['CellID','cell_type_tumor']]

# merge cluster assignments across panels 
tracked = pd.merge(panel_df_dict['tracked'], immune, left_on = 'immune_CellID', right_on = 'CellID')
tracked = pd.merge(tracked, tumor, left_on = 'tumor_CellID', right_on = 'CellID')

In [4]:
# wrangle into a format that ggalluvial can handle (frequency table)
tracked = tracked[['cell_type_tumor','cell_type_tracked','cell_type_immune']]
tracked.columns = ['Tumor', 'Combined','Immune']
tracked = pd.DataFrame(tracked.value_counts(['Tumor', 'Combined','Immune'])).reset_index()

# reset again to give ggalluvial a subject 
tracked = tracked.reset_index()
tracked.columns = ['subject','Tumor', 'Combined','Immune', 'frequency']

# wide to long 
tracked_melt = tracked.melt(id_vars=['subject','frequency'])
tracked_melt.columns = ['subject', 'frequency', 'panel', 'cell_type']

In [6]:
tracked_melt.to_csv('data/E06_alluvial_input.csv')