In [2]:
REPO = '../'
RESULT_TABLE = f'{REPO}/data/table'
RESULT_OBJ = f'{REPO}/data/object'
FIGURE_FOLDER= f'{REPO}/data/figure'
SETTING_FOLDER = f'{REPO}/data/setting'
EXTERNAL_DATA=f'{REPO}/data/external'


import numpy as np
import pandas as pd
import warnings
from IPython.display import display
import pickle
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Load clinical data

In [3]:
# Cell annotation
obs = pd.read_csv(f'{RESULT_TABLE}/GEX_OBS_Lineage.csv',index_col=0)
## add celltype 
obs['Celltype'] = obs['Lineage']
for lineage in ['T','Myeloid','Epithelial']:
    lineage_anno = pd.read_csv(f'{RESULT_TABLE}/annotation/{lineage}.csv',index_col=0)
    celltype = lineage_anno['Celltype'] if 'Celltype' in lineage_anno else lineage_anno['Lineage']
    obs.loc[celltype.index,'Celltype'] = celltype
## add cellstate
obs['Cellstate'] = obs['Celltype']
for celltype in ['CD8T','Macs','Tumor']:
    cellstate = pd.read_csv(f'{RESULT_TABLE}/MPs/{celltype}/Annotation.csv',index_col=0)['Cellstate']
    obs.loc[cellstate.index,'Cellstate'] = cellstate

# clin
sample_meta =  pd.read_excel(f'{RESULT_TABLE}/Supplementary Table 1.xlsx',index_col=0).replace(np.nan,'N/A')
sample_meta['br_short'] = sample_meta['BestResponse'].map({
    'favorable response\n(RCB 0-I)': 'R',
    'unfavorable response\n(RCB II-III)': 'NR'
})
# merge sample meta with cell meta
obs = obs.reset_index().merge(sample_meta,left_on='Sample',right_on='CCG_ID',how='left').set_index('index')
obs.to_csv(f'{RESULT_TABLE}/GEX_OBS_Cellstate.csv')

## breakdown on pts in each timepoint and grouped by their bestresponse

In [4]:
sample_meta.Timepoint.value_counts()

Baseline     12
W3D1          9
W7D1          8
AfterSurg     7
Surg+AC       4
Name: Timepoint, dtype: int64

In [5]:
pd.crosstab(sample_meta.Timepoint,sample_meta.BestResponse)

BestResponse,favorable response\n(RCB 0-I),unfavorable response\n(RCB II-III)
Timepoint,Unnamed: 1_level_1,Unnamed: 2_level_1
AfterSurg,1,6
Baseline,3,9
Surg+AC,1,3
W3D1,2,7
W7D1,4,4


## number of pts with at least biopsies at two timepoint and grouped by their best response

In [6]:
tmp = pd.crosstab([sample_meta.BestResponse,sample_meta.Patient],sample_meta.Timepoint)
for bp in sample_meta.BestResponse.unique():
    for temp in [('Baseline','W3D1'),
                 ('W3D1','W7D1'),
                 ('Baseline','W7D1'),
                 ('Baseline','Surg+AC'),
                ('Baseline','AfterSurg')]:
        n_pts = (tmp.loc[bp,temp].sum(axis=1)==2).sum()
        if n_pts > 1:
            print(f"{bp}: N of pts ({','.join(list(temp))})={n_pts}")

favorable response
(RCB 0-I): N of pts (W3D1,W7D1)=2
favorable response
(RCB 0-I): N of pts (Baseline,W7D1)=2
unfavorable response
(RCB II-III): N of pts (Baseline,W3D1)=4
unfavorable response
(RCB II-III): N of pts (W3D1,W7D1)=3
unfavorable response
(RCB II-III): N of pts (Baseline,W7D1)=3
unfavorable response
(RCB II-III): N of pts (Baseline,Surg+AC)=2
unfavorable response
(RCB II-III): N of pts (Baseline,AfterSurg)=3


As W7D1 is the only timepoint where at least two patients from each group have biopsies taken from both the baseline and W7D1, we are limited to conducting the differential temporal analysis between **baseline and W7D1**.

## Select pts with single cell profiles at both W7D1 and baseline

In [32]:
selected_pts_dt = tmp[['Baseline','W7D1']].sum(axis=1) 
selected_pts_dt = tmp.loc[selected_pts_dt==2,:].reset_index()
selected_pts = selected_pts_dt.Patient
"c('"+"','".join(selected_pts)+"')"

"c('P01','P08','P12','P13','P18')"

In [33]:
sample_meta.loc[sample_meta.Patient.isin(selected_pts),['Patient','Treatment_Arm','BestResponse']].drop_duplicates().sort_values('Treatment_Arm')

Unnamed: 0_level_0,Patient,Treatment_Arm,BestResponse
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P01.T1,P01,Chemo->Combo,favorable response\n(RCB 0-I)
P13.T1,P13,Chemo->Combo,unfavorable response\n(RCB II-III)
P08.T1,P08,ICI->Combo,favorable response\n(RCB 0-I)
P12.T1,P12,ICI->Combo,unfavorable response\n(RCB II-III)
P18.T1,P18,ICI->Combo,unfavorable response\n(RCB II-III)


In [34]:
def sanity_check_n_samples_for_comparison(df,cond_column,celltype_col,
                                          sample_col='Sample_Short',
                                          min_cells=10):
    table = pd.crosstab([df[cond_column],df[sample_col]],df[celltype_col])
    n_samples = []
    for c in df[cond_column].unique():
        # n of samples with at least min_cells per celltype
        n_sample_per_celltype = (table.loc[c,:]>min_cells).sum(axis=0)
        n_sample_per_celltype.name = c
        n_samples.append(n_sample_per_celltype)
    n_samples = pd.concat(n_samples,axis=1)
    n_samples['Min_N'] = n_samples.min(axis=1)
    return n_samples

def iter_comparison_of_interest(df,celltype_col,selected_pts,min_cells=10,min_sample=2):
    df = df.loc[df[celltype_col]!='NS',:].copy() # remove unspecified cells
    print('Baseline: R vs NR')
    n_samples = sanity_check_n_samples_for_comparison(
        df=df.loc[df.Timepoint=='Baseline',:],
        cond_column='BestResponse',
        celltype_col=celltype_col
    )
    display(n_samples.loc[n_samples.Min_N>=min_sample,:].sort_values(celltype_col))
    print(f'Name of cell types with at least {min_cells} cells in at least {min_sample} samples in each condition:')
    print('c("'+'","'.join(n_samples.index[n_samples.Min_N>=min_sample].str.replace('[.+()-/\ ]','_').tolist())+'")')
    
    print('W7D1: R vs NR')
    n_samples = sanity_check_n_samples_for_comparison(
        df=df.loc[df.Timepoint=='W7D1',:],
        cond_column='br_short',
        celltype_col=celltype_col
    )
    display(n_samples.loc[n_samples.Min_N>=min_sample,:].sort_values(celltype_col))
    print(f'Name of cell types with at least {min_cells} cells in at least {min_sample} samples in each condition:')
    print('c("'+'","'.join(n_samples.index[n_samples.Min_N>=min_sample].str.replace('[.+()-/\ ]','_').tolist())+'")')
    
    print('\nBaseline-W7D1: R vs NR')
    r_n_samples = sanity_check_n_samples_for_comparison(
        df=df.loc[df.Timepoint.isin(['Baseline','W7D1'])&
                  df.Patient.isin(selected_pts)&
                  (df['br_short']=='R'),:],
        cond_column='Timepoint',
        celltype_col=celltype_col
    )    
    nr_n_samples = sanity_check_n_samples_for_comparison(
        df=df.loc[df.Timepoint.isin(['Baseline','W7D1'])&
                  df.Patient.isin(selected_pts)&
                  (df['br_short']=='NR'),:],
        cond_column='Timepoint',
        celltype_col=celltype_col
    )
    selected_celltypes = r_n_samples.index[r_n_samples.Min_N>=min_sample].intersection(
        nr_n_samples.index[nr_n_samples.Min_N>=min_sample])

    print('\nBaseline-W7D1: R')
    display(r_n_samples.loc[r_n_samples.Min_N>=min_sample,:])
    print('Baseline-W7D1: NR')
    display(nr_n_samples.loc[nr_n_samples.Min_N>=min_sample,:])
    print(f"Celltype present at least {min_sample} pts per timepoint per group in selected patients({','.join(selected_pts)})")
    print('c("'+'","'.join(selected_celltypes.str.replace('[.+()-/\ ]','_').tolist())+'")')
                   

### Check on n of samples in each comparison

In [35]:
iter_comparison_of_interest(df=obs,celltype_col='Celltype',selected_pts=selected_pts,min_sample=2)

Baseline: R vs NR


Unnamed: 0_level_0,unfavorable response\n(RCB II-III),favorable response\n(RCB 0-I),Min_N
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adipocytes,3,2,2
B,7,3,3
CAF,9,3,3
CD4T,6,3,3
CD8T,9,3,3
Endothelial,9,3,3
Epithelial,9,2,2
Immune,7,3,3
Macs,9,3,3
Pericyte,9,3,3


Name of cell types with at least 10 cells in at least 2 samples in each condition:
c("Adipocytes","B","CAF","CD4T","CD8T","Endothelial","Epithelial","Immune","Macs","Pericyte","Plasma","SMC","Stromal","Tumor")
W7D1: R vs NR


Unnamed: 0_level_0,R,NR,Min_N
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adipocytes,2,2,2
B,4,2,2
CAF,4,4,4
CD8T,4,4,4
Endothelial,4,4,4
Epithelial,4,4,4
Immune,3,3,3
Macs,4,4,4
Pericyte,4,4,4
Plasma,3,3,3


Name of cell types with at least 10 cells in at least 2 samples in each condition:
c("Adipocytes","B","CAF","CD8T","Endothelial","Epithelial","Immune","Macs","Pericyte","Plasma","SMC","Stromal","Tumor")

Baseline-W7D1: R vs NR

Baseline-W7D1: R


Unnamed: 0_level_0,Baseline,W7D1,Min_N
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B,2,2,2
CAF,2,2,2
CD4T,2,2,2
CD8T,2,2,2
Endothelial,2,2,2
Epithelial,2,2,2
Macs,2,2,2
Pericyte,2,2,2
Tumor,2,2,2


Baseline-W7D1: NR


Unnamed: 0_level_0,Baseline,W7D1,Min_N
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B,3,2,2
CAF,3,3,3
CD8T,3,3,3
Endothelial,3,3,3
Epithelial,3,3,3
Immune,3,2,2
Macs,3,3,3
Pericyte,3,3,3
Plasma,3,3,3
SMC,3,3,3


Celltype present at least 2 pts per timepoint per group in selected patients(P01,P08,P12,P13,P18)
c("B","CAF","CD8T","Endothelial","Epithelial","Macs","Pericyte","Tumor")


# ATAC Metadata

In [29]:
with open(f'{RESULT_OBJ}/cistopic_obj.pickle','rb') as f:
    cistopic_obj = pickle.load(f)

In [30]:
## Add ATAC and GEX Barcodes
obs['ATAC_Barcodes']=obs.index.map(lambda x:x.split('_')[0]+'___') + obs['Sample']
obs.index.name='GEX_Barcodes'
obs.reset_index(inplace=True)
obs.Cellstate=obs.Cellstate.str.replace('[.+()-/\ ]','_')
obs_mo = obs.loc[obs.ATAC_Barcodes.isin(cistopic_obj.cell_data.index),:]
obs_mo .to_csv(f"{RESULT_TABLE}/MO_OBS_Cellstate.csv",index=False)