In [1]:
GEX_Cohort='GEX_CCG1112_LowMt'
ATAC_Cohort='CCG1112_ATAC_QC'
REPO = '../..'
WORKFLOW_DATA = f'{REPO}/data/workflow'
EXTERNAL_DATA = f'{REPO}/data/external'
RESULT_DATA = f'{REPO}/data/result/cleaned_files/'
Manuscript_RESULT = f'{REPO}/data/result/manuscript_table/'
FIGURE_FOLDER=f'{REPO}/ata/result/cleaned_figures'


import numpy as np
import pandas as pd
import warnings
from IPython.display import display
import pickle
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Load clinical data

In [7]:
clin = pd.read_csv(f'{RESULT_DATA}/Clin_Obs.csv')
clin['Patient'] = clin['Sample_Short'].map(lambda x:x.split('.')[0])
clin.Treatment_Arm = clin.Treatment_Arm.map({'ICI->Chemo':'ICI->Combo','Chemo->ICI':'Chemo->Combo'})
# load lineage level annotation
lineage= pd.concat([
    pd.read_csv(f'{RESULT_DATA}/Annotation_{lineage}.csv',
                 index_col=0)[['Sample_Short','Sample','Compartment','Lineage']] for lineage in ['Immune','Epithelial','Stromal']
],axis=0)
# because we did seperate T cell into CD8+ and CD4+ T cells
# thus, we need to update such cell type information.
orig_n_t = (lineage.Lineage=='T').sum()
## Remove the T lineage
lineage = lineage.loc[lineage.Lineage!='T',:]
lineage['Celltype'] = lineage['Lineage'] # placeholder the lineage as the cell type, e.g. B and Plasma.
t_lineage =  pd.read_csv(f'{RESULT_DATA}/Annotation_T.csv',index_col=0)[['Sample_Short','Sample','Compartment','Lineage','Celltype']]
t_lineage['Celltype'] = t_lineage['Celltype']+'T'
## Add back the T lineage with celltype annotation
lineage = pd.concat([
    lineage,
    t_lineage
])
## Macs 
myeloid_lineage =  pd.read_csv(f'{Manuscript_RESULT}/Interm_Annotation/Myeloid.csv',index_col=0)
lineage.loc[myeloid_lineage.index,'Celltype'] = myeloid_lineage['Myeloid_scSHC']
lineage['Cellstate'] = lineage['Celltype'] # 'NS' non-specified
lineage.head()

Unnamed: 0,Sample_Short,Sample,Compartment,Lineage,Celltype,Cellstate
GTATGTGGTCCTCCAA-1_CCG1112_16_T4_A1_CCG1112_MO_Batch1,P16.T4,CCG1112_16_T4_A1,Immune,Plasma,Plasma,Plasma
AACAAGCCAACAGGAT-1_CCG1112_16_T4_A1_CCG1112_MO_Batch1,P16.T4,CCG1112_16_T4_A1,Immune,Myeloid,Macs,Macs
TTATAGCCATATAACC-1_CCG1112_16_T4_A1_CCG1112_MO_Batch1,P16.T4,CCG1112_16_T4_A1,Immune,Myeloid,Macs,Macs
ATCACAATCCCTCTAA-1_CCG1112_16_T4_A1_CCG1112_MO_Batch1,P16.T4,CCG1112_16_T4_A1,Immune,Myeloid,Macs,Macs
CCTCAAACAGGCATGA-1_CCG1112_16_T4_A1_CCG1112_MO_Batch1,P16.T4,CCG1112_16_T4_A1,Immune,Myeloid,Macs,Macs


## breakdown on pts in each timepoint and grouped by their bestresponse

In [8]:
clin.Timepoint.value_counts()

Baseline     12
W3D1          9
W7D1          8
AfterSurg     7
Surg+AC       4
Name: Timepoint, dtype: int64

In [9]:
pd.crosstab(clin.Timepoint,clin.BestResponse)

BestResponse,0-I,II-III
Timepoint,Unnamed: 1_level_1,Unnamed: 2_level_1
AfterSurg,1,6
Baseline,3,9
Surg+AC,1,3
W3D1,2,7
W7D1,4,4


## number of pts with at least biopsies at two timepoint and grouped by their best response

In [10]:
tmp = pd.crosstab([clin.BestResponse,clin.Patient],clin.Timepoint)
for bp in clin.BestResponse.unique():
    for temp in [('Baseline','W3D1'),
                 ('Baseline','W7D1'),
                 ('Baseline','Surg+AC'),
                ('Baseline','AfterSurg')]:
        n_pts = (tmp.loc[bp,temp].sum(axis=1)==2).sum()
        if n_pts > 1:
            print(f"{bp}: N of pts ({','.join(list(temp))})={n_pts}")

II-III: N of pts (Baseline,W3D1)=4
II-III: N of pts (Baseline,W7D1)=3
II-III: N of pts (Baseline,Surg+AC)=2
II-III: N of pts (Baseline,AfterSurg)=3
0-I: N of pts (Baseline,W7D1)=2


As W7D1 is the only timepoint where at least two patients from each group have biopsies taken from both the baseline and W7D1, we are limited to conducting the differential temporal analysis between **baseline and W7D1**.

## Select pts with single cell profiles at both W7D1 and baseline

In [11]:
selected_pts_dt = tmp[['Baseline','W7D1']].sum(axis=1) 
selected_pts_dt = tmp.loc[selected_pts_dt==2,:].reset_index()
selected_pts = selected_pts_dt.Patient
"c('"+"','".join(selected_pts)+"')"

"c('P01','P08','P12','P13','P18')"

In [12]:
clin.loc[clin.Patient.isin(selected_pts),['Patient','Treatment_Arm','BestResponse']].drop_duplicates().sort_values('Treatment_Arm')

Unnamed: 0,Patient,Treatment_Arm,BestResponse
4,P01,Chemo->Combo,0-I
37,P13,Chemo->Combo,II-III
11,P12,ICI->Combo,II-III
20,P18,ICI->Combo,II-III
34,P08,ICI->Combo,0-I


In [13]:
def sanity_check_n_samples_for_comparison(df,cond_column,celltype_col,
                                          sample_col='Sample_Short',
                                          min_cells=10):
    table = pd.crosstab([df[cond_column],df[sample_col]],df[celltype_col])
    n_samples = []
    for c in df[cond_column].unique():
        # n of samples with at least min_cells per celltype
        n_sample_per_celltype = (table.loc[c,:]>min_cells).sum(axis=0)
        n_sample_per_celltype.name = c
        n_samples.append(n_sample_per_celltype)
    n_samples = pd.concat(n_samples,axis=1)
    n_samples['Min_N'] = n_samples.min(axis=1)
    return n_samples

def iter_comparison_of_interest(df,celltype_col,selected_pts,min_cells=10,min_sample=2):
    df = df.loc[df[celltype_col]!='NS',:].copy() # remove unspecified cells
    print('Baseline: R vs NR')
    n_samples = sanity_check_n_samples_for_comparison(
        df=df.loc[df.Timepoint=='Baseline',:],
        cond_column='BestResponse',
        celltype_col=celltype_col
    )
    display(n_samples.loc[n_samples.Min_N>=min_sample,:].sort_values(celltype_col))
    print(f'Name of cell types with at least {min_cells} cells in at least {min_sample} samples in each condition:')
    print('c("'+'","'.join(n_samples.index[n_samples.Min_N>=min_sample].str.replace('[.+()-/\ ]','_').tolist())+'")')
    
    print('W7D1: R vs NR')
    n_samples = sanity_check_n_samples_for_comparison(
        df=df.loc[df.Timepoint=='W7D1',:],
        cond_column='BestResponse',
        celltype_col=celltype_col
    )
    display(n_samples.loc[n_samples.Min_N>=min_sample,:].sort_values(celltype_col))
    print(f'Name of cell types with at least {min_cells} cells in at least {min_sample} samples in each condition:')
    print('c("'+'","'.join(n_samples.index[n_samples.Min_N>=min_sample].str.replace('[.+()-/\ ]','_').tolist())+'")')
    
    print('\nBaseline-W7D1: R vs NR')
    r_n_samples = sanity_check_n_samples_for_comparison(
        df=df.loc[df.Timepoint.isin(['Baseline','W7D1'])&
                  df.Patient.isin(selected_pts)&
                  (df['BestResponse']=='0-I'),:],
        cond_column='Timepoint',
        celltype_col=celltype_col
    )    
    nr_n_samples = sanity_check_n_samples_for_comparison(
        df=df.loc[df.Timepoint.isin(['Baseline','W7D1'])&
                  df.Patient.isin(selected_pts)&
                  (df['BestResponse']=='II-III'),:],
        cond_column='Timepoint',
        celltype_col=celltype_col
    )
    selected_celltypes = r_n_samples.index[r_n_samples.Min_N>=min_sample].intersection(
        nr_n_samples.index[nr_n_samples.Min_N>=min_sample])

    print('\nBaseline-W7D1: R')
    display(r_n_samples.loc[r_n_samples.Min_N>=min_sample,:])
    print('Baseline-W7D1: NR')
    display(nr_n_samples.loc[nr_n_samples.Min_N>=min_sample,:])
    print(f"Celltype present at least {min_sample} pts per timepoint per group in selected patients({','.join(selected_pts)})")
    print('c("'+'","'.join(selected_celltypes.str.replace('[.+()-/\ ]','_').tolist())+'")')
                   

# GEX Annotation

In [14]:
## Relative abundance of a cell type in the lineage which it belongs to
# data= pd.concat([
#     pd.read_csv(f'{RESULT_DATA}/MP_CellState_{celltype}.csv',
#                  index_col=0)[['Sample_Short','Sample','Celltype','Cellstate']
#                              ] for celltype in ['Macs','CD8T','CAF','Endothelial']
# ],axis=0)

# data = pd.concat([data,
#                   pd.read_csv(f'{RESULT_DATA}/Annotation_Tumor.csv',index_col=0)[['Sample_Short','Sample','Lineage','Celltype']].rename(columns={'Celltype':'Cellstate'})
# ],axis=0)

data= pd.concat([
    pd.read_csv(f'{Manuscript_RESULT}/MPs/{celltype}/Annotation.csv',
                 index_col=0)
                             for celltype in ['Macs','CD8T','Tumor']
],axis=0)

obs = lineage.copy()
obs.loc[data.index,'Cellstate'] = data['Cellstate']
obs['Tech'] = obs.index.map(lambda x:'MO' if 'MO' in x else 'GEX')
obs = obs.reset_index().merge(clin,on='Sample_Short',how='left').set_index('index')
assert obs.shape[0]==lineage.shape[0],'The number of cell does not match.'
print(f'N(sample)={obs.Sample_Short.nunique()};N(Pts)={obs.Patient.nunique()}')
obs.to_csv(f'{Manuscript_RESULT}/GEX_OBS.csv',index=True)

N(sample)=40;N(Pts)=20


### Check on n of samples in each comparison

In [13]:
iter_comparison_of_interest(df=obs,celltype_col='Celltype',selected_pts=selected_pts,min_sample=2)

Baseline: R vs NR


Unnamed: 0_level_0,II-III,0-I,Min_N
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adipocytes,3,2,2
B,7,3,3
CAF,9,3,3
CD4T,6,3,3
CD8T,9,3,3
Endothelial,9,3,3
Epithelial,9,2,2
Immune,7,3,3
Macs,9,3,3
Pericyte,9,3,3


Name of cell types with at least 10 cells in at least 2 samples in each condition:
c("Adipocytes","B","CAF","CD4T","CD8T","Endothelial","Epithelial","Immune","Macs","Pericyte","Plasma","SMC","Stromal","Tumor")
W7D1: R vs NR


Unnamed: 0_level_0,0-I,II-III,Min_N
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adipocytes,2,2,2
B,4,2,2
CAF,4,4,4
CD8T,4,4,4
Endothelial,4,4,4
Epithelial,4,4,4
Immune,3,3,3
Macs,4,4,4
Pericyte,4,4,4
Plasma,3,3,3


Name of cell types with at least 10 cells in at least 2 samples in each condition:
c("Adipocytes","B","CAF","CD8T","Endothelial","Epithelial","Immune","Macs","Pericyte","Plasma","SMC","Stromal","Tumor")

Baseline-W7D1: R vs NR

Baseline-W7D1: R


Unnamed: 0_level_0,Baseline,W7D1,Min_N
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B,2,2,2
CAF,2,2,2
CD4T,2,2,2
CD8T,2,2,2
Endothelial,2,2,2
Epithelial,2,2,2
Macs,2,2,2
Pericyte,2,2,2
Tumor,2,2,2


Baseline-W7D1: NR


Unnamed: 0_level_0,Baseline,W7D1,Min_N
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B,3,2,2
CAF,3,3,3
CD8T,3,3,3
Endothelial,3,3,3
Epithelial,3,3,3
Immune,3,2,2
Macs,3,3,3
Pericyte,3,3,3
Plasma,3,3,3
SMC,3,3,3


Celltype present at least 2 pts per timepoint per group in selected patients(P01,P08,P12,P13,P18)
c("B","CAF","CD8T","Endothelial","Epithelial","Macs","Pericyte","Tumor")


# ATAC Metadata

In [12]:
with open(f'{WORKFLOW_DATA}/{ATAC_Cohort}/cistopic_obj.pickle','rb') as f:
    cistopic_obj = pickle.load(f)

In [13]:
## Add ATAC and GEX Barcodes
obs['ATAC_Barcodes']=obs.index.map(lambda x:x.split('_')[0]+'___') + obs['Sample']
obs.index.name='GEX_Barcodes'
obs.reset_index(inplace=True)
obs.Cellstate=obs.Cellstate.str.replace('[.+()-/\ ]','_')
obs_mo = obs.loc[obs.ATAC_Barcodes.isin(cistopic_obj.cell_data.index),:]
obs_mo .to_csv(f"{Manuscript_RESULT}/MO_OBS.csv",index=False)