## KP Lung CD8 T-Cell 10X 5' Immune Profiling Metadata & Clonotype Processing

Import package dependencies:

In [None]:
import pandas as pd
import numpy as np
from datetime import date

Set input file paths:

In [None]:
# input/output directory
today = date.today()
od = "Final_Analysis/" + today.strftime("%Y-%m-%d") + "/Metadata/"
md_path = od + today.strftime("%Y-%m-%d") + "_" + "md_doublets_out.csv"
md_path_pre = od + today.strftime("%Y-%m-%d") + "_" + "md_posthash.csv"

projectTils_path = "data/mouse_samples_per_cell_attributes.csv"

Load Data:

In [None]:
doublets = pd.read_csv(md_path_pre)
singlets = pd.read_csv(md_path)

Change data types and format dataframes:

In [None]:
singlets = singlets.rename(columns={"Unnamed: 0": "barcode"}) # change name to correct id

# set index
doublets = doublets.set_index('barcode')
singlets = singlets.set_index('barcode')

# remove unnecessary information
singlets = singlets.drop(labels=['orig.ident','nCount_RNA','nFeature_RNA','percent.mt','demux'],axis=1)

# ensure correct datatype
singlets = singlets.astype(str)
doublets = doublets.astype(str)

Filter out cells that were removed in doublet removal step:

In [None]:
singlets_md = singlets.merge(doublets, how = 'left', left_index = True, right_index = True) # merge VDJ

## Add in Project TIL data

In [None]:
projectTils = pd.read_csv(projectTils_path) # load in data

# format dataframe
projectTils['Cell.ID'] = projectTils['Cell.ID'].str.split('Q_').str[1]
projectTils = projectTils.rename(columns={'Cell.ID':'barcode',
                                          'query.projected.functional.cluster':'projTIL_Guo_fxCluster',
                                          'query.projected.TILPRED':'projTIL_Guo_TILPRED'})

projectTils = projectTils.astype(str) # validate correct datatype
projectTils = projectTils.set_index('barcode') # set index

Merge in ProjectTIL data:

In [None]:
singlets_md = singlets_md.merge(projectTils, how = 'left', left_index = True, right_index = True) # merge TIL analysis

Write final metadata to CSV:

In [None]:
singlets_md.to_csv((od + today.strftime("%Y-%m-%d") + "_" + "md_final.csv"))

# Determine TCR Clonotypes Derived From Multiple Mice

It is important for us to know which clonotypes arose from multiple different mice. Based off of the hashing and VDJ data collected, the vast majority of the clonotypes are derived from a single mouse. A small subset of clonotypes derive from one or multiple mice, so we want to distinguish between the two. 

In [None]:
clonotypes = singlets_md['raw_clonotype_id'].unique() # generates list of clonotypes

# set up framework/dataframes
mult_clonotypes = pd.DataFrame()
bad_clonotypes, double_clonotypes, tripleplus_clonotypes, good_clonotypes = list(), list(), list(), list()

# generate subsets
singlets_demux = singlets_md.loc[singlets_md['raw_clonotype_id']!= '0'] # look only at cells with VDJ
singlets_demuxHash = singlets_md.loc[singlets_md['demux']!= '0'] # look only at cells with hashing

# score each clonotype for # demux identities
for i in clonotypes:
    singlets_clone = singlets_demuxHash.loc[singlets_demuxHash['raw_clonotype_id']== i] # partition just cells in a given clonotype
    mice = singlets_clone['demux'].unique() # get number of mice in clonotype
    mice_num = len(np.unique(mice))
    if (mice_num ==1):
        good_clonotypes.append(i)
    if (mice_num > 1):
        bad_clonotypes.append(i)
        mult_clonotypes
    if (mice_num == 2):
        double_clonotypes.append(i)
    if (mice_num >= 3):
        tripleplus_clonotypes.append(i)

# summary dataframe
singlets_demux = singlets_demux.replace({'demux': '0'}, {'demux': 'unmapped'}, regex = True)
singlets_mult = singlets_demux.loc[singlets_demux['raw_clonotype_id'].isin(bad_clonotypes)] # get multi-clonotype ids
mult_clonotypes = pd.DataFrame(data = singlets_mult[['raw_clonotype_id', 'demux']].value_counts()) # get paired value counts
mult_clonotypes = mult_clonotypes.rename(columns = {0: "cell_number_detected"})

# convert everything to dataframe
bad_clonotypes = pd.DataFrame(data=bad_clonotypes)
double_clonotypes = pd.DataFrame(data=double_clonotypes)
tripleplus_clonotypes = pd.DataFrame(data=tripleplus_clonotypes)

# write everything to file
bad_clonotypes.to_csv((od + today.strftime("%Y-%m-%d") + "_" + "multipleMouse_clonotypes.csv"))
double_clonotypes.to_csv((od + today.strftime("%Y-%m-%d") + "_" + "twoMouse_clonotypes.csv"))
tripleplus_clonotypes.to_csv((od + today.strftime("%Y-%m-%d") + "_" + "threeOrMoreMouse_clonotypes.csv"))
mult_clonotypes.to_csv((od + today.strftime("%Y-%m-%d") + "_" + "multi_clonotypes_aggregate.csv"))