# Add in VDJ + Hashing information to Metadata File

## Start with reading & formatting the dataset's current metadata file from Seurat

In [None]:
import pandas as pd
import numpy as np
from datetime import date
today = date.today()

# set input/output directory
od = "Final_Analysis/" + today.strftime("%Y-%m-%d") + "/Metadata/"
md_path = od + today.strftime("%Y-%m-%d") + "_" + "md_prefilter.csv"

In [None]:
# read in the data
md_merge = pd.read_csv(md_path)
md_merge = md_merge.rename(columns={"Unnamed: 0": "barcode"}) # rename columns
md_merge = md_merge.set_index(['barcode']) # set index
md_merge

## Read & Format VDJ Data

In [None]:
# read in VDJ data 
vdj_siy = pd.read_csv("data/VDJ_Contig_Annotations/siy_all_contig_annotations.csv")
vdj_siin = pd.read_csv("data/VDJ_Contig_Annotations/siin_all_contig_annotations.csv")

# rename columns for appropriate IDs
vdj_siy_b = vdj_siy.rename(columns={0:'barcode'})
vdj_siy['barcode'] = vdj_siy_b['barcode'] # i have no idea why i did it this way but dont fix what aint broke
vdj_siin_b = vdj_siin.rename(columns={0:'barcode'})
vdj_siin['barcode'] = vdj_siin_b['barcode'] # i have no idea why i did it this way but dont fix what aint broke

# append library identifiers to barcodes
vdj_siy['barcode'] = "siy_" + vdj_siy['barcode']
vdj_siin['barcode'] = "siin_" + vdj_siin['barcode']

## Read & Format Hashing Data

In [None]:
# read in hashing data
hash_siin = pd.read_csv("data/HashingIDs/200916_siin_hash_hashids.csv")
hash_siy = pd.read_csv("data/HashingIDs/200916_siy_hash_hashids.csv")

# reformat hashing data to have right column names
hash_siin = hash_siin.rename(columns={"Unnamed: 0": "barcode", "x": "demux"})
hash_siy = hash_siy.rename(columns={"Unnamed: 0": "barcode", "x": "demux"})

# append library identifiers to barcodes
hash_siin['barcode'] = "siin_" + hash_siin['barcode']
hash_siy['barcode'] = "siy_" + hash_siy['barcode']


## Filter VDJ Data

In [None]:
# filter VDJ data
vdj_siy = vdj_siy.loc[vdj_siy['is_cell']== True]
vdj_siin = vdj_siin.loc[vdj_siin['is_cell']== True]
vdj_siy = vdj_siy.loc[vdj_siy['full_length']==True]
vdj_siin = vdj_siin.loc[vdj_siin['full_length']==True]
vdj_siy = vdj_siy.loc[vdj_siy['high_confidence']==True]
vdj_siin = vdj_siin.loc[vdj_siin['high_confidence']==True]

# append library identifiers to clonotypes
vdj_siy['raw_clonotype_id'] = vdj_siy['raw_clonotype_id'] + "_siy"
vdj_siin['raw_clonotype_id'] = vdj_siin['raw_clonotype_id'] + "_siin"

# final filter
vdj_siy = vdj_siy.loc[vdj_siy['raw_consensus_id']!='None'] # put into a new dataframe just in case
vdj_siin = vdj_siin.loc[vdj_siin['raw_consensus_id']!='None'] # put into a new dataframe just in case


In [None]:
# configure datatypes 
md_merge = md_merge.astype(str)
vdj_siy = vdj_siy.astype(str)
vdj_siin = vdj_siin.astype(str)
hash_siin = hash_siin.astype(str)
hash_siy = hash_siy.astype(str)

# set common indices for merge
vdj_siy = vdj_siy.set_index(['barcode'])
vdj_siin = vdj_siin.set_index(['barcode'])
hash_siin = hash_siin.set_index(['barcode'])
hash_siy = hash_siy.set_index(['barcode'])

# VDJ only
# get rid of duplicate info
## all duplicated barcodes all have the same clonotype, but will have multiple entries with multiple contigs
vdj_siy = vdj_siy[~vdj_siy.index.duplicated(keep='first')]
vdj_siin = vdj_siin[~vdj_siin.index.duplicated(keep='first')]


In [None]:
# merge library datasets prior to merge with metadata
vdj_merge = pd.concat([vdj_siy,vdj_siin]) # merge VDJ datasets
hash_merge = pd.concat([hash_siy,hash_siin])

# merge to metadata
md_mergeV = md_merge.merge(vdj_merge, how = 'left', left_index = True, right_index = True) # merge VDJ
md_mergeV = md_mergeV.merge(hash_merge, how = 'left', left_index = True, right_index = True) # merge hashing

## extract strings + format for plotting of top10 clonotypes
md_mergeV['clonotype']= md_mergeV.raw_clonotype_id.str.extract('(\d+)') #extract clonotype #
md_mergeV = md_mergeV.fillna("0") # replace NaN with 0s so it can be logically evaluated as an integer
md_mergeV['clonotype'] = md_mergeV['clonotype'].astype(int) # change datatype to integer
md_mergeV['top_clonotypes'] = np.where(md_mergeV['clonotype'] <= 10, md_mergeV['raw_clonotype_id'], '') #filter out only those in top 10
md_mergeV['top_clonotypes'] = md_mergeV['top_clonotypes']
# take a look @ final product!
md_mergeV

In [None]:
# save
md_mergeV.to_csv((od + today.strftime("%Y-%m-%d") + "_" + "md_posthash.csv"))

In [None]:
print("Number of SIIN Clonotypes Detected by CellRanger..")
print(len(vdj_siin['raw_clonotype_id'].unique()))

In [None]:
print("Number of SIY Clonotypes Detected by CellRanger..")
print(len(vdj_siy['raw_clonotype_id'].unique()))

In [None]:
print("SIY Clonotype Size Breakdown..")
siy_counts = vdj_siy[["raw_clonotype_id"]].value_counts()
siy_counts

In [None]:
print("SIIN Clonotype Size Breakdown..")
siin_counts = vdj_siin[["raw_clonotype_id"]].value_counts()
siin_counts

In [None]:
print("SIY Clonotypes with One Cell...")
siy_counts.loc[siy_counts == 1]

In [None]:
print("SIY Clonotypes with More than Five Cells...")
siy_counts.loc[siy_counts < 5]

In [None]:
print("SIIN Clonotypes with One Cell...")
siin_counts.loc[siin_counts == 1]

In [None]:
print("SIIN Clonotypes with More than Five Cells...")
siin_counts.loc[siin_counts < 5]