In [1]:
#Author Kanishk Asthana kasthana@eng.ucsd.edu
import pysam
from datetime import datetime,timedelta
import argparse
import os
import operator
import pandas as pd
import re

#Default Values
bamFileName="/stg1/data2/kanishk/Chronoseq_project/Nov_Dec_2023_processed_files/18156_35/Bulk_13_chronoseq_V4/Bulk_13_chronoseq_V4.aligned.sorted.merged.tagged.bam"
outFileName="/stg1/data2/kanishk/Chronoseq_project/Nov_Dec_2023_processed_files/18156_35/Bulk_13_chronoseq_V4/Bulk_13_chronoseq_V4.aligned.sorted.merged.tagged.time_tags.csv"
MIN_TT_COUNT=20 #Minimum number of Times you see a Time-Tag for a particular Cell Barcode.

In [2]:
script_start_time=datetime.now()

#source https://www.idtdna.com/pages/products/custom-dna-rna/mixed-bases
letter_dict={'R': 'Y', 'Y': 'R', 'M': 'K', 'K': 'M', 'S': 'S', 'W': 'W', 'H': 'D', 'B': 'V', 'V': 'B', 'D': 'H', 'N': 'N', 'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
regex_dict={'R':"[AG]",'Y':"[CT]",'M':"[AC]",'K':"[GT]",'S':"[GC]",'W':"[AT]",'H':"[ACT]",'B':"[GCT]",'V':"[ACG]",'D':"[AGT]",'N':"[ACTG]",'A':'A','G':'G','C':'C','T':'T'}
#Time-Tag Sequences Synthesized by IDT and the Name you gave them.
time_tag_oligos={
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAATNTHGHGBAAAAAAAAAAAAAAAAAAA":"TTGG",
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAACDCDTNTBAAAAAAAAAAAAAAAAAAA":"CCTT",
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGHGHANABAAAAAAAAAAAAAAAAAAA":"GGAA",
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAATNTDCDCBAAAAAAAAAAAAAAAAAAA":"TTCC"}
print("Time-Tag Oligos Synthesized by IDT:")
print(time_tag_oligos)
def getReverseCompliment(inputString):
    outputString=""
    for letter in inputString[::-1]:
        outputString+=letter_dict[letter]
    return(outputString)
    
def getRegexSearchString(inputString):
    outputString=""
    for letter in inputString:
        outputString+=regex_dict[letter]
    return(outputString)

def getRegexSearchDictForOligos(oligo_dict):
    regexOligoSearchDict={}
    for key in oligo_dict:
        condensed_key="AAAAAA"+key.strip('A')+"AAAAAA"
        regexOligoSearchDict[getRegexSearchString(getReverseCompliment(condensed_key))]=oligo_dict[key]
    return(regexOligoSearchDict)

regexOligoSearchDict=getRegexSearchDictForOligos(time_tag_oligos)
print("We Will be Searching for the Following Strings for Time-Tags:")
print(regexOligoSearchDict)

def convertRegexDictToTimeTagDict(regex_dict):
    new_dict={}
    for regex in regex_dict:
        new_dict[regexOligoSearchDict[regex]]=regex_dict[regex]
    return(new_dict)

class CellBarcode:
    CellBarcodesWithEnoughCounts=[]
    def __init__(self,barcode,umi,time_tag):
        self.barcode=barcode
        self.total_time_tag_count=0
        self.time_tag_counts_dict={}
        #Initializing Counts for Each Time Tag for a Specific Cell Barcode.
        for key in regexOligoSearchDict:
            self.time_tag_counts_dict[key]=0
        self.update(time_tag)
        self.final_time_tag="Too Few Counts. Could not Assign."
         
    def update(self,time_tag):
        result=None
        for key in regexOligoSearchDict:
            result=re.search(key,time_tag)
            if result is not None:
                self.total_time_tag_count+=1
                self.time_tag_counts_dict[key]+=1
                
    def computeHasEnoughCounts(self):
        if self.total_time_tag_count>MIN_TT_COUNT:
            CellBarcode.CellBarcodesWithEnoughCounts.append(self.barcode)
    
    def getFinalTimeTag(self):
        tag_with_max_counts=max(self.time_tag_counts_dict,key=self.time_tag_counts_dict.get)
        percentage_detected=self.time_tag_counts_dict[tag_with_max_counts]/self.total_time_tag_count
        if percentage_detected>0.95:
            self.final_time_tag=regexOligoSearchDict[tag_with_max_counts]
        else:
            self.final_time_tag=regexOligoSearchDict[tag_with_max_counts]+" . Time Tag Collision Detected. Primary Tag is present less than 95% of total Time Tags detected."
        

barcode_dict={}

#If you get an error here your file probably not correctly formated. Make sure you have a header.
bamFile=pysam.AlignmentFile(bamFileName,"rb")
BamRecords=bamFile.fetch(until_eof=True)

start_time=datetime.now()
prevMil=start_time
print("Started Processing BAM file at",start_time,".")

total_records=0
for record in BamRecords:
    

    #For printing progress
    total_records+=1
    #Rembemer to Remove this before you finalize the script.
    if total_records>3000000:
        break
    
    if total_records%1000000==0:
        time_taken=datetime.now()-prevMil
        print("Finished processing ",total_records,"\trecords at",datetime.now(),". Previous 1000000 Records took ",time_taken.total_seconds(),"s")
        prevMil=datetime.now()
    

    #Main Logic
    cell_barcode=record.get_tag('XC')
    umi=record.get_tag('XM')
    time_tag=record.get_tag('YT')
    
    if cell_barcode in barcode_dict:
        barcode_dict[cell_barcode].update(time_tag)
    else:
        barcode_dict[cell_barcode]=CellBarcode(cell_barcode,umi,time_tag)

total_time=datetime.now()-start_time
print("Finished processing BAM file at ",datetime.now(),". Total time taken ",total_time)

bamFile.close()

for barcode in barcode_dict:
    barcode_dict[barcode].computeHasEnoughCounts()

print(len(CellBarcode.CellBarcodesWithEnoughCounts),"Cell barcodes have enough Time Tags Detected for further processing.")

barcodes_dict_df={
 "CELL BARCODES":[],
 "TIME TAG COUNTS":[],
 "FINAL TIME TAG":[]
}

for oligo in time_tag_oligos:
    barcodes_dict_df[time_tag_oligos[oligo]]=[]
print(barcodes_dict_df)

for barcode in CellBarcode.CellBarcodesWithEnoughCounts:
    cell_barcode=barcode_dict[barcode]
    barcode_dict[barcode].getFinalTimeTag()
    barcodes_dict_df["CELL BARCODES"].append(barcode)
    barcodes_dict_df["TIME TAG COUNTS"].append(cell_barcode.total_time_tag_count)
    barcodes_dict_df["FINAL TIME TAG"].append(cell_barcode.final_time_tag)
    time_tag_dict=convertRegexDictToTimeTagDict(barcode_dict[barcode].time_tag_counts_dict)
    for time_tag in time_tag_dict:
        barcodes_dict_df[time_tag].append(time_tag_dict[time_tag])

new_df=pd.DataFrame.from_dict(barcodes_dict_df)

new_df

new_df.to_csv(outFileName,index=False,sep="\t")

print("Total Execution Time:",datetime.now()-script_start_time)

Time-Tag Oligos Synthesized by IDT:
{'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAATNTHGHGBAAAAAAAAAAAAAAAAAAA': 'TTGG', 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAACDCDTNTBAAAAAAAAAAAAAAAAAAA': 'CCTT', 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGHGHANABAAAAAAAAAAAAAAAAAAA': 'GGAA', 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAATNTDCDCBAAAAAAAAAAAAAAAAAAA': 'TTCC'}
We Will be Searching for the Following Strings for Time-Tags:
{'TTTTTT[ACG]C[AGT]C[AGT]A[ACTG]ATTTTTT': 'TTGG', 'TTTTTT[ACG]A[ACTG]A[ACT]G[ACT]GTTTTTT': 'CCTT', 'TTTTTT[ACG]T[ACTG]T[AGT]C[AGT]CTTTTTT': 'GGAA', 'TTTTTT[ACG]G[ACT]G[ACT]A[ACTG]ATTTTTT': 'TTCC'}
Started Processing BAM file at 2024-01-08 14:30:28.397916 .
Finished processing  1000000 	records at 2024-01-08 14:30:37.279254 . Previous 1000000 Records took  8.881325 s
Finished processing  2000000 	records at 2024-01-08 14:30:46.430056 . Previous 1000000 Records took  9.14936 s
Finished processing  3000000 	records at 2024-01-08 14:30:55.351044 . Previous 1000000 Records took  8.920714 s
Finished processing BAM file at  

In [3]:
new_df

Unnamed: 0,CELL BARCODES,TIME TAG COUNTS,FINAL TIME TAG,TTGG,CCTT,GGAA,TTCC
0,CACCGAAATACG,31,TTGG,30,0,0,1
1,GCCGCTATGGCA,99,TTCC,0,0,0,99
2,ATCCCTTAACGG,45,TTGG,45,0,0,0
3,TTAACATTGCCT,47,TTGG,47,0,0,0
4,AATCCGCCCCGA,103,TTCC,0,0,0,103
...,...,...,...,...,...,...,...
4561,TTAACATTACCT,21,TTGG,21,0,0,0
4562,GGGCAAACTGGT,49,TTGG,49,0,0,0
4563,CCCTTCATGTTT,35,TTGG,35,0,0,0
4564,ACTGACAAGTTA,24,TTGG,24,0,0,0


In [4]:
barcode_dict["CACCGAAATACG"].getFinalTimeTag()
barcode_dict["CACCGAAATACG"].final_time_tag

'TTGG'