In [1]:
import pysam
import pandas as pd
import operator
from datetime import datetime,timedelta
import pygtrie

In [2]:
summaryFileName="/home/kanishk/mixedHumanMouse/star_gene_exon_tagged.dge_Long.summary.txt"
bamFileName="/home/kanishk/mixedHumanMouse/star_gene_exon_tagged_Long.bam"
dgeFileName="/home/kanishk/mixedHumanMouse/star_gene_exon_tagged_Long.dge.txt.gz"
outfilepath="/home/kanishk/chrono-seq-tools/dge.summary"
min_transcripts=100
num_cell_bases_missing=3
cell_barcode_length=12

In [3]:
summaryFile=pd.read_table(summaryFileName,header=5)
barcodes_ordered_by_num_of_transcripts=summaryFile.CELL_BARCODE

In [4]:
#Filtering Barcodes above minimum transcripts threshold
barcodesWithEnoughTranscripts=summaryFile.loc[(summaryFile.NUM_TRANSCRIPTS>min_transcripts)]

In [5]:
barcodes=barcodesWithEnoughTranscripts.sort_values("NUM_GENIC_READS",ascending=False).CELL_BARCODE
barcodes.to_csv(outfilepath+".barcodesAboveCuttoff.txt",index=False,header=None)

In [6]:
bamFile=pysam.AlignmentFile(bamFileName,"rb",check_sq=False)
BamRecords=bamFile.fetch(until_eof=True)

In [7]:
dge=pd.read_table(dgeFileName)

In [8]:
#Objects from this class will form a Key:Value pair in a dictionary with barcodes we selected
class consensus_time_tag:
    #Common Values for entire class
    tag_length=10
    umi_last_base=6
    gene_expression_matrix=dge.copy()

    def __init__(self,barcode=""):
        #Associating Complete Gene Expression Values for a Cell Barcode and time tag in the object
        self.barcode=barcode
        self.gene_expression_values=0;
        if len(self.barcode)!=0:    
            self.gene_expression_values=self.gene_expression_matrix.iloc[:,self.gene_expression_matrix.columns.get_loc(barcode)]
        self.consensusBaseMap=[]
        self.baseProbabilityMap={'A':[],'T':[],'G':[],'C':[],'N':[]}
        self.baseProbabilityMapStrings={'A':"",'T':"",'G':"",'C':"",'N':""}
        for i in range(0,self.tag_length):
            self.consensusBaseMap.append({'A':0,'T':0,'G':0,'C':0,'N':0})
            
    def update(self,tag):
        #Increase count each new tag
        for i in range(0,self.tag_length):
            self.consensusBaseMap[i][tag[i]]+=1
    
    def get_consensus_tag(self):
        #Getting bases with most counts
        concensus_tag=""
        for i in range(0,self.tag_length):
            concensus_tag+=max(self.consensusBaseMap[i].items(),key=operator.itemgetter(1))[0]
        return concensus_tag

    def get_total_reads_per_tag(self):
        return sum(self.consensusBaseMap[0].values())

    def get_base_percentage(self,base):
        percentages=""
        for i in range(0,self.tag_length):
            total=sum(self.consensusBaseMap[i].values())
            total_for_base=self.consensusBaseMap[i][base]
            percentage=total_for_base/total
            percentages+=str("%.2f" % percentage)+" "
            self.baseProbabilityMap[base].append(percentage)
        self.baseProbabilityMapStrings[base]=percentages
    
    def get_base_percentage_per_base(self,base):
        self.get_base_percentage(base)
        return self.baseProbabilityMapStrings[base]
    
    def calculate_base_percentages(self):
        for base in "ATGCN":
            self.get_base_percentage(base)
    
    def combine(self,tag2):
        combined_tag=consensus_time_tag()
        #Combining Base Maps
        for i in range(0,self.tag_length):
            for base in "ATGCN":
                combined_tag.consensusBaseMap[i][base]=self.consensusBaseMap[i][base]+tag2.consensusBaseMap[i][base]
        combined_tag.gene_expression_values=self.gene_expression_values+tag2.gene_expression_values
        combined_tag.barcode=tag2.barcode
        return combined_tag

In [9]:
#Initializing barcode dictionary. Each key value pair is Barcode: concensus_time_tag object. 
barcode_dict=pygtrie.CharTrie()
for cell_barcode in barcodes:
    barcode_dict[cell_barcode]=consensus_time_tag(cell_barcode)

In [10]:
#Query BAM file for our selected Barcodes. If the Barcodes match then we update the time-tag counts
start_time=datetime.now()
prevTenMil=start_time
print("Started Processing BAM file to get Time Tags for the Cell Barcodes at ",start_time)
total_records=0
for record in BamRecords:
    total_records+=1
    if total_records%1000000==0:
        time_taken=datetime.now()-prevTenMil
        print("Finished processing ",total_records,"\trecords at",datetime.now(),". Previous 1000000 Records took ",time_taken.total_seconds(),"s")
        prevTenMil=datetime.now()
    time_tag=record.get_tag('YT')
    cell_barcode=record.get_tag('XC')
    if cell_barcode in barcode_dict:
        barcode_dict[cell_barcode].update(time_tag)
total_time=datetime.now()-start_time
print("Finished processing BAM file at ",datetime.now(),". Total time taken ",total_time)

Started Processing BAM file to get Time Tags for the Cell Barcodes at  2021-01-30 23:28:47.889478


KeyboardInterrupt: 

In [None]:
#Compute the concensus tags
concensus_tags=[]
time_tag_stats=[]
for cell_barcode in barcodes:
    tagBarcodePair=[cell_barcode,barcode_dict[cell_barcode].get_consensus_tag()]
    summary_vals=summaryFile.loc[summaryFile.CELL_BARCODE.str.contains(cell_barcode)].iloc[:,1:].values[0].tolist()
    concensus_tags.append(tagBarcodePair+summary_vals)
    additional_tag_stats=[barcode_dict[cell_barcode].get_total_reads_per_tag(),barcode_dict[cell_barcode].get_base_percentage_per_base('A')
                  ,barcode_dict[cell_barcode].get_base_percentage_per_base('T'),barcode_dict[cell_barcode].get_base_percentage_per_base('G')
                  ,barcode_dict[cell_barcode].get_base_percentage_per_base('C'),barcode_dict[cell_barcode].get_base_percentage_per_base('N')]
    time_tag_stats.append(tagBarcodePair+additional_tag_stats)

In [None]:
print("Writing Initial Statistics for Time Tags to Files..")
concensus_tag_df=pd.DataFrame(concensus_tags,columns=["CELL_BARCODE","TIME_TAG","NUM_GENIC_READS","NUM_TRANSCRIPTS","NUM_GENES"])
time_tag_stats_df=pd.DataFrame(time_tag_stats,columns=["CELL_BARCODE","TIME_TAG","TOTAL_COUNTS","A%","T%","G%","C%","N%"])

In [13]:
concensus_tag_df.to_csv(outfilepath+".time_tags.txt",sep="\t",index=False)
time_tag_stats_df.to_csv(outfilepath+"detailed_infor_time_tags.txt",sep="\t",index=False)

In [14]:
pd.set_option('display.max_rows', time_tag_stats_df.shape[0]+1)
time_tag_stats_df

Unnamed: 0,CELL_BARCODE,TIME_TAG,TOTAL_COUNTS,A%,T%,G%,C%,N%
0,CCGATTAAAGGC,GGGGAATTTT,15995138,0.23 0.23 0.23 0.25 0.84 0.92 0.19 0.01 0.00 0...,0.25 0.23 0.24 0.23 0.05 0.07 0.80 0.99 0.99 0...,0.31 0.31 0.34 0.35 0.07 0.00 0.00 0.00 0.00 0...,0.20 0.23 0.18 0.17 0.05 0.00 0.00 0.00 0.00 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
1,AAAGAGCCATCT,CCGGGGTTTT,15579101,0.25 0.25 0.23 0.22 0.22 0.20 0.00 0.00 0.00 0...,0.24 0.24 0.15 0.15 0.16 0.17 0.99 1.00 1.00 0...,0.25 0.25 0.37 0.41 0.41 0.44 0.00 0.00 0.00 0...,0.25 0.26 0.24 0.22 0.21 0.20 0.00 0.00 0.00 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
2,TTCGCAAGCCTA,AGAGGTTTTT,10859576,0.26 0.25 0.27 0.24 0.24 0.23 0.00 0.00 0.00 0...,0.25 0.24 0.22 0.21 0.20 0.29 0.99 1.00 1.00 0...,0.24 0.27 0.24 0.32 0.35 0.24 0.00 0.00 0.00 0...,0.25 0.24 0.27 0.23 0.21 0.24 0.00 0.00 0.00 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
3,ACAACAAATATG,GGGGGGAAAT,10124750,0.23 0.24 0.23 0.21 0.20 0.31 0.99 0.95 0.68 0...,0.26 0.24 0.25 0.24 0.27 0.23 0.01 0.04 0.31 0...,0.31 0.30 0.34 0.33 0.31 0.31 0.00 0.00 0.00 0...,0.20 0.22 0.18 0.22 0.21 0.15 0.00 0.00 0.00 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
4,GTCACCAATCCG,GGGGGGAAAT,8664346,0.24 0.24 0.23 0.23 0.21 0.31 1.00 0.99 0.82 0...,0.24 0.24 0.24 0.24 0.24 0.21 0.00 0.01 0.18 0...,0.29 0.35 0.33 0.34 0.31 0.32 0.00 0.00 0.00 0...,0.23 0.17 0.20 0.19 0.24 0.16 0.00 0.00 0.00 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
5,AGGAACTATTTG,GGGGGGTTTT,9153451,0.25 0.23 0.22 0.20 0.18 0.17 0.01 0.01 0.01 0...,0.19 0.16 0.12 0.11 0.10 0.13 0.98 0.99 0.99 0...,0.33 0.40 0.45 0.51 0.55 0.52 0.01 0.00 0.00 0...,0.22 0.21 0.22 0.19 0.17 0.18 0.00 0.00 0.00 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
6,GTTAAACGGGTT,GGGAAAATTT,7733308,0.24 0.22 0.21 0.34 0.90 0.95 0.76 0.11 0.01 0...,0.24 0.27 0.25 0.21 0.04 0.03 0.22 0.88 0.97 0...,0.31 0.29 0.31 0.25 0.04 0.01 0.01 0.01 0.01 0...,0.22 0.22 0.23 0.20 0.03 0.01 0.01 0.01 0.01 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
7,AACACCTGCTTG,TTGGGGTTTT,7278824,0.25 0.24 0.22 0.19 0.19 0.17 0.00 0.00 0.00 0...,0.26 0.26 0.15 0.11 0.10 0.13 1.00 1.00 1.00 0...,0.24 0.25 0.39 0.49 0.52 0.51 0.00 0.00 0.00 0...,0.25 0.25 0.24 0.20 0.19 0.19 0.00 0.00 0.00 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
8,CTAATCGCACAG,GGGGGGTTTT,6525155,0.25 0.25 0.17 0.16 0.15 0.13 0.00 0.00 0.00 0...,0.14 0.14 0.08 0.07 0.07 0.09 0.99 0.99 0.99 0...,0.38 0.38 0.54 0.61 0.65 0.63 0.00 0.00 0.00 0...,0.22 0.23 0.21 0.16 0.14 0.15 0.00 0.00 0.00 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...
9,TGTAGATTTCGC,GGGGTAAATT,5160524,0.23 0.23 0.24 0.22 0.24 0.96 0.97 0.91 0.04 0...,0.25 0.26 0.26 0.27 0.29 0.02 0.02 0.08 0.95 0...,0.30 0.28 0.32 0.31 0.23 0.01 0.00 0.01 0.01 0...,0.22 0.23 0.18 0.20 0.24 0.01 0.01 0.01 0.01 0...,0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0...


In [15]:
print("Checking for Cell Barcode Synthesis Errors..")
for base_position in range(cell_barcode_length-1,cell_barcode_length-1-num_cell_bases_missing,-1):
    print("Checking for Synthesis Errors in Cell Barcode at Base Position ",base_position+1)
    barcodes_to_be_combined=[]
    current_barcodes=barcode_dict.keys().copy() #Making a copy which will not be muted in next loop
    for cell_barcode in current_barcodes:
        prefix=cell_barcode[0:base_position]
        try:
            #If mulitples values in the list are associated with that prefix then pop the node and store it in a list
            subtrie_keys=barcode_dict.keys(prefix)
            subtrie=barcode_dict.items(prefix)
            if len(subtrie_keys)>1:
                print(prefix," has ",len(subtrie_keys)," barcodes associated with it.")
                barcodes_to_be_combined.append(subtrie)
                del barcode_dict[prefix:] #Deleting All Barcodes in Subtrie from main Trie
        except KeyError: #Shows a key error if barcodes already deleted from the main Trie
            continue

    for subtrie in barcodes_to_be_combined:
        temp_tag=consensus_time_tag()
        for barcode,tag in subtrie:
            temp_tag=temp_tag.combine(tag)
        new_barcode=list(temp_tag.barcode)
        new_barcode[base_position]="N"#Replacing Base with N
        temp_tag.barcode="".join(new_barcode)
        barcode_dict[temp_tag.barcode]=temp_tag #Adding combined Tag back to the Trie

Checking for Cell Barcode Synthesis Errors..
Checking for Synthesis Errors in Cell Barcode at Base Position  12
CCAGCTGGCGG  has  4  barcodes associated with it.
CATAGTCAAAT  has  4  barcodes associated with it.
AAGGTCCTACC  has  4  barcodes associated with it.
ACTCTTCGTAC  has  4  barcodes associated with it.
ATCTAACCTCA  has  4  barcodes associated with it.
ATAGAACATCG  has  4  barcodes associated with it.
ATAGAACATCT  has  4  barcodes associated with it.
ATAGAACATCA  has  4  barcodes associated with it.
ATAGAACATCC  has  4  barcodes associated with it.
GTCTAGGTCCT  has  2  barcodes associated with it.
Checking for Synthesis Errors in Cell Barcode at Base Position  11
ATAGAACATC  has  4  barcodes associated with it.
GTCTAGGTCC  has  4  barcodes associated with it.
Checking for Synthesis Errors in Cell Barcode at Base Position  10
CCTGTAATG  has  2  barcodes associated with it.
ACAACAAAT  has  2  barcodes associated with it.
TTATTGGGC  has  2  barcodes associated with it.


In [16]:
bamFile.close()

In [17]:
for tag in concensus_tag_df.TIME_TAG.unique():
    print(tag,concensus_tag_df.TIME_TAG.str.contains(tag).sum())

GGGGAATTTT 3
CCGGGGTTTT 2
AGAGGTTTTT 1
GGGGGGAAAT 10
GGGGGGTTTT 19
GGGAAAATTT 9
TTGGGGTTTT 1
GGGGTAAATT 1
GGGGAAATTT 12
AGGGGGTTTT 2
GGTGTTTTTT 1
GGGGGAAAAT 6
GGGGGGAATT 1
GGAAAATTTT 23
GGGGGAAATT 5
GGAGGGTTTT 1
GGAGGTTTTT 1
GGGGAAAATT 7
GAAAATTTTT 3
GGGGGAATTT 2
GAAATTTTTT 1
GGCCCTTTTT 1
GGAAATTTTT 1
GGGGGAACAT 21
TGAGGTTTTT 1
GTGGGAACTT 1
GGTGAAACAT 1
GGGGGAACTT 1
GGGGGGACAT 1
GGAGGAACAT 1
GAAAAATTTT 1
TGGGGGTTTT 1
AAGGGGTTTT 1
GGTGGGACAT 1
CGGGGAACAT 1
GTATTTTTTT 1
AGGGGATTTT 1
ACCGGCTTTT 1
