In [1]:
import pysam
import os
import sys
from sys import getsizeof
import time
import pandas as pd
import polars as pl

directory_path = os.path.abspath(os.path.join('../src/'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    
from read_process import get_contig_lengths_dict,\
incorporate_replaced_pos_info,incorporate_insertions_and_deletions,\
get_positions_from_md_tag,reverse_complement,get_edit_information,get_edit_information_wrapper,\
has_edits,get_total_coverage_for_contig_at_position,\
print_read_info, update_coverage_array, get_read_information, get_hamming_distance, remove_softclipped_bases,find

from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_bam, \
write_read_to_bam_file, remove_file_if_exists, make_folder

import os, psutil


# Preload which barcodes to use...

In [2]:
barcodes_list_path = '/projects/ps-yeolab3/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_batch2/cellranger/results/ms_hippo_stamp_EIF4A_batch2/outs/filtered_feature_bc_matrix/barcodes.tsv.gz'

In [3]:
barcode_whitelist = set(pd.read_csv(barcodes_list_path, names=['barcodes']).barcodes.tolist())

In [4]:
len(barcode_whitelist)

39200

# ~~~~~~~~~~~~~~~~~~
# Multi-processing enabled
# ~~~~~~~~~~~~~~~~~~

# An example on a full 10x bam

#### in 10X's bam file, xf=25 means that read is uniquely mapped to a genome, and was used for counting UMI. So we should only look at reads with xf=25 from the 10X bam.

In [5]:
#bampath = '/projects/ps-yeolab5/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_batch2/filtered_possorted_ms_hippo_stamp_bam/filtered_keep_xf25_possorted_genome_with_header.bam_MD.bam'
bampath = '/projects/ps-yeolab3/ekofman/sailor2/data/groups_0_1_2_3_4_5_6_7_8_9_10_11_merged.bam'


samfile = pysam.AlignmentFile(bampath, "rb")

In [6]:
samfile_header = str(samfile.header)

In [7]:
getsizeof(samfile_header)/1000

19338.323

# Helper functions

In [8]:
def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_whitelist=None, verbose=False):  
    time_reporting = {}
    start_time = time.perf_counter()
    
    samfile = pysam.AlignmentFile(bampath, "rb")
        
    counts = defaultdict(lambda:defaultdict(lambda:0))
    total_reads = 0
    
    bam_handles_for_barcodes = {}
    read_lists_for_barcodes = defaultdict(lambda:[])
    
    reads_for_contig = samfile.fetch(contig, start, end, multiple_iterators=True)

    output_file = '{}/{}_{}_{}_{}_edit_info.tsv'.format(edit_info_subfolder, contig, split_index, start, end)
    remove_file_if_exists(output_file)

    with open(output_file, 'w') as f:        
        write_header_to_bam(f)

        for i, read in enumerate(reads_for_contig):
            total_reads += 1
            
            if total_reads % 1000 == 0:
                time_reporting[total_reads] = time.perf_counter() - start_time

            barcode = read.get_tag("CB")
            if barcode_whitelist:
                if barcode not in barcode_whitelist:
                    counts[contig]['Barcode Filtered'] += 1
                    continue
                
            barcodes[contig][barcode] += 1

            verbose = False
            
            try:
                error_code, list_of_rows, num_edits_of_each_type = get_read_information(read, contig, verbose=verbose)
            except Exception as e:
                print("Failed on\n{}".format(read.to_string()))
                break
                
            if error_code:
                counts[contig][error_code] += 1
            else:
                counts[contig][EDITED_CODE] += 1
                write_rows_to_info_file(list_of_rows, f)
            
            # Store each read using its string representation
            read_as_string = read.to_string()
            read_tab_separated = read_as_string.split('\t')
     
            second_new_contig_section = '{}_{}'.format(contig, barcode)
            read_tab_separated[2] = second_new_contig_section
            
            read_as_string = '\t'.join(read_tab_separated)
            
            read_lists_for_barcodes[barcode].append(read_as_string)
            
    
    # Add all reads to dictionary for contig and barcode, in their string representation
    num_barcodes = 0
    total_bams = len(read_lists_for_barcodes)
    
    
    barcode_to_concatted_reads = {}
    for barcode, read_list in read_lists_for_barcodes.items():        
        num_barcodes += 1
        if num_barcodes % 100 == 0:
            #print('{}/{} processed'.format(num_barcodes, total_bams))
            pass
        # Concatenate the string representations of all reads for each bam-contig combination
        all_reads_concatted = '\n'.join(read_list)
            
        # Save this concatenated block of text to dictionary
        barcode_to_concatted_reads[barcode] = all_reads_concatted
        
    time_reporting[total_reads] = time.perf_counter() - start_time
    
    samfile.close()
    
    return barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting


def find_edits_and_split_bams(bampath, contig, split_index, start, end, output_folder, barcode_whitelist=None, verbose=False):
    barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting = find_edits(bampath, contig, split_index,
                                                                         start, end, output_folder, barcode_whitelist=barcode_whitelist, verbose=verbose)    
    return barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting
    
def find_edits_and_split_bams_wrapper(parameters):
    try:
        start_time = time.perf_counter()
        bampath, contig, split_index, start, end, output_folder, barcode_whitelist, verbose = parameters
        label = '{}({}):{}-{}'.format(contig, split_index, start, end)

        #print("{} ({}):{}-{}\tfind_edits_and_split_bams".format(contig, split_index, start, end))
        barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting = find_edits_and_split_bams(bampath, contig, split_index, start, end,                                                                                        
                                                                                                              output_folder, 
                                                                                                              barcode_whitelist=barcode_whitelist,
                                                                                                              verbose=False)
        barcodes_df = pd.DataFrame.from_dict(barcodes)
        counts_df = pd.DataFrame.from_dict(counts)
        time_df = pd.DataFrame.from_dict(time_reporting, orient='index')
        if len(barcode_to_concatted_reads) > 0:
            barcode_to_concatted_reads_pl = pl.from_dict(barcode_to_concatted_reads).transpose(include_header=True, header_name='barcode').rename({"column_0": "contents"})
        else:
            # No transposes are allowed on empty dataframes
            barcode_to_concatted_reads_pl = pl.from_dict(barcode_to_concatted_reads)
            
        total_time = time.perf_counter() - start_time
        return contig, label, barcode_to_concatted_reads_pl, total_reads, barcodes_df, counts_df, time_df, total_time
    except Exception as e:
        print('Contig {}: {}'.format(label, e))
        return 0, pd.DataFrame(), label, pd.DataFrame()

# Go through every read and identify all edits

In [9]:
from collections import defaultdict
import pandas as pd
#from matplotlib import pyplot as plt
import numpy as np
import time
from multiprocessing import Pool
import multiprocessing
from tqdm import tqdm

start_time = time.perf_counter()

print("CPU count: {}".format(multiprocessing.cpu_count()))

output_folder = '/projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem_bccontig'

contig_lengths_dict = get_contig_lengths_dict(samfile)

# Print info?
verbose = False 
EDITED_CODE = 'edited'

# How many subcontigs to split each contig into to leverage multi-processing
num_intervals = 16

num_reads_to_coverage_dict_kb = {}
num_reads_to_seconds = {}


start_time = time.perf_counter()
total_seconds_for_reads = {0: 1}

barcodes = defaultdict(lambda:defaultdict(lambda:0))

jobs = []
for contig in contig_lengths_dict.keys():
    # Skip useless contigs
    if len(contig) > 5 or contig == 'Stamp':# or contig != '17':
        continue
        
    print("Contig {}".format(contig))
    contig_length = contig_lengths_dict.get(contig)
    intervals_for_contig = get_intervals(contig, contig_lengths_dict, num_intervals)
    
    # Make subfolder in which to information about edits
    edit_info_subfolder = '{}/edit_info'.format(output_folder)
    make_folder(edit_info_subfolder)
        
    # Set up for pool
    for split_index, interval in enumerate(intervals_for_contig):
        split_index = str(split_index).zfill(3)
        parameters = [bampath, contig, split_index, interval[0], interval[1], output_folder, barcode_whitelist, verbose]
        jobs.append(parameters)
    
print("{} total jobs".format(len(jobs)))

# Pooling
results = []
overall_total_reads = 0

overall_label_to_list_of_contents = defaultdict(lambda:{})

with Pool(processes=16) as p:
    max_ = len(jobs)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(find_edits_and_split_bams_wrapper, jobs):
            pbar.update()
            
            overall_label_to_list_of_contents[_[0]][_[1]] =  _[2]
            results.append([_[3], _[4], _[5], _[6], _[7]])
            
            total_reads = _[3]
            total_time = time.perf_counter() - start_time
            
            overall_total_reads += total_reads

            total_seconds_for_reads[overall_total_reads] = total_time

overall_time = time.perf_counter() - start_time 


CPU count: 64
Contig 1
Contig 10
Contig 11
Contig 12
Contig 13
Contig 14
Contig 15
Contig 16
Contig 17
Contig 18
Contig 19
Contig 2
Contig 3
Contig 4
Contig 5
Contig 6
Contig 7
Contig 8
Contig 9
Contig MT
Contig X
Contig Y
352 total jobs


100%|██████████████████████████████████████████████████████████████████████████████| 352/352 [00:44<00:00,  7.86it/s]


In [11]:
print("Total time: {} seconds".format(overall_time))
print("Total time: {} minutes".format(overall_time/60))

Total time: 45.14981406927109 seconds
Total time: 0.7524969011545182 minutes


Memory: 230 Gigabytes

# More helper functions

# Combine all of the reads (string representation) for each barcode
## Groups the results from each sub-contig segment above, for example the reads from the first half of chr1 and those from the second half.

In [12]:
print("Overall contigs:\n\n\t", overall_label_to_list_of_contents.keys())
print("\nSubcontig regions for an example contig (1):\n\n\t",sorted(overall_label_to_list_of_contents.get('1').keys()))

Overall contigs:

	 dict_keys(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y'])

Subcontig regions for an example contig (1):

	 ['1(000):0-12216999', '1(001):12216999-24433998', '1(002):24433998-36650997', '1(003):36650997-48867996', '1(004):48867996-61084995', '1(005):61084995-73301994', '1(006):73301994-85518993', '1(007):85518993-97735992', '1(008):97735992-109952991', '1(009):109952991-122169990', '1(010):122169990-134386989', '1(011):134386989-146603988', '1(012):146603988-158820987', '1(013):158820987-171037986', '1(014):171037986-183254985', '1(015):183254985-195471984']


450 GB

### Generate list of jobs to be multiprocessed

In [13]:
def get_job_params(contig, df_dict, header_string):
    job_params = []
    
    # Sort the subcontig regions such that the reads are properly ordered 
    sorted_subcontig_names = sorted(df_dict.keys())
    sorted_subcontig_dfs = []
    for n in sorted_subcontig_names:
        sorted_subcontig_dfs.append(df_dict.get(n))
        
    if len(sorted_subcontig_dfs) == 0:
        print("Empty")
        return []
    
    print("\t{}: num subcontigs to concat: {}".format(contig, len(sorted_subcontig_dfs)))
    # All of the reads for all of the barcodes are in this dataframe
    print("\t{}: concatting".format(contig))
    all_contents_df = pl.concat(sorted_subcontig_dfs)
        
    # Combine the reads (in string representation) for all rows corresponding to a barcode        
    for n in ["A", "C", "G", "T"]:
        suffix = "{}-1".format(n)
        
        all_contents_for_suffix = all_contents_df.filter(pl.col('barcode').str.ends_with(suffix))
        
        reads_deduped = list(OrderedDict.fromkeys(all_contents_for_suffix.transpose().with_columns(
            pl.concat_str(
                [pl.col(c) for c in all_contents_for_suffix.transpose().columns],
                separator="\n"
                 ).alias("combined_text")
        )[['combined_text']][1].item().split('\n')))
        
        # Make a sub-subfolder to put the bams for this specific contig
        contig_folder = '{}/{}_{}/'.format(split_bams_folder, contig, n)
        if not os.path.exists(contig_folder):
            os.mkdir(contig_folder)
            
            
        bam_file_name = '{}/{}_{}.bam'.format(contig_folder, contig, n)
        
        # Add parameters to list of jobs
        job_params.append([reads_deduped, bam_file_name, header_string])
        
    del all_contents_df
    return job_params

In [14]:
from collections import OrderedDict
import time

start_time = time.perf_counter()

# Get the bam header, which will be used for each of the split bams too
header_string = str(samfile.header)

# Make a subfolder into which the split bams will be placed
split_bams_folder = '{}/split_bams'.format(output_folder)
if not os.path.exists(split_bams_folder):
    os.mkdir(split_bams_folder)

num_contigs = 0

all_bam_jobs = []

for contig, df_dict in overall_label_to_list_of_contents.items():
    num_contigs += 1
    job_params = get_job_params(contig, df_dict, header_string)
    for j in job_params:
        all_bam_jobs.append(j)
    
overall_job_builder_time = time.perf_counter() - start_time 

	1: num subcontigs to concat: 16
	1: concatting
	2: num subcontigs to concat: 16
	2: concatting
	3: num subcontigs to concat: 16
	3: concatting
	4: num subcontigs to concat: 16
	4: concatting
	5: num subcontigs to concat: 16
	5: concatting
	6: num subcontigs to concat: 16
	6: concatting
	7: num subcontigs to concat: 16
	7: concatting
	8: num subcontigs to concat: 16
	8: concatting
	9: num subcontigs to concat: 16
	9: concatting
	10: num subcontigs to concat: 16
	10: concatting
	11: num subcontigs to concat: 16
	11: concatting
	12: num subcontigs to concat: 16
	12: concatting
	13: num subcontigs to concat: 16
	13: concatting
	14: num subcontigs to concat: 16
	14: concatting
	15: num subcontigs to concat: 16
	15: concatting
	16: num subcontigs to concat: 16
	16: concatting
	17: num subcontigs to concat: 16
	17: concatting
	18: num subcontigs to concat: 16
	18: concatting
	19: num subcontigs to concat: 16
	19: concatting


In [15]:
print("Total time to prepare list for multiprocess-writing bams: {} minutes".format(round(overall_job_builder_time/60)))

Total time to prepare list for multiprocess-writing bams: 46 minutes


# Generate bams

In [16]:
#overall_label_to_list_of_contents.clear()

In [None]:
len(all_bam_jobs)

In [18]:
def sort_bam(bam_file_name):
    output_name = bam_file_name + ".sorted.bam"
    pysam.sort("-o", output_name, bam_file_name)  
    return output_name


def write_reads_to_file(reads, bam_file_name, header_string):
    header = pysam.AlignmentHeader.from_text(header_string)
    
    header_dict = header.as_dict()
    lengths_for_sn = {}
    
    header_dict_sq = header_dict.get("SQ")
    for s in header_dict_sq:
        sn = s.get("SN")
        ln = s.get("LN")
        lengths_for_sn[sn] = ln
        
    print("\tCurrent header length for {}: {}".format(bam_file_name, len(lengths_for_sn)))
    
    all_barcodes_for_contig = set([r.split('\t')[2] for r in reads])
    print("\tNum barcodes for {}: {}".format(bam_file_name, len(all_barcodes_for_contig)))
        
    for new_sn in all_barcodes_for_contig:
        new_sn_chrom = new_sn.split("_")[0]
        
        new_ln = lengths_for_sn.get(new_sn_chrom)
        new_entry = {"SN": new_sn, "LN": new_ln}
        header_dict_sq.append(new_entry)
    
    #print("\tExample new entries: {}".format(header_dict_sq[-4:]))
    header_dict['SQ'] = header_dict_sq
    
    print("\tNew header length: {}".format(len(header_dict.get("SQ"))))
    
    new_header = pysam.AlignmentHeader.from_dict(header_dict)
    
    num_reads = len(reads)
    
    with pysam.AlignmentFile(bam_file_name, "wb", text=str(new_header)) as bam_handle:
        for i, read_str in enumerate(reads):
            if i % 100000 == 0:
                print('file {}: {}/{} reads'.format(bam_file_name.split('/')[-1], i, num_reads))
                
            try:
                read = pysam.AlignedSegment.fromstring(read_str, new_header)
                bam_handle.write(read) 
            except Exception as e:
                print('{}\n\nfile {}: Failed to write read with str representation of:\n\t {}'.format(e,
                                                                                                      bam_file_name.split('/')[-1],
                                                                                                read_str))
                sys.exit(1)
                
            
            
    bam_handle.close()
    
            
def write_reads_to_file_wrapper(parameters):
    reads, bam_file_name, header_string = parameters
    write_reads_to_file(reads, bam_file_name, header_string)
    
    try:
        print("\tSorting {}...".format(bam_file_name))
        sorted_bam_file_name = sort_bam(bam_file_name)
        print("\tIndexing {}...".format(sorted_bam_file_name))
        index_bam(sorted_bam_file_name)
        
    except Exception as e:
        print("Failed at indexing {}".format(bam_file_name))
        

In [None]:
start_time = time.perf_counter()

with Pool(processes=16) as p:
    max_ = len(all_bam_jobs)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(write_reads_to_file_wrapper, all_bam_jobs):
            pbar.update()

total_bam_generation_time = time.perf_counter() - start_time


  0%|                                                                                         | 0/76 [00:00<?, ?it/s]

In [None]:
print("Total time to write bams: {} minutes".format(round(total_bam_generation_time/60)))

# Second loop to get coverage at sites with edits

In [None]:
from glob import glob
import os
import sys
from collections import defaultdict
import pandas as pd
import polars as pl

directory_path = os.path.abspath(os.path.join('../src/'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    
from utils import get_edit_info_for_barcode_in_contig_wrapper

output_folder = '/projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem_bccontig'


splits = [i.split("/")[-1].split('_edit')[0] for i in glob('{}/edit_info/*'.format(output_folder))]
print("Accessing split bams: {}".format(', '.join(sorted(splits))))

### Gather the edit information generated for each subcontig, and group by contig so we only have 1 edit information dataframe to process per contig

In [None]:
all_edit_info_for_barcodes = []

edit_info_grouped_per_contig = defaultdict(lambda:[])
edit_info_grouped_per_contig_combined = defaultdict(lambda:[])

num_splits = len(splits)
print("Grouping edit information outputs by contig...")
for i, split in enumerate(splits):
    if i%10 == 0:
        print("\t{}/{}...".format(i, num_splits))
    contig = split.split("_")[0]
    
    barcode_to_coverage_dict = defaultdict()    
    
    barcode_to_coverage_dict = defaultdict()
    edit_info_file = '{}/edit_info/{}_edit_info.tsv'.format(output_folder, split)
    edit_info_df = pd.read_csv(edit_info_file, sep='\t')
    edit_info_df['position'] = edit_info_df['position'].astype(int)
    edit_info_df['base_quality'] = edit_info_df['base_quality'].astype(int)
    edit_info_df['mapping_quality'] = edit_info_df['mapping_quality'].astype(int)
    edit_info_df['dist_from_end'] = edit_info_df['dist_from_end'].astype(int)

    edit_info = pl.from_pandas(edit_info_df) 
    
    for n in ["A", "C", "G", "T"]:
        suffix = '{}-1'.format(n)
        edit_info_subset = edit_info.filter(pl.col("barcode").str.ends_with(suffix))
        
        edit_info_grouped_per_contig["{}_{}".format(contig, n)].append(edit_info_subset)
    
    del edit_info_df
    
print("Done grouping! Concatenating ...")



In [None]:
for contig, list_of_edit_info_dfs in edit_info_grouped_per_contig.items():
    edit_info_grouped_per_contig_combined[contig] = pl.concat(list_of_edit_info_dfs)

print("Done concatenating!")

### Get coverage at edit positions for each contig

##### Merge across contigs for each barcode???

In [None]:
import pandas as pd

pd.options.mode.chained_assignment = None 


def get_job_params_for_coverage_for_edits_in_contig(edit_info_grouped_per_contig_combined, output_folder):
    job_params = []
    
    for contig, edit_info in edit_info_grouped_per_contig_combined.items():
        print(contig)
        print('Num edits pre filter: {}'.format(len(edit_info)))
        edit_info = edit_info.filter(pl.col("base_quality") > 15)
        print('\tNum edits post filter: {}'.format(len(edit_info)))
        
        #unique_barcodes = list(edit_info.unique("barcode")["barcode"])
        #unique_contigs = list(edit_info.unique("contig")["contig"])
        #print('Num unique contigs: {}'.format(len(unique_contigs)))
        
        job_params.append([edit_info, contig, output_folder])  
    return job_params
    
coverage_counting_job_params = get_job_params_for_coverage_for_edits_in_contig(edit_info_grouped_per_contig_combined, 
                                                                output_folder)
len(coverage_counting_job_params)

### This is going at rate of 1 items per seconds... which would take several hours...

In [None]:
#edit_info_plus_coverage_df = get_edit_info_for_barcode_in_contig_wrapper(coverage_counting_job_params[0])

In [None]:
from multiprocessing import get_context
import time
from multiprocessing import Pool
import multiprocessing
from tqdm import tqdm
import numpy as np

start_time = time.perf_counter()

results = []
# Spawn has to be used instead of the default fork when using the polars library
with get_context("spawn").Pool(processes=16) as p:
    max_ = len(coverage_counting_job_params)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(get_edit_info_for_barcode_in_contig_wrapper, coverage_counting_job_params):
            pbar.update()
            results.append(_)
            
total_time = time.perf_counter() - start_time

In [None]:
print(total_time)

In [None]:
all_edit_info = pd.concat(results)

In [None]:
total_time/60

In [None]:
all_edit_info.to_csv('{}/all_edit_info.tsv'.format(output_folder), sep='\t')

# Group by site to get final total edit and coverage counts at each site

In [None]:
all_edit_info.head()


# Verify C>T ratios

In [None]:
all_edit_info.groupby(['ref', 'alt']).count()


In [None]:
all_edit_info.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
plt.title("All edits")

base_quality_thresh = 15
all_edit_info[all_edit_info.base_quality > base_quality_thresh].groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
plt.title("Edits with base quality > {}".format(base_quality_thresh))

all_edit_info_filtered = all_edit_info[all_edit_info.base_quality > base_quality_thresh]


In [None]:
example_new_ct =  all_edit_info_filtered[(all_edit_info_filtered.ref == 'C') & (all_edit_info_filtered.alt == 'T')].sort_values('position')

In [None]:
len(example_new_ct)

In [None]:
example_new_ct

# Cells that do have STAMP expressed versus don't...?

In [None]:
stamp_expression_path = \
'/projects/ps-yeolab3/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_combined/\
4.1_cells_with_middling_stamp/stamp_expression_for_all_used_cells.tsv'

In [None]:
stamp_expression_df = pd.read_csv(stamp_expression_path, sep='\t', index_col=0)

In [None]:
stamp_expression_df.Stamp.hist(bins=50)

In [None]:
all_edit_info_filtered['edit'] = all_edit_info_filtered['ref'] + '>' + all_edit_info_filtered['alt']

In [None]:
fractions_ct = []
threshs = [0, 0.5, 1, 2, 3, 4, 5, 6, 6.5, 6.6]
for thresh in threshs:
    print(thresh)
    barcodes_at_stamp_thresh = stamp_expression_df[stamp_expression_df.Stamp > thresh].index
    
    all_edit_info_filtered_in_stamp_level = all_edit_info_filtered[
        all_edit_info_filtered.barcode.isin(barcodes_at_stamp_thresh)]
    
    all_edit_info_filtered_in_stamp_level.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
    plt.title("Edit Type Distribution for Cells with STAMP expression above {}".format(thresh))
    
    fraction_ct = len(all_edit_info_filtered_in_stamp_level[all_edit_info_filtered_in_stamp_level['edit'] == 'C>T'])/len(all_edit_info_filtered_in_stamp_level)
    fractions_ct.append(fraction_ct)
    
    

In [None]:
plt.plot(threshs, fractions_ct)
plt.ylabel("Fraction of total edits that are C>T")
plt.xlabel("STAMP expression minimum")
plt.title("Enrichment for C>T edits within cells filtered by STAMP threshold")

In [None]:
fractions_ct_low = []
threshs = [1, 2, 3, 4, 5, 6, 6.5, 6.6]
for thresh in threshs:
    print(thresh)
    barcodes_at_stamp_thresh = stamp_expression_df[stamp_expression_df.Stamp < thresh].index
    
    all_edit_info_filtered_in_stamp_level = all_edit_info_filtered[
        all_edit_info_filtered.barcode.isin(barcodes_at_stamp_thresh)]
    
    all_edit_info_filtered_in_stamp_level.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
    plt.title("Edit Type Distribution for Cells with STAMP expression below {}".format(thresh))
    
    fraction_ct = len(all_edit_info_filtered_in_stamp_level[all_edit_info_filtered_in_stamp_level['edit'] == 'C>T'])/len(all_edit_info_filtered_in_stamp_level)
    fractions_ct_low.append(fraction_ct)

In [None]:
plt.plot(threshs, fractions_ct_low)
plt.ylabel("Fraction of total edits that are C>T")
plt.xlabel("STAMP expression maximum")
plt.title("Enrichment for C>T edits within cells filtered by STAMP threshold")