In [20]:
import pysam
import os
import sys
from sys import getsizeof
import time
import pandas as pd

directory_path = os.path.abspath(os.path.join('../src/'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    
from read_process import get_contig_lengths_dict,\
incorporate_replaced_pos_info,incorporate_insertions_and_deletions,\
get_positions_from_md_tag,reverse_complement,get_edit_information,get_edit_information_wrapper,\
has_edits,get_total_coverage_for_contig_at_position,\
print_read_info, update_coverage_array, get_read_information, get_hamming_distance, remove_softclipped_bases,find

from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_bam, \
write_read_to_bam_file, remove_file_if_exists, make_folder

import os, psutil


# Preload which barcodes to use...

In [21]:
barcodes_list_path = '/projects/ps-yeolab3/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_batch2/cellranger/results/ms_hippo_stamp_EIF4A_batch2/outs/filtered_feature_bc_matrix/barcodes.tsv.gz'

In [22]:
barcode_whitelist = set(pd.read_csv(barcodes_list_path, names=['barcodes']).barcodes.tolist())

In [23]:
len(barcode_whitelist)

39200

# ~~~~~~~~~~~~~~~~~~
# Multi-processing enabled
# ~~~~~~~~~~~~~~~~~~

# An example on a full 10x bam

#### in 10X's bam file, xf=25 means that read is uniquely mapped to a genome, and was used for counting UMI. So we should only look at reads with xf=25 from the 10X bam.

In [24]:
#bampath = '/projects/ps-yeolab5/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_batch2/filtered_possorted_ms_hippo_stamp_bam/filtered_keep_xf25_possorted_genome_with_header.bam_MD.bam'
bampath = '/projects/ps-yeolab3/ekofman/sailor2/data/groups_0_1_2_3_4_5_6_7_8_9_10_11_merged.bam'


samfile = pysam.AlignmentFile(bampath, "rb")

In [25]:
samfile_header = str(samfile.header)

In [26]:
getsizeof(samfile_header)/1000

19338.323

# Helper functions

In [27]:
def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_whitelist=None, verbose=False):  
    time_reporting = {}
    start_time = time.perf_counter()
    
    samfile = pysam.AlignmentFile(bampath, "rb")
        
    counts = defaultdict(lambda:defaultdict(lambda:0))
    total_reads = 0
    
    bam_handles_for_barcodes = {}
    read_lists_for_barcodes = defaultdict(lambda:[])
    
    reads_for_contig = samfile.fetch(contig, start, end, multiple_iterators=True)

    output_file = '{}/{}_{}_{}_{}_edit_info.tsv'.format(edit_info_subfolder, contig, split_index, start, end)
    remove_file_if_exists(output_file)

    with open(output_file, 'w') as f:        
        write_header_to_bam(f)

        for i, read in enumerate(reads_for_contig):
            total_reads += 1
            
            if total_reads % 1000 == 0:
                time_reporting[total_reads] = time.perf_counter() - start_time

            barcode = read.get_tag("CB")
            if barcode_whitelist:
                if barcode not in barcode_whitelist:
                    counts[contig]['Barcode Filtered'] += 1
                    continue
                
            barcodes[contig][barcode] += 1

            verbose = False
            
            try:
                error_code, list_of_rows, num_edits_of_each_type = get_read_information(read, contig, verbose=verbose)
            except Exception as e:
                print("Failed on\n{}".format(read.to_string()))
                break
                
            if error_code:
                counts[contig][error_code] += 1
            else:
                counts[contig][EDITED_CODE] += 1
                write_rows_to_info_file(list_of_rows, f)
            
            # Store each read using its string representation
            read_as_string = read.to_string()
                    
            read_lists_for_barcodes[barcode].append(read_as_string)
            
    
    # Add all reads to dictionary for contig and barcode, in their string representation
    num_barcodes = 0
    total_bams = len(read_lists_for_barcodes)
    
    
    barcode_to_concatted_reads = {}
    for barcode, read_list in read_lists_for_barcodes.items():
        num_barcodes += 1
        if num_barcodes % 100 == 0:
            #print('{}/{} processed'.format(num_barcodes, total_bams))
            pass
        # Concatenate the string representations of all reads for each bam-contig combination
        all_reads_concatted = '\n'.join(read_list)
            
        # Save this concatenated block of text to dictionary
        barcode_to_concatted_reads[barcode] = all_reads_concatted
        
    time_reporting[total_reads] = time.perf_counter() - start_time
    
    samfile.close()
    
    return barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting


def find_edits_and_split_bams(bampath, contig, split_index, start, end, output_folder, barcode_whitelist=None, verbose=False):
    barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting = find_edits(bampath, contig, split_index,
                                                                         start, end, output_folder, barcode_whitelist=barcode_whitelist, verbose=verbose)    
    return barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting
    
def find_edits_and_split_bams_wrapper(parameters):
    try:
        start_time = time.perf_counter()
        bampath, contig, split_index, start, end, output_folder, barcode_whitelist, verbose = parameters
        label = '{}({}):{}-{}'.format(contig, split_index, start, end)

        #print("{} ({}):{}-{}\tfind_edits_and_split_bams".format(contig, split_index, start, end))
        barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting = find_edits_and_split_bams(bampath, contig, split_index, start, end,                                                                                        
                                                                                                              output_folder, 
                                                                                                              barcode_whitelist=barcode_whitelist,
                                                                                                              verbose=False)
        barcodes_df = pd.DataFrame.from_dict(barcodes)
        counts_df = pd.DataFrame.from_dict(counts)
        time_df = pd.DataFrame.from_dict(time_reporting, orient='index')
        barcode_to_concatted_reads_df = pd.DataFrame.from_dict(barcode_to_concatted_reads, orient='index')
        
        total_time = time.perf_counter() - start_time
        return barcode_to_concatted_reads_df, total_reads, barcodes_df, label, counts_df, time_df, total_time
    except Exception as e:
        print('Contig {}: {}'.format(label, e))
        return 0, pd.DataFrame(), label, pd.DataFrame()

# Go through every read and identify all edits

In [28]:
from collections import defaultdict
import pandas as pd
#from matplotlib import pyplot as plt
import numpy as np
import time
from multiprocessing import Pool
import multiprocessing
from tqdm import tqdm

start_time = time.perf_counter()

print("CPU count: {}".format(multiprocessing.cpu_count()))

output_folder = '/projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem'

contig_lengths_dict = get_contig_lengths_dict(samfile)

# Print info?
verbose = False 
EDITED_CODE = 'edited'

# How many subcontigs to split each contig into to leverage multi-processing
num_intervals = 16

num_reads_to_coverage_dict_kb = {}
num_reads_to_seconds = {}


start_time = time.perf_counter()
total_seconds_for_reads = {0: 1}

barcodes = defaultdict(lambda:defaultdict(lambda:0))

jobs = []
for contig in contig_lengths_dict.keys():
    # Skip useless contigs
    if len(contig) > 5 or contig == 'Stamp':# or contig != '17':
        continue
        
    print("Contig {}".format(contig))
    contig_length = contig_lengths_dict.get(contig)
    intervals_for_contig = get_intervals(contig, contig_lengths_dict, num_intervals)
    
    # Make subfolder in which to information about edits
    edit_info_subfolder = '{}/edit_info'.format(output_folder)
    make_folder(edit_info_subfolder)
        
    # Set up for pool
    for split_index, interval in enumerate(intervals_for_contig):
        split_index = str(split_index).zfill(3)
        parameters = [bampath, contig, split_index, interval[0], interval[1], output_folder, barcode_whitelist, verbose]
        jobs.append(parameters)
    
print("{} total jobs".format(len(jobs)))

# Pooling
results = []
overall_total_reads = 0
with Pool(processes=6) as p:
    max_ = len(jobs)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(find_edits_and_split_bams_wrapper, jobs):
            pbar.update()
            results.append(_)
            
            total_reads = _[1]
            total_time = time.perf_counter() - start_time
            
            overall_total_reads += total_reads

            total_seconds_for_reads[overall_total_reads] = total_time

overall_time = time.perf_counter() - start_time 


CPU count: 36
Contig 1
Contig 10
Contig 11
Contig 12
Contig 13
Contig 14
Contig 15
Contig 16
Contig 17
Contig 18
Contig 19
Contig 2
Contig 3
Contig 4
Contig 5
Contig 6
Contig 7
Contig 8
Contig 9
Contig MT
Contig X
Contig Y
352 total jobs


100%|█████████████████████████████████████████| 352/352 [01:40<00:00,  3.51it/s]


In [29]:
print("Total time: {} seconds".format(overall_time))
print("Total time: {} minutes".format(overall_time/60))

Total time: 101.14009240269661 seconds
Total time: 1.6856682067116102 minutes


Memory: 345 Gigabytes

# More helper functions

# Combine all of the reads (string representation) for each barcode
## Groups the results from each sub-contig segment above, for example the reads from the first half of chr1 and those from the second half.

In [30]:
import polars as pl

edit_finder_results = results

num_barcodes_to_time = {}
num_contigs_to_time = {}

overall_label_to_list_of_contents = defaultdict(lambda:{})

for barcode_to_concatted_reads_df, total_reads, barcodes_df, label, counts_df, time_df, total_time in edit_finder_results:
    try:
        
        if len(barcode_to_concatted_reads_df.columns) < 3:
            barcode_to_concatted_reads_df.columns = ['contents']
            barcode_to_concatted_reads_df['barcode'] = [b.split('/')[-1].split('.bam')[0] for b in barcode_to_concatted_reads_df.index]
            contig = label.split('(')[0]
            barcode_to_concatted_reads_df['barcode_contig'] = barcode_to_concatted_reads_df['barcode'] + '_' + contig
                   
        overall_label_to_list_of_contents[contig][label] = pl.from_pandas(barcode_to_concatted_reads_df)
    except Exception as e:
        print(e, label)

Length mismatch: Expected axis has 0 elements, new values have 1 elements MT(000):0-1019
Length mismatch: Expected axis has 0 elements, new values have 1 elements MT(001):1019-2038
Length mismatch: Expected axis has 0 elements, new values have 1 elements Y(004):22936176-28670220
Length mismatch: Expected axis has 0 elements, new values have 1 elements Y(005):28670220-34404264
Length mismatch: Expected axis has 0 elements, new values have 1 elements Y(008):45872352-51606396
Length mismatch: Expected axis has 0 elements, new values have 1 elements Y(009):51606396-57340440
Length mismatch: Expected axis has 0 elements, new values have 1 elements Y(012):68808528-74542572
Length mismatch: Expected axis has 0 elements, new values have 1 elements Y(014):80276616-86010660


In [31]:
print("Overall contigs:\n\n\t", overall_label_to_list_of_contents.keys())
print("\nSubcontig regions for an example contig (1):\n\n\t",sorted(overall_label_to_list_of_contents['1'].keys()))

Overall contigs:

	 dict_keys(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y'])

Subcontig regions for an example contig (1):

	 ['1(000):0-12216999', '1(001):12216999-24433998', '1(002):24433998-36650997', '1(003):36650997-48867996', '1(004):48867996-61084995', '1(005):61084995-73301994', '1(006):73301994-85518993', '1(007):85518993-97735992', '1(008):97735992-109952991', '1(009):109952991-122169990', '1(010):122169990-134386989', '1(011):134386989-146603988', '1(012):146603988-158820987', '1(013):158820987-171037986', '1(014):171037986-183254985', '1(015):183254985-195471984']


### Generate list of jobs to be multiprocessed

In [34]:
def get_job_params(contig, df_dict):
    job_params = []
    # Make a sub-subfolder to put the bams for this specific contig
    contig_folder = '{}/{}/'.format(split_bams_folder, contig)
    if not os.path.exists(contig_folder):
        os.mkdir(contig_folder)
    
    # Sort the subcontig regions such that the reads are properly ordered 
    sorted_subcontig_names = sorted(df_dict.keys())
    sorted_subcontig_dfs = []
    for n in sorted_subcontig_names:
        sorted_subcontig_dfs.append(df_dict.get(n))
        
    print("\t{}: num subcontigs to concat: {}".format(contig, len(sorted_subcontig_dfs)))
    # All of the reads for all of the barcodes are in this dataframe
    print("\t{}: concatting".format(contig))
    all_contents_df = pl.concat(sorted_subcontig_dfs)
    
         
    all_contents_df_partitioned = all_contents_df.partition_by("barcode")
    total_barcodes = len(all_contents_df_partitioned)
    
    for i, contents_for_barcode in enumerate(all_contents_df_partitioned):            
        if i % 10000 == 0:
            print('Contig {}: {}/{} barcodes'.format(contig, i, total_barcodes))

        # Combine the reads (in string representation) for all rows corresponding to a barcode        
        all_reads_for_barcode_concatted = contents_for_barcode.transpose().with_columns(
            pl.concat_str(
                [pl.col(c) for c in contents_for_barcode.transpose().columns],
                separator="\n"
                 ).alias("combined_text")
        )[['combined_text']][0].item()
            
        # Turn the newline-delimited block of text back into list of reads as strings
        # OrderedDict line is to remove duplicates
        reads_deduped = list(OrderedDict.fromkeys(all_reads_for_barcode_concatted.split('\n')))
                
        # Establish the name of the split bam that will be generated
        barcode = contents_for_barcode.unique('barcode')['barcode'].item()
        bam_file_name = '{}/{}_{}.bam'.format(contig_folder, contig, barcode)
        
        # Add parameters to list of jobs
        job_params.append([reads_deduped, bam_file_name, header_string])
    return job_params

In [35]:
from collections import OrderedDict

start_time = time.perf_counter()

# Get the bam header, which will be used for each of the split bams too
header_string = str(samfile.header)

# Make a subfolder into which the split bams will be placed
split_bams_folder = '{}/split_bams'.format(output_folder)
if not os.path.exists(split_bams_folder):
    os.mkdir(split_bams_folder)

num_contigs = 0

job_list = []

for contig, df_dict in overall_label_to_list_of_contents.items():
    num_contigs += 1
    job_params = get_job_params(contig, df_dict)
    job_list.append(job_params)
    
overall_job_builder_time = time.perf_counter() - start_time 

	1: num subcontigs to concat: 16
	1: concatting
Contig 1: 0/1500 barcodes
	10: num subcontigs to concat: 16
	10: concatting
Contig 10: 0/1500 barcodes
	11: num subcontigs to concat: 16
	11: concatting
Contig 11: 0/1500 barcodes
	12: num subcontigs to concat: 16
	12: concatting
Contig 12: 0/1500 barcodes
	13: num subcontigs to concat: 16
	13: concatting
Contig 13: 0/1500 barcodes
	14: num subcontigs to concat: 16
	14: concatting
Contig 14: 0/1500 barcodes
	15: num subcontigs to concat: 16
	15: concatting
Contig 15: 0/1500 barcodes
	16: num subcontigs to concat: 16
	16: concatting
Contig 16: 0/1500 barcodes
	17: num subcontigs to concat: 16
	17: concatting
Contig 17: 0/1500 barcodes
	18: num subcontigs to concat: 16
	18: concatting
Contig 18: 0/1500 barcodes
	19: num subcontigs to concat: 16
	19: concatting
Contig 19: 0/1500 barcodes
	2: num subcontigs to concat: 16
	2: concatting
Contig 2: 0/1500 barcodes
	3: num subcontigs to concat: 16
	3: concatting
Contig 3: 0/1500 barcodes
	4: num 

In [36]:
print("Total time to prepare list for multiprocess-writing bams: {} minutes".format(round(overall_job_builder_time/60)))

Total time to prepare list for multiprocess-writing bams: 1 minutes


# Generate bams

In [37]:
overall_label_to_list_of_contents.clear()

In [40]:
len(job_list)

22

In [41]:
all_bam_jobs = []
for r in job_list:
    for i in r:
        all_bam_jobs.append(i)

In [42]:
len(all_bam_jobs)

32562

In [43]:
def sort_bam(bam_file_name):
    output_name = bam_file_name.split("bam")[0] + ".sorted.bam"
    pysam.sort("-o", output_name, bam_file_name)  
    return output_name

def write_reads_to_file(reads, bam_file_name, header_string):
    with pysam.AlignmentFile(bam_file_name, "wb", text=header_string) as bam_handle:
        for read_str in reads:
            read = pysam.AlignedSegment.fromstring(read_str, bam_handle.header)
            bam_handle.write(read) 
    bam_handle.close()
            
def write_reads_to_file_wrapper(parameters):
    reads, bam_file_name, header_string = parameters
    write_reads_to_file(reads, bam_file_name, header_string)
    
    try:
        index_bam(bam_file_name)
    except Exception as e:
        print("Failed at indexing {}".format(bam_file_name))
        

In [44]:
start_time = time.perf_counter()

with Pool(processes=16) as p:
    max_ = len(all_bam_jobs)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(write_reads_to_file_wrapper, all_bam_jobs):
            pbar.update()

total_time = time.perf_counter() - start_time


100%|█████████████████████████████████████| 32562/32562 [19:35<00:00, 27.69it/s]


In [45]:
print("Total time to write bams: {} minutes".format(round(total_time/60)))

Total time to write bams: 20 minutes


# Time profiling of the edit-counting step

In [None]:
total_contig_times = {}
all_read_info_dfs = []
all_time_dfs = []

total_times = {}
for result in results:
    
    label = result[3]

    try:
        total_time = result[6]
        total_times[label] = total_time
        
        time_df = result[5]
        all_time_dfs.append(time_df)
        total_time_for_contig = (float(time_df.max()))
        total_contig_times[label] = total_time_for_contig
        
        read_info_df = result[4]
        read_info_df.columns = [label]
        all_read_info_dfs.append(read_info_df)
    except Exception as e:
        print(e, label)
        
total_contig_times_df = pd.DataFrame.from_dict(total_contig_times, orient='index', columns=['seconds']).sort_values('seconds')
print('Total time without threading: {} minutes'.format(round(total_contig_times_df.seconds.sum()/60, 2)))

total_reads_df = pd.concat(all_read_info_dfs,axis=1).T

total_reads_and_times_df = total_reads_df.join(total_contig_times_df)


# Rates

rates = []
for reads, secs in zip(list(total_seconds_for_reads.keys()), list(total_seconds_for_reads.values())):
    rate = reads/secs
    rates.append(rate)
average_rate = np.mean(rates)
print("Average of {} reads/second".format(average_rate))

In [None]:
plt.scatter(total_reads_and_times_df.edited, total_reads_and_times_df.seconds, s=1)
plt.scatter(total_reads_and_times_df.no_edits, total_reads_and_times_df.seconds, s=1)

plt.title("Total processing time vs number of reads")
plt.ylabel("Time (seconds)")
plt.xlabel("Reads")
plt.legend(['Reads with edits', 'Read without edits'])

pd.DataFrame.from_dict(total_seconds_for_reads, orient='index').sort_index().plot(legend=False)
plt.xlabel("Reads")
plt.ylabel("Time (seconds)")
plt.title("Runtime vs number of reads processed")

In [None]:
plt.plot(range(len(rates)), rates)
plt.title("Mean rate (reads per second)")
plt.ylabel("Reads/Second")
plt.xlabel("Number of reads (e10^6)")
plt.axhline(average_rate, color='r')


In [None]:
seconds_per_read = 1/average_rate

In [None]:
import math

reads_per_cell = 50000
total_cells = 15000
total_reads = reads_per_cell * total_cells
print(total_reads)

total_estimated_time = total_reads * seconds_per_read
print('Estimated total time in minutes for {} cells with {} reads per cell ({} total reads): {} minutes'.format(total_cells,
                                                                                               reads_per_cell,
                                                                                                                total_reads,
                                                                                               math.ceil(total_estimated_time/60), 3))


# Second loop to get coverage at sites with edits

In [1]:
from glob import glob
import os
import sys
from collections import defaultdict
import pandas as pd
import polars as pl

directory_path = os.path.abspath(os.path.join('../src/'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    
from utils import get_edit_info_for_barcode_in_contig, get_edit_info_for_barcode_in_contig_wrapper

output_folder = '/projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem'


splits = [i.split("/")[-1].split('_edit')[0] for i in glob('{}/edit_info/*'.format(output_folder))]
print("Accessing split bams: {}".format(', '.join(sorted(splits))))

Accessing split bams: 10_000_0_8168438, 10_001_8168438_16336876, 10_002_16336876_24505314, 10_003_24505314_32673752, 10_004_32673752_40842190, 10_005_40842190_49010628, 10_006_49010628_57179066, 10_007_57179066_65347504, 10_008_65347504_73515942, 10_009_73515942_81684380, 10_010_81684380_89852818, 10_011_89852818_98021256, 10_012_98021256_106189694, 10_013_106189694_114358132, 10_014_114358132_122526570, 10_015_122526570_130695008, 11_000_0_7630159, 11_001_7630159_15260318, 11_002_15260318_22890477, 11_003_22890477_30520636, 11_004_30520636_38150795, 11_005_38150795_45780954, 11_006_45780954_53411113, 11_007_53411113_61041272, 11_008_61041272_68671431, 11_009_68671431_76301590, 11_010_76301590_83931749, 11_011_83931749_91561908, 11_012_91561908_99192067, 11_013_99192067_106822226, 11_014_106822226_114452385, 11_015_114452385_122082544, 12_000_0_7508064, 12_001_7508064_15016128, 12_002_15016128_22524192, 12_003_22524192_30032256, 12_004_30032256_37540320, 12_005_37540320_45048384, 12_00

### Gather the edit information generated for each subcontig, and group by contig so we only have 1 edit information dataframe to process per contig

In [2]:
all_edit_info_for_barcodes = []

edit_info_grouped_per_contig = defaultdict(lambda:[])
edit_info_grouped_per_contig_combined = defaultdict(lambda:[])

num_splits = len(splits)
print("Grouping edit information outputs by contig...")
for i, split in enumerate(splits):
    if i%10 == 0:
        print("\t{}/{}...".format(i, num_splits))
    contig = split.split("_")[0]
    
    barcode_to_coverage_dict = defaultdict()    
    
    barcode_to_coverage_dict = defaultdict()
    edit_info_file = '{}/edit_info/{}_edit_info.tsv'.format(output_folder, split)
    edit_info_df = pd.read_csv(edit_info_file, sep='\t')
    edit_info_df['position'] = edit_info_df['position'].astype(int)
    edit_info_df['base_quality'] = edit_info_df['base_quality'].astype(int)
    edit_info_df['mapping_quality'] = edit_info_df['mapping_quality'].astype(int)
    edit_info_df['dist_from_end'] = edit_info_df['dist_from_end'].astype(int)

    edit_info = pl.from_pandas(edit_info_df) 
    edit_info_grouped_per_contig[contig].append(edit_info)
    
    del edit_info_df
    
print("Done grouping! Concatenating ...")



Grouping edit information outputs by contig...
	0/352...
	10/352...
	20/352...
	30/352...
	40/352...
	50/352...
	60/352...
	70/352...
	80/352...
	90/352...
	100/352...
	110/352...
	120/352...
	130/352...
	140/352...
	150/352...
	160/352...
	170/352...
	180/352...
	190/352...
	200/352...
	210/352...
	220/352...
	230/352...
	240/352...
	250/352...
	260/352...
	270/352...
	280/352...
	290/352...
	300/352...
	310/352...
	320/352...
	330/352...
	340/352...
	350/352...
Done grouping! Concatenating ...


In [3]:
for contig, list_of_edit_info_dfs in edit_info_grouped_per_contig.items():
    edit_info_grouped_per_contig_combined[contig] = pl.concat(list_of_edit_info_dfs)

print("Done concatenating!")

Done concatenating!


### Get coverage at edit positions for each contig

In [4]:
import pandas as pd

pd.options.mode.chained_assignment = None 


def get_coverage_for_edits_in_contig(edit_info_grouped_per_contig_combined, output_folder):
    job_params = []
    
    for contig, edit_info in edit_info_grouped_per_contig_combined.items():
        unique_barcodes = list(edit_info["barcode"].unique())

        for i, barcode in enumerate(unique_barcodes):                 
            job_params.append([edit_info, contig, barcode, output_folder])  
    return job_params
    
coverage_counting_job_params = get_coverage_for_edits_in_contig(edit_info_grouped_per_contig_combined, 
                                                                output_folder)

In [None]:
from multiprocessing import get_context
import time
from multiprocessing import Pool
import multiprocessing
from tqdm import tqdm
import numpy as np

start_time = time.perf_counter()

results = []
with get_context("spawn").Pool(processes=16) as p:
    max_ = len(coverage_counting_job_params)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(get_edit_info_for_barcode_in_contig_wrapper, coverage_counting_job_params):
            pbar.update()
            results.append(_)
            
total_time = time.perf_counter() - start_time

 45%|████████████████▊                    | 14541/32034 [09:52<20:03, 14.53it/s]

In [None]:
print(total_time)

In [None]:
all_edit_info = pd.concat(results)

In [None]:
all_edit_info.to_csv('{}/all_edit_info.tsv'.format(output_folder), sep='\t')

# Group by site to get final total edit and coverage counts at each site

# Verify C>T ratios

In [None]:
all_edit_info.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
plt.title("All edits")

base_quality_thresh = 15
all_edit_info[all_edit_info.base_quality > base_quality_thresh].groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
plt.title("Edits with base quality > {}".format(base_quality_thresh))

all_edit_info_filtered = all_edit_info[all_edit_info.base_quality > base_quality_thresh]


In [None]:
example_new_ct =  all_edit_info_filtered[(all_edit_info_filtered.ref == 'C') & (all_edit_info_filtered.alt == 'T')].sort_values('position')

In [None]:
len(example_new_ct)

In [None]:
example_new_ct

# Cells that do have STAMP expressed versus don't...?

In [None]:
stamp_expression_path = \
'/projects/ps-yeolab3/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_combined/\
4.1_cells_with_middling_stamp/stamp_expression_for_all_used_cells.tsv'

In [None]:
stamp_expression_df = pd.read_csv(stamp_expression_path, sep='\t', index_col=0)

In [None]:
stamp_expression_df.Stamp.hist(bins=50)

In [None]:
all_edit_info_filtered['edit'] = all_edit_info_filtered['ref'] + '>' + all_edit_info_filtered['alt']

In [None]:
fractions_ct = []
threshs = [0, 0.5, 1, 2, 3, 4, 5, 6, 6.5, 6.6]
for thresh in threshs:
    print(thresh)
    barcodes_at_stamp_thresh = stamp_expression_df[stamp_expression_df.Stamp > thresh].index
    
    all_edit_info_filtered_in_stamp_level = all_edit_info_filtered[
        all_edit_info_filtered.barcode.isin(barcodes_at_stamp_thresh)]
    
    all_edit_info_filtered_in_stamp_level.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
    plt.title("Edit Type Distribution for Cells with STAMP expression above {}".format(thresh))
    
    fraction_ct = len(all_edit_info_filtered_in_stamp_level[all_edit_info_filtered_in_stamp_level['edit'] == 'C>T'])/len(all_edit_info_filtered_in_stamp_level)
    fractions_ct.append(fraction_ct)
    
    

In [None]:
plt.plot(threshs, fractions_ct)
plt.ylabel("Fraction of total edits that are C>T")
plt.xlabel("STAMP expression minimum")
plt.title("Enrichment for C>T edits within cells filtered by STAMP threshold")

In [None]:
fractions_ct_low = []
threshs = [1, 2, 3, 4, 5, 6, 6.5, 6.6]
for thresh in threshs:
    print(thresh)
    barcodes_at_stamp_thresh = stamp_expression_df[stamp_expression_df.Stamp < thresh].index
    
    all_edit_info_filtered_in_stamp_level = all_edit_info_filtered[
        all_edit_info_filtered.barcode.isin(barcodes_at_stamp_thresh)]
    
    all_edit_info_filtered_in_stamp_level.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
    plt.title("Edit Type Distribution for Cells with STAMP expression below {}".format(thresh))
    
    fraction_ct = len(all_edit_info_filtered_in_stamp_level[all_edit_info_filtered_in_stamp_level['edit'] == 'C>T'])/len(all_edit_info_filtered_in_stamp_level)
    fractions_ct_low.append(fraction_ct)

In [None]:
plt.plot(threshs, fractions_ct_low)
plt.ylabel("Fraction of total edits that are C>T")
plt.xlabel("STAMP expression maximum")
plt.title("Enrichment for C>T edits within cells filtered by STAMP threshold")