In [15]:
import pysam
import os
import sys
from sys import getsizeof
import time
import pandas as pd
import polars as pl

directory_path = os.path.abspath(os.path.join('../src/'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    
from read_process import get_contig_lengths_dict,\
incorporate_replaced_pos_info,incorporate_insertions_and_deletions,\
get_positions_from_md_tag,reverse_complement,get_edit_information,get_edit_information_wrapper,\
has_edits,get_total_coverage_for_contig_at_position,\
print_read_info, update_coverage_array, get_read_information, get_hamming_distance, remove_softclipped_bases,find

from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_bam, \
write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper

import os, psutil


# Preload which barcodes to use...

In [None]:
barcodes_list_path = '/projects/ps-yeolab3/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_batch2/cellranger/results/ms_hippo_stamp_EIF4A_batch2/outs/filtered_feature_bc_matrix/barcodes.tsv.gz'

In [None]:
barcode_whitelist = set(pd.read_csv(barcodes_list_path, names=['barcodes']).barcodes.tolist())

In [None]:
len(barcode_whitelist)

# ~~~~~~~~~~~~~~~~~~
# Multi-processing enabled
# ~~~~~~~~~~~~~~~~~~

# An example on a full 10x bam

#### in 10X's bam file, xf=25 means that read is uniquely mapped to a genome, and was used for counting UMI. So we should only look at reads with xf=25 from the 10X bam.

In [None]:
bampath = '/projects/ps-yeolab5/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_batch2/filtered_possorted_ms_hippo_stamp_bam/filtered_keep_xf25_possorted_genome_with_header.bam_MD.bam'
#bampath = '/projects/ps-yeolab3/ekofman/sailor2/data/groups_0_1_2_3_4_5_6_7_8_9_10_11_merged.bam'


samfile = pysam.AlignmentFile(bampath, "rb")

In [None]:
samfile_header = str(samfile.header)

In [None]:
getsizeof(samfile_header)/1000

# Helper functions

In [None]:
def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_whitelist=None, verbose=False):  
    time_reporting = {}
    start_time = time.perf_counter()
    
    samfile = pysam.AlignmentFile(bampath, "rb")
        
    counts = defaultdict(lambda:defaultdict(lambda:0))
    total_reads = 0
    
    bam_handles_for_barcodes = {}
    read_lists_for_barcodes = defaultdict(lambda:[])
    
    reads_for_contig = samfile.fetch(contig, start, end, multiple_iterators=True)

    output_file = '{}/{}_{}_{}_{}_edit_info.tsv'.format(edit_info_subfolder, contig, split_index, start, end)
    remove_file_if_exists(output_file)

    with open(output_file, 'w') as f:        
        write_header_to_bam(f)

        for i, read in enumerate(reads_for_contig):
            total_reads += 1
            
            if total_reads % 1000 == 0:
                time_reporting[total_reads] = time.perf_counter() - start_time

            barcode = read.get_tag("CB")
            if barcode_whitelist:
                if barcode not in barcode_whitelist:
                    counts[contig]['Barcode Filtered'] += 1
                    continue
                
            barcodes[contig][barcode] += 1

            verbose = False
            
            try:
                error_code, list_of_rows, num_edits_of_each_type = get_read_information(read, contig, verbose=verbose)
            except Exception as e:
                print("Failed on\n{}".format(read.to_string()))
                break
                
            if error_code:
                counts[contig][error_code] += 1
            else:
                counts[contig][EDITED_CODE] += 1
                write_rows_to_info_file(list_of_rows, f)
            
            # Store each read using its string representation
            read_as_string = read.to_string()
            read_tab_separated = read_as_string.split('\t')
     
            second_new_contig_section = '{}_{}'.format(contig, barcode)
            read_tab_separated[2] = second_new_contig_section
            
            read_as_string = '\t'.join(read_tab_separated)
            
            read_lists_for_barcodes[barcode].append(read_as_string)
            
    
    # Add all reads to dictionary for contig and barcode, in their string representation
    num_barcodes = 0
    total_bams = len(read_lists_for_barcodes)
    
    
    barcode_to_concatted_reads = {}
    for barcode, read_list in read_lists_for_barcodes.items():        
        num_barcodes += 1
        if num_barcodes % 100 == 0:
            #print('{}/{} processed'.format(num_barcodes, total_bams))
            pass
        # Concatenate the string representations of all reads for each bam-contig combination
        all_reads_concatted = '\n'.join(read_list)
            
        # Save this concatenated block of text to dictionary
        barcode_to_concatted_reads[barcode] = all_reads_concatted
        
    time_reporting[total_reads] = time.perf_counter() - start_time
    
    samfile.close()
    
    return barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting


def find_edits_and_split_bams(bampath, contig, split_index, start, end, output_folder, barcode_whitelist=None, verbose=False):
    barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting = find_edits(bampath, contig, split_index,
                                                                         start, end, output_folder, barcode_whitelist=barcode_whitelist, verbose=verbose)    
    return barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting
    
def find_edits_and_split_bams_wrapper(parameters):
    try:
        start_time = time.perf_counter()
        bampath, contig, split_index, start, end, output_folder, barcode_whitelist, verbose = parameters
        label = '{}({}):{}-{}'.format(contig, split_index, start, end)

        #print("{} ({}):{}-{}\tfind_edits_and_split_bams".format(contig, split_index, start, end))
        barcode_to_concatted_reads, total_reads, barcodes, counts, time_reporting = find_edits_and_split_bams(bampath, contig, split_index, start, end,                                                                                        
                                                                                                              output_folder, 
                                                                                                              barcode_whitelist=barcode_whitelist,
                                                                                                              verbose=False)
        barcodes_df = pd.DataFrame.from_dict(barcodes)
        counts_df = pd.DataFrame.from_dict(counts)
        time_df = pd.DataFrame.from_dict(time_reporting, orient='index')
        if len(barcode_to_concatted_reads) > 0:
            barcode_to_concatted_reads_pl = pl.from_dict(barcode_to_concatted_reads).transpose(include_header=True, header_name='barcode').rename({"column_0": "contents"})
        else:
            # No transposes are allowed on empty dataframes
            barcode_to_concatted_reads_pl = pl.from_dict(barcode_to_concatted_reads)
            
        total_time = time.perf_counter() - start_time
        return contig, label, barcode_to_concatted_reads_pl, total_reads, barcodes_df, counts_df, time_df, total_time
    except Exception as e:
        print('Contig {}: {}'.format(label, e))
        return 0, pd.DataFrame(), label, pd.DataFrame()

# Go through every read and identify all edits

In [None]:
from collections import defaultdict
import pandas as pd
#from matplotlib import pyplot as plt
import numpy as np
import time
from multiprocessing import Pool
import multiprocessing
from tqdm import tqdm

start_time = time.perf_counter()

print("CPU count: {}".format(multiprocessing.cpu_count()))

#output_folder = '/projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem_bccontig-subset'
output_folder = '/projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem_bccontig'

contig_lengths_dict = get_contig_lengths_dict(samfile)

# Print info?
verbose = False 
EDITED_CODE = 'edited'

# How many subcontigs to split each contig into to leverage multi-processing
num_intervals = 16

num_reads_to_coverage_dict_kb = {}
num_reads_to_seconds = {}


start_time = time.perf_counter()
total_seconds_for_reads = {0: 1}

barcodes = defaultdict(lambda:defaultdict(lambda:0))

jobs = []
for contig in ['6']:#contig_lengths_dict.keys():
    # Skip useless contigs
    if len(contig) > 5 or contig == 'Stamp':# or contig != '17':
        continue
        
    print("Contig {}".format(contig))
    contig_length = contig_lengths_dict.get(contig)
    intervals_for_contig = get_intervals(contig, contig_lengths_dict, num_intervals)
    
    # Make subfolder in which to information about edits
    edit_info_subfolder = '{}/edit_info'.format(output_folder)
    make_folder(edit_info_subfolder)
        
    # Set up for pool
    for split_index, interval in enumerate(intervals_for_contig):
        split_index = str(split_index).zfill(3)
        parameters = [bampath, contig, split_index, interval[0], interval[1], output_folder, barcode_whitelist, verbose]
        jobs.append(parameters)
    
print("{} total jobs".format(len(jobs)))

# Pooling
results = []
overall_total_reads = 0

overall_label_to_list_of_contents = defaultdict(lambda:{})

with Pool(processes=16) as p:
    max_ = len(jobs)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(find_edits_and_split_bams_wrapper, jobs):
            pbar.update()
            
            overall_label_to_list_of_contents[_[0]][_[1]] =  _[2]
            results.append([_[3], _[4], _[5], _[6], _[7]])
            
            total_reads = _[3]
            total_time = time.perf_counter() - start_time
            
            overall_total_reads += total_reads

            total_seconds_for_reads[overall_total_reads] = total_time

overall_time = time.perf_counter() - start_time 


In [None]:
print("Total time: {} seconds".format(overall_time))
print("Total time: {} minutes".format(overall_time/60))

Memory: 230 Gigabytes

# More helper functions

# Combine all of the reads (string representation) for each barcode
## Groups the results from each sub-contig segment above, for example the reads from the first half of chr1 and those from the second half.

In [None]:
print("Overall contigs:\n\n\t", overall_label_to_list_of_contents.keys())
print("\nSubcontig regions for an example contig (1):\n\n\t",sorted(overall_label_to_list_of_contents.get('1').keys()))

### which finished?

In [None]:
# Make a subfolder into which the split bams will be placed
split_bams_folder = '{}/split_bams'.format(output_folder)
if not os.path.exists(split_bams_folder):
    os.mkdir(split_bams_folder)
    

In [None]:
from glob import glob

def get_contigs_that_need_bams_written(split_bams_folder):
    bam_indices_written = [f.split('/')[-1].split('.bam')[0] for f in glob('{}/*/*.sorted.bam.bai'.format(split_bams_folder))]
    
    subsets_per_contig = defaultdict(lambda:0)
    for bam_index_written in bam_indices_written:
        contig_label, subset_label = bam_index_written.split('_')
        subsets_per_contig[contig_label] += 1


    expected_contigs = list(overall_label_to_list_of_contents.keys())
    contigs_to_write_bams_for = []
    for c in expected_contigs:
        num_written_indices = subsets_per_contig.get(c, 0)
        if num_written_indices < 4:
            print("Contig {} has {}/4 bams generated".format(c, num_written_indices))
            contigs_to_write_bams_for.append(c)
    
    return contigs_to_write_bams_for

In [None]:
contigs_to_generate_bams_for = get_contigs_that_need_bams_written(split_bams_folder)

In [None]:
contigs_to_generate_bams_for

### Generate list of jobs to be multiprocessed

In [None]:
from collections import OrderedDict
import time
from multiprocessing import get_context

num_contigs = 0

start_time = time.perf_counter()

# Get the bam header, which will be used for each of the split bams too
header_string = str(samfile.header)

with get_context("spawn").Pool(processes=16) as p:
    max_ = len(contigs_to_generate_bams_for)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(concat_and_write_bams_wrapper, [[i[0], i[1], header_string, split_bams_folder] for i in overall_label_to_list_of_contents.items() if i[0] in contigs_to_generate_bams_for]):
            pbar.update()

total_bam_generation_time = time.perf_counter() - start_time


In [None]:
print("Total time to concat and write bams: {} minutes".format(round(total_bam_generation_time/60)))

# Second loop to get coverage at sites with edits

In [3]:
from glob import glob
import os
import sys
from collections import defaultdict
import pandas as pd
import polars as pl

directory_path = os.path.abspath(os.path.join('../src/'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    
from utils import get_edit_info_for_barcode_in_contig_wrapper

#output_folder = '/projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem_bccontig-subset'
output_folder = '/projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem_bccontig'


splits = [i.split("/")[-1].split('_edit')[0] for i in sorted(glob('{}/edit_info/*'.format(output_folder)))]
print("Accessing split bams: {}".format(', '.join(sorted(splits))))

Accessing split bams: 10_000_0_8168438, 10_001_8168438_16336876, 10_002_16336876_24505314, 10_003_24505314_32673752, 10_004_32673752_40842190, 10_005_40842190_49010628, 10_006_49010628_57179066, 10_007_57179066_65347504, 10_008_65347504_73515942, 10_009_73515942_81684380, 10_010_81684380_89852818, 10_011_89852818_98021256, 10_012_98021256_106189694, 10_013_106189694_114358132, 10_014_114358132_122526570, 10_015_122526570_130695008, 11_000_0_7630159, 11_001_7630159_15260318, 11_002_15260318_22890477, 11_003_22890477_30520636, 11_004_30520636_38150795, 11_005_38150795_45780954, 11_006_45780954_53411113, 11_007_53411113_61041272, 11_008_61041272_68671431, 11_009_68671431_76301590, 11_010_76301590_83931749, 11_011_83931749_91561908, 11_012_91561908_99192067, 11_013_99192067_106822226, 11_014_106822226_114452385, 11_015_114452385_122082544, 12_000_0_7508064, 12_001_7508064_15016128, 12_002_15016128_22524192, 12_003_22524192_30032256, 12_004_30032256_37540320, 12_005_37540320_45048384, 12_00

### Gather the edit information generated for each subcontig, and group by contig so we only have 1 edit information dataframe to process per contig

In [4]:
all_edit_info_for_barcodes = []

edit_info_grouped_per_contig = defaultdict(lambda:[])
edit_info_grouped_per_contig_combined = defaultdict(lambda:[])

num_splits = len(splits)
print("Grouping edit information outputs by contig...")
for i, split in enumerate(splits):
    if i%10 == 0:
        print("\t{}/{}...".format(i, num_splits))
        
    contig = split.split("_")[0]
    print("contig: {}, {}".format(contig, split))
    
    barcode_to_coverage_dict = defaultdict()    
    
    barcode_to_coverage_dict = defaultdict()
    edit_info_file = '{}/edit_info/{}_edit_info.tsv'.format(output_folder, split)
    edit_info_df = pd.read_csv(edit_info_file, sep='\t')
    edit_info_df['position'] = edit_info_df['position'].astype(int)
    edit_info_df['base_quality'] = edit_info_df['base_quality'].astype(int)
    edit_info_df['mapping_quality'] = edit_info_df['mapping_quality'].astype(int)
    edit_info_df['dist_from_end'] = edit_info_df['dist_from_end'].astype(int)

    edit_info = pl.from_pandas(edit_info_df) 
    
    for n in ["A", "C", "G", "T"]:
        suffix = '{}-1'.format(n)
        print("\tsuffix: {}".format(suffix))
        edit_info_subset = edit_info.filter(pl.col("barcode").str.ends_with(suffix))
        
        edit_info_grouped_per_contig["{}_{}".format(contig, n)].append(edit_info_subset)
    
    del edit_info_df
    
print("Done grouping! Concatenating ...")



Grouping edit information outputs by contig...
	0/352...
contig: 10, 10_000_0_8168438
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_001_8168438_16336876
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_002_16336876_24505314
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_003_24505314_32673752
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_004_32673752_40842190
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_005_40842190_49010628
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_006_49010628_57179066
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_007_57179066_65347504
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_008_65347504_73515942
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
contig: 10, 10_009_73515942_81684380
	suffix: A-1
	suffix: C-1
	suffix: G-1
	suffix: T-1
	10/352...
contig: 10, 10_010_81684380_89852818
	suffix: A-1
	

In [None]:
glob(edit_info_file)

In [5]:
for contig, list_of_edit_info_dfs in edit_info_grouped_per_contig.items():
    edit_info_grouped_per_contig_combined[contig] = pl.concat(list_of_edit_info_dfs)

print("Done concatenating!")

Done concatenating!


### Get coverage at edit positions for each contig

##### Merge across contigs for each barcode???

In [6]:
import pandas as pd

pd.options.mode.chained_assignment = None 


def get_job_params_for_coverage_for_edits_in_contig(edit_info_grouped_per_contig_combined, output_folder):
    job_params = []
    
    for contig, edit_info in edit_info_grouped_per_contig_combined.items():
        print(contig)
        print('Num edits pre filter: {}'.format(len(edit_info)))
        edit_info = edit_info.filter(pl.col("base_quality") > 15)
        print('\tNum edits post filter: {}'.format(len(edit_info)))
        
        #unique_barcodes = list(edit_info.unique("barcode")["barcode"])
        #unique_contigs = list(edit_info.unique("contig")["contig"])
        #print('Num unique contigs: {}'.format(len(unique_contigs)))
        
        job_params.append([edit_info, contig, output_folder])  
    return job_params
    
coverage_counting_job_params = get_job_params_for_coverage_for_edits_in_contig(edit_info_grouped_per_contig_combined, 
                                                                output_folder)
len(coverage_counting_job_params)

10_A
Num edits pre filter: 1065209
	Num edits post filter: 533419
10_C
Num edits pre filter: 1088889
	Num edits post filter: 548510
10_G
Num edits pre filter: 1036531
	Num edits post filter: 523817
10_T
Num edits pre filter: 1144092
	Num edits post filter: 576963
11_A
Num edits pre filter: 1710684
	Num edits post filter: 873201
11_C
Num edits pre filter: 1741277
	Num edits post filter: 893696
11_G
Num edits pre filter: 1657485
	Num edits post filter: 851934
11_T
Num edits pre filter: 1833903
	Num edits post filter: 942440
12_A
Num edits pre filter: 918904
	Num edits post filter: 486963
12_C
Num edits pre filter: 942087
	Num edits post filter: 502891
12_G
Num edits pre filter: 895472
	Num edits post filter: 478802
12_T
Num edits pre filter: 985185
	Num edits post filter: 526857
13_A
Num edits pre filter: 702487
	Num edits post filter: 359375
13_C
Num edits pre filter: 726242
	Num edits post filter: 374026
13_G
Num edits pre filter: 682186
	Num edits post filter: 353135
13_T
Num edits pr

88

### This is going at rate of 1 items per seconds... which would take several hours...

In [7]:
#edit_info_plus_coverage_df = get_edit_info_for_barcode_in_contig_wrapper(coverage_counting_job_params[0])

In [8]:
len(coverage_counting_job_params)

88

In [9]:
from multiprocessing import get_context
import time
from multiprocessing import Pool
import multiprocessing
from tqdm import tqdm
import numpy as np

start_time = time.perf_counter()

results = []
# Spawn has to be used instead of the default fork when using the polars library
with get_context("spawn").Pool(processes=16) as p:
    max_ = len(coverage_counting_job_params)
    with tqdm(total=max_) as pbar:
        for _ in p.imap_unordered(get_edit_info_for_barcode_in_contig_wrapper, coverage_counting_job_params):
            pbar.update()
            results.append(_)
            
total_time = time.perf_counter() - start_time

 81%|████████████████████████████████████████████████████████████████▌               | 71/88 [13:05<02:04,  7.30s/it]

Contig 12_A. Loading /projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem_bccontig/split_bams/12_A/12_A.bam.sorted.bam bamfile...
Contig 12_A. Loaded bamfile...
Contig 12_A. Iterating through barcodes...
0/10105 barcodes for 12_A...
300/10105 barcodes for 12_A...
600/10105 barcodes for 12_A...
900/10105 barcodes for 12_A...
1200/10105 barcodes for 12_A...
1500/10105 barcodes for 12_A...
1800/10105 barcodes for 12_A...
2100/10105 barcodes for 12_A...
2400/10105 barcodes for 12_A...
2700/10105 barcodes for 12_A...
3000/10105 barcodes for 12_A...
3300/10105 barcodes for 12_A...
3600/10105 barcodes for 12_A...
3900/10105 barcodes for 12_A...
4200/10105 barcodes for 12_A...
4500/10105 barcodes for 12_A...
4800/10105 barcodes for 12_A...
5100/10105 barcodes for 12_A...
5400/10105 barcodes for 12_A...
5700/10105 barcodes for 12_A...
6000/10105 barcodes for 12_A...
6300/10105 barcodes for 12_A...
6600/10105 barcodes for 12_A...
6900/10105 barcodes for 12_A...
7200/10105 barcodes for 

100%|████████████████████████████████████████████████████████████████████████████████| 88/88 [14:56<00:00, 10.19s/it]


600/3959 barcodes for Y_T...
900/3959 barcodes for Y_T...
1200/3959 barcodes for Y_T...
1500/3959 barcodes for Y_T...
1800/3959 barcodes for Y_T...
2100/3959 barcodes for Y_T...
2400/3959 barcodes for Y_T...
2700/3959 barcodes for Y_T...
3000/3959 barcodes for Y_T...
3300/3959 barcodes for Y_T...
3600/3959 barcodes for Y_T...
3900/3959 barcodes for Y_T...
Contig 12_C. Loading /projects/ps-yeolab3/ekofman/sailor2/scripts/full_test-highmem_bccontig/split_bams/12_C/12_C.bam.sorted.bam bamfile...
Contig 12_C. Loaded bamfile...
Contig 12_C. Iterating through barcodes...
0/9685 barcodes for 12_C...
300/9685 barcodes for 12_C...
600/9685 barcodes for 12_C...
900/9685 barcodes for 12_C...
1200/9685 barcodes for 12_C...
1500/9685 barcodes for 12_C...
1800/9685 barcodes for 12_C...
2100/9685 barcodes for 12_C...
2400/9685 barcodes for 12_C...
2700/9685 barcodes for 12_C...
3000/9685 barcodes for 12_C...
3300/9685 barcodes for 12_C...
3600/9685 barcodes for 12_C...
3900/9685 barcodes for 12_C...


In [10]:
print(total_time/60)

14.965925415356955


In [11]:
len(results)

88

In [12]:
all_edit_info = pd.concat(results)

In [14]:
all_edit_info.groupby(['ref', 'alt']).count()


Unnamed: 0_level_0,Unnamed: 1_level_0,barcode,contig,position,read_id,strand,dist_from_end,base_quality,mapping_quality,barcode_position,coverage
ref,alt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
A,C,3154432,3154432,3154432,3154432,3154432,3154432,3154432,3154432,3154432,3154432
A,G,5581219,5581219,5581219,5581219,5581219,5581219,5581219,5581219,5581219,5581219
A,N,29998,29998,29998,29998,29998,29998,29998,29998,29998,29998
A,T,2016649,2016649,2016649,2016649,2016649,2016649,2016649,2016649,2016649,2016649
C,A,4585917,4585917,4585917,4585917,4585917,4585917,4585917,4585917,4585917,4585917
C,G,1394182,1394182,1394182,1394182,1394182,1394182,1394182,1394182,1394182,1394182
C,N,24246,24246,24246,24246,24246,24246,24246,24246,24246,24246
C,T,11494552,11494552,11494552,11494552,11494552,11494552,11494552,11494552,11494552,11494552
G,A,4821013,4821013,4821013,4821013,4821013,4821013,4821013,4821013,4821013,4821013
G,C,1643154,1643154,1643154,1643154,1643154,1643154,1643154,1643154,1643154,1643154


In [None]:
total_time/60

In [13]:
all_edit_info.to_csv('{}/all_edit_info.tsv'.format(output_folder), sep='\t')

# Group by site to get final total edit and coverage counts at each site

In [None]:
all_edit_info.head()


# Verify C>T ratios

In [None]:
all_edit_info.groupby(['ref', 'alt']).count()


In [None]:
all_edit_info.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
plt.title("All edits")

base_quality_thresh = 15
all_edit_info[all_edit_info.base_quality > base_quality_thresh].groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
plt.title("Edits with base quality > {}".format(base_quality_thresh))

all_edit_info_filtered = all_edit_info[all_edit_info.base_quality > base_quality_thresh]


In [None]:
example_new_ct =  all_edit_info_filtered[(all_edit_info_filtered.ref == 'C') & (all_edit_info_filtered.alt == 'T')].sort_values('position')

In [None]:
len(example_new_ct)

In [None]:
example_new_ct

# Cells that do have STAMP expressed versus don't...?

In [None]:
stamp_expression_path = \
'/projects/ps-yeolab3/ekofman/Sammi/MouseBrainEF1A_SingleCell_EPR_combined/\
4.1_cells_with_middling_stamp/stamp_expression_for_all_used_cells.tsv'

In [None]:
stamp_expression_df = pd.read_csv(stamp_expression_path, sep='\t', index_col=0)

In [None]:
stamp_expression_df.Stamp.hist(bins=50)

In [None]:
all_edit_info_filtered['edit'] = all_edit_info_filtered['ref'] + '>' + all_edit_info_filtered['alt']

In [None]:
fractions_ct = []
threshs = [0, 0.5, 1, 2, 3, 4, 5, 6, 6.5, 6.6]
for thresh in threshs:
    print(thresh)
    barcodes_at_stamp_thresh = stamp_expression_df[stamp_expression_df.Stamp > thresh].index
    
    all_edit_info_filtered_in_stamp_level = all_edit_info_filtered[
        all_edit_info_filtered.barcode.isin(barcodes_at_stamp_thresh)]
    
    all_edit_info_filtered_in_stamp_level.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
    plt.title("Edit Type Distribution for Cells with STAMP expression above {}".format(thresh))
    
    fraction_ct = len(all_edit_info_filtered_in_stamp_level[all_edit_info_filtered_in_stamp_level['edit'] == 'C>T'])/len(all_edit_info_filtered_in_stamp_level)
    fractions_ct.append(fraction_ct)
    
    

In [None]:
plt.plot(threshs, fractions_ct)
plt.ylabel("Fraction of total edits that are C>T")
plt.xlabel("STAMP expression minimum")
plt.title("Enrichment for C>T edits within cells filtered by STAMP threshold")

In [None]:
fractions_ct_low = []
threshs = [1, 2, 3, 4, 5, 6, 6.5, 6.6]
for thresh in threshs:
    print(thresh)
    barcodes_at_stamp_thresh = stamp_expression_df[stamp_expression_df.Stamp < thresh].index
    
    all_edit_info_filtered_in_stamp_level = all_edit_info_filtered[
        all_edit_info_filtered.barcode.isin(barcodes_at_stamp_thresh)]
    
    all_edit_info_filtered_in_stamp_level.groupby(['ref', 'alt']).count().plot(kind='barh', legend=False)
    plt.title("Edit Type Distribution for Cells with STAMP expression below {}".format(thresh))
    
    fraction_ct = len(all_edit_info_filtered_in_stamp_level[all_edit_info_filtered_in_stamp_level['edit'] == 'C>T'])/len(all_edit_info_filtered_in_stamp_level)
    fractions_ct_low.append(fraction_ct)

In [None]:
plt.plot(threshs, fractions_ct_low)
plt.ylabel("Fraction of total edits that are C>T")
plt.xlabel("STAMP expression maximum")
plt.title("Enrichment for C>T edits within cells filtered by STAMP threshold")