In [1]:
from __future__ import print_function
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pylab
import pandas as pd
import numpy as np
import os
import sys
import gzip
import itertools
import operator
import subprocess
import twobitreader
from Bio.Alphabet import IUPAC
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pysam
import shutil

from uditas.uditas_replace_helpers import *

#  Pipeline1
# This, for the most part, is the UDITAS pipeline modified to allow for REPLACE targeting. 

## There is a second pipeline below to do global alignments to quantify large deletions, translocatins, concatomers, etc by aligning the sequence past the break site to the entire genome+targeting sequence.

## Inputs
#### next few window are different inputs to make it run well

In [3]:
#things taken from original uditas args

check_plasmid_insertions = 1
ncpu = 4
window_size = 15
amplicon_window_around_cut = 1000
min_MAPQ = 5
min_AS = -180
process_AMP_seq_run = 0 #off

############# Directory ############
#this is a minimal directory of only ~300 files for quick debugging
#directory = '/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5/10000Reads'

directory = '/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5'

print(directory)

/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5


# Master Number

In [4]:
# number in list of 

#0 is polbRev
#1 is polbRev 
#2 is mcherry_rev
#3 is mcherry_fwd


master_number = 1



#######################################################################################
#primer sequences for mispriming. Primer sequence + 12nt downstream 
#this will only be avialable for read2

##### the sequences need to be capital for this to work########

primerlist = [['TGGTTTTATTTCACCCCATGATAGTAATTATATCACT', 'this is for polb fwd (row 0 in sheet)'],
              ['ACAAAAGAGGCCAAGCTGGAGCAGGAAATAGATGC', 'this is for pipeline1 polb rev (row 1 in sheet'],
              ['GTTAGCAGACTTCCTCTGCCCTCAATCTTTAAGAA', 'this is for pipeline1 mCherryRev, row2 in sheet'],
              ['GCAGGAGCTCGTCGACCCATGGGGGCCCGCCCC', 'this is for pipeline1 mCherryfwd, row3 in sheet'],
              ['TGGTTTTATTTCACCCCATGATAGTAATTATATCACT', 'this is for polb fwd WT (row 4 in sheet) was contaminated'],
              ['ACAAAAGAGGCCAAGCTGGAGCAGGAAATAGATGC', 'this is for pipeline1 WT polb rev (row 5 in sheet']]

                

primer_seq_plus_downstream = primerlist[master_number][0]          
print(primerlist[master_number][1], '\n')    
print('primer_seq_plus_downstream', primer_seq_plus_downstream)


#check what is printed below!!!

#############################################################################################
# Function to make the 'amplicon_info' list. Taking the line of our experiment csv file

def get_csv_data(dir_sample, line_of_data_from_csv):
    sample_info_filename = os.path.join(dir_sample, 'sample_info.csv')
    experiments = pd.read_csv(sample_info_filename)
    return experiments.loc[line_of_data_from_csv]


#the amplicon info is related to the line on the csv file. It is indexed from 0. For PolbF we use 0.
#it is on my extera space so need to make sure that is mounted. (first thing to check if it throws and error)


amplicon_info = get_csv_data(directory, master_number)

###########check waht is printed below #######
amplicon_info

this is for pipeline1 polb rev (row 1 in sheet 

primer_seq_plus_downstream ACAAAAGAGGCCAAGCTGGAGCAGGAAATAGATGC


NGS_req-ID                                                                     A000H2GWGF
name                                                                        Tn5_Pol50_pbR
Sample                                                                               Polb
description                                   MiniSeq_K562_PolbHIROS_50per_unsorted_polbR
Control sample (Y/N)                                                                    N
Notes                                                                                 NaN
Dilution                                                                              NaN
Cell name_type                                                                       K562
UMI_Len                                                                        NNNNNNNNNN
IndexI7Primer                                                                      prE369
I7_Index_ID                                                          P7_N702_SBS12nextera
index_I1  

### Set global Variables

In [22]:
# Assign the file_genome_2bit location. This is needed for pulling sequence from the referene genome by location
assembly = amplicon_info['genome']
file_genome_2bit = os.path.join('/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes', assembly + '.2bit')
print(file_genome_2bit)

# BOWTIE2_INDEXES are needed for global alignments
#not sure if this will work
#normally in bash: export BOWTIE2_INDEXES=/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes
#check in bash: > ECHO $GENOMES_2BIT

#this is the refrence genome for Bowtie from Illumina. It does not have our targeting vector
%env BOWTIE2_INDEXES=/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes

#check the env variables
#%env

/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes/hg38.2bit
env: BOWTIE2_INDEXES=/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes


### Functions to process the reads 
I imported them into the helper_replace python file

### Discard Mispriming Reads
When you put a universal primer on the ends of everything, every mispriming event will amplify. An effect we normally don't deal with. I did nested PCR to reduce this. However 85% of the alignments in the UDITAS data I looked at seemed to be mispriming. They did all their blasting and analysis before removing mispriming. But to save computational power and remove error early on I will discard mispriming events. 
They discard these only for plasmid alignments analyze_alignments_plasmid for some reason which comes from the bam file.

By eye it looks like 50-90% of my reads are correctly primed which is amaizng. Nesting helepd a lot.


In [23]:
correct_priming(directory, amplicon_info, primer_seq_plus_downstream)



reads_in_experiment_list_count 189785
 reads with good priming: 136172
reads misprimed 53613


### Trim off illumina adapter from shorter reads

In [149]:
#Running the trimming

trim_fastq(directory, amplicon_info, 0)



In [150]:
# Running the reference plasmid creation
create_plasmid_reference(directory, amplicon_info)


In [6]:
#Test the get_reaction_type is working
#uditas_replace_helpers.get_reaction_type(amplicon_info)
get_reaction_type(amplicon_info)

'replace'

In [29]:
#Make the amplicons by calling the create_amplicon function

create_amplicon(directory, amplicon_info, file_genome_2bit)

In [153]:
#try out the alignment to the plasmid
align_plasmid_local(directory, amplicon_info, ncpu=4)


In [154]:
#extract the unmapped reads
extract_unmapped_reads_plasmid(directory, amplicon_info)

In [25]:
##### for some reason it throws errors when running from the python helper file so I just copied it here 
#analyze plasmid alignments

def analyze_alignments_plasmid(dir_sample, amplicon_info, min_MAPQ, file_genome_2bit, do_plasmid):
    N7 = amplicon_info['index_I1']
    N5 = amplicon_info['index_I2']
        
    exp_dir = create_filename(dir_sample, N7, N5, 'mainfolder')

    file_UMI = create_filename(dir_sample, N7, N5, 'umifastqgz')
    UMI_dict = create_barcode_dict(file_UMI)
    
    results_folder = os.path.join(exp_dir, 'results')
    if not os.path.exists(results_folder):
        os.mkdir(results_folder)

    results_file = create_filename(dir_sample, N7, N5, 'results_plasmid')

    if do_plasmid:
        file_sorted_bam_plasmid_local = create_filename(dir_sample, N7, N5, 'sorted_bam_plasmid_local')

        bam_in_alignment_file = pysam.AlignmentFile(file_sorted_bam_plasmid_local, 'rb')
        bam_in = bam_in_alignment_file.fetch()

        genome = twobitreader.TwoBitFile(file_genome_2bit)  # Load genome. Used for getting the sequences
        
        length_to_test = 15  # We check this number of bases after the primer
        uditas_primer_length = amplicon_info['end'] - amplicon_info['start']
        
        if amplicon_info['strand'] == '+':  # This is the UDiTaS oligo strand
            #I had to add int() command to make this work for some reason
            seq_after_uditas_primer = genome[amplicon_info['chr']][int(amplicon_info['end']):int((amplicon_info['end'] + length_to_test))]
            
        elif amplicon_info['strand'] == '-':
            seq_after_uditas_primer = reverse_complement(genome[amplicon_info['chr']][int((amplicon_info['start'] - length_to_test)):(int(amplicon_info['start']))])
        n_max_mismatches = 2  # We allow this number of mismatches between the read and the sequence after the primer

        names_list_plasmid_genome = []
        UMI_list_plasmid_genome = []
        names_list_plasmid_only = []
        UMI_list_plasmid_only = []
        
        for read in bam_in:
            if read.mapping_quality >= min_MAPQ and not read.is_unmapped and not read.is_secondary:
                if read.is_read2:  # R2 is the UDiTaS primer
                    if read.is_reverse:
                        seq_test = reverse_complement(read.query_sequence)[int(uditas_primer_length):int((uditas_primer_length + length_to_test))]
                    else:
                        seq_test = read.query_sequence[int(uditas_primer_length): int(uditas_primer_length + length_to_test)]
                    # Sometimes, after cutadapt we have a read shorter than uditas_primer_length + length_to_test
                    # We skip those directly without calculating hamm_dist, which doesn't make sense
                    if (len(seq_test) == len(seq_after_uditas_primer.upper()) and
                        hamm_dist(seq_test, seq_after_uditas_primer.upper()) <= n_max_mismatches):
                        # Reads for which the R2 has genomic sequence after the UDiTaS primer
                        UMI_list_plasmid_genome.append(UMI_dict[read.query_name][0])
                        names_list_plasmid_genome.append(read.query_name)
                    else: # We put those short reads into the plasmid only bucket
                        UMI_list_plasmid_only.append(UMI_dict[read.query_name][0])
                        names_list_plasmid_only.append(read.query_name)

        total_reads_plasmid_genome = len(set(names_list_plasmid_genome))
        total_reads_collapsed_plasmid_genome = len(set(UMI_list_plasmid_genome))
        total_reads_plasmid_only = len(set(names_list_plasmid_only))
        total_reads_collapsed_plasmid_only = len(set(UMI_list_plasmid_only))

        results_df = pd.DataFrame({'target_plus_plasmid_total_reads': [total_reads_plasmid_genome],
                                   'target_plus_plasmid_total_reads_collapsed': [total_reads_collapsed_plasmid_genome],
                                   'plasmid_only_total_reads': [total_reads_plasmid_only],
                                   'plasmid_only_total_reads_collapsed': [total_reads_collapsed_plasmid_only]
                                   },
                                  columns=['target_plus_plasmid_total_reads',
                                           'target_plus_plasmid_total_reads_collapsed',
                                           'plasmid_only_total_reads',
                                           'plasmid_only_total_reads_collapsed'])
    else:
        results_df = pd.DataFrame(index=np.arange(1),
                                  columns=['target_plus_plasmid_total_reads',
                                           'target_plus_plasmid_total_reads_collapsed',
                                           'plasmid_only_total_reads',
                                           'plasmid_only_total_reads_collapsed'])

    results_df.to_excel(results_file)

    return results_df


In [30]:
#run the plasmid analysis to count plasmid integration events
result_plasmid_df = analyze_alignments_plasmid(directory, amplicon_info, min_MAPQ, file_genome_2bit, True)
result_plasmid_df

Unnamed: 0,target_plus_plasmid_total_reads,target_plus_plasmid_total_reads_collapsed,plasmid_only_total_reads,plasmid_only_total_reads_collapsed
0,0,0,3095,67


In [31]:
#align against our suite of amplicons
# took about 5 min with 300k reads
align_amplicon(directory, amplicon_info, check_plasmid_insertions, ncpu)

In [32]:
#this will extract unmap reads new folder (files that did not align to the predicted structural variants)
extract_unmapped_reads_amplicons(directory, amplicon_info)

## This section is to do the analysis of the amplicon alignments

In [44]:
result_reads_in_all_amplicons_df = analyze_alignments_all_amplicons(directory, amplicon_info, min_MAPQ, min_AS)
result_reads_in_all_amplicons_df

number of reads before mispriming cleanup 407906
number of unique umis before mispriming cleanup 174757


Unnamed: 0,all_amplicons_total_reads,all_amplicons_total_reads_collapsed
0,95912,2160


In [35]:
#took 
result_amplicon_df = analyze_alignments(directory, amplicon_info, window_size, amplicon_window_around_cut, min_MAPQ, min_AS)
result_amplicon_df

Unnamed: 0,wt_cut1_total_reads,wt_cut1_total_indels,wt_cut1_total_deletions,wt_cut1_total_insertions,wt_cut1_total_reads_collapsed,wt_cut1_total_indels_collapsed,wt_cut1_total_deletions_collapsed,wt_cut1_total_insertions_collapsed,wt_cut2_total_reads,wt_cut2_total_indels,...,1a_1a_cut1_total_insertions_collapsed,2b_2b_cut1_total_reads,2b_2b_cut1_total_indels,2b_2b_cut1_total_deletions,2b_2b_cut1_total_insertions,2b_2b_cut1_total_reads_collapsed,2b_2b_cut1_total_indels_collapsed,2b_2b_cut1_total_deletions_collapsed,2b_2b_cut1_total_insertions_collapsed,median_fragment_size
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,165.0


### This is the Global alignment of the remaining reads

#### What I need here is to modify it so that 

In [36]:
#Run the allignment
assembly = amplicon_info['genome']
align_genome_global(directory, amplicon_info, assembly)

In [37]:
#Do the global alignment analysis
results_genome_global_df = analyze_alignments_genome_global(directory, amplicon_info, min_MAPQ, min_AS,  file_genome_2bit)
results_genome_global_df



the number of alignments that were attempted to align 34949


Unnamed: 0,genomewide_total_reads,genomewide_total_reads_collapsed,genomewide_target_only_reads,genomewide_target_only_reads_collapsed
0,16490,414,0,0


### Summerize data

In [38]:
read_count = pd.DataFrame()
read_count.loc[0,'read_count'] = count_reads(directory, amplicon_info)

In [39]:
#summerize the data

summary_all_alignments = get_summary_all_alignments(directory, amplicon_info, read_count.loc[[0]], result_plasmid_df, result_reads_in_all_amplicons_df, results_genome_global_df)
summary_all_alignments

Unnamed: 0,read_count,target_plus_plasmid_total_reads,target_plus_plasmid_total_reads_collapsed,plasmid_only_total_reads,plasmid_only_total_reads_collapsed,all_amplicons_total_reads,all_amplicons_total_reads_collapsed,genomewide_total_reads,genomewide_total_reads_collapsed,genomewide_target_only_reads,genomewide_target_only_reads_collapsed,total_aligned,total_aligned_collapsed,percent_aligned,percent_aligned_all_amplicons
0,222201.0,0,0,3095,67,87042,2020,16490,414,0,0,106627,2501,47.986733,81.632232


In [40]:
##### you can't run this cell alone. It requires analysis higher in the pipeline

results_alignments_junction = pd.concat([result_amplicon_df, result_plasmid_df], axis=1)

sample_info_filename = os.path.join(directory, 'sample_info.csv')

experiments = pd.read_csv(sample_info_filename)

N7 = amplicon_info['index_I1']
N5 = amplicon_info['index_I2']

results_summary = summarize_results(results_alignments_junction)
results_summary
#this outputs it in a directory above the N501_701 folder as a summary file. Need to label it or it will be overwritten by next tone
results_summary_with_experiments = pd.concat([experiments, results_summary], axis=1)
results_summary_with_experiments.to_excel(os.path.join(directory, N7 + '_' + N5, N7 + '_' + N5 + '_' + 'results_summary.xlsx'))

results_pivot = melt_results(results_summary_with_experiments)
results_pivot.to_excel(os.path.join(directory, N7 + '_' + N5, N7 + '_' + N5+'_'+'results_summary_pivot.xlsx'))
results_pivot

Unnamed: 0,NGS_req-ID,name,Sample,description,Control sample (Y/N),Notes,Dilution,Cell name_type,UMI_Len,IndexI7Primer,...,doner_tail_tail_cut1_total_reads_percent,doner_head_tail_cut1_total_reads_percent,doner_head_head_cut1_total_reads_percent,1a_1a_cut1_total_reads_percent,2b_2b_cut1_total_reads_percent,target_plus_plasmid_total_reads_percent,plasmid_only_total_reads_percent,total_aligned_junctions_collapsed,Type,Percent Editing
0,A000H2GWGF,Tn5_Pol50_pbF,Polb,MiniSeq_K562_PolbHIROS_50per_unsorted_polbF,N,,,K562,NNNNNNNNNN,prE368,...,1.033386,38.022922,0.0,0.0,0.0,0.0,3.672021,1976.0,wt_cut1_total_reads_collapsed_percent,0.000000
1,A000H2GWGF,Tn5_Pol50_pbR,Polb,MiniSeq_K562_PolbHIROS_50per_unsorted_polbR,N,,,K562,NNNNNNNNNN,prE369,...,,,,,,,,,wt_cut1_total_reads_collapsed_percent,
2,A000H2GWGF,Tn5_Pol50_mCh,Polb,MiniSeq_K562_PolbHIROS_50per_unsorted_mCherryp...,N,,,K562,NNNNNNNNNN,prE370,...,,,,,,,,,wt_cut1_total_reads_collapsed_percent,
3,A000H2GWGF,Tn5_Pol50_bpa,Polb,MiniSeq_K562_PolbHIROS_50per_unsorted_mCherry_Fwd,N,,,K562,NNNNNNNNNN,prE371,...,,,,,,,,,wt_cut1_total_reads_collapsed_percent,
4,A000H2GWGF,Tn5_WT_pbF,Polb,MiniSeq_K562_PolbHIROS_Wtcontaminated_w_50per_...,N,,,K562,NNNNNNNNNN,prE372,...,,,,,,,,,wt_cut1_total_reads_collapsed_percent,
5,A000H2GWGF,Tn5_WT_pbR,Polb,MiniSeq_K562_PolbHIROS_Wtcontaminated_w_50per_...,N,,,K562,NNNNNNNNNN,prE373,...,,,,,,,,,wt_cut1_total_reads_collapsed_percent,
6,A000H2GWGF,Tn5_Pol50_pbF,Polb,MiniSeq_K562_PolbHIROS_50per_unsorted_polbF,N,,,K562,NNNNNNNNNN,prE368,...,1.033386,38.022922,0.0,0.0,0.0,0.0,3.672021,1976.0,wt_cut2_total_reads_collapsed_percent,0.000000
7,A000H2GWGF,Tn5_Pol50_pbR,Polb,MiniSeq_K562_PolbHIROS_50per_unsorted_polbR,N,,,K562,NNNNNNNNNN,prE369,...,,,,,,,,,wt_cut2_total_reads_collapsed_percent,
8,A000H2GWGF,Tn5_Pol50_mCh,Polb,MiniSeq_K562_PolbHIROS_50per_unsorted_mCherryp...,N,,,K562,NNNNNNNNNN,prE370,...,,,,,,,,,wt_cut2_total_reads_collapsed_percent,
9,A000H2GWGF,Tn5_Pol50_bpa,Polb,MiniSeq_K562_PolbHIROS_50per_unsorted_mCherry_Fwd,N,,,K562,NNNNNNNNNN,prE371,...,,,,,,,,,wt_cut2_total_reads_collapsed_percent,


# Pipeline2
#  This is a second pipeline. Different than Uditas but built in a simliar form. 
## It uses the read distal to the gene specic primer. This read covers the region downstream of the break. After clipping off anything remaing upstream of the break we make a global alignment to look at genomic integrations and translocations. This also quantifies the alignments.


### 0. Use correct_prime function in pipeline1 to arrive at reads that were correctly primed.
### 1. Trim correctly primed reads to the break site  
### 2. Only use a single end of the pair end reads (distal gene specific primer)
### 3. Use bowtie2 genome that has the targeting seq in it added
### 4. Do global end-to-end alignment
### 5. Do a local alignment
### 6. process bam file and pull out the high mapQ reads

The idea here is to trim the sequence up to the cut on both reads and then do a global alignment end-to-end agasint the human genome.

The idea here is to take the files that are 'trimmed' and 'correct priming' checking. Then to do another round of trimming to get down to the cut site. I need to make another folder for 'trimmed_to_cut'

For Tn5 setup the 'Read 1' contains the sequence away from the gene specific primer. So it is the read we should concentration on if we don't do paired end.

For LAM the 'Read 2' is the the sequence distal to the gene specific primer


## Need to make an new bowtie2 index file that includes targeting for alignment agains the whole genome so it is all in one sheet together. 

### Other option would be to align it to the amplicon, extract unaligned files and then align to the genome but seems cleaner this way. 
#### Ideally every sequence is unique between the targeting vector and genome.


1. Build your fastas of interest and label .fa files.
    1. You need fasta of hg38 or reference genome. You can pull this from downloaded bowtie indexed sampels and then use the following command to turn the index into a fasta file: bowtie2-inspect hg38 > hg38.fa   
    2. Put all the fasta files in the same folder
2. index the files with bowtie
    1. use the command bowtie2-build -f pE049,pe038_mc.fa,hg38.fa -p hg38_plus_targetvectorandplasmid
    2. this has the -p to make it take less ram in my case.
    3. In this case it adds the hg38 and the minicircle targeting file together
    4. I have a Intel® Core™ i7-5500U CPU @ 2.40GHz × 4 with 15.1 GiB ram and it required about 13.8 gigs of ram and 2 hours to do a hg38+small fasta index
    5. be sure to pay attention to the name of the new indexed file. "hg38_plus_targetvector" in example above. Add it to the sample_info.csv sheet. Under the tab 'genome_plus_targeting'.
    6. you can check it indexed correcctly: bowtie2-inspect -s hg38_plus_targetvector
    

    



### A simple loop for runnign pipeline 2

In [5]:
### summary function that analyzes the numbers in list. It puts together everything in following cells

# Assign the file_genome_2bit location. This is needed for pulling sequence from the referene genome by location
assembly_plus_targetvector = amplicon_info['genome_plus_targeting']
file_genome_2bit_plus_target = os.path.join('/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes', assembly_plus_targetvector + '.2bit')
print(file_genome_2bit_plus_target)
#Needs to have a bowtie build in which the targeting vector is included!!
%env BOWTIE2_INDEXES=/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes
#check the env variables
#%env


check_plasmid_insertions = 1
ncpu = 4
window_size = 15
amplicon_window_around_cut = 1000
min_MAPQ = 5
min_AS = -180
process_AMP_seq_run = 0 #off

def get_csv_data(dir_sample, line_of_data_from_csv):
    sample_info_filename = os.path.join(dir_sample, 'sample_info.csv')
    experiments = pd.read_csv(sample_info_filename)
    return experiments.loc[line_of_data_from_csv]

############# Directory ############
#this is a minimal directory of only ~300 files for quick debugging
#directory = '/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5/10000Reads'

directory = '/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5'

print('the directory is:', directory)


seq_to_breaksite = [['TGGTTTTATTTCACCCCATGATAGTAATTATATCACTTCTGATCTGTTAAGAATAGACCTTTTAAAAGTATTGGATAACTTAGAGATGAGACATCTTCAGTTACTCTGTTATTCACCTATTACTCCTTAGGTTACTTGTGAATAATTTTGTGTGGGTCA', 'this is for pipeline2 polb fwd (row 0 in sheet)'],
                    ['ACAAAAGAGGCCAAGCTGGAGCAGGAAATAGATGCACACGGAGGAAATGGTAGTGGAGTTCAGAGGAGGGAGAGGTTCTTTCTGCCTGGG','this is for pipeline2 polb rev (row 1 in sheet)'],
                    ['GTTAGCAGACTTCCTCTGCCCTCAATCTTTAAGAAAAAAAAAAGTCTAACAATGATTTAGGAATGCTTTGAGGACTTAAATGATCTTATTGGAAACATACCAGTCTGCTAAAAGACTAATTTTGTGTGGGTCA','this is for pipeline2 mCherryRev, row2 in sheet'],
                    ['GCAGGAGCTCGTCGACCCATGGGGGCCCGCCCCAACTGGGGTAACCTTTGGGCTCCCCGGGCGCGACTAGTGAATTCAGATCTGATATCTCTAGAAGTCCTGGG', 'this is for pipeline2 mCherryFWD, row3 in sheet'],
                    ['TGGTTTTATTTCACCCCATGATAGTAATTATATCACTTCTGATCTGTTAAGAATAGACCTTTTAAAAGTATTGGATAACTTAGAGATGAGACATCTTCAGTTACTCTGTTATTCACCTATTACTCCTTAGGTTACTTGTGAATAATTTTGTGTGGGTCA', 'this is for pipeline2 WT polb fwd (row 4 in sheet)'],
                    ['ACAAAAGAGGCCAAGCTGGAGCAGGAAATAGATGCACACGGAGGAAATGGTAGTGGAGTTCAGAGGAGGGAGAGGTTCTTTCTGCCTGGG','this is for pipeline2 WTpo lb rev (row 5 in sheet)']]


# type the number in the list for the runs to run: 
    #0 is polbRev
    #1 is polbRev 
    #2 is mcherry_rev
    #3 is mcherry_fwd

lines_to_run = [0,1,2,3]

#type in 'on' or 'off'
trim = 'off'
local_align = 'off'
global_align = 'on'

for i in lines_to_run:
    print('running row', i)
    
    #for each round choose the corresponding seq to breaksite
    seq_primer_to_breaksite = seq_to_breaksite[i][0]
    print(seq_to_breaksite[i][1],'\n')
    
    #make the correct vector of the csv sheet
    amplicon_info = get_csv_data(directory, i)
    
    #run trimming
    if trim == 'on':
        trim_fastq_to_break(directory, amplicon_info, seq_primer_to_breaksite, length = 35)

    if global_align == 'on':
        #####this aligns global (end-to-end).  keep_sam=1 means keep the sam file
        align_afterbreak_end_to_end_genome_global(directory, amplicon_info, assembly_plus_targetvector, keep_sam=0)


        #### Process the alignment files to make a bam file of only high quality reads
        final_trimmed_bam_filtered_mapq_AS_primary(directory, 'global', amplicon_info, 'tn5')

        ### global alignment bed generation  
        print('-----  now making bed file   ------')

        N7 = amplicon_info['index_I1']
        N5 = amplicon_info['index_I2']
        file_trimmed_genome_global_bed = create_filename(directory, N7, N5, 'break_trimmed_genome_global_bed')
        file_sorted_bam_genome_global = create_filename(directory, N7, N5, 'break_trimmed_filtered_and_sorted_genome_global')

        !bam2bed < {file_sorted_bam_genome_global} > {file_trimmed_genome_global_bed}
        bed_folder = os.path.abspath(create_filename(directory, N7, N5, 'all_bed'))

        if not os.path.exists(bed_folder):
            os.mkdir(bed_folder)
        shutil.copy(file_trimmed_genome_global_bed, bed_folder)
        print('made global bed file:', N7, '_', N5 )
        
        #quantify the global alignment
        print('doing analysis on the global alignment', N7, '_', N5 )
        quantify_global = quantify_pipeline2_alignments(directory, 'global', amplicon_info)
        print(quantify_global)
        
    if local_align == 'on':
        
        # aligns local (soft clip the ends) keep_sam=1 means keep the sam file
        align_afterbreaks_genome_local(directory, 1, amplicon_info, assembly_plus_targetvector, keep_sam=0)
        
        #sort out the good alignments
        final_trimmed_bam_filtered_mapq_AS_primary(directory, 'local', amplicon_info, 'tn5')

        # making a bed file for local
        print('-----  now making bed file   ------')
        N7 = amplicon_info['index_I1']
        N5 = amplicon_info['index_I2']
        file_trimmed_genome_local_bed = create_filename(directory, N7, N5, 'break_trimmed_genome_local_bed')
        file_sorted_bam_genome_local = create_filename(directory, N7, N5, 'break_trimmed_filtered_and_sorted_bam_genome_local')

        !bam2bed < {file_sorted_bam_genome_local} > {file_trimmed_genome_local_bed}
        bed_folder = os.path.abspath(create_filename(directory, N7, N5, 'all_bed'))

        if not os.path.exists(bed_folder):
            os.mkdir(bed_folder)
        shutil.copy(file_trimmed_genome_local_bed, bed_folder)
        print('made local bed file:', N7, '_', N5 )
        
        # quantify the local alignment
        print('doing the quantification of local alignments')
        quantify_local = quantify_pipeline2_alignments(directory, 'local', amplicon_info, 'lam')
        print(quantify_local)
   
    
    
    


/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes/hg38_plus_targetvector.2bit
env: BOWTIE2_INDEXES=/media/edanner/NewUbuntuSpace/Workspace/Ref_Genomes
the directory is: /media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5
running row 0
this is for pipeline2 polb fwd (row 0 in sheet) 

minimum length after trimming is: 35
tn5 analysis
tn5 analysis
sam file deleted
-----  now making bed file   ------
made global bed file: N701 _ N501
doing analysis on the global alignment N701 _ N501
   %_target_site_reads  %_target_vector_reads  %_translocations  \
0            70.912312              28.807988            0.2797   

   %_target_site_reads_collapsed  %_target_vector_reads_collapsed  \
0                      71.733871                        28.104839   

   %_translocations_collapsed  alignments_target_site_all  \
0                     0.16129                      127272   

   alignments_target_vector_all  alignments_translocations_all  

## this is if you want to do pipeline2 one piece at a time

In [14]:
########### all inputs required for pipeline2 #####
#This is for the sequence to the breaksite for the pipeline of genomic alignment 
# make sure the sequences are capital

# number in list of 

#0 is polbRev
#1 is polbRev 
#2 is mcherry_rev
#3 is mcherry_fwd



check_plasmid_insertions = 1
ncpu = 4
window_size = 15
amplicon_window_around_cut = 1000
min_MAPQ = 5
min_AS = -180
process_AMP_seq_run = 0 #off

############# Directory ############
#this is a minimal directory of only ~300 files for quick debugging
#directory = '/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5/10000Reads'

directory = '/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5'

print(directory)


print()
seq_to_breaksite = [['TGGTTTTATTTCACCCCATGATAGTAATTATATCACTTCTGATCTGTTAAGAATAGACCTTTTAAAAGTATTGGATAACTTAGAGATGAGACATCTTCAGTTACTCTGTTATTCACCTATTACTCCTTAGGTTACTTGTGAATAATTTTGTGTGGGTCA', 'this is for pipeline2 polb fwd (row 0 in sheet)'],
                    ['ACAAAAGAGGCCAAGCTGGAGCAGGAAATAGATGCACACGGAGGAAATGGTAGTGGAGTTCAGAGGAGGGAGAGGTTCTTTCTGCCTGGG','this is for pipeline2 polb rev (row 1 in sheet)'],
                    ['GTTAGCAGACTTCCTCTGCCCTCAATCTTTAAGAAAAAAAAAAGTCTAACAATGATTTAGGAATGCTTTGAGGACTTAAATGATCTTATTGGAAACATACCAGTCTGCTAAAAGACTAATTTTGTGTGGGTCA','this is for pipeline2 mCherryRev, row2 in sheet'],
                    ['GCAGGAGCTCGTCGACCCATGGGGGCCCGCCCCAACTGGGGTAACCTTTGGGCTCCCCGGGCGCGACTAGTGAATTCAGATCTGATATCTCTAGAAGTCCTGGG', 'this is for pipeline2 mCherryFWD, row3 in sheet'],
                    ['TGGTTTTATTTCACCCCATGATAGTAATTATATCACTTCTGATCTGTTAAGAATAGACCTTTTAAAAGTATTGGATAACTTAGAGATGAGACATCTTCAGTTACTCTGTTATTCACCTATTACTCCTTAGGTTACTTGTGAATAATTTTGTGTGGGTCA', 'this is for pipeline2 WT polb fwd (row 4 in sheet)'],
                    ['ACAAAAGAGGCCAAGCTGGAGCAGGAAATAGATGCACACGGAGGAAATGGTAGTGGAGTTCAGAGGAGGGAGAGGTTCTTTCTGCCTGGG','this is for pipeline2 WTpo lb rev (row 5 in sheet)']]

  
master_number = 0
    
    
seq_primer_to_breaksite = seq_to_breaksite[master_number][0]
print(seq_to_breaksite[master_number][1],'\n')    
print('seq_to_breaksite', seq_primer_to_breaksite, '\n')

#check amplicon info and directory
print('directory', directory)

def get_csv_data(dir_sample, line_of_data_from_csv):
    sample_info_filename = os.path.join(dir_sample, 'sample_info.csv')
    experiments = pd.read_csv(sample_info_filename)
    return experiments.loc[line_of_data_from_csv]


#the amplicon info is related to the line on the csv file. It is indexed from 0. For PolbF we use 0.
#it is on my extera space so need to make sure that is mounted. (first thing to check if it throws and error)


amplicon_info = get_csv_data(directory, master_number)

###########check waht is printed below #######
amplicon_info




/media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5

this is for pipeline2 polb fwd (row 0 in sheet) 

seq_to_breaksite TGGTTTTATTTCACCCCATGATAGTAATTATATCACTTCTGATCTGTTAAGAATAGACCTTTTAAAAGTATTGGATAACTTAGAGATGAGACATCTTCAGTTACTCTGTTATTCACCTATTACTCCTTAGGTTACTTGTGAATAATTTTGTGTGGGTCA 

directory /media/edanner/NewUbuntuSpace/Workspace/LinearAmp/Sequence2_191129_MN00157_0047_A000H2GWGF/P_Eric4_Tn5


NGS_req-ID                                                                     A000H2GWGF
name                                                                        Tn5_Pol50_pbF
Sample                                                                               Polb
description                                   MiniSeq_K562_PolbHIROS_50per_unsorted_polbF
Control sample (Y/N)                                                                    N
Notes                                                                                 NaN
Dilution                                                                              NaN
Cell name_type                                                                       K562
UMI_Len                                                                        NNNNNNNNNN
IndexI7Primer                                                                      prE368
I7_Index_ID                                                          P7_N701_SBS12nextera
index_I1  

In [31]:
#Trim the break up to the cut site

trim_fastq_to_break(directory, amplicon_info, seq_primer_to_breaksite, length = 35)

minimum length after trimming is: 35
tn5 analysis


### alignments of sequences trimmed to the break

In [9]:
#this aligns everything after the break to end-to-end keep_sam=1 means keep the sam file
align_afterbreak_end_to_end_genome_global(directory, amplicon_info, assembly_plus_targetvector, keep_sam=1)


tn5 analysis
-----  now making bed file   ------
made global bed file: N701 _ N501


In [33]:
#this aligns everything in the local format wehre it can soft clip the ends keep_sam=1 means keep the sam file
align_afterbreaks_genome_local(directory, 1, amplicon_info, assembly_plus_targetvector, keep_sam=0)



tn5 analysis
sam file deleted
-----  now making bed file   ------
made local bed file: N704 _ N501


## Preparing the files for plotting in R. This make the bed files
#### The bed files are filtered for primary reads of high quality

In [15]:
#this filteres all the GLOBAL reads.

final_trimmed_bam_filtered_mapq_AS_primary(directory, 'global', amplicon_info)


# THE BED IS MADE FROM THE FILTERED READS
### This should work for automatic bed generation, but it doesn't produce anythign and no ideaw way

print('-----  now making bed file   ------')


N7 = amplicon_info['index_I1']
N5 = amplicon_info['index_I2']
file_trimmed_genome_global_bed = create_filename(directory, N7, N5, 'break_trimmed_genome_global_bed')
file_sorted_bam_genome_global = create_filename(directory, N7, N5, 'break_trimmed_filtered_and_sorted_genome_global')

!bam2bed < {file_sorted_bam_genome_global} > {file_trimmed_genome_global_bed}


bed_folder = os.path.abspath(create_filename(directory, N7, N5, 'all_bed'))


if not os.path.exists(bed_folder):
    os.mkdir(bed_folder)

shutil.copy(file_trimmed_genome_global_bed, bed_folder)

print('made global bed file:', N7, '_', N5 )


working
-----  now making bed file   ------
made global bed file: N701 _ N501


In [None]:
# Filtering and making bed file for local alignments.  BED FILE IS MADE OF FILTERED READS

final_trimmed_bam_filtered_mapq_AS_primary(directory, 'local', amplicon_info)


print('-----  now making bed file   ------')

N7 = amplicon_info['index_I1']
N5 = amplicon_info['index_I2']
file_trimmed_genome_local_bed = create_filename(directory, N7, N5, 'break_trimmed_genome_local_bed')
file_sorted_bam_genome_local = create_filename(directory, N7, N5, 'break_trimmed_filtered_and_sorted_bam_genome_local')


!bam2bed < {file_sorted_bam_genome_local} > {file_trimmed_genome_local_bed}


bed_folder = os.path.abspath(create_filename(directory, N7, N5, 'all_bed'))

if not os.path.exists(bed_folder):
    os.mkdir(bed_folder)

shutil.copy(file_trimmed_genome_local_bed, bed_folder)

print('made local bed file:', N7, '_', N5 )

### take the bed files and then use R script to plot

In [16]:
#polb fwd
print(amplicon_info['name'])
quantify_pipeline2_alignments(directory, 'global', amplicon_info)

Unnamed: 0,%_target_site_reads,%_target_vector_reads,%_translocations,%_target_site_reads_collapsed,%_target_vector_reads_collapsed,%_translocations_collapsed,alignments_target_site_all,alignments_target_vector_all,alignments_translocations_all,collapsed_alignments_target_site,collapsed_alignments_target_vector,collapsed_alignments_translocations,total aligned reads,total collpased reads
0,70.714524,28.727636,0.55784,71.733871,28.104839,0.16129,127272,51704,1004,1779,697,4,179980.0,2480.0


In [45]:
#polb rev
print(amplicon_info['name'])
quantify_pipeline2_alignments(directory, 'global', amplicon_info)


Unnamed: 0,alignments_target_site_all,alignments_target_vector_all,alignments_translocations_all,collapsed_alignments_target_site,collapsed_alignments_target_vector,collapsed_alignments_translocations,total aligned reads,total collpased reads,%_target_site_reads,%_target_vector_reads,%_translocations,%_target_site_reads_collapsed,%_target_vector_reads_collapsed,%_translocations_collapsed
0,44362,53745,200,1094,1062,138,98307.0,2294.0,45.125983,54.670573,0.203444,47.689625,46.294682,6.015693


In [39]:
#mcherry fwd
print(amplicon_info['name'])
quantify_pipeline2_alignments(directory, 'global', amplicon_info)

Unnamed: 0,alignments_target_site_all,alignments_target_vector_all,alignments_translocations_all,collapsed_alignments_target_site,collapsed_alignments_target_vector,collapsed_alignments_translocations,total aligned reads,total collpased reads,%_target_site_reads,%_target_vector_reads,%_translocations,%_target_site_reads_collapsed,%_target_vector_reads_collapsed,%_translocations_collapsed
0,50325,35465,41458,1004,937,813,127248.0,2754.0,39.548755,27.870772,32.580473,36.456064,34.023239,29.520697


In [43]:
#mcherry rev
print(amplicon_info['name'])
quantify_pipeline2_alignments(directory, 'global', amplicon_info)

Unnamed: 0,alignments_target_site_all,alignments_target_vector_all,alignments_translocations_all,collapsed_alignments_target_site,collapsed_alignments_target_vector,collapsed_alignments_translocations,total aligned reads,total collpased reads,%_target_site_reads,%_target_vector_reads,%_translocations,%_target_site_reads_collapsed,%_target_vector_reads_collapsed,%_translocations_collapsed
0,17974,40909,12349,493,1157,351,71232.0,2001.0,25.233041,57.430649,17.33631,24.637681,57.821089,17.541229
