In [1]:
import os
import itertools
from itertools import permutations
import re
import pandas as pd


In [89]:
# a CBE produces C>T on sense strand when fwd, G>A on sense strand if rev
# a ABE produces A>G on sense strand when fwd, T>C on sense strand if rev

# usable for PAMs: Sp NGG, SpG NGN, SpRY NNN
# 
# 

class BE_gRNAs(): 
    
    # references
    bases = 'ACGT'
    complements = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 
                   'a':'t', 't':'a', 'g':'c', 'c':'g'}
    cas_key = {'Sp': 'NGG', 'SpG': 'NGN', 'SpRY': 'NNN'}
    
    def __init__(self, be_type, editing_window, gene, end_goals, cas_type, exon_filename, exon_dir=''): 
        
        # vars
        self.be_type = be_type
        self.gene = gene
        self.end_goals = end_goals
        self.PAM = self.cas_key[cas_type]
        self.editing_window = editing_window
        
        # load file of exons
        self.filename = os.path.join(exon_dir, exon_filename)
        f = open(self.filename, "r")
        self.file_content = f.read()
        # list of exons and list of exons with +-20 bps of introns
            # both are necessary bc +-20 bps needed bc gRNA can attach onto DNA that isnt expressed
            # only +- 20 is needed bc gRNA is only 23 bps
        self.exons_extra, self.exons = self.parse_exons()
        
        if len(self.end_goals) > 0: 
    
            # find all sense and antisense codons that can be mutated
            self.target_codons, self.target_codons_compls = self.generate_target_codons()
            # find all indices with in frame codon, in range PAM
            self.indices = self.find_all_indices()
            # find all full guide RNAs
            self.potential_gRNAs, self.sense_antisense, self.exon_num, self.targets = self.get_gRNAs()
                    
        else: 
            
            # find all full guide RNAs without a target codon to change
            self.potential_gRNAs, self.sense_antisense, self.exon_num, self.targets = self.get_all_gRNAs()
            
        # self.gene self.potential_gRNAs self.sense_antisense self.exon_num self.targets
    
    ###################################################################################################

    # parse exons with and without intron ends from fasta file format
    def parse_exons(self): 
        exons_extra = []
        i = -1
        for line in self.file_content.split('\n'): 
            if len(line) > 0 and line[0] == '>': 
                exons_extra.append('')
                i += 1
            else: 
                exons_extra[i] += line

        exons = []
        for exon in exons_extra: 
            exons.append(''.join([base for base in exon if base.isupper()]))
        return exons_extra, exons
    
    def rev_complement(self, seq): 
        compl = ''
        for i in range(len(seq)): 
            compl += self.complements[seq[i]]
        return compl[::-1]
    
    def complement(self, seq): 
        compl = ''
        for i in range(len(seq)): 
            compl += self.complements[seq[i]]
        return compl
    
    ###################################################################################################
    
    # generate target codons based on type of base editor, and what codon we want to generate
    def generate_target_codons(self): 
        codons = []
        anticodons = []
        for goal in self.end_goals: 
            codons += self.base_edits(goal, 'sense')
            anticodons += self.base_edits(goal, 'anti')
        return list(set(codons)), list(set(anticodons))
    
    ### horrible
    def base_edits(self, goal, mode): 
        if self.be_type == 'CBE': 
            if mode == 'sense': 
                return self.replace(goal, 'C', 'T')
            elif mode == 'anti': 
                return self.replace(goal, 'G', 'A')
        elif self.be_type == 'ABE':
            if mode == 'sense': 
                return self.replace(goal, 'A', 'G')
            elif mode == 'anti': 
                return self.replace(goal, 'T', 'C')
        else: 
            print('BE type invalid')
        
    def replace(self, goal, x, y): 
        result = [goal[0:i] + x + goal[i+1:] for i in range(3) if goal[i]==y]
        result += [goal[0:i] + x + x + goal[i+2:] for i in range(2) if goal[i:i+2]==y+y]
        result += [x + goal[1] + x for i in range(1) if goal[0]==y and goal[2]==y]
        result += [x+x+x for i in range(1) if goal==y+y+y]
        return result
        
    ###################################################################################################
    
    def find_all_indices(self): 
        result = [[]] * len(self.exons_extra)
        for i in range(len(self.exons)): 
            indices = []
            prev_len = sum([len(exon) for exon in self.exons[:i]])
            for codon in self.target_codons: 
                inds = [m.start() for m in re.finditer(codon, self.exons_extra[i])]
                inds = [ind for ind in inds if ((ind+prev_len+1)%3)==0]
                inds = [[ind, 'fwd', i+1] for ind in inds if self.has_PAM_CBE(i, ind, 'fwd')]
                indices += inds
            for codon_compl in self.target_codons_compls: 
                inds = [m.start() for m in re.finditer(codon_compl, self.exons_extra[i])]
                inds = [ind for ind in inds if ((ind+prev_len+1)%3)==0]
                inds = [[ind, 'rev', i+1] for ind in inds if self.has_PAM_CBE(i, ind, 'rev')]
                indices += inds
            indices.sort()
            result[i] = indices
        return result
    
    def has_PAM_CBE(self, i, ind, direction): 
        real_PAM = self.PAM.replace("N", "")
        if direction == 'fwd':
            frame = self.exons_extra[i][ind:ind+20]
            return real_PAM in frame[-6:]
        elif direction == 'rev':
            frame = self.exons_extra[i][ind-17:ind+3]
            real_PAM = self.rev_complement(real_PAM)
            return real_PAM in frame[:6]
    
    ###################################################################################################

    def get_gRNAs(self): 
        potential_gRNAs = []
        potential_target = []
        sense_antisense = []
        exon_num = []
        real_PAM = self.PAM.replace("N", "")

        # make list of all guide RNAs possible
        for i in range(len(self.exons)): 
            for j in range(len(self.indices[i])):
                frame = self.find_frame(self.exons_extra[i], 
                                        self.indices[i][j][0],
                                        self.indices[i][j][1])
                for k in range(5): 
                    if frame[k+21:k+21+len(real_PAM)] == real_PAM: ###
                        potential_gRNAs.append(frame[k:k+23])
                        sense_antisense.append(self.indices[i][j][1])
                        if self.indices[i][j][1] == 'fwd': 
                            potential_target.append(frame[k:k+23])
                        elif self.indices[i][j][1] == 'rev':
                            potential_target.append(self.rev_complement(frame[k:k+23]))                            
                        exon_num.append(self.indices[i][j][2])
        
        return potential_gRNAs, sense_antisense, exon_num, potential_target
        
    def find_frame(self, exon, ind, direction): 
        if direction=='fwd': 
            return exon[ind-7:ind+20]
        elif direction=='rev': 
            return self.rev_complement(exon[ind-17:ind+10])

    ###################################################################################################
    
    def get_all_gRNAs(self): 
        potential_gRNAs = []
        potential_target = []
        sense_antisense = []
        exon_num = []
        real_PAM = self.PAM.replace("N", "")
        
        # make list of all guide RNAs possible
        for i in range(len(self.exons)): 
            exon_i = self.exons_extra[i]
            for j in range(len(exon_i)): 

                # looking fwd
                # NGG and NGN cases can be generalized since 21 starts at GG or GN                
                frame = exon_i[j:j+23]
                if frame[21:21+len(real_PAM)].upper() == real_PAM: 
                    sub_frame = frame[self.editing_window[0]-1:self.editing_window[1]]
                    if (self.be_type == 'CBE' and 'C' in sub_frame) or (self.be_type == 'ABE' and 'A' in sub_frame): 
                        potential_gRNAs.append(frame)
                        potential_target.append(frame)
                        sense_antisense.append('fwd')
                        exon_num.append(i+1)
                
                # looking rev
                # since the opposite just involves looking at the opposite strand, same code but just rev_compl
                revcompl_frame = self.rev_complement(frame)
                if revcompl_frame[21:21+len(real_PAM)].upper() == real_PAM: 
                    sub_frame = revcompl_frame[self.editing_window[0]-1:self.editing_window[1]]
                    if (self.be_type == 'CBE' and 'C' in sub_frame) or (self.be_type == 'ABE' and 'A' in sub_frame): 
                        potential_gRNAs.append(revcompl_frame)
                        potential_target.append(self.rev_complement(revcompl_frame))
                        sense_antisense.append('rev')
                        exon_num.append(i+1)

        return potential_gRNAs, sense_antisense, exon_num, potential_target

    ###################################################################################################

    def save_data(self, filename): 
        head = ['gene', 'guide RNA', 'target', 'direction', 'exon']
        df = pd.DataFrame(zip([self.gene]*len(self.potential_gRNAs), 
                              self.potential_gRNAs, 
                              self.targets,
                              self.sense_antisense, 
                              self.exon_num), 
                          columns=head)
        df.to_csv(filename, index=False)
    

In [90]:
# cas_type should be Sp, SpG, SpRY
# base_editor_type should be ABE, CBE
# stop_codons can be all stop codons, or any other codons you want to generate, 
#     or an empty list [] if you want a list of all possible guides


In [96]:
# example

###
base_editor_type = 'CBE'
target_codons = ['TAG', 'TAA', 'TGA'] # to generate stop codons = ['TAG', 'TAA', 'TGA']
cas_type = 'Sp'
editing_window = (4, 8) # inclusive
notes = 'stopCodons'
###

gene = 'ARexon8'
exon_filename = '221210_AR_Exon8_Input.fasta'
output_dir = 'results/'

AR_exon8_BE = BE_gRNAs(base_editor_type, editing_window, gene, target_codons, cas_type, exon_filename)
AR_exon8_BE.save_data(output_dir + gene + '_' + cas_type + base_editor_type + '_' + notes + '_gRNAs.csv')


In [80]:
# bugs in code
# a stop codon turning into another stop codon should be excluded
# for SpRY and a lesser extent SpG and Sp
#    the same site is targeted by 5 gRNAs since SpRY has no PAM and moving down one base makes a new gRNA
#    these should be somehow classified together as part of a subgroup
# add a direction option


In [97]:
len('ATTGCGAGAGAGCTGCATCAGTTCGCTTTTGACCTGCTAATCAAGTCACACATGGTGAGCGTGGACTTTCCGGAAATGATGGCAGAGATCATCTCTGTGCAAGTGCCCAAGATCCTTTCTGGGAAAGTCAAGCCCA')

136