In [1]:
from Bio.Seq import Seq
import re

In [2]:
def read_alignment(file):
    f = open(file, "r")
    lines = [x for x in f.readlines() if x[0]!='#']
    alignment = []
    for x in range(int(len(lines)/5)):
        temp = []
        for i in range(1,4):
            temp.append(lines[5*x+i])
        alignment.append(temp)
    return alignment

def turn_to_sense(alignment):
    out = []
    for a in alignment:
        temp = []
        sbj = a[0].split(" ")[-1].strip().upper()
        qry = a[1].split(" ")[-1].strip().upper()
        qlt = a[2].split(" ")[-1].strip()
        strnd = re.split("\s+", a[1])[-4]
        
        if strnd == "+":
            temp.append(sbj)
            temp.append(qry)
            temp.append(qlt)
        
        if strnd == "-": 
            temp.append(str(Seq(sbj).reverse_complement()))
            temp.append(str(Seq(qry).reverse_complement()))
            temp.append(qlt[::-1])
        
        out.append(temp)
            
    return out


def get_indel_matches(alignment):
    l = 7
    
    regex = re.compile(r"[A-Z]{}-[A-Z]{}".format('{'+str(l)+'}','{'+str(l)+'}'))
    
    indel_matches = []
    
    for x in alignment:
        if match := regex.search(x[0]):
            if x[2][match.span()[0]:match.span()[1]] == "~"*(2*l+1):
                a = x[0][match.span()[0]:match.span()[1]].upper()
                b = x[1][match.span()[0]:match.span()[1]].upper()
                indel_matches.append((a,b))
        if match := regex.search(x[1]):
            if x[2][match.span()[0]:match.span()[1]] == "~"*(2*l+1):
                b = x[0][match.span()[0]:match.span()[1]].upper()
                a = x[1][match.span()[0]:match.span()[1]].upper()
                indel_matches.append((a,b))
    return indel_matches



def get_double_indel_matches(alignment):
    
    l = 9
    
    regex = re.compile(r"[A-Z]{}--[A-Z]{}".format('{'+str(l)+'}','{'+str(l)+'}'))
    
    indel_matches = []
    
    for x in alignment:
        if match := regex.search(x[0]):
            if x[2][match.span()[0]-1:match.span()[1]-1] == "~"*(2*l+2):
                a = x[0][match.span()[0]-1:match.span()[1]-1].upper()
                b = x[1][match.span()[0]-1:match.span()[1]-1].upper()
                indel_matches.append((a,b))
        if match := regex.search(x[1]):
            if x[2][match.span()[0]-1:match.span()[1]-1] == "~"*(2*l+2):
                b = x[0][match.span()[0]-1:match.span()[1]-1].upper()
                a = x[1][match.span()[0]-1:match.span()[1]-1].upper()
                indel_matches.append((a,b))
    return indel_matches

In [3]:
alignment = read_alignment("crass_focardii_forward.maf")

alignment = turn_to_sense(alignment)


indel_matches = get_indel_matches(alignment)


double_indel_matches = get_double_indel_matches(alignment)

In [6]:
file=open('transcriptome-genome-align-plus1-fs.txt','w')
file.writelines([x[1] for x in indel_matches])
file.close()

In [7]:
file=open('transcriptome-genome-align-plus2-fs.txt','w')
file.writelines([x[1] for x in double_indel_matches])
file.close()