In [2]:
import os
import subprocess
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
import glob
from Bio import SeqRecord
import pandas as pd
#pip install genomicranges #this worked 
import genomicranges as gr
from Bio.Seq import Seq
#from Bio.SeqRecord import SeqRecord 
import pyranges as pr
from Bio import pairwise2
from Bio.SeqUtils import GC
import random 

In [None]:
myPath = # PATH TO GITHUB FOLDER

In [None]:
def make_sbatch_file(filename): 
    
    my_list = ["#!/bin/bash", 
               "#SBATCH --job-name=all_bz", 
               "#SBATCH --nodes=1",  
               "#SBATCH --ntasks=1",                     
               "#SBATCH --cpus-per-task=10",              
               "#SBATCH --mem=20gb",                    
               "#SBATCH --partition=20",                
               "##SBATCH --output all_bz-%j.out",  
               "#SBATCH --mail-type=ALL",               
               "#SBATCH --mail-user=hlharris@wi.mit.edu"] 
    
    with open(filename, "w") as file: 
        for item in my_list:
            file.write(item + '\n') 
    
def calc_zeros(alignment): 

    #returns string with 0's in gaps in human sequence 
     
    count = 0 
    return [0 if base == "-" else (count := count + 1) for base in alignment[0]]

def crop_alignment(start, end, alignment):

    #crop alignment by start and end coordinates

    zeros_seq = calc_zeros(alignment)
    try:
        ix_start = zeros_seq.index(start)
        ix_end = zeros_seq.index(end)
        cropped_alignment = alignment[:, ix_start:ix_end]
        return cropped_alignment
    except: 
        return None
    

def concat_alignment(gtf_file, alignment):

    #concatenate alignment based on gtf file coordinates to extract particular sections


    complete_align_type = MultipleSeqAlignment([]) 
    #add the groups to complete align: 
    for recordix in range(len(alignment)): 
        new_record = SeqRecord.SeqRecord("") 
        new_record.id = alignment[recordix].id 
        new_record.seq = Seq("") #added this 

        complete_align_type.append(new_record)
    
   # print(complete_align_type)
    for index, row in gtf_file.iterrows():     
        crop_align = crop_alignment(row[3], row[4], alignment)
        #print(crop_align)
        if crop_align is not None: 
            for recordix in range(len(crop_align)): 
                complete_align_type[recordix].seq += crop_align[recordix].seq #append additional sequence
        
    return complete_align_type

In [None]:
#make directory for each gene pair 
genes = ["EIF1AX", "EIF1AY", "KDM5D" , "KDM5C","UTY", "KDM6A", "ZFY", "ZFX", "DDX3Y" ,"DDX3X", "USP9Y" , "USP9X", "RPS4Y1", "RPS4X"] 

gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"],  "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1'], "KDM6A_UTY": ["KDM6A", "UTY"]}

#extract human X and Y genes 

for gene_pair,two_genes in gene_pair_dict.items(): 
    subprocess.run(['mkdir', myPath + '/sequences/pair_align/' + gene_pair],  stderr=subprocess.PIPE) 
    records_X = list(SeqIO.parse(myPath + '/sequences/' + two_genes[0] + ".fa", "fasta")) 
    SeqIO.write(records_X[0], myPath + '/sequences/pair_align/' + gene_pair + '/human_Xgene', 'fasta') 

    records_Y = list(SeqIO.parse(myPath + '/sequences/' + two_genes[1]  + ".fa", "fasta")) 
    SeqIO.write(records_Y[0], myPath + '/sequences/pair_align/' + gene_pair + '/human_Ygene' , 'fasta') 


In [None]:
#align X-Y pair genes 

for gene_pair, two_genes in gene_pair_dict.items(): 

    tree = '(human_Xgene human_Ygene)'
    
    os.chdir(myPath + '/sequences/pair_align/'+ gene_pair)

    subprocess.run(["touch", "all_bz.log"])  #making a new file 
    
    with open("all_bz.log", "w") as log_file:
        result = subprocess.run(['all_bz', '-', tree], stdout=log_file, stderr=subprocess.PIPE, check=True)
        
    make_sbatch_file("testj1.sh") #make sbatch file - it makes a new one each time through so you dont have to delete the old one 
    print(gene_pair) 
    with open("testj1.sh", "a") as file: #append to the file 
        file.write("bash ") 
        file.write("all_bz.log" + "\n")
        file.write("tba '" + tree + "' *.*.maf tba.maf >&tba.log" + "\n") 
        file.write("maf_project tba.maf human_Xgene '" + tree + "' > human_proj.maf" + "\n") #project from the perspective of the X chr 
        file.write("msa_view -o FASTA human_proj.maf > " + gene_pair + "_msa.fa")
        
    subprocess.run(["sbatch", 'testj1.sh'], stderr=subprocess.PIPE) 

    os.chdir(myPath + '/sequences/pair_align/')  
    

In [None]:
#write alignments as phylip files

gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"],  "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1'], "KDM6A_UTY": ["KDM6A", "UTY"]}

for gene_pair, two_genes in gene_pair_dict.items():     
    records = SeqIO.parse(myPath + "/sequences/pair_align/" +  gene_pair + "/" +  gene_pair +  "_msa.fa", "fasta") 
    new_name = myPath + "/sequences/pair_align/" +  gene_pair + "/" +  gene_pair + "_msa.phy"
    SeqIO.write(records, new_name, 'phylip') 
    

In [33]:
#extract regions
gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"], "KDM6A_UTY": ["KDM6A", "UTY"], "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1']}

regions = ["promoter", "exon", "intron"]   

for gene_pair, two_genes in gene_pair_dict.items(): 
    alignment = AlignIO.read(myPath + "/sequences/pair_align/" + gene_pair + "/" + gene_pair + "_msa.fa", "fasta")
    
    gtf_file = pd.read_csv(myPath + "/tables/" + two_genes[0] + "_gtf_all103023.txt", delimiter="\t", header = None) 
    
    for region in regions: 
        gtf_file1 = gtf_file[gtf_file[2] == region] 
        #print(gtf_file1)
        returned_align = concat_alignment(gtf_file1,alignment) 
        #print(gene_pair, returned_align, region)
        SeqIO.write(returned_align, myPath + "/sequences/pair_align/" + gene_pair + "/" + gene_pair + "_" + region + "_msa.fa", "fasta") #as fasta
        SeqIO.write(returned_align, "/sequences/pair_align/" + gene_pair + "/" + gene_pair + "_" + region + "_msa.phy", "phylip") #as phylip


After generating the homolog alignments: 
    
    -use trimal to remove all gaps -- 
    -find the promoter %id 
    -find the exon %id 
    -find the intronic %id 

-leave the gaps in these alignments - 


In [34]:
def edit_alignment(records_X, pair, region):

    new_alignment = MultipleSeqAlignment([])
    records_x_lost = [0,1] #take edit the alignment 
    
    for ix in records_x_lost:
        
        new_record = records_X[ix]
     
        new_alignment.append(new_record)
    
  
    SeqIO.write(new_alignment, myPath + "/sequences/pair_align/" + pair + "/" + "human_XY_" + region + "_msa.phy", "phylip") #as phylip
    
    #use trimal to get rid of regions with gaps in X and Y
    #CHANGE TO YOUR TRIMAL LOCATION 
    subprocess.run(['/lab/page_scratch/hannah/trimal/source/trimal', '-in', myPath + "/sequences/pair_align/" + pair + "/" + "human_XY_" + region + "_msa.phy", '-out', myPath + '/sequences/pair_align/' + pair + '/' + 'human_XY' + '_' + region + '_msa_filtered.phy', '-noallgaps'],  stderr=subprocess.PIPE)

    
    
    records_X = list(SeqIO.parse(myPath + '/sequences/pair_align/' + pair + '/' + 'human_XY' + '_' + region + '_msa_filtered.phy', "phylip"))
    
    
    new_alignment = MultipleSeqAlignment([])
    records_x_lost = [0,1] #take edit the alignment since now its only 2 sequences long
    
    for ix in records_x_lost:
    # Replace '*' with '-' in the sequence so can remove them with trimal 
        
        modified_seq  = str(records_X[ix].seq).replace('-', 'K')
        modified_seq = str(modified_seq).replace('*', '-')
        modified_seq = str(modified_seq).replace('K', '*') #replacing the - w stars - want to keep the gaps for the calculations of % alignments 
        new_record = records_X[ix]
        new_record.seq = Seq(modified_seq)     
        new_alignment.append(new_record)
                

    SeqIO.write(new_alignment, myPath + "/sequences/pair_align/" + pair + "/" + "human_XY_" + region + "_msa.phy", "phylip") #as phylip

    subprocess.run(['/lab/page_scratch/hannah/trimal/source/trimal', '-in', myPath + '/sequences/pair_align/' + pair + '/' + 'human_XY' + '_' + region + '_msa.phy', '-out', myPath + '/sequences/pair_align/' + pair + '/' + 'human_XY' + '_' + region + '_msa_filtered.phy', '-gt', '1'],  stderr=subprocess.PIPE)
        
    try:
        new_alignment = list(SeqIO.parse(myPath + '/sequences/pair_align/' + pair + '/' + 'human_XY' + '_' + region + '_msa_filtered.phy', "phylip"))    
        return new_alignment
          
    except: 
        return []
    

## calculate percent ID

In [35]:
regions = ["exon", "intron"]  
pairs = [ "EIF1AX_EIF1AY","KDM5C_KDM5D", "KDM6A_UTY", "ZFX_ZFY", "DDX3X_DDX3Y", "USP9X_USP9Y", "RPS4X_RPS4Y1"]

gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"], "KDM6A_UTY": ["KDM6A", "UTY"], "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1']}

regions = ["intron", "exon"]


p = []
r = []
perc = []
combos = set()
combo_dict = {}

for gp_d in gene_pair_dict: 
    pair = gp_d
    #print(pair, region)
    
    for region in regions: 
              
            records_X = list(SeqIO.parse(myPath + '/sequences/pair_align/' + pair + "/" + pair + "_" + region + "_msa.fa", "fasta")) 
          
            new_alignment = edit_alignment(records_X, pair, region)
           
            if new_alignment:
                x_gene = new_alignment[0] 
                y_gene = new_alignment[1] 
              # print(x_gene, y_gene)
                num_matches = sum(a == b for a, b in zip(x_gene, y_gene))
              # print(num_matches)
              # print(len(x_gene))
                percent_identity = (num_matches / len(x_gene)) * 100
                p.append(pair)
                r.append(region) 
                perc.append(percent_identity)
            else: 
                print(pair, region, "NONE")
                
for pair1, genes1 in gene_pair_dict.items():
    for pair2, genes2 in gene_pair_dict.items():
        combos.add((genes1[0], genes2[1]))
        combo_dict[(genes1[0], genes2[1])] = genes1[0] + "_" + genes2[1]


In [None]:
df = pd.DataFrame((list(zip(p,r,perc))), columns = ['pair', 'region', 'percent']) 
print(df)

In [None]:
#generate files of alignment %'s

for combo in combos: 
    
    promoter_percent = get_perc(combo, 'promoter')[0] 
  
    p.append(combo_dict[combo])
    r.append("promoter")
    perc.append(promoter_percent)
    
df = pd.DataFrame((list(zip(p,r,perc))), columns = ['pair', 'region', 'percent']) 

df.to_csv(myPath + 'percent_alignment_0129.csv')

## get GC content

In [None]:
gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"], "KDM6A_UTY": ["KDM6A", "UTY"], "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1']}
regions = ["exon", "intron"]   

gene = []
pairs = []
reg = []
GC_perc = []

for gp_d, gl in gene_pair_dict.items(): 
    pair = gp_d
    
    prom, gc_xp, gc_yp, algn= get_perc(gl, 'promoter')
    GC_perc.append(gc_xp) 
    reg.append('promoter')
    pairs.append(pair)
    gene.append('X_gene')
    
    GC_perc.append(gc_yp)
    reg.append('promoter')
    pairs.append(pair)
    gene.append('Y_gene')
    
    for region in regions:   
        
        alignment_x = AlignIO.read(myPath + "/sequences/pair_align/" + pair + "/" + "human_Xgene", "fasta")

        gtf_file_x = pd.read_csv(myPath + "/tables/" + gl[0] + "_gtf_all103023.txt", delimiter="\t", header = None) 
        gtf_file_x = gtf_file_x[gtf_file_x[2] == region] 
        returned_align_x = concat_alignment(gtf_file_x,alignment_x) 
            
            
        alignment_y = AlignIO.read(myPath + "/sequences/pair_align/" + pair + "/" + "human_Ygene", "fasta")

        gtf_file_y = pd.read_csv(myPath + "/tables/" + gl[1] + "_gtf_all103023.txt", delimiter="\t", header = None) 
        gtf_file_y = gtf_file_y[gtf_file_y[2] == region] 
        returned_align_y = concat_alignment(gtf_file_y,alignment_y) 
    
        GC_perc.append(GC(returned_align_x[0].seq))
        reg.append(region)
        pairs.append(pair)
        gene.append('X_gene')
        
        GC_perc.append(GC(returned_align_y[0].seq))
        reg.append(region)
        pairs.append(pair)
        gene.append('Y_gene')
        
        

In [40]:
df1 = pd.DataFrame((list(zip(gene,pairs,reg, GC_perc))), columns = ['gene', 'pairs', 'region', 'GC_perc']) 
df1.to_csv('GC_0129.csv')

## calculate CpG for promoters 

In [41]:
def get_perc_fornormCG(combo,region): 

    gtf_file_x = pd.read_csv(myPath + "/tables/" + combo[0] + "_gtf_all103023.txt", delimiter="\t", header = None) 
    gtf_file_x1 = gtf_file_x[gtf_file_x[2] == region] 

    gtf_file_y = pd.read_csv(myPath + "/tables/" + combo[1] + "_gtf_all103023.txt", delimiter="\t", header = None) 
    gtf_file_y1 = gtf_file_y[gtf_file_y[2] == region] 
             
    rec_x = list(SeqIO.parse(myPath + '/sequences/pair_align/' + combo[0] + "/human", "fasta"))
    rec_y = list(SeqIO.parse(myPath + '/sequences/pair_align/' + combo[1] + "/human", "fasta"))

    promoter_x = rec_x[0].seq[gtf_file_x1.iloc[0,3]: gtf_file_x1.iloc[0,4]]
    #print(promoter_x)
    promoter_y = rec_y[0].seq[gtf_file_y1.iloc[0,3]: gtf_file_y1.iloc[0,4]]
    return promoter_x, promoter_y

In [42]:
gene = []
pairs = []
reg = []
CpG_norm = []
raw_cpGs = []

gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"], "KDM6A_UTY": ["KDM6A", "UTY"], "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1']}

for gp_d, gl in gene_pair_dict.items(): 
    pair = gp_d
    p_x, p_y = get_perc_fornormCG(gl, "promoter")
    
    p_x_CpG = (p_x.count("CG") * 500) / (p_x.count("C") * p_x.count("G"))
    p_y_CpG = (p_y.count("CG") * 500) / (p_y.count("C") * p_y.count("G"))

    CpG_norm.append(p_x_CpG)
    reg.append("promoter")
    pairs.append(pair)    
    gene.append('X_gene')
    raw_cpGs.append(p_x.count("CG")/500 *100)
    
    CpG_norm.append(p_y_CpG)
    reg.append("promoter")
    pairs.append(pair)    
    gene.append('Y_gene')
    raw_cpGs.append(p_y.count("CG")/500 *100)

df1 = pd.DataFrame((list(zip(gene,pairs,reg, CpG_norm, raw_cpGs))), columns = ['gene', 'pairs', 'region', 'CpG_norm', 'raw_cpGs']) 
df1.to_csv(myPath + '/tables/CpG_0129.csv')

## generate alignment for x and y promoters (defined by gtf coordinates)

In [None]:
def get_perc(combo,region): 

    gtf_file_x = pd.read_csv(myPath + "/tables/" + combo[0] + "_gtf_all103023.txt", delimiter="\t", header = None) 
    gtf_file_x1 = gtf_file_x[gtf_file_x[2] == region] 

    gtf_file_y = pd.read_csv(myPath + "/tables/" + combo[1] + "_gtf_all103023.txt", delimiter="\t", header = None) 
    gtf_file_y1 = gtf_file_y[gtf_file_y[2] == region] 
             
    rec_x = list(SeqIO.parse(myPath + '/sequences/primates/' + combo[0] + "/human", "fasta"))
    rec_y = list(SeqIO.parse(myPath + '/sequences/primates/' + combo[1] + "/human", "fasta"))

    promoter_x = rec_x[0].seq[gtf_file_x1.iloc[0,3]: gtf_file_x1.iloc[0,4]]
  
    promoter_y = rec_y[0].seq[gtf_file_y1.iloc[0,3]: gtf_file_y1.iloc[0,4]]
   
    
    GC_x = GC(promoter_x)
    GC_y = GC(promoter_y)

    #generating promoter alignmetns 
    alignments = pairwise2.align.globalxx(promoter_x, promoter_y) 
   
    
    best_alignment = max(alignments, key=lambda alignment: alignment[2])
   
    #Get the aligned sequences from the best alignment
    aligned_x, aligned_y, score, start, end = best_alignment
 
    #Get the aligned sequences from the best alignment  
    aligned_seq1 = aligned_x 
    aligned_seq2 = aligned_y

    num_matches = sum(a == b for a, b in zip(aligned_seq1, aligned_seq2))

    percent_identity = (num_matches / len(aligned_seq1)) * 100 

    return percent_identity, GC_x, GC_y, best_alignment


## rolling alignment of 50 bp from X perspective:

In [None]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment


p = []
r = []
perc = []
end_nums = []
start_nums = []

gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"], "KDM6A_UTY": ["KDM6A", "UTY"], "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1']}

for gp_d, gl in gene_pair_dict.items(): 
    print(gp_d)
    promoter_percent, GC_x, GC_y, best_alignment = get_perc(gl,  'promoter') #different alignment method
    #print(best_alignment)
    best_alignment = [
    Seq(best_alignment[0]), Seq(best_alignment[1])]
    seq_records = [SeqRecord(seq) for seq in best_alignment]
    alignment = MultipleSeqAlignment(seq_records)
    
   
                                           
    for num in range(1, 450, 1): #change from 25 to 1 and 475 to 499
        end_num = num + 50
        crop_align_best = crop_alignment(num,end_num, alignment)
        print(crop_align_best[0])
        aligned_seq1 = crop_align_best[0] 
        aligned_seq2 = crop_align_best[1]

        num_matches = sum(a == b for a, b in zip(aligned_seq1, aligned_seq2))

    
        percent_identity = (num_matches / len(aligned_seq1)) * 100 
    
        p.append(gp_d)
        r.append("promoter")
        perc.append(percent_identity)
        end_nums.append(end_num)
        start_nums.append(num)
    
    
    
    
df = pd.DataFrame((list(zip(p,r,perc, end_nums, start_nums))), columns = ['pair', 'region', 'percent', "end_nums", "start_nums"]) 


In [None]:
print(df)
df.to_csv(myPath + '/tables/perc_rolling_0129.csv')

## get GC content promoters (from X perspective) 

In [None]:
g = []
gc = []
type_ofgene = []
regions1 = []

regions = [ "promoter"]  
gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"], "KDM6A_UTY": ["KDM6A", "UTY"], "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1']}



for gp, gl in gene_pair_dict.items(): 
            print(gp) 
            if gp == "KDM5C_KDM5D":
                continue
            records = AlignIO.read(myPath + "/sequences/pair_align/" + gp + "/" + "human_XY_promoter_msa_filtered.phy", "phylip")
            human_seq = str(records[0].seq)
            animal_seq = str(records[1].seq)
            
            h_gc = ((human_seq.count('G') + human_seq.count('C')) / (human_seq.count('G') + human_seq.count('C') + human_seq.count('A') + human_seq.count('T'))) *100
            a_gc = ((animal_seq.count('G') + animal_seq.count('C')) / (animal_seq.count('G') + animal_seq.count('C') + animal_seq.count('A') + animal_seq.count('T'))) *100
            
            gc.append(h_gc)
            g.append(gp)
            type_ofgene.append('X_gene')
            
            gc.append(a_gc) 
            g.append(gp)
            type_ofgene.append('Y_gene')

            


In [70]:
df = pd.DataFrame((list(zip(g, gc, type_ofgene))), columns = ['pair', 'gc', 'type']) 


## scrambled promoter controls 

In [90]:
def get_perc_mixedp(combo,region): 

    gtf_file_x = pd.read_csv(myPath + "/sequences/tables/" + combo[0] + "_gtf_all103023.txt", delimiter="\t", header = None) 
    gtf_file_x1 = gtf_file_x[gtf_file_x[2] == region] 

    gtf_file_y = pd.read_csv(myPath + "/sequences/tables/" + combo[1] + "_gtf_all103023.txt", delimiter="\t", header = None) 
    gtf_file_y1 = gtf_file_y[gtf_file_y[2] == region] 
             
    rec_x = list(SeqIO.parse(myPath + '/sequences/primates/' + combo[0] + "/human", "fasta"))
    rec_y = list(SeqIO.parse(myPath + '/sequences/primates/' + combo[1] + "/human", "fasta"))

    promoter_x = rec_x[0].seq[gtf_file_x1.iloc[0,3]: gtf_file_x1.iloc[0,4]]
  
    promoter_y = rec_y[0].seq[gtf_file_y1.iloc[0,3]: gtf_file_y1.iloc[0,4]]
   
    print(promoter_x)
    promoter_x = ''.join(random.sample(list(promoter_x), len(promoter_x)))
    print(promoter_x)
    print(promoter_y)
    promoter_y = ''.join(random.sample(list(promoter_y), len(promoter_y)))

    alignments = pairwise2.align.globalxx(promoter_x, promoter_y) 
   
    
    best_alignment = max(alignments, key=lambda alignment: alignment[2])
   
    #Get the aligned sequences from the best alignment
    aligned_x, aligned_y, score, start, end = best_alignment
 
    #Get the aligned sequences from the best alignment  
    aligned_seq1 = aligned_x 
    aligned_seq2 = aligned_y

    num_matches = sum(a == b for a, b in zip(aligned_seq1, aligned_seq2))

    percent_identity = (num_matches / len(aligned_seq1)) * 100 

    return percent_identity, best_alignment

In [None]:
p = []
r = []
perc = []

gene_pair_dict = {"EIF1AX_EIF1AY": ["EIF1AX", "EIF1AY"], "KDM5C_KDM5D":["KDM5C", "KDM5D"], "KDM6A_UTY": ["KDM6A", "UTY"], "ZFX_ZFY": ["ZFX", "ZFY"], "DDX3X_DDX3Y": ["DDX3X", "DDX3Y"],
                 "USP9X_USP9Y": ["USP9X", "USP9Y"], "RPS4X_RPS4Y1": ['RPS4X', 'RPS4Y1']}

for gp, combo in gene_pair_dict.items(): 
    
    promoter_percent = get_perc_mixedp(combo,  'promoter')[0] 
  
    p.append(gp)
    r.append("promoter")
    perc.append(promoter_percent)
    
df = pd.DataFrame((list(zip(p,r,perc))), columns = ['pair', 'region', 'percent']) 
print(df)
df.to_csv(myPath + '/tables/percent_alignment_scramble.csv')