In [1]:
import os
import subprocess
import Bio
from Bio import SeqIO 
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
import glob
from Bio import SeqRecord
import pandas as pd
import genomicranges as gr
from Bio.Seq import Seq
import pyranges as pr
from Bio import AlignIO, SeqIO

In [3]:
def make_sbatch_file(filename):
    
    my_list = ["#!/bin/bash", 
               "#SBATCH --job-name=all_bz", 
               "#SBATCH --nodes=1",  
               "#SBATCH --ntasks=1",                     
               "#SBATCH --cpus-per-task=10",              
               "#SBATCH --mem=20gb",                    
               "#SBATCH --partition=20",                
               "##SBATCH --output all_bz-%j.out",  
               "#SBATCH --mail-type=ALL",               
               "#SBATCH --mail-user=hlharris@wi.mit.edu"] 
    
    with open(filename, "w") as file: 
        for item in my_list:
            file.write(item + '\n') 
    
def calc_zeros(alignment): 
    count = 0 
    return [0 if base == "-" else (count := count + 1) for base in alignment[0]]


def crop_alignment(start, end, alignment):
        
    zeros_seq = calc_zeros(alignment)
    try:
        ix_start = zeros_seq.index(start)
        ix_end = zeros_seq.index(end) 
        cropped_alignment = alignment[:, ix_start:ix_end]
        return cropped_alignment
    except: 
        return None
    
    
def concat_alignment(gtf_file, alignment):
    complete_align_type = MultipleSeqAlignment([]) 
    for recordix in range(len(alignment)): 
        new_record = SeqRecord("")
        new_record.id = alignment[recordix].id 
        new_record.seq = Seq("") #added this 

        complete_align_type.append(new_record)
    
    for index, row in gtf_file.iterrows():     
        crop_align = crop_alignment(row[3], row[4], alignment)
        #print(crop_align)
        if crop_align is not None: 
            for recordix in range(len(crop_align)): 
                complete_align_type[recordix].seq += crop_align[recordix].seq #append additional sequence
        
    return complete_align_type

def has_ACGT(sequence):
    count_ACGT = sum(c in 'ACTG' for c in sequence)
    return count_ACGT > len(sequence) / 4 #

In [1]:
#generate alignment 

myPath = #PATH TO GITHUB FOLDER 

genes = ['DDX3X', 'EIF1AX', 'KDM5C', 'KDM6A', 'RPS4X', 'USP9X', 'ZFX']


for gene in genes: 

    tree = '(humanmasked (bull dog))'
    
    os.chdir(myPath + '/sequences/' + gene)

    subprocess.run(["touch", "all_bz.log"])  #making a new file 
    
    with open("all_bz.log", "w") as log_file:
        result = subprocess.run(['all_bz', '-', tree], stdout=log_file, stderr=subprocess.PIPE, check=True)
        
    make_sbatch_file("testj1.sh") #make sbatch file - it makes a new one each time through so you dont have to delete the old one 
    print(gene) 
    with open("testj1.sh", "a") as file: #append to the file 
        file.write("bash ") 
        file.write("all_bz.log" + "\n")
        file.write("tba '" + tree + "' *.*.maf tba.maf >&tba.log" + "\n") 
        file.write("maf_project tba.maf humanmasked '" + tree + "' > human_proj.maf" + "\n") #project from the perspective of the X chr 
        file.write("msa_view -o FASTA human_proj.maf > " + gene + "_msa.fa")
        
    subprocess.run(["sbatch", 'testj1.sh'], stderr=subprocess.PIPE) 
    print(gene)
    os.chdir(myPath + '/sequences/')  

    

SyntaxError: invalid syntax (1432213991.py, line 3)

In [2]:
#get alignments with no editing of gaps w trimal
genes = ['DDX3X', 'EIF1AX', 'KDM5C', 'KDM6A', 'RPS4X', 'USP9X', 'ZFX']
regions = ["promoter", "exon", "intron"]   

for gene in genes: 
    alignment = AlignIO.read(myPath + "/sequences/" + gene + "/" + gene + "_msa.fa", "fasta")
    
    gtf_file = pd.read_csv(myPath + "/tables/" + gene + "_gtf_all103023.txt", delimiter="\t", header = None) 
    
    for region in regions: 
        
        gtf_file1 = gtf_file[gtf_file[2] == region] 
        #print(gtf_file1)
        returned_align = concat_alignment(gtf_file1,alignment) 
        #print(gene_pair, returned_align, region)
        SeqIO.write(returned_align, myPath + "/sequences/" + gene + "/" + gene + "_" + region + "_msa.fa", "fasta") #as fasta
        SeqIO.write(returned_align, myPath + "/sequences/" + gene + "/" + gene + "_" + region + "_msa.phy", "phylip") #as phylip


NameError: name 'AlignIO' is not defined

## generate percent calculations for regions/alignments

In [24]:
def edit_alignment(records_X, gene, region, specy):

    new_alignment = MultipleSeqAlignment([])
    
    if specy == "bull":
        records_x_lost = [0,1] 
    else:
        records_x_lost = [0,2]
        
    for ix in records_x_lost:
        
        new_record = records_X[ix]
     
        new_alignment.append(new_record)
    
    SeqIO.write(new_alignment, myPath + "/sequences/" + gene + "/" + "humanmasked." + specy + region + "_gaps_msa.phy", "phylip") #as phylip
    
    #NEED TO ADD YOUR PATH TO SOFTWARE PROGRAM TRIMAL 
    subprocess.run(['/lab/page_scratch/hannah/trimal/source/trimal', '-in', myPath + "/sequences/" + gene + "/" + "humanmasked." + specy + region + "gaps_msa.phy", '-out',  myPath +  "/sequences/" + gene + "/" + "humanmasked." + specy + region + "_gaps_msa.phy", '-noallgaps'],  stderr=subprocess.PIPE)

    records_X = list(SeqIO.parse( myPath + "/sequences/" + gene + "/" + "humanmasked." + specy + region + "_gaps_msa.phy", "phylip"))
    
    
    new_alignment = MultipleSeqAlignment([])
    records_x_lost = [0,1] #take edit the alignment since now its only 2 sequences long
    
    for ix in records_x_lost:
    # Replace '*' with '-' in the sequence
        
        modified_seq  = str(records_X[ix].seq).replace('-', 'K')
        modified_seq = str(modified_seq).replace('*', '-')
        modified_seq = str(modified_seq).replace('K', '*') #replacing the - w stars - want to keep the gaps for the calculations of % alignments 
        new_record = records_X[ix]
        new_record.seq = Seq(modified_seq)     
        new_alignment.append(new_record)
                
    
    SeqIO.write(new_alignment, myPath + "/sequences/" + gene + "/" + specy + region + "replgaps_msa.phy", "phylip") #as phylip

    subprocess.run(['/lab/page_scratch/hannah/trimal/source/trimal', '-in', myPath + "/sequences/" + gene + "/" + specy + region + "replgaps_msa.phy", '-out', myPath + "/sequences/" + gene + "/" + specy + region + "nogaps_msa.phy", '-gt', '1'],  stderr=subprocess.PIPE)
        
    try:
        new_alignment = list(SeqIO.parse(myPath + "/sequences/" + gene + "/" + specy + region + "nogaps_msa.phy", "phylip"))    
        return new_alignment
          
    except: 
        return []
    

In [3]:
regions = ["exon", "intron", "promoter"]  
genes = ['DDX3X', 'EIF1AX', 'KDM5C', 'KDM6A', 'RPS4X', 'USP9X', 'ZFX']
species = ["dog", "bull"]

p = []
r = []
perc = []
speci = []

for gene in genes: 
   
    
    for region in regions: 
           # print(records_X, "here")     
            records_X = list(SeqIO.parse(myPath + '/sequences/' + gene + "/" + gene + "_" + region + "_msa.fa", "fasta")) 
            #print(records_X[0])
            for specy in species: 
                new_alignment = edit_alignment(records_X, gene, region, specy)
            
                if new_alignment:
                    x_gene = new_alignment[0] 
                    y_gene = new_alignment[1] 
                    num_matches = sum(a == b for a, b in zip(x_gene, y_gene))
             
                    percent_identity = (num_matches / len(x_gene)) * 100
                    p.append(gene)
                    r.append(region) 
                    speci.append(specy)
                    perc.append(percent_identity)
                else: 
                    print(pair, region, "NONE")
                


df = pd.DataFrame((list(zip(p,r,perc, speci))), columns = ['pair', 'region', 'percent', "species"]) 
        
        

NameError: name 'SeqIO' is not defined

In [28]:

df.to_csv(myPath + '/tables/percent_alignment_1211_dogbull.csv')

In [76]:
g = []
gc = []
speci = []
regions1 = []

regions = ["exon", "intron", "promoter"]  
genes = ['DDX3X', 'EIF1AX', 'KDM5C', 'KDM6A', 'RPS4X', 'USP9X', 'ZFX']
species = ["dog", "bull"]


for gene in genes: 
    for region in regions:
        for specy in species: 
            records = AlignIO.read(myPath + "/sequences/" + gene + "/" + "humanmasked." + specy + region + "_gaps_msa.phy", "phylip")
            human_seq = str(records[0].seq)
            animal_seq = str(records[1].seq)
            
            h_gc = ((human_seq.count('G') + human_seq.count('C')) / (human_seq.count('G') + human_seq.count('C') + human_seq.count('A') + human_seq.count('T'))) *100
            a_gc = ((animal_seq.count('G') + animal_seq.count('C')) / (animal_seq.count('G') + animal_seq.count('C') + animal_seq.count('A') + animal_seq.count('T'))) *100
            
            gc.append(h_gc)
            g.append(gene)
            speci.append('human')
            
            regions1.append(region)
            
            gc.append(a_gc)
            g.append(gene)
            speci.append(specy)
            regions1.append(region) 


In [77]:
df = pd.DataFrame((list(zip(g,gc,speci, regions1))), columns = ['gene', 'gc', 'species', 'region']) 

In [4]:
df.to_csv(myPath + '/tables/percent_GC_1211_dogbull.csv')

NameError: name 'df' is not defined