In [None]:
import os
import subprocess
import Bio
from Bio import SeqIO 
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
import glob
from Bio import SeqRecord
import pandas as pd
import genomicranges as gr
from Bio.Seq import Seq
import pyranges as pr
from Bio import AlignIO, SeqIO

In [None]:
def make_sbatch_file(filename):
    
    my_list = ["#!/bin/bash", 
               "#SBATCH --job-name=all_bz", 
               "#SBATCH --nodes=1",  
               "#SBATCH --ntasks=1",                     
               "#SBATCH --cpus-per-task=10",              
               "#SBATCH --mem=20gb",                    
               "#SBATCH --partition=20",                
               "##SBATCH --output all_bz-%j.out",  
               "#SBATCH --mail-type=ALL",               
               "#SBATCH --mail-user=hlharris@wi.mit.edu"] 
    
    with open(filename, "w") as file: 
        for item in my_list:
            file.write(item + '\n') 
    
def calc_zeros(alignment): 
    count = 0 
    return [0 if base == "-" else (count := count + 1) for base in alignment[0]]


def crop_alignment(start, end, alignment):
        
    zeros_seq = calc_zeros(alignment)
    #print(zeros_seq) #MIGHT NEED TO SUBTRACT 1 FROM EACH 
    try:
        ix_start = zeros_seq.index(start)
        #print(ix_start)
        ix_end = zeros_seq.index(end) #pretty elegant
        #print(ix_end)
        cropped_alignment = alignment[:, ix_start:ix_end]
        return cropped_alignment
    except: 
        return None
    
    
def concat_alignment(gtf_file, alignment):
   # print(alignment)
    complete_align_type = MultipleSeqAlignment([]) 
    #add the groups to complete align: 
    for recordix in range(len(alignment)): 
        #new_record = SeqRecord.SeqRecord("") 
        new_record = SeqRecord("")
        new_record.id = alignment[recordix].id 
        new_record.seq = Seq("") #added this 

        complete_align_type.append(new_record)
    
   # print(complete_align_type)
    for index, row in gtf_file.iterrows():     
        crop_align = crop_alignment(row[3], row[4], alignment)
        #print(crop_align)
        if crop_align is not None: 
            for recordix in range(len(crop_align)): 
                complete_align_type[recordix].seq += crop_align[recordix].seq #append additional sequence
        
    return complete_align_type

In [None]:
def has_ACGT(sequence):
    count_ACGT = sum(c in 'ACTG' for c in sequence)
    return count_ACGT > len(sequence) / 4 #true or false

In [16]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

genes = [ "EIF1AX", "EIF1AY", "KDM5D" , "KDM5C","UTY", "KDM6A", "ZFY", "ZFX", "DDX3Y" ,"DDX3X", "USP9Y" , "USP9X", "RPS4Y1", "RPS4X"] 


regions = ["exon", "intron", "promoter"]  

species_dict = {0:"human", 1:"chimp", 2: "mac", 3: "marm"} 

files_generated = [] 
spec1 = 'humanmasked'


for gene in genes: 
    print(gene)
    records = list(SeqIO.parse('/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/' + gene + ".fa", "fasta")) 
    
    #set the trees for input 
    if len(records) == 3: 
        tree = '((((((humanmasked chimp) gorilla) orangutan) pileatedgibbon) mac) lemur loris)'
        list_of_species = ['chimp', 'gorilla', 'orangutan', 'pileatedgibbon', 'mac']
    else: 
        tree = '((((((humanmasked chimp) gorilla) orangutan) pileatedgibbon) mac marm) lemur loris)' 
        list_of_species = ['chimp', 'gorilla', 'orangutan', 'pileatedgibbon', 'mac', 'marm']

        
    #make a new directory for each gene: 
    os.chdir('/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/multiz_7sp/' + gene)
    
    
    gtf_file = pd.read_csv("/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/" + gene + "_gtf_all103023.txt", delimiter="\t", header = None) 
    
    for spec2ix in range(len(list_of_species)): 
        
        spec2 = list_of_species[spec2ix]
        spec2ix = spec2ix + 1
        second_align_file_name = spec1 + "." + spec2  
        
        msa = list(SeqIO.parse('/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/multiz_7sp/' + gene + "/" + gene + "_msamasked.fa", "fasta")) #get the MASKED version there

        new_filepath = "/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/multiz_7sp/" + gene + "/" + second_align_file_name + "_msa.fa"
        pair_alignment = MultipleSeqAlignment([msa[0]]) #add human alignment

        pair_alignment.append(msa[spec2ix]) 

        SeqIO.write(pair_alignment, new_filepath, "fasta") #as fasta

        
        #separate into multiple files based on GTF file 
        for region in regions: 
            #
            gtf_file1 = gtf_file[gtf_file[2] == region] 
            new_alignment = AlignIO.read("/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/multiz_7sp/" + gene + "/" + second_align_file_name + "_msa.fa", "fasta")
            #
            returned_align = concat_alignment(gtf_file1,new_alignment) 
            
            filtered_sequences = [record for record in returned_align if has_ACGT(record.seq)] 
            if filtered_sequences:
                SeqIO.write(filtered_sequences, "/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/multiz_7sp/" + gene + "/" + second_align_file_name + "_" + region + "_msa.phy", "phylip") #as phylip
                
                new_alignment = MultipleSeqAlignment([])

                for record in filtered_sequences:
    # Replace '*' with '-' in the sequence
                    modified_seq = str(record.seq).replace('*', '-')
    # Replace 'N' with '-'
                    modified_seq = modified_seq.replace('N', '-')
    
    # Create a new SeqRecord with the modified sequence
                    new_record = SeqRecord(Seq(modified_seq), id=record.id)
    
    # Append the new record to the new alignment
                    new_alignment.append(new_record)
        
                SeqIO.write(new_alignment, "/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/multiz_7sp/" + gene + "/" + second_align_file_name + "_" + region + "_msa.phy", "phylip") #as phylip

                subprocess.run(['/lab/page_scratch/hannah/trimal/source/trimal', '-in', '/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/multiz_7sp/' + gene + '/' + second_align_file_name + '_' + region + '_msa.phy', '-out', '/lab/solexa_page/hannah/220516_mpra/msa/long_alignments/multiz_7sp/' + gene + '/' + second_align_file_name + '_' + region + '_msa_filtered.phy', '-gt', '1'],  stderr=subprocess.PIPE)

            else: 
                print("check") 

EIF1AX
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
EIF1AY
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter region
now im here and UGH
promoter_a region
stupid check
now im here and UGH
promoter 