In [None]:
## use RNA-STAR conda environment
from pathlib import Path
import traceback
import argparse
import pandas as pd
import pysam
import subprocess

### Loading in data

In [None]:
left = pd.read_csv("UNUAR_motif_sites_mRNA.tsv", sep = "\t")
left["Motif"] = left["Motif"].str.replace("U", "T") ## each BAM file has sequences w/ Thymine (T) instead of Uracil (U)
right = pd.read_excel("SupplementaryTable1.xlsx")
df = pd.merge(left, right, how = "left", on = "Motif")

In [24]:
df.columns

Index(['TranscriptID', 'Motif', 'Region', 'Chrom', 'Strand',
       'TranscriptPosStart', 'TranscriptPosEnd', 'TranscriptModBase',
       'GenomicModBase', 'TranscriptLength', 'DistFromAUG', 'DistFromSTOP',
       'DistFromExonStart', 'DistFromExonEnd', 'fit_A', 'fit_B', 'fit_R'],
      dtype='object')

In [23]:
df.tail(5)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,DistFromAUG,DistFromSTOP,DistFromExonStart,DistFromExonEnd,fit_A,fit_B,fit_R
3425688,rna-XM_047417842.1,TATAA,3UTR,NC_000001.11,+,2262,2267,2264,248918727,2684,2133,364,2125,419,0.829824,1e-08,0.68498
3425689,rna-XM_047417842.1,TCTAA,3UTR,NC_000001.11,+,2267,2272,2269,248918732,2684,2138,369,2130,414,0.753295,0.03700348,0.763335
3425690,rna-XM_047417842.1,TATAA,3UTR,NC_000001.11,+,2286,2291,2288,248918751,2684,2157,388,2149,395,0.829824,1e-08,0.68498
3425691,rna-XM_047417842.1,TGTAA,3UTR,NC_000001.11,+,2297,2302,2299,248918762,2684,2168,399,2160,384,0.763066,0.03536472,0.670875
3425692,rna-XM_047417842.1,TTTAG,3UTR,NC_000001.11,+,2380,2385,2382,248918845,2684,2251,482,2243,301,0.841842,0.08050644,0.610735


In [21]:
null_val = df.isnull().sum().sum()
print("Null values: " + str(null_val))

Null values: 0


### Opening BAM file

In [25]:
df

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,DistFromAUG,DistFromSTOP,DistFromExonStart,DistFromExonEnd,fit_A,fit_B,fit_R
0,rna-NM_001134855.2-2,TCTAG,CDS,NW_018654708.1,-,1358,1363,1360,373,1733,880,151,101,372,0.716983,5.129877e-02,0.678453
1,rna-NM_001134855.2-2,TCTAA,3UTR,NW_018654708.1,-,1530,1535,1532,201,1733,1052,21,273,200,0.753295,3.700348e-02,0.763335
2,rna-NM_001171747.2-2,TCTAA,CDS,NW_019805492.1,+,98,103,100,101,1500,100,187,100,1399,0.753295,3.700348e-02,0.763335
3,rna-NM_001171747.2-2,TTTAG,3UTR,NW_019805492.1,+,357,362,359,360,1500,359,72,359,1140,0.841842,8.050644e-02,0.610735
4,rna-NM_001171747.2-2,TTTAG,3UTR,NW_019805492.1,+,383,388,385,386,1500,385,98,385,1114,0.841842,8.050644e-02,0.610735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3425688,rna-XM_047417842.1,TATAA,3UTR,NC_000001.11,+,2262,2267,2264,248918727,2684,2133,364,2125,419,0.829824,1.000000e-08,0.684980
3425689,rna-XM_047417842.1,TCTAA,3UTR,NC_000001.11,+,2267,2272,2269,248918732,2684,2138,369,2130,414,0.753295,3.700348e-02,0.763335
3425690,rna-XM_047417842.1,TATAA,3UTR,NC_000001.11,+,2286,2291,2288,248918751,2684,2157,388,2149,395,0.829824,1.000000e-08,0.684980
3425691,rna-XM_047417842.1,TGTAA,3UTR,NC_000001.11,+,2297,2302,2299,248918762,2684,2168,399,2160,384,0.763066,3.536472e-02,0.670875


In [None]:
def deletion_rate(input_bam_name):
    bamfile = pysam.AlignmentFile(input_bam_name, "rb")

In [None]:
def open_bam(folder_name):
    current_path = Path.cwd()
    input_dir = current_path/"realignments"/folder_name
    
    try: 
        for subfolder in input_dir.iterdir():
            if subfolder.is_dir():
                processed_folder = input_dir/f"{subfolder.name}_realigned"
                
                for bam in subfolder.glob("*.bam"):
                    input_bam_name = Path(bam) ## turn string from list back into filepath

                    # run_realign(input_bam_name, output_bam_name, fasta_dir, discard)
                    # bam_index(output_bam_name)

    except Exception as e:
        print(f"Failed to calculate deletion rates in {folder_name}: {e}")
        traceback.print_exc()
        raise

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description = "Calculates observed and real deletion rates for every read in a BAM file.")
    parser.add_argument("--folder_name", help = "Name of processed_fastqs folder", required = True)
    args = parser.parse_args()

    print("Starting realignment...")
    open_bam(args.folder_name)
    print("Realignment finished.")