In [1]:
import os

# change the working directory to where the example files are located
os.chdir(os.path.realpath('examples'))
print(os.listdir())

['483510-hisat2-hybrid-alignment-trimmed.out', '483513-hisat2-hybrid-alignment.out']


In [2]:
# import 483510-hisat2-hybrid-alignment-trimmed.out as a list with the first 117 lines trimmed out
with open(os.listdir()[0], 'r') as f:
    text = f.read().split("\n") # splitting each line and storing it as an element an a list
    text = text[118:] # the first 117 lines are junk

print(text[0:5])



In [4]:
# a function to extract sample names from the list that contains each new line in the text as an element
def get_sample_names(list):
    id = []
    for i in range(0, len(list)): # looping through each line
        if list[i] == "Aligning reads to the indexed genome using HISAT2 and generating sorted BAM files using Samtools...":
            bam = list[i+1] # store the line after list[i]
            # if the first line after list[i] contains the suffix "trimmed.fq.gz"
            if bam[12:] == ".trimmed.fq.gz" or bam[12:] == "trimmed.fq.gz":
                id.append(bam.replace("_1.trimmed.fq.gz", "")) # append the prefix (sample ID)
            # if the first line after list[i] contains the suffix "fq.gz"
            elif bam[12:] == ".fq.gz" or bam[12:] == "fq.gz":
                id.append(bam.replace("_1.fq.gz", "")) # append the prefix (sample ID)
    return id

print(get_sample_names(text))

['amelRNA_1', 'amelRNA_2', 'amelRNA_3', 'amelRNA_4', 'amelRNA_5', 'amelRNA_6', 'amelRNA_7', 'amelRNA_8', 'amelRNA_9', 'amelRNA_10', 'amelRNA_11', 'amelRNA_12', 'amelRNA_13', 'amelRNA_14', 'amelRNA_15', 'amelRNA_16', 'amelRNA_18', 'amelRNA_19']


In [5]:
# a function to extract alignment statistics
def get_alignment_stats(list):
    alignment_stats = []

    # parts of lines (suffix) where the info wanted is found to be used to extract them
    # since the info we want is found before the parts assigned below in the same line, we refer to them as suffix
    reads_all = "reads; of these:" # total number of paired reads
    conc_one = "aligned concordantly exactly 1 time" # number of paired reads aligned concordantly exactly 1 time
    conc_multi = "aligned concordantly >1 times" # number of paired reads aligned concordantly exactly more than 1 time
    reads_disconc = "pairs aligned concordantly 0 times; of these:" # number of paired reads aligned concordantly exactly 0 time s
    disconc_one = "aligned discordantly 1 time" # number of reads aligned discordantly 1 time
    pair_mates = "pairs aligned 0 times concordantly or discordantly; of these:" # number of paired reads aligned 0 times concordantly or discordantly
    mates_zero = "mates make up the pairs; of these:" # number of reads that make up the pairs
    mates_one = "aligned exactly 1 time" # number of individual reads (mates) aligned exactly 1 time
    mates_multi = "aligned >1 times" # number of individual reads (mates) aligned exactly more than 1 time
    overall = "overall alignment rate" # overall alignment rate of reads to the genome

    for element in list: # looping through the lines
        # finding the index suffixes start
        reads_all_ind = element.find(reads_all)
        conc_one_ind = element.find(conc_one)
        conc_multi_ind = element.find(conc_multi)
        reads_disconc_ind = element.find(reads_disconc)
        disconc_one_ind = element.find(disconc_one)
        pair_mates_ind = element.find(pair_mates)
        mates_zero_ind = element.find(mates_zero)
        mates_one_ind = element.find(mates_one)
        mates_multi_ind = element.find(mates_multi)
        overall_ind = element.find(overall)
        
        # if a line contains one of the suffixes specificed above, append the prefixes (information of interest) to a new list
        # removing any leading and trailing whitespaces
        if element[reads_all_ind:] == reads_all:
            alignment_stats.append(element[:reads_all_ind].strip())
        elif element[conc_one_ind:] == conc_one:
            alignment_stats.append(element[:conc_one_ind].strip())
        elif element[conc_multi_ind:] == conc_multi:
            alignment_stats.append(element[:conc_multi_ind].strip())
        elif element[reads_disconc_ind:] == reads_disconc:
            alignment_stats.append(element[:reads_disconc_ind].strip())
        elif element[disconc_one_ind:] == disconc_one:
            alignment_stats.append(element[:disconc_one_ind].strip())
        elif element[pair_mates_ind:] == pair_mates:
            alignment_stats.append(element[:pair_mates_ind].strip())
        elif element[mates_zero_ind:] == mates_zero:
            alignment_stats.append(element[:mates_zero_ind].strip())
        elif element[mates_one_ind:] == mates_one:
            alignment_stats.append(element[:mates_one_ind].strip())
        elif element[mates_multi_ind:] == mates_multi:
            alignment_stats.append(element[:mates_multi_ind].strip())
        elif element[overall_ind:] == overall:
            alignment_stats.append(element[:overall_ind].strip())
    return alignment_stats

print(get_alignment_stats(text))

['29620775', '26049478 (87.94%)', '525302 (1.77%)', '3045995', '225090 (7.39%)', '2820905', '5641810', '1780566 (31.56%)', '56423 (1.00%)', '93.58%', '30911037', '27239267 (88.12%)', '680166 (2.20%)', '2991604', '274388 (9.17%)', '2717216', '5434432', '1792804 (32.99%)', '72653 (1.34%)', '94.23%', '31395825', '27971795 (89.09%)', '572134 (1.82%)', '2851896', '272897 (9.57%)', '2578999', '5157998', '1602742 (31.07%)', '85290 (1.65%)', '94.47%', '32112536', '28430984 (88.54%)', '535309 (1.67%)', '3146243', '228136 (7.25%)', '2918107', '5836214', '1931165 (33.09%)', '76512 (1.31%)', '94.04%', '32081947', '28367725 (88.42%)', '637570 (1.99%)', '3076652', '279069 (9.07%)', '2797583', '5595166', '1874079 (33.49%)', '66125 (1.18%)', '94.30%', '31950608', '28587787 (89.47%)', '460095 (1.44%)', '2902726', '276220 (9.52%)', '2626506', '5253012', '1735708 (33.04%)', '55250 (1.05%)', '94.58%', '30718531', '27126080 (88.31%)', '396309 (1.29%)', '3196142', '395726 (12.38%)', '2800416', '5600832', '1