# Primer - Nanostat

In [29]:
import pandas as pd
import fnmatch
import os

In [30]:
def get_absolute_file_paths(folder_name: str, filter: str = "*") -> list[str]:
    absolute_paths = []
    for dirpath, _, filenames in os.walk(folder_name):
        for filename in filenames:
            if fnmatch.fnmatch(filename, filter):
                absolute_path = os.path.abspath(os.path.join(dirpath, filename))
                absolute_paths.append(absolute_path)
    return absolute_paths

def parse_line(line: str) -> float:
    line_splitted = line.strip().split()
    number = line_splitted[-1].replace(",","")
    return float(number)
    

def parse_general_summary(data: list[str]) -> dict:
    results = {}
    for line in data:
        if "Mean read length:" in line:
            results["mean_rl"] = parse_line(line)
        elif "Mean read quality:" in line:
            results["mean_rq"] = parse_line(line)
        elif "Median read length:" in line:
            results["median_rl"] = parse_line(line)
        elif "Median read quality:" in line:
            results["median_rq"] = parse_line(line)
        elif "Number of reads:" in line:
            results["number_of_reads"] = parse_line(line)
        elif "Read length N50:" in line:
            results["read_len_50"] = parse_line(line)
        elif "STDEV read length:" in line:
            results["stdev_read_len"] = parse_line(line)
        elif "Total bases:" in line:
            results["total_bases"] = parse_line(line)
    return results

In [39]:
file_paths= get_absolute_file_paths("data/nanostat", "*")
general_summary_clean_data_list = [] 
file_names_index = []

for path in file_paths:
    file_name = os.path.basename(path)
    with open(path, "r") as test_file:
        data = test_file.readlines()
        general_summary = parse_general_summary(data)
        general_summary_clean_data_list.append(general_summary)

clean_data = pd.DataFrame(general_summary_clean_data_list, index=["a","b","a","b","a","b","a","b","a","b","c"])
clean_data
    

Unnamed: 0,mean_rl,mean_rq,median_rl,median_rq,number_of_reads,read_len_50,stdev_read_len,total_bases
a,3434.6,12.5,1943.0,12.6,1725.0,6773.0,4219.5,5924628.0
b,3838.9,12.6,2465.0,12.8,1245.0,7062.0,4343.6,4779463.0
a,8141.9,13.2,4968.0,13.4,1681.0,15409.0,9368.7,13686597.0
b,4561.4,12.4,2988.0,12.5,2893.0,8476.0,5012.0,13196187.0
a,6394.6,12.9,4065.0,13.0,12895.0,11739.0,7396.4,82458084.0
b,7979.2,12.9,4155.0,13.0,14327.0,17926.0,10603.5,114318078.0
a,7372.6,12.5,4498.5,12.6,10314.0,13871.0,8034.5,76041040.0
b,6612.1,12.5,4039.0,12.6,13054.0,12859.0,7633.7,86314616.0
a,6165.7,12.7,3810.5,12.7,26236.0,10078.0,7401.5,161763592.0
b,6366.0,12.7,3621.5,12.7,17076.0,11346.0,8202.1,108705615.0


In [2]:
data

['General summary:         \n',
 'Mean read length:              3,434.6\n',
 'Mean read quality:                12.5\n',
 'Median read length:            1,943.0\n',
 'Median read quality:              12.6\n',
 'Number of reads:               1,725.0\n',
 'Read length N50:               6,773.0\n',
 'STDEV read length:             4,219.5\n',
 'Total bases:               5,924,628.0\n',
 'Number, percentage and megabases of reads above quality cutoffs\n',
 '>Q5:\t1725 (100.0%) 5.9Mb\n',
 '>Q7:\t1725 (100.0%) 5.9Mb\n',
 '>Q10:\t1568 (90.9%) 5.4Mb\n',
 '>Q12:\t1080 (62.6%) 3.9Mb\n',
 '>Q15:\t96 (5.6%) 0.2Mb\n',
 'Top 5 highest mean basecall quality scores and their read lengths\n',
 '1:\t19.7 (270)\n',
 '2:\t19.6 (242)\n',
 '3:\t18.6 (145)\n',
 '4:\t18.0 (432)\n',
 '5:\t17.9 (296)\n',
 'Top 5 longest reads and their mean basecall quality score\n',
 '1:\t36198 (10.3)\n',
 '2:\t35371 (11.6)\n',
 '3:\t29367 (9.7)\n',
 '4:\t28874 (13.1)\n',
 '5:\t27986 (13.1)\n']