# Primer - Nanostat

In [29]:
import pandas as pd
import fnmatch
import os

In [30]:
def get_absolute_file_paths(folder_name: str, filter: str = "*") -> list[str]:
    absolute_paths = []
    for dirpath, _, filenames in os.walk(folder_name):
        for filename in filenames:
            if fnmatch.fnmatch(filename, filter):
                absolute_path = os.path.abspath(os.path.join(dirpath, filename))
                absolute_paths.append(absolute_path)
    return absolute_paths

def parse_line(line: str) -> float:
    line_splitted = line.strip().split()
    number = line_splitted[-1].replace(",","")
    return float(number)
    

def parse_general_summary(data: list[str]) -> dict:
    results = {}
    for line in data:
        if "Mean read length:" in line:
            results["mean_rl"] = parse_line(line)
        elif "Mean read quality:" in line:
            results["mean_rq"] = parse_line(line)
        elif "Median read length:" in line:
            results["median_rl"] = parse_line(line)
        elif "Median read quality:" in line:
            results["median_rq"] = parse_line(line)
        elif "Number of reads:" in line:
            results["number_of_reads"] = parse_line(line)
        elif "Read length N50:" in line:
            results["read_len_50"] = parse_line(line)
        elif "STDEV read length:" in line:
            results["stdev_read_len"] = parse_line(line)
        elif "Total bases:" in line:
            results["total_bases"] = parse_line(line)
    return results

In [40]:
file_paths= get_absolute_file_paths("data/nanostat", "*")
general_summary_clean_data_list = [] 
file_names_index = []

for path in file_paths:
    file_name = os.path.basename(path)
    file_names_index.append(file_name)
    with open(path, "r") as test_file:
        data = test_file.readlines()
        general_summary = parse_general_summary(data)
        general_summary_clean_data_list.append(general_summary)

clean_data = pd.DataFrame(general_summary_clean_data_list, index=file_names_index)
clean_data


Unnamed: 0,mean_rl,mean_rq,median_rl,median_rq,number_of_reads,read_len_50,stdev_read_len,total_bases
CFBP_1719,3434.6,12.5,1943.0,12.6,1725.0,6773.0,4219.5,5924628.0
CFBP_2502,3838.9,12.6,2465.0,12.8,1245.0,7062.0,4343.6,4779463.0
CFBP_3157_ONT_merged,8141.9,13.2,4968.0,13.4,1681.0,15409.0,9368.7,13686597.0
CFBP_3395,4561.4,12.4,2988.0,12.5,2893.0,8476.0,5012.0,13196187.0
CFBP_3445_ONT_merged,6394.6,12.9,4065.0,13.0,12895.0,11739.0,7396.4,82458084.0
NCPPB_3253_ONT_merged,7979.2,12.9,4155.0,13.0,14327.0,17926.0,10603.5,114318078.0
NIB_Z_2806,7372.6,12.5,4498.5,12.6,10314.0,13871.0,8034.5,76041040.0
NIB_Z_2809,6612.1,12.5,4039.0,12.6,13054.0,12859.0,7633.7,86314616.0
NIB_Z_3057,6165.7,12.7,3810.5,12.7,26236.0,10078.0,7401.5,161763592.0
NIB_Z_3516,6366.0,12.7,3621.5,12.7,17076.0,11346.0,8202.1,108705615.0


In [44]:
clean_data.to_csv("general_summary.csv")

In [50]:
clean_data.to_excel("general_summary.xlsx")

In [59]:
# dostop do stolpca
clean_data.loc[:, "mean_rl"].sum()
clean_data["mean_rl"].sum()
clean_data.mean_rl.sum()

68850.1

In [None]:
# risanje grafov
clean_data.mean_rl.plot(kind="bar")