# Process all files

In this notebook, we load the list of files to process them in parallel.
The results are saved into a hdf5 file.


## Load the list of files

In [None]:
from pathlib import Path
from ipyfilechooser import FileChooser
import pandas as pd
import dnasufo 

root = Path("")
dst = Path("")
fc1 = FileChooser(root, select_desc="Root")
fc2 = FileChooser(dst, select_desc="Destination")
display(fc1)
display(fc2)

In [None]:
root = Path(fc1.selected)
dst = Path(fc2.selected)
filelist = pd.read_csv(dst/'filelist.csv', index_col=0)
print(f"Number of files {len(filelist)}")
filelist.head()

## Processing

Files are processed in parallel. For this we start by defining a processing function for each line.

In [None]:
def process_and_save_result(root, row):
    """Process and save results"""
    ret = dnasufo.process(root / row['path'], channels=[row['membrane'], row['dna']])
    dnasufo.save_result(dst / f"{row.name:06d}.h5", row['name'], *ret)
    return "ok"


Process all the files in parallel and save the results in the destination folder.

In [None]:
from dask import delayed, compute
from dask.distributed import LocalCluster
cluster = LocalCluster()
client = cluster.get_client()
cluster.scale(2)
tsk = [delayed(process_and_save_result)(root, row) for row in filelist.iloc[:5].iloc]
compute(tsk)