In [1]:
import sys
from pathlib import Path

# add parent folder to the path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)
sys.path, module_path

(['/nfs/projects5/basecalling-jorge/basecalling/notebooks',
  '/home/javila/micromamba/envs/basecalling-cuda117/lib/python310.zip',
  '/home/javila/micromamba/envs/basecalling-cuda117/lib/python3.10',
  '/home/javila/micromamba/envs/basecalling-cuda117/lib/python3.10/lib-dynload',
  '',
  '/home/javila/micromamba/envs/basecalling-cuda117/lib/python3.10/site-packages',
  '/nfs/projects5/basecalling-jorge/basecalling'],
 '/nfs/projects5/basecalling-jorge/basecalling')

___

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader # load batches to the network

from ont_fast5_api.fast5_interface import get_fast5_file
from feito.dataloaders import DatasetBasecalling


In [3]:
dataset_basecalling = DatasetBasecalling(
    ["/projects5/basecalling-jorge/basecalling/data/RODAN/test/mouse-dataset/0/0a0bf68b-3b64-4fc6-ba34-d853db589f4b.fast5",
     "/projects5/basecalling-jorge/basecalling/data/RODAN/test/mouse-dataset/0/0a8787dc-a4b9-45da-b4e0-8711ec36897e.fast5"
     ], path_save_index="../output/basecalling/index.csv")



In [4]:
dataset_basecalling.index

[Index(index=0, path_fast5='/projects5/basecalling-jorge/basecalling/data/RODAN/test/mouse-dataset/0/0a0bf68b-3b64-4fc6-ba34-d853db589f4b.fast5', read_id='0a0bf68b-3b64-4fc6-ba34-d853db589f4b', subsignal_id=0, start=0, end=4095),
 Index(index=1, path_fast5='/projects5/basecalling-jorge/basecalling/data/RODAN/test/mouse-dataset/0/0a0bf68b-3b64-4fc6-ba34-d853db589f4b.fast5', read_id='0a0bf68b-3b64-4fc6-ba34-d853db589f4b', subsignal_id=1, start=4096, end=8191),
 Index(index=2, path_fast5='/projects5/basecalling-jorge/basecalling/data/RODAN/test/mouse-dataset/0/0a0bf68b-3b64-4fc6-ba34-d853db589f4b.fast5', read_id='0a0bf68b-3b64-4fc6-ba34-d853db589f4b', subsignal_id=2, start=8192, end=12287),
 Index(index=3, path_fast5='/projects5/basecalling-jorge/basecalling/data/RODAN/test/mouse-dataset/0/0a0bf68b-3b64-4fc6-ba34-d853db589f4b.fast5', read_id='0a0bf68b-3b64-4fc6-ba34-d853db589f4b', subsignal_id=3, start=12288, end=16383),
 Index(index=4, path_fast5='/projects5/basecalling-jorge/basecalling

In [32]:
BATCH_SIZE=5
dataloader_basecalling = DataLoader(dataset_basecalling, batch_size=BATCH_SIZE, shuffle=False)


In [35]:
iter_dl = iter(dataloader_basecalling)

In [40]:
batch = next(iter_dl)

In [41]:
batch.shape

torch.Size([2, 4096])

In [7]:
def print_all_raw_data():
    fast5_filepath = "../data/RODAN/test/mouse-dataset/0/0a0bf68b-3b64-4fc6-ba34-d853db589f4b.fast5" # This can be a single- or multi-read file
    with get_fast5_file(fast5_filepath, mode="r") as f5:
        for read in f5.get_reads():
            raw_data = read.get_raw_data()
            print(read.read_id, raw_data, len(raw_data))

In [8]:
print_all_raw_data()

0a0bf68b-3b64-4fc6-ba34-d853db589f4b [554 547 571 ... 559 567 556] 17611


In [9]:
path_fast5 = "../data/RODAN/test/mouse-dataset/0/0a0bf68b-3b64-4fc6-ba34-d853db589f4b.fast5" # This can be a single- or multi-read file

def load_signal(path_fast5: str):
    "Load a signal from fast5 file"    
    with get_fast5_file(path_fast5, mode="r") as f5:
        for read in f5.get_reads():
            raw_signal = read.get_raw_data()
            len_signal = len(raw_signal)
            read_id    = read.read_id
            
    return raw_signal, read_id, len_signal

In [10]:
raw_signal, read_id, len_signal = load_signal(path_fast5)

In [11]:
raw_signal

array([554, 547, 571, ..., 559, 567, 556], dtype=int16)

In [12]:
read_id

'0a0bf68b-3b64-4fc6-ba34-d853db589f4b'

In [13]:
len_signal

17611

In [15]:
raw_signal[:100]


array([554, 547, 571, 561, 570, 559, 536, 510, 512, 518, 516, 515, 507,
       507, 512, 519, 520, 519, 510, 511, 508, 522, 500, 519, 519, 511,
       522, 525, 512, 519, 514, 512, 515, 510, 522, 490, 502, 512, 521,
       480, 719, 846, 848, 826, 839, 821, 844, 866, 857, 856, 862, 871,
       873, 883, 875, 884, 866, 856, 880, 876, 881, 880, 879, 856, 858,
       835, 857, 876, 888, 887, 878, 877, 873, 870, 871, 836, 834, 835,
       845, 848, 836, 838, 897, 818, 834, 810, 820, 802, 836, 827, 819,
       826, 823, 828, 828, 838, 827, 841, 870, 873], dtype=int16)

In [19]:
raw_signal[:100].reshape((10,-1))

array([[554, 547, 571, 561, 570, 559, 536, 510, 512, 518],
       [516, 515, 507, 507, 512, 519, 520, 519, 510, 511],
       [508, 522, 500, 519, 519, 511, 522, 525, 512, 519],
       [514, 512, 515, 510, 522, 490, 502, 512, 521, 480],
       [719, 846, 848, 826, 839, 821, 844, 866, 857, 856],
       [862, 871, 873, 883, 875, 884, 866, 856, 880, 876],
       [881, 880, 879, 856, 858, 835, 857, 876, 888, 887],
       [878, 877, 873, 870, 871, 836, 834, 835, 845, 848],
       [836, 838, 897, 818, 834, 810, 820, 802, 836, 827],
       [819, 826, 823, 828, 828, 838, 827, 841, 870, 873]], dtype=int16)

In [54]:
import numpy as np
from typing import Optional

def split_raw_signal(signal: np.ndarray, len_subsignals: int = 4096, left_trim: int= 0, right_trim: int = 0, len_overlap: int = 0):
    """Return an array with non-overlapping signals of the same length.
    First the signal is trimmed (left/right), then the signal is padded with 0 
    in the end to have perfect subsignals of len_subsignal's lengths

    Args:
        signal (np.ndarray): input raw read
        len_subsignals (int, optional): fixed length of signals, it must be the input of the basecaller. Defaults to 4096.
        len_overlap (int): _description_. Defaults to 0.
        left_trim (int, optional): _description_. Defaults to 0.
        right_trim (int, optional): _description_. Defaults to 0.
    """    
    len_signal = len(signal)

    # trim signal
    start = left_trim - 1 if left_trim > 0 else 0 
    end   = len_signal - right_trim + 1 if right_trim > 0 else len_signal
    trimmed_signal = signal[start:end].copy()

    # pad signal at the end with zeros to make the length divisible by len_subsignals
    len_padd = len_subsignals - (len(trimmed_signal) % len_subsignals)
    trimmed_signal = np.pad(trimmed_signal, (0,len_padd), 'constant', constant_values=(0,0))
    
    # reshape trimmed signal
    return trimmed_signal.reshape((-1,len_subsignals))

In [55]:
s = split_raw_signal(
    raw_signal,
)
s.shape

(5, 4096)

In [None]:
def preprocessing(signal,):
    "Apply some preprocessing to the signal"
    pass