```bibtex
@INPROCEEDINGS{szekely2019casting,
  author={Székely, Éva and Henter, Gustav Eje and Gustafson, Joakim},
  booktitle={Proc. ICASSP 2019}, 
  title={Casting to corpus: Segmenting and selecting spontaneous dialogue for {TTS} with a {CNN-LSTM} speaker-dependent breath detector}, 
  year={2019},
  pages={6925-6929},
  doi={10.1109/ICASSP.2019.8683846}
}
```

In [12]:
import numpy as np

In [2]:
import requests

helpers = requests.get("https://raw.githubusercontent.com/BirgerMoell/tmh/master/tmh/breath_detection/support_scripts/helpers.py")

In [9]:
exec("\n".join(helpers.text.split("\n")[83:94]))

In [10]:
a = zcr_rate

In [13]:
def colorvec(inp, maxzcr=0.4,
             low_slow=np.array([0., 255., 255.]),
             low_fast=np.array([255., 255., 255.]),
             high_slow=np.array([0., 0., 0.]),
             high_fast=np.array([255., 0., 0.])):
    spec, zcr = inp

    spec_width = spec.shape[1]
    zcr2 = np.interp(range(spec_width), 
                     np.linspace(0, spec_width, len(zcr)), 
                     zcr)

    spec2 = np.abs(spec) / 80
    outp = np.zeros((*spec2.shape, 3))
    z = np.clip(zcr2 / maxzcr, 0, 1)

    low = low_slow[:, np.newaxis] + (low_fast - low_slow)[:, np.newaxis] * z
    high = high_slow[:, np.newaxis] + (high_fast - high_slow)[:, np.newaxis] * z

    for k in range(3):
        outp[:, :, k] = np.tile(low[k], (spec2.shape[0], 1)) + \
                        spec2 * np.tile(high[k] - low[k], (spec2.shape[0], 1))

    outp /= 255
    return outp

In [None]:
def zcr_rate(wav_in, step=240, sz=960):
    cross = np.abs(np.diff(np.sign(wav_in + 1e-8)))
    cross = np.minimum(cross, 1)

    steps = int((len(cross) - sz) / step)

    zrate = np.array([np.mean(cross[i*step:i*step+sz]) for i in range(steps)])

    return zrate

In [None]:
import torch
import torchaudio
import triton
import triton.language as tl

@triton.jit
def zcr_kernel(
    wav_ptr, zrate_ptr,
    step, sz, n_elements,
    BLOCK_SIZE: tl.constexpr
):
    pid = tl.program_id(0)
    block_start = pid * BLOCK_SIZE

    for i in range(block_start, block_start + BLOCK_SIZE):
        if i < (n_elements - sz) // step:
            sum_cross = 0.0
            prev_sign = tl.sign(tl.load(wav_ptr + i * step) + 1e-8)
            for j in range(1, sz):
                curr_sign = tl.sign(tl.load(wav_ptr + i * step + j) + 1e-8)
                sum_cross += tl.abs(curr_sign - prev_sign) / 2
                prev_sign = curr_sign
            zrate = sum_cross / sz
            tl.store(zrate_ptr + i, zrate)

def zcr_rate_gpu(wav_in, step=240, sz=960):
    n_elements = wav_in.shape[0]
    steps = (n_elements - sz) // step
    zrate = torch.zeros(steps, dtype=torch.float32, device='cuda')

    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    zcr_kernel[grid](wav_in, zrate, step, sz, n_elements, BLOCK_SIZE=1024)

    return zrate

# Load input wav and convert to tensor
y, sr = torchaudio.load(input_root + input_file)
wav_out = y[0].cuda()  # Assuming mono audio
samples = len(wav_out) // (2 * sr)
wav_in = wav_out[:2*sr*samples].reshape(samples, 2*sr)

# Create mel-spectrogram
mel_spec = torchaudio.transforms.MelSpectrogram(
    sample_rate=sr,
    n_fft=sr // 50,
    hop_length=(sr // 50) // 8,
    n_mels=128
).cuda()

melspecs = mel_spec(wav_in.T).permute(1, 0, 2)

# Calculate zero crossing rate
zrates = torch.stack([zcr_rate_gpu(wav) for wav in wav_in])

# Create zcr-colored melspectrograms
def colorvec2_gpu(inp, maxzcr=0.4, low_slow=torch.tensor([0., 255., 255.]), 
                  low_fast=torch.tensor([255., 255., 255.]), 
                  high_slow=torch.tensor([0., 0., 0.]), 
                  high_fast=torch.tensor([255., 0., 0.])):
    spec, zcr = inp
    zcr2 = torch.nn.functional.interpolate(zcr.unsqueeze(0).unsqueeze(0), size=spec.shape[1], mode='linear', align_corners=False).squeeze()
    spec2 = spec.abs() / 80
    z = torch.clamp(zcr2 / maxzcr, 0, 1)
    low = low_slow.unsqueeze(1) + (low_fast - low_slow).unsqueeze(1) * z
    high = high_slow.unsqueeze(1) + (high_fast - high_slow).unsqueeze(1) * z
    outp = torch.stack([
        torch.tile(low[k], (spec2.shape[0], 1)) + spec2 * torch.tile(high[k] - low[k], (spec2.shape[0], 1))
        for k in range(3)
    ]).permute(1, 2, 0)
    return outp / 255

colspecs = [colorvec2_gpu((spec, zrate)) for spec, zrate in zip(melspecs, zrates)]
x_complete = torch.stack(colspecs).float()

# Convert back to CPU if needed
x_complete = x_complete.cpu().numpy()

In [None]:
import numpy as np
import librosa

def create_melspec(wav_in, sr=None, n_fft=960, hop_length=120, n_mels=128):
    if sr is None:
        sr = min(48000, len(wav_in) // 2)
        n_fft = sr // 50
        hop_length = sr // 400
    
    S = librosa.feature.melspectrogram(
        y=wav_in, 
        sr=sr, 
        n_fft=n_fft, 
        hop_length=hop_length, 
        n_mels=n_mels,
        power=1
    )
    log_S = librosa.amplitude_to_db(S, ref=np.max)
    return log_S.astype(np.float32)

def zcr_rate(wav_in, step=240, sz=960):
    cross = np.abs(np.diff(np.sign(wav_in + 1e-8)))
    cross = np.minimum(cross, 1)
    steps = (len(cross) - sz) // step
    return np.array([np.mean(cross[i*step:i*step+sz]) for i in range(steps)])

def colorvec2(inp, maxzcr=0.4, low_slow=np.array([0., 255., 255.]), 
              low_fast=np.array([255., 255., 255.]), 
              high_slow=np.array([0., 0., 0.]), 
              high_fast=np.array([255., 0., 0.])):
    spec, zcr = inp
    zcr2 = np.interp(np.arange(spec.shape[1]), 
                     np.linspace(0, spec.shape[1], len(zcr)), zcr)
    spec2 = np.abs(spec) / 80
    z = np.clip(zcr2 / maxzcr, 0, 1)
    low = low_slow[:, np.newaxis] + (low_fast - low_slow)[:, np.newaxis] * z
    high = high_slow[:, np.newaxis] + (high_fast - high_slow)[:, np.newaxis] * z
    outp = np.zeros((spec2.shape[0], spec2.shape[1], 3))
    for k in range(3):
        outp[:, :, k] = np.tile(low[k], (spec2.shape[0], 1)) + \
                        spec2 * np.tile(high[k] - low[k], (spec2.shape[0], 1))
    return outp / 255

In [None]:
import numpy as np
from multiprocessing import Pool
from codes.helpers import load_wav, create_melspec, zcr_rate, colorvec2

# Load input wav and split into two second samples
y = load_wav(input_root + input_file, sr=sr)
wav_out = np.asarray(y[1])
samples = len(wav_out) // (2 * sr)
wav_in = np.reshape(wav_out[:(2*sr*samples)], (samples, 2*sr))

# Create mel-spectrogram and calculate zero crossing rate
pool = Pool()
ins = [wav_in[r, :] for r in range(samples)]

melspecs = pool.starmap(create_melspec, [(wav, sr) for wav in ins])
zrates = pool.map(zcr_rate, ins)

# Create zcr-colored melspectrograms
colspecs = pool.starmap(colorvec2, zip(melspecs, zrates))

pool.close()
pool.join()

# Convert to numpy array if needed
x_complete = np.array(colspecs)

In [None]:
import torch
import torch.nn as nn

class BreathAnalysisModel(nn.Module):
    def __init__(self, timesteps, img_rows, img_cols, num_classes):
        super(BreathAnalysisModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool1 = nn.MaxPool2d(kernel_size=(5, 4))
        self.conv2 = nn.Conv2d(16, 8, kernel_size=(4, 1), stride=(4, 1))
        self.bn2 = nn.BatchNorm2d(8)
        self.pool2 = nn.MaxPool2d(kernel_size=(6, 5))
        self.lstm = nn.LSTM(input_size=self.calculate_lstm_input_size(img_rows, img_cols),
                            hidden_size=8, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(16, num_classes)
        self.timesteps = timesteps

    def calculate_lstm_input_size(self, img_rows, img_cols):
        x = torch.randn(1, 3, img_rows, img_cols)
        x = self.pool1(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool2(torch.relu(self.bn2(self.conv2(x))))
        return x.view(x.size(0), -1).size(1)

    def forward(self, x):
        # x shape: (batch_size, timesteps, channels, height, width)
        batch_size = x.size(0)
        x = x.view(batch_size * self.timesteps, 3, x.size(3), x.size(4))
        
        x = self.pool1(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool2(torch.relu(self.bn2(self.conv2(x))))
        
        x = x.view(batch_size, self.timesteps, -1)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [None]:
import h5py
import torch
import numpy as np
import os

def convert_keras_to_pytorch(h5_path, output_path):
    # Load the Keras model weights
    with h5py.File(h5_path, 'r') as f:
        # Create a dictionary to store PyTorch state dict
        state_dict = {}

        # Iterate through layers in the H5 file
        for layer_name in f.keys():
            if isinstance(f[layer_name], h5py.Group):
                for param_name in f[layer_name].keys():
                    weight = f[layer_name][param_name][:]
                    
                    # Convert weights to PyTorch tensors
                    if 'kernel' in param_name:
                        if 'conv2d' in layer_name:
                            weight = np.transpose(weight, (3, 2, 0, 1))
                        elif 'dense' in layer_name:
                            weight = np.transpose(weight)
                    
                    # Rename keys to PyTorch convention
                    if 'kernel' in param_name:
                        param_name = 'weight'
                    elif 'gamma' in param_name:
                        param_name = 'weight'
                    elif 'beta' in param_name:
                        param_name = 'bias'
                    elif 'moving_mean' in param_name:
                        param_name = 'running_mean'
                    elif 'moving_variance' in param_name:
                        param_name = 'running_var'
                    
                    # Special handling for LSTM weights
                    if 'lstm' in layer_name:
                        if 'kernel' in param_name:
                            ih_weight = weight[:, :32]
                            hh_weight = weight[:, 32:]
                            state_dict[f'{layer_name}.weight_ih_l0'] = torch.FloatTensor(ih_weight)
                            state_dict[f'{layer_name}.weight_hh_l0'] = torch.FloatTensor(hh_weight)
                        elif 'bias' in param_name:
                            state_dict[f'{layer_name}.bias_ih_l0'] = torch.FloatTensor(weight)
                            state_dict[f'{layer_name}.bias_hh_l0'] = torch.FloatTensor(weight)
                    else:
                        state_dict[f'{layer_name}.{param_name}'] = torch.FloatTensor(weight)

    # Save the state dict as a PyTorch checkpoint
    torch.save(state_dict, output_path)
    print(f"PyTorch checkpoint saved to {output_path}")

# Usage
h5_path = output_root + model_name + '.h5'
output_path = os.path.splitext(h5_path)[0] + '.pth'
convert_keras_to_pytorch(h5_path, output_path)