In [None]:
!pip install openai-whisper git+https://github.com/sooftware/conformer.git PyYAML gdown gradio -q
import torch
# Check that we have a GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m98.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [2]:
%%bash
# Create cfg.yaml with model parameters (adapted from the official repo)
cat > cfg.yaml << 'CFG'
# Data and model config
device: 'cuda:0'        # computation device
sampling_rate: 8000     # audio sampling rate
win_len: 256            # STFT window length (25ms)
hop: 80                 # STFT hop length (10ms)
lowfreq: 50.0           # mel filterbank low freq cutoff
highfreq: 2500.0        # mel filterbank high freq cutoff
max_record_time: 16     # max duration of each recording (s)
max_event_time: 3       # max duration of each respiratory event (s)
# Model hyperparameters
whisper_seq: 1500
whiper_dim: 384
encoder_dim: 256
num_encoder_layers: 16
num_attention_heads: 4
rnn_hid_dim: 512
rnn_layers: 2
bidirect: true
n_fc_layers: 2
fc_layer_dim: 1024
output_dim: 15
input_dropout: 0.1
feed_forward_dropout: 0.1
attention_dropout: 0.1
conv_dropout: 0.1
rtb_data_channels: 1
CFG

%%bash
# Create class-id.txt mapping 15 classes (Name|ID)
cat > class-id.txt << 'CLASSIDS'
Healthy|0
Bronchiectasis|1
Bronchiolitis|2
COPD|3
Asthma|4
LRTI|5
Pneumonia|6
URTI|7
Bronchitis|8
Lung Fibrosis|9
Asthma & Lung Fibrosis|10
Heart Failure & Lung Fibrosis|11
Heart Failure|12
Heart Failure & COPD|13
Pleural Effusion|14
CLASSIDS

%%bash
# Create cfg_parse.py to load the YAML config
cat > cfg_parse.py << 'PYCODE'
import yaml
cfg = yaml.safe_load(open('cfg.yaml'))
PYCODE

bash: line 31: fg: no job control
bash: line 51: fg: no job control


In [8]:
# Import the config and define model architecture classes
import math
import torch
import torch.nn as nn
from cfg_parse import cfg  # load the cfg dictionary from YAML
from conformer import Conformer

# Depthwise Separable Conv2D layer used in ReneTrialBlock
class DSConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super(DSConv2d, self).__init__()
        self.depth_conv = nn.Conv2d(
            in_channels=in_channels, out_channels=in_channels,
            kernel_size=(kernel_size, kernel_size),
            padding=(kernel_size // 2, kernel_size // 2), groups=in_channels
        )
        self.pointwise_conv = nn.Conv2d(
            in_channels=in_channels, out_channels=out_channels, kernel_size=(1, 1)
        )
    def forward(self, x):
        out = self.depth_conv(x)
        out = self.pointwise_conv(out)
        return out

# ReneTrialBlock: the final convolutional block that produces class logits
class ReneTrialBlock(nn.Module):
    def __init__(self, cfg, in_channels):
        super(ReneTrialBlock, self).__init__()
        self.cfg = cfg
        # Left convolution flow
        self.left_flow = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=(1,1)),
            nn.BatchNorm2d(in_channels),
            nn.GELU(),
            DSConv2d(in_channels, in_channels, kernel_size=3),
            nn.BatchNorm2d(in_channels),
            nn.GELU(),
            nn.Conv2d(in_channels, in_channels, kernel_size=(5,5), padding=(5//2, 5//2))
        )
        # Right convolution flow (mirror of left_flow with reversed conv order)
        self.right_flow = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=(5,5), padding=(5//2, 5//2)),
            nn.BatchNorm2d(in_channels),
            nn.GELU(),
            DSConv2d(in_channels, in_channels, kernel_size=3),
            nn.BatchNorm2d(in_channels),
            nn.GELU(),
            nn.Conv2d(in_channels, in_channels, kernel_size=(1,1))
        )
        # Final linear layer: maps concatenated features to output classes
        self.layer = nn.Linear(cfg['rnn_hid_dim'] * 2, cfg['output_dim'])
    def forward(self, input_data):
        # input_data: [batch, channels*feature_map] as a flat vector
        # Reshape to 2D feature maps (assume square)
        feature_size = int(math.sqrt(cfg['rnn_hid_dim'] * 2))
        x = input_data.reshape(input_data.size(0), cfg['rtb_data_channels'], feature_size, feature_size)
        # Convolution flows and residual
        out = self.left_flow(x) + self.right_flow(x) + x
        # Flatten and linear layer to class logits
        out = out.view(input_data.size(0), -1)
        return self.layer(out)

# Main RENE Model class
class Model(nn.Module):
    def __init__(self, cfg):
        super(Model, self).__init__()
        self.cfg = cfg
        # Conformer encoder (from the installed library)
        self.conformer = Conformer(
            num_classes=cfg['rnn_hid_dim'],      # output feature dim = rnn hidden dim
            input_dim=cfg['whiper_dim'],         # Whisper encoder feature dimension
            encoder_dim=cfg['encoder_dim'],
            num_encoder_layers=cfg['num_encoder_layers'],
            num_attention_heads=cfg['num_attention_heads'],
            input_dropout_p=cfg['input_dropout'],
            feed_forward_dropout_p=cfg['feed_forward_dropout'],
            attention_dropout_p=cfg['attention_dropout'],
            conv_dropout_p=cfg['conv_dropout']
        )
        # Bidirectional GRU
        self.gru = nn.GRU(
            input_size=cfg['rnn_hid_dim'], hidden_size=cfg['rnn_hid_dim'],
            num_layers=cfg['rnn_layers'], bidirectional=cfg['bidirect']
        )
        # ReneTrialBlock for final classification
        self.rene = ReneTrialBlock(cfg, in_channels=cfg['rtb_data_channels'])
    def forward(self, x, input_lengths):
        # x: [batch, time_frames, whisper_dim], input_lengths: length of each sequence
        encoder_out, output_lengths = self.conformer(x, input_lengths)  # [batch, T, rnn_hid_dim]
        # Transpose to shape [T, batch, features] for GRU
        encoder_out = encoder_out.permute(1, 0, 2)
        rnn_out, _ = self.gru(encoder_out)         # rnn_out: [T, batch, 2*rnn_hid_dim] (bi-GRU)
        last_timestep = rnn_out[-1]                # take the last time-step output of GRU for each batch
        logits = self.rene(last_timestep)    # [batch, output_dim] = class scores
        return logits

# Instantiate the model (we will load pretrained weights next)
model = Model(cfg)
print("Model instantiated with %d output classes." % model.cfg['output_dim'])

Model instantiated with 15 output classes.


In [4]:
# Download the RENE(S) pretrained checkpoint from Google Drive
import os
model_path = "Rene.pth"
if not os.path.exists(model_path):
    # Using gdown with the shared file ID
    !gdown --id 1NcGPIURY4mWtRr_KkwHAodssOexN-PbC -O Rene.pth
else:
    print("Model checkpoint already downloaded.")

Downloading...
From (original): https://drive.google.com/uc?id=1NcGPIURY4mWtRr_KkwHAodssOexN-PbC
From (redirected): https://drive.google.com/uc?id=1NcGPIURY4mWtRr_KkwHAodssOexN-PbC&confirm=t&uuid=25809376-b14b-4bd7-801d-80c11fd13a9c
To: /content/Rene.pth
100% 648M/648M [00:07<00:00, 88.5MB/s]


In [10]:
# Load the pretrained weights into the model
checkpoint = torch.load("Rene.pth", map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'], strict=False)  # <--- FIXED HERE
model.to(device).eval()
print("Pretrained RENE model loaded.")

Pretrained RENE model loaded.


In [11]:
# Download a sample lung sound WAV (from SPRSound open dataset)
sample_url = "https://raw.githubusercontent.com/SJTU-YONGFU-RESEARCH-GRP/SPRSound/main/example/65097128_5.6_1_p1_2242.wav"
!wget -q -O sample.wav $sample_url

In [13]:
import numpy as np
import whisper

# Load Whisper tiny model for feature extraction
whisper_model = whisper.load_model("tiny").to(device)
whisper_model.eval()

# Load and preprocess the audio
audio = whisper.load_audio("sample.wav")  # returns NumPy array in float32
# Whisper expects 16 kHz audio and pads/clips to 30 sec. We'll pad/trim to 16 sec (target of RENE)
MAX_SEC = 30
audio = whisper.pad_or_trim(audio, length=MAX_SEC * whisper.audio.SAMPLE_RATE)
mel = whisper.log_mel_spectrogram(audio).to(device)

# Use Whisper encoder to get audio features
with torch.no_grad():
    encoder_out = whisper_model.encoder(mel.unsqueeze(0).to(device))  # shape [1, n_frames, 384]
# Determine actual length in frames (to inform Conformer)
n_frames_total = encoder_out.shape[1]  # typically 1500 for 15s of audio after Whisper padding
# Estimate the number of frames corresponding to real (non-padded) audio content
orig_len_samples = min(len(audio), MAX_SEC * whisper.audio.SAMPLE_RATE)
orig_frames = math.floor(orig_len_samples / 160)  # 160-sample hop = 10ms frame step
input_length = torch.LongTensor([orig_frames // 2])  # //2 because Whisper encoder downsamples by 2x in time

# Run the RENE model to get class logits
encoder_out = encoder_out.to(device)
with torch.no_grad():
    logits = model(encoder_out, input_length.to(device))
    probs = torch.softmax(logits, dim=1)[0]  # probabilities for each of the 15 classes

# Load class names and print results
classes = [line.split('|')[0] for line in open('class-id.txt').read().splitlines()]
top_idx = int(torch.argmax(probs))
top_class = classes[top_idx]
top_conf = probs[top_idx].item()

print(f"Top predicted class: **{top_class}** ({top_conf*100:.1f}% confidence)")
print("\nClass probabilities:")
ranked = sorted(zip(classes, probs.cpu().numpy()), key=lambda x: x[1], reverse=True)
for cls, p in ranked:
    print(f"  {cls:25s}: {p*100:.2f}%")

Top predicted class: **Healthy** (14.9% confidence)

Class probabilities:
  Healthy                  : 14.90%
  URTI                     : 14.63%
  Asthma & Lung Fibrosis   : 10.61%
  Pneumonia                : 10.19%
  Heart Failure            : 6.68%
  Heart Failure & Lung Fibrosis: 5.62%
  Bronchiolitis            : 5.52%
  Pleural Effusion         : 5.16%
  Bronchitis               : 5.09%
  Lung Fibrosis            : 4.89%
  LRTI                     : 3.94%
  Asthma                   : 3.70%
  Heart Failure & COPD     : 3.38%
  COPD                     : 2.95%
  Bronchiectasis           : 2.73%


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [15]:
import gradio as gr

# Define the prediction function for Gradio
def classify_respiratory_sound(audio_file):
    # Load audio from the uploaded file
    audio = whisper.load_audio(audio_file)
    audio = whisper.pad_or_trim(audio, length=MAX_SEC * whisper.audio.SAMPLE_RATE)
    mel = whisper.log_mel_spectrogram(audio).to(device)
    with torch.no_grad():
        enc_out = whisper_model.encoder(mel.unsqueeze(0).to(device))
    # Calculate original length in frames for masking
    orig_len = min(len(audio), MAX_SEC * whisper.audio.SAMPLE_RATE)
    orig_frames = math.floor(orig_len / 160)
    inp_len = torch.LongTensor([orig_frames // 2])
    with torch.no_grad():
        logits = model(enc_out.to(device), inp_len.to(device))
        probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
    # Prepare outputs
    top_idx = int(probs.argmax())
    top_label = classes[top_idx]
    # Build dict of class confidences
    confidences = {cls: float(probs[i]) for i, cls in enumerate(classes)}
    return top_label, confidences

# Create Gradio interface
interface = gr.Interface(
    fn=classify_respiratory_sound,
    inputs=gr.Audio(type="filepath", label="Upload Lung Sound (.wav)"),
    outputs=[
        gr.Textbox(label="Top Predicted Disease"),
        gr.Label(num_top_classes=15, label="All Class Probabilities")
    ],
    title="RENE Respiratory Disease Classifier",
    description="Upload a lung sound recording (.wav) to get the predicted respiratory condition and confidence scores for all 15 classes."
)

# Launch the Gradio app (in Colab, this will display an inline interface or a shareable link)
interface.launch(debug=False, share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5e6051db410d4ddf9e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


