## GenAI Assignment 1
### Github https://github.com/itqan-abdullah/Audio-Assignment
## OverFitting a Model to Remove Piano Background Sound

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Installing and Loading Libraries

In [3]:
!pip install audiomentations -q
!pip install librosa -q
!pip install torch -q
!pip install torchvision -q
!pip install opencv-python -q
!pip install tqdm -q

In [4]:
import soundfile as sf
from audiomentations import Compose, AddGaussianNoise, AddBackgroundNoise
import numpy as np
import librosa


import copy
import os
import random
import shutil
import zipfile
from math import atan2, cos, sin, sqrt, pi, log

import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
from numpy import linalg as LA
from torch import optim, nn
from torch.utils.data import DataLoader, random_split
from torch.utils.data.dataset import Dataset
from torchvision import transforms
from tqdm import tqdm
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [5]:
%cd "/content/drive/MyDrive/Audio-Assignment-main"

/content/drive/MyDrive/Audio-Assignment-main


#### An audio loader function that ensures 16000 sampling rate

In [7]:
def load_audio(filename, target_samplerate = 16000):
    # Load the audio file
    data, original_samplerate = sf.read(filename)

    # If the original sampling rate is different from the target, resample
    if original_samplerate != target_samplerate:
        data = librosa.resample(data, orig_sr=original_samplerate, target_sr=target_samplerate)

    return data, target_samplerate

#### A helper function that overlays piano sound to the audio Itqan has recorded


In [41]:
# Load the audio file
audio, sample_rate = sf.read("audio_1.mp3")

# Augmentation with background noise
augment = Compose([
    AddBackgroundNoise(sounds_path="piano.mp3", min_snr_db=10, max_snr_db=30, p=1.0)
])

# Apply the augmentation
augmented_audio = augment(samples=audio, sample_rate=sample_rate)

# Save the augmented audio
sf.write("audio_with_piano.mp3", augmented_audio, sample_rate)

# Model Preparation
Now that we an input and a label, we need a model for background sound removal. Taking inspiration from image to image models, we decided to adopt UNET that maps aor mixed input audio to one with background piano music removed. A UNET progressively shortens the spacialilty of the data, increases channels to a certain extent and then expands it with the help of transpose convolutions and skip connections to map it to the dimensions of the output.
For this we need:
1. Double Convolutions
2. DownSampling Layers
3. UpSampling Layers


In [43]:
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_op = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.conv_op(x)

In [44]:
class DownSample(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = DoubleConv(in_channels, out_channels)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

    def forward(self, x):
        down = self.conv(x)
        p = self.pool(down)

        return down, p

In [45]:
class UpSample(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.up = nn.ConvTranspose1d(in_channels, in_channels//2, kernel_size=2, stride=2)
        self.conv = DoubleConv(in_channels, out_channels)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        x = torch.cat([x1, x2], 1)
        return self.conv(x)

## Consolidating the model

In [46]:
class UNet(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.down_convolution_1 = DownSample(in_channels, 64)
        self.down_convolution_2 = DownSample(64, 128)
        self.down_convolution_3 = DownSample(128, 256)
        self.down_convolution_4 = DownSample(256, 512)

        self.bottle_neck = DoubleConv(512, 1024)

        self.up_convolution_1 = UpSample(1024, 512)
        self.up_convolution_2 = UpSample(512, 256)
        self.up_convolution_3 = UpSample(256, 128)
        self.up_convolution_4 = UpSample(128, 64)

        self.out = nn.Conv1d(in_channels=64, out_channels=1, kernel_size=1)

    def forward(self, x):
        down_1, p1 = self.down_convolution_1(x)
        down_2, p2 = self.down_convolution_2(p1)
        down_3, p3 = self.down_convolution_3(p2)
        down_4, p4 = self.down_convolution_4(p3)

        b = self.bottle_neck(p4)

        up_1 = self.up_convolution_1(b, down_4)
        up_2 = self.up_convolution_2(up_1, down_3)
        up_3 = self.up_convolution_3(up_2, down_2)
        up_4 = self.up_convolution_4(up_3, down_1)

        out = self.out(up_4)
        return out

### We also need an audio dataset class and a training function

In [47]:

class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, noisy_audio_path = 'audio_with_piano.mp3', clean_audio_path = 'audio_1.mp3', sample_rate=16000, duration=2):
        self.sample_rate = sample_rate
        self.samples_per_segment = int(sample_rate * duration)  # 2 seconds = 32000 samples

        # Load audio files
        noisy_audio = torch.tensor(
            load_audio(noisy_audio_path)[0][:100000], dtype=torch.float
        )
        clean_audio = torch.tensor(
            load_audio(clean_audio_path)[0][:100000], dtype=torch.float
        )

        # Ensure both audio tensors are the same length
        min_length = min(len(noisy_audio), len(clean_audio))
        noisy_audio = noisy_audio[:min_length]
        clean_audio = clean_audio[:min_length]

        # Split into segments
        self.noisy_segments = noisy_audio.unfold(0, self.samples_per_segment, self.samples_per_segment)
        self.clean_segments = clean_audio.unfold(0, self.samples_per_segment, self.samples_per_segment)

    def __len__(self):
        return self.noisy_segments.size(0)
    def __getitem__(self, idx):
        # Add necessary dimensions for (batch_size, channels, time_steps)
        noisy_segment = self.noisy_segments[idx].unsqueeze(0)  # Shape: (1, 1, 32000)
        clean_segment = self.clean_segments[idx].unsqueeze(0)  # Shape: (1, 1, 32000)
        return noisy_segment, clean_segment





In [None]:
# Training loop function
def train_model(model, dataloader, criterion, optimizer, num_epochs, device):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for noisy, clean in dataloader:
            noisy = noisy.to(device)  # Move input to device
            clean = clean.to(device)  # Move target to device

            optimizer.zero_grad()  # Zero out previous gradients

            # Forward pass
            denoised = model(noisy)

            # Compute loss
            loss = criterion(denoised, clean)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * noisy.size(0)  # Accumulate batch loss

        epoch_loss = running_loss / len(dataloader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

In [48]:
dataset = AudioDataset()
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [49]:
model = UNet(1)
criterion = nn.MSELoss()  # Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)  # Optimizer

## Training

In [50]:
train_model(model, dataloader, criterion, optimizer, num_epochs=100, device=device)

Epoch 1/100, Loss: 0.0037
Epoch 2/100, Loss: 0.0034
Epoch 3/100, Loss: 0.0033
Epoch 4/100, Loss: 0.0032
Epoch 5/100, Loss: 0.0030
Epoch 6/100, Loss: 0.0029
Epoch 7/100, Loss: 0.0026
Epoch 8/100, Loss: 0.0022
Epoch 9/100, Loss: 0.0019
Epoch 10/100, Loss: 0.0014
Epoch 11/100, Loss: 0.0010
Epoch 12/100, Loss: 0.0007
Epoch 13/100, Loss: 0.0006
Epoch 14/100, Loss: 0.0006
Epoch 15/100, Loss: 0.0006
Epoch 16/100, Loss: 0.0005
Epoch 17/100, Loss: 0.0003
Epoch 18/100, Loss: 0.0003
Epoch 19/100, Loss: 0.0002
Epoch 20/100, Loss: 0.0002
Epoch 21/100, Loss: 0.0002
Epoch 22/100, Loss: 0.0002
Epoch 23/100, Loss: 0.0002
Epoch 24/100, Loss: 0.0002
Epoch 25/100, Loss: 0.0002
Epoch 26/100, Loss: 0.0002
Epoch 27/100, Loss: 0.0002
Epoch 28/100, Loss: 0.0002
Epoch 29/100, Loss: 0.0002
Epoch 30/100, Loss: 0.0002
Epoch 31/100, Loss: 0.0002
Epoch 32/100, Loss: 0.0002
Epoch 33/100, Loss: 0.0002
Epoch 34/100, Loss: 0.0002
Epoch 35/100, Loss: 0.0002
Epoch 36/100, Loss: 0.0001
Epoch 37/100, Loss: 0.0001
Epoch 38/1

# Inference

In [57]:
model.eval()

# Example input (batch of size 1, 10 features)
noisy_audio = 'audio_with_piano.mp3'
input_data = torch.tensor(load_audio(noisy_audio)[0][:100000][np.newaxis, np.newaxis, :],dtype= torch.float).to(device)

with torch.no_grad():
  out2 = model(input_data)
out2 = np.array(out2.cpu().squeeze())
sf.write('model_out_1.mp3', out2, 16000)

# Inputting the output of the model into it again

In [56]:
with torch.no_grad():
  out2 = model(input_data)
  out2 = model(out2)
out2 = np.array(out2.cpu().squeeze())
sf.write('model_out_2.mp3', out2, 16000)

# Conclusion
The model can remove piano successfully from the overfitted data. Additionally, it also doesn't remove necessary data if given a sample with no background audio as in the case of inputting the output of the model again into it.