In [11]:
cd /content/drive/MyDrive/assignemet2_speech/

/content/drive/MyDrive/assignemet2_speech


In [12]:
!ls

datasets  data_test.txt  hindi_language_speeker_verification.ipynb  Untitled0.ipynb


In [16]:
import torch
import os
import pandas as pd
import torch.nn as nn
import torchaudio
from torch.nn.functional import cosine_similarity
from torch.utils.data import Dataset
from sklearn.metrics import roc_curve
from scipy.optimize import brentq
from scipy.interpolate import interp1d
from transformers import Wav2Vec2Model, Wav2Vec2Tokenizer, HubertModel
# Check if torchaudio's sox_io backend is available
if torchaudio.get_audio_backend() != 'sox_io':
    torchaudio.set_audio_backend("sox_io")


  if torchaudio.get_audio_backend() != 'sox_io':
  torchaudio.set_audio_backend("sox_io")


In [19]:
import os
import torchaudio
from torch.utils.data import Dataset

class KathbathDataset(Dataset):
    def __init__(self, root_dir, split_name, transform=None):
        self.root_dir = root_dir
        self.split_name = split_name
        self.transform = transform
        self.speakers = sorted(os.listdir(os.path.join(root_dir, split_name)))
        self.audio_files = []
        self.labels = []
        self.audio_ids = []
        self.user_ids = []
        self.genders = []

        for speaker in self.speakers:
            speaker_dir = os.path.join(root_dir, split_name, speaker)
            files = sorted(os.listdir(speaker_dir))
            for file in files:
                self.audio_files.append(os.path.join(speaker_dir, file))
                self.labels.append(self.speakers.index(speaker))
                audio_id, user_id, gender = self.parse_audio_filename(file)
                self.audio_ids.append(audio_id)
                self.user_ids.append(user_id)
                self.genders.append(gender)

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        label = self.labels[idx]
        waveform, sample_rate = torchaudio.load(audio_path)
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, label

    def parse_audio_filename(self, filename):
        parts = filename.split('-')
        audio_id = parts[0]
        user_id = parts[1]
        gender = parts[2].split('.')[0]
        return audio_id, user_id, gender




In [20]:


# Step 2: Define your model architecture
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # Load pre-trained model and tokenizer
        model_name = "facebook/wav2vec2-large-xlsr-53"
        #self.model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft").eval()
        self.model = Wav2Vec2Model.from_pretrained(model_name).eval()

    def forward(self, x1, x2):
        # Assuming x1 and x2 are paths to audio files
        # You need to implement how to load and process audio files into features
        # Convert audio files to features
        feature1 = self.preprocess_audio(x1)
        feature2 = self.preprocess_audio(x2)

        # Extract embeddings
        with torch.no_grad():
            out1 = self.model(feature1.unsqueeze(0)).last_hidden_state
            out2 = self.model(feature2.unsqueeze(0)).last_hidden_state

        # Flatten the embeddings
        out1_emb = out1.squeeze(0)
        out2_emb = out2.squeeze(0)

        # Ensure the dimensions match
        min_length = min(out1_emb.shape[0], out2_emb.shape[0])
        output1 = out1_emb[:min_length]
        output2 = out2_emb[:min_length]


        # Here, you need to define how to compute similarity between output1 and output2
        # For example, you can use cosine similarity, Euclidean distance, etc.
        similarity_score = self.compute_similarity(output1, output2)

        return similarity_score


    # Function to preprocess audio clips
    def preprocess_audio(self, audio_path):
        waveform, sample_rate = torchaudio.load(audio_path)
        TARGET_SAMPLE_RATE =16000
        # Resample if necessary
        if sample_rate != TARGET_SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
            waveform = resampler(waveform)

        # # Convert stereo to mono if necessary
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        #print(waveform.shape)
        # Ensure single channel
        waveform = waveform.squeeze(0)  # Remove batch dimension if present
        #print(waveform.shape)
        if waveform.dim() > 1:
            waveform = waveform.mean(dim=0, keepdim=True)  # Take the mean if multiple channels

        # Normalize waveform
        waveform /= torch.max(torch.abs(waveform))

        return waveform


    def compute_similarity(self, output1, output2):
        # Implement similarity computation here
        # For demonstration, let's assume we compute cosine similarity between output1 and output2
        # You might need to reshape or process the outputs before computing similarity
        # Here's a simple example of computing cosine similarity
        # Note: This is just a placeholder. Implement the actual similarity computation as needed.
        # Calculate cosine similarity
        similarity = cosine_similarity(output1, output2, dim=1)
        return similarity.mean().item()



In [None]:

# Assuming MyModel is defined as provided in the question
# Assuming Kathbath Dataset has a structure compatible with PyTorch Dataset

# Define a function to compute the EER
def compute_eer(scores, labels):
    fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=1)
    eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    return eer * 100  # Convert to percentage

# Load the Kathbath Dataset and define data loaders
# Replace <kathbath_dataset_path> with the path to your Kathbath Dataset
# Replace <batch_size> with the desired batch size
kathbath_dataset_path = "<kathbath_dataset_path>"
batch_size = <batch_size>

# Assuming KathbathDataset class is implemented and DataLoader is used to load data
# Example usage:
train_dataset = KathbathDataset(root_dir="datasets", split_name="valid_data")
val_dataset = KathbathDataset(root_dir="datasets", split_name="valid_data")
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=True)


# Define the fine-tuning procedure
def fine_tune_model(model, train_loader, val_loader, optimizer, criterion, num_epochs):
    best_eer = float('inf')
    for epoch in range(num_epochs):
        model.train()
        for batch in train_loader:
            # Forward pass
            audio1, audio2, labels = batch
            optimizer.zero_grad()
            output = model(audio1, audio2)
            loss = criterion(output, labels)

            # Backward pass
            loss.backward()
            optimizer.step()

        # Evaluate on validation set
        eer = evaluate_model(model, val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation EER: {eer:.2f}%")

        # Save the model if it has the best EER so far
        if eer < best_eer:
            best_eer = eer
            torch.save(model.state_dict(), "best_model.pth")

    return best_eer

# Define a function to evaluate the model on the validation set
def evaluate_model(model, val_loader):
    model.eval()
    scores = []
    labels = []
    with torch.no_grad():
        for batch in val_loader:
            audio1, audio2, batch_labels = batch
            output = model(audio1, audio2)
            scores.extend(output.cpu().numpy())
            labels.extend(batch_labels.cpu().numpy())
    eer = compute_eer(scores, labels)
    return eer

# Fine-tune the model
# Define your optimizer and loss function
# Replace <optimizer> and <criterion> with appropriate choices
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCELoss()  # Assuming binary classification (similar/dissimilar)
num_epochs = 10  # Define the number of epochs for fine-tuning

# Fine-tune the model
best_eer = fine_tune_model(model, train_loader, val_loader, optimizer, criterion, num_epochs)

print(f"Best Validation EER: {best_eer:.2f}%")

# Once fine-tuning is done, you can evaluate the best model on the test set if available
# Load the best model
best_model = MyModel()
best_model.load_state_dict(torch.load("best_model.pth"))


test_dataset = KathbathDataset(root_dir="datasets", split_name="test_data")
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)
# Evaluate the best model on the test set and report the EER
test_eer = evaluate_model(best_model, test_loader)
print(f"Test EER: {test_eer:.2f}%")
