# Audio Classification: Speaker & Language

This notebook does the following:
1. Use a single **metadata CSV** to label both **speaker** and **language**.
2. Extract **MFCC features** from each audio file.
3. Perform **binary classification** separately for:
   - **Speaker** (Jeevan vs. Not_Jeevan)
   - **Language** (English vs. Not_English)
4. Use **k-Fold Cross-Validation** to evaluate performance (accuracy, precision, recall, F1) and generate confusion matrices.

In [6]:
#Installing All Required Libraries
!pip install librosa scikit-learn pandas numpy



In [8]:
#Importing ALl libraries
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

ModuleNotFoundError: No module named 'numpy'

In [9]:

def extract_features(file_path, sr=16000, n_mfcc=13):
    """
    Loads the audio file, extracts MFCC features, and returns the averaged MFCCs.
    """
    y, sr = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = np.mean(mfcc, axis=1)
    return mfcc_mean

def build_dataset(metadata_csv, audio_dir):
    """
    Reads metadata from a CSV that has at least 3 columns:
        - filename
        - speaker_label (e.g., "Jeevan" or "Not_Jeevan")
        - language_label (e.g., "English" or "Not_English")
    """
    df = pd.read_csv(metadata_csv)
    X, y_speaker, y_language = [], [], []
    for _, row in df.iterrows():
        file_path = os.path.join(audio_dir, row['filename'])
        X.append(extract_features(file_path))
        y_speaker.append(1 if row['speaker_label'].lower() == 'jeevan' else 0)
        y_language.append(1 if row['language_label'].lower() == 'english' else 0)
    return np.array(X), np.array(y_speaker), np.array(y_language)

def evaluate_classifier(X, y, n_splits=5):
    """
    Performs k-Fold cross-validation, returns classification metrics and a confusion matrix.
    """
    clf = SVC(kernel='linear', probability=True, random_state=42)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, X, y, cv=skf)
    return classification_report(y, y_pred, target_names=["Class 0", "Class 1"]), confusion_matrix(y, y_pred)

In [10]:
# Paths to metadata CSV and audio files folder
metadata_csv_path = "data/metadata.csv"
audio_directory = "data/audio_files"

# Build dataset
X, y_speaker, y_language = build_dataset(metadata_csv_path, audio_directory)

NameError: name 'pd' is not defined

In [11]:
# Speaker Classification
speaker_report, speaker_cm = evaluate_classifier(X, y_speaker, n_splits=5)
print("=== Speaker Classification (Jeevan vs. Not_Jeevan) ===")
print("Classification Report:\n", speaker_report)
print("Confusion Matrix:\n", speaker_cm)


NameError: name 'X' is not defined

In [None]:
# Language Classification
language_report, language_cm = evaluate_classifier(X, y_language, n_splits=5)
print("\n=== Language Classification (English vs. Not_English) ===")
print("Classification Report:\n", language_report)
print("Confusion Matrix:\n", language_cm)
