<a href="https://colab.research.google.com/github/karankumar211/Native-Language-Identification-Project/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CELL 1: Install Pip Packages
print("Installing all Python libraries for Task 1...")
# We install torchaudio (the stable audio loader)
!pip install datasets transformers torch torchaudio librosa soundfile huggingface_hub requests beautifulsoup4 tqdm scikit-learn
print("All Python libraries installed successfully.")

In [None]:
# CELL 2: Download, Unzip, and Prepare Data
from huggingface_hub import hf_hub_download
import zipfile
import os
import glob

print("Downloading dataset 'DarshanaS/IndicAccentDb' (3.2 GB)...")
zip_path = hf_hub_download(
    repo_id="DarshanaS/IndicAccentDb",
    filename="IndicAccentDB.zip",
    repo_type="dataset"
)
print(f"Dataset downloaded to: {zip_path}")

extract_dir = "data"
os.makedirs(extract_dir, exist_ok=True)
print(f"Extracting to '{extract_dir}' folder...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print("Extraction complete.")

# --- Setup file lists and labels ---
all_file_paths = glob.glob("data/**/*.wav", recursive=True)
print(f"Found {len(all_file_paths)} audio files.")

label_names = sorted(list(set([os.path.basename(os.path.dirname(p)) for p in all_file_paths])))
label_to_int = {name: i for i, name in enumerate(label_names)}
int_to_label = {i: name for i, name in enumerate(label_names)}
print(f"Label mapping: {label_to_int}")

In [None]:
# CELL 3: Load HuBERT Models
import torch
from transformers import AutoFeatureExtractor, HubertModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
target_sr = 16000

# --- 1. Load HuBERT (Base) for feature extraction ---
ssl_ckpt = 'facebook/hubert-base-ls960'
print(f"Loading model: {ssl_ckpt} onto device...")
feature_extractor = AutoFeatureExtractor.from_pretrained(ssl_ckpt)
hubert_model = HubertModel.from_pretrained(ssl_ckpt).to(device)
print("Base HuBERT model loaded successfully.")

# --- 2. Load HuBERT (Layer-wise) ---
print("Loading HuBERT model for layer-wise extraction...")
hubert_model_layers = HubertModel.from_pretrained(
    ssl_ckpt,
    output_hidden_states=True
).to(device)
print("Layer-wise HuBERT model loaded.")

In [None]:
# CELL 4: Run ALL Sentence-Level Feature Extraction
import librosa
import numpy as np
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# Task 1 (Standard) lists
hubert_features_list = []
mfcc_features_list = []
labels_list = []

# Task 1 (Layer-wise) lists
layer_features_list = [[] for _ in range(13)] # 13 layers (0 to 12)
labels_list_layers = [] # Labels will be the same

print(f"Starting all feature extraction for {len(all_file_paths)} files...")

for path in tqdm(all_file_paths, desc="Extracting All Sentence Features"):
    try:
        label_string = os.path.basename(os.path.dirname(path))
        label_int = label_to_int[label_string]
        waveform_raw, sr_raw = librosa.load(path, sr=None)

        # --- 1. Extract MFCCs ---
        mfccs = librosa.feature.mfcc(y=waveform_raw, sr=sr_raw, n_mfcc=20)
        mfccs_mean = np.mean(mfccs, axis=1)
        mfcc_features_list.append(mfccs_mean)

        # --- 2. Resample for HuBERT ---
        if sr_raw != target_sr:
            waveform_16k = librosa.resample(waveform_raw, orig_sr=sr_raw, target_sr=target_sr)
        else:
            waveform_16k = waveform_raw

        inputs = feature_extractor(waveform_16k, sampling_rate=target_sr, return_tensors='pt').to(device)

        # --- 3. Extract Standard HuBERT (from base model) ---
        with torch.no_grad():
            outputs = hubert_model(inputs.input_values)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
        hubert_features_list.append(embedding)
        labels_list.append(label_int)

        # --- 4. Extract Layer-wise HuBERT ---
        with torch.no_grad():
            outputs_layers = hubert_model_layers(inputs.input_values)

        for i in range(13):
            layer_embedding = outputs_layers.hidden_states[i].mean(dim=1).squeeze(0).cpu().numpy()
            layer_features_list[i].append(layer_embedding)
        labels_list_layers.append(label_int)

    except Exception as e:
        print(f"Skipping file {path} due to error: {e}")

# --- Convert Task 1 (Standard) to NumPy ---
X_hubert = np.array(hubert_features_list)
X_mfcc = np.array(mfcc_features_list)
y_labels = np.array(labels_list)
print(f"\nStandard Feature extraction complete.")
print(f"HuBERT shape: {X_hubert.shape}, MFCC shape: {X_mfcc.shape}")

# --- Convert Task 1 (Layer-wise) to NumPy ---
X_hubert_layers = [np.array(features) for features in layer_features_list]
y_labels_layers = np.array(labels_list_layers)
print(f"Layer-wise extraction complete. Created {len(X_hubert_layers)} feature matrices.")

In [None]:
# CELL 5: Mount Drive & Save All Features
import numpy as np
from google.colab import drive
import json

print("Mounting Google Drive...")
drive.mount('/content/drive')

try:
    print("Saving processed data to your Google Drive...")
    save_path = '/content/drive/MyDrive/Colab_Project_Data/'
    os.makedirs(save_path, exist_ok=True)

    np.save(os.path.join(save_path, 'X_hubert.npy'), X_hubert)
    np.save(os.path.join(save_path, 'X_mfcc.npy'), X_mfcc)
    np.save(os.path.join(save_path, 'y_labels.npy'), y_labels)
    # Save the layer-wise data
    np.save(os.path.join(save_path, 'X_hubert_layers.npy'), np.array(X_hubert_layers, dtype=object))

    with open(os.path.join(save_path, 'label_to_int.json'), 'w') as f:
        json.dump(label_to_int, f)

    print(f"--- SUCCESS --- All data saved to: {save_path}")
except Exception as e:
    print(f"An error occurred while saving: {e}")

In [None]:
# CELL 6: Train & Compare SVMs (Task 1)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

print("--- Task 1: MFCC vs. HuBERT (Sentence-Level) ---")
X_mfcc_train, X_mfcc_test, y_train, y_test = train_test_split(
    X_mfcc, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)
X_hubert_train, X_hubert_test, _, _ = train_test_split(
    X_hubert, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

mfcc_scaler = StandardScaler().fit(X_mfcc_train)
hubert_scaler = StandardScaler().fit(X_hubert_train)

print("Training MFCC Model...")
mfcc_svm = SVC(random_state=42).fit(mfcc_scaler.transform(X_mfcc_train), y_train)
y_pred_mfcc = mfcc_svm.predict(mfcc_scaler.transform(X_mfcc_test))
acc_mfcc = accuracy_score(y_test, y_pred_mfcc)

print("Training HuBERT Model...")
hubert_svm = SVC(random_state=42).fit(hubert_scaler.transform(X_hubert_train), y_train)
y_pred_hubert = hubert_svm.predict(hubert_scaler.transform(X_hubert_test))
acc_hubert = accuracy_score(y_test, y_pred_hubert)

print("\n--- COMPARISON (Task 1) ---")
print(f"MFCC Model Accuracy:   {acc_mfcc * 100:.2f}%")
print(f"HuBERT Model Accuracy: {acc_hubert * 100:.2f}%")

In [None]:
# CELL 7: Train & Plot Layer-wise SVMs (Task 1.3)
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

layer_accuracies = []
layer_indices = range(len(X_hubert_layers))

print("Starting layer-wise model training...")
for i in layer_indices:
    print(f"Training on Layer {i}...")
    X_layer = X_hubert_layers[i]
    y_labels = y_labels_layers
    X_train, X_test, y_train, y_test = train_test_split(
        X_layer, y_labels, test_size=0.2, random_state=42, stratify=y_labels
    )
    scaler = StandardScaler().fit(X_train)
    svm = SVC(random_state=42).fit(scaler.transform(X_train), y_train)
    y_pred = svm.predict(scaler.transform(X_test))
    acc = accuracy_score(y_test, y_pred)
    layer_accuracies.append(acc)
print("Layer-wise training complete.")

plt.figure(figsize=(12, 6))
plt.plot(layer_indices, layer_accuracies, marker='o', linestyle='-')
plt.title('HuBERT Layer-wise Analysis for Accent Identification')
plt.xlabel('HuBERT Layer (0=Input, 1-12=Transformer)')
plt.ylabel('Accent Classification Accuracy')
plt.xticks(layer_indices)
plt.grid(True)
plt.show()

best_layer_index = np.argmax(layer_accuracies)
best_accuracy = layer_accuracies[best_layer_index]
print(f"\n--- Task 1 (Part 3) Result ---")
print(f"Best performance found at Layer: {best_layer_index} with {best_accuracy * 100:.2f}% accuracy.")