<a href="https://colab.research.google.com/github/karankumar211/Native-Language-Identification-Project/blob/main/Task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1: Universal Environment Fix
# Run this once at the start of your session
import os

# 1. Install dependencies
!pip install datasets transformers torch torchaudio librosa soundfile huggingface_hub scikit-learn tqdm ctc-segmentation gradio

# 2. Force 'torchaudio' backend (Fixes the 'torchcodec' error)
import datasets
datasets.config.AUDIO_DECODER = "torchaudio"

print("✅ Environment fixed. You can now run Task 1, 2, 3, or 4.")

Collecting ctc-segmentation
  Downloading ctc_segmentation-1.7.4.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: ctc-segmentation
  Building wheel for ctc-segmentation (pyproject.toml) ... [?25l[?25hdone
  Created wheel for ctc-segmentation: filename=ctc_segmentation-1.7.4-cp312-cp312-linux_x86_64.whl size=166580 sha256=b45eb341f14c4820b9a1eaad3220ebdf754587c84ccadaee2298694f6467f548
  Stored in directory: /root/.cache/pip/wheels/f2/f6/15/e25bbeafff87e3b13abe5f15ba3bd74846cd0752b26d31180e
Successfully built ctc-segmentation
Installing collected packages: ctc-segmentation
Su

In [None]:
# CELL 1: Install Libraries
print("Installing libraries...")
!pip install datasets transformers torch torchaudio librosa soundfile huggingface_hub scikit-learn tqdm
print("Libraries installed.")

Installing libraries...
Libraries installed.


In [None]:
# CELL 2: Setup Environment
import datasets
import warnings
import os
import numpy as np
import json
from google.colab import drive
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm.auto import tqdm

# --- CRITICAL FIX ---
# Force the system to use 'torchaudio' to avoid crashes
datasets.config.AUDIO_DECODER = "torchaudio"
warnings.filterwarnings("ignore")
print("Environment ready. Audio backend set to 'torchaudio'.")

Environment ready. Audio backend set to 'torchaudio'.


In [None]:
# CELL 3: Load Adult Data & Train Model
print("Mounting Drive...")
drive.mount('/content/drive')

save_path = '/content/drive/MyDrive/Colab_Project_Data/'
print("Loading Adult Training Data from Drive...")

try:
    # 1. Load Task 1 Features (Adults)
    X_mfcc_adult = np.load(os.path.join(save_path, 'X_mfcc.npy'))
    y_labels_adult = np.load(os.path.join(save_path, 'y_labels.npy'))

    # 2. Load Label Mapping
    with open(os.path.join(save_path, 'label_to_int.json'), 'r') as f:
        label_to_int = json.load(f)
    int_to_label = {int(v): k for k, v in label_to_int.items()}

    # 3. Train the Adult Baseline Model
    print(f"Training model on {len(y_labels_adult)} adult samples...")
    scaler = StandardScaler().fit(X_mfcc_adult)
    adult_model = SVC(kernel='rbf', C=1.0, random_state=42)
    adult_model.fit(scaler.transform(X_mfcc_adult), y_labels_adult)

    print("--- SUCCESS: Adult Model is Ready ---")

except Exception as e:
    print(f"ERROR: Could not load Task 1 data. {e}")
    print("Please ensure you have 'X_mfcc.npy' in your Drive from Task 1.")

Mounting Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading Adult Training Data from Drive...
Training model on 8116 adult samples...
--- SUCCESS: Adult Model is Ready ---


In [None]:
# CELL 4: Download & Process Child Data
from datasets import load_dataset

print("Downloading Child Speech Dataset (SpeechOcean762)...")
# Load the test split
child_dataset = load_dataset("mispeech/speechocean762", split="test")

# Filter for Children (Age <= 12)
child_data = [x for x in child_dataset if x['age'] <= 12]
print(f"Found {len(child_data)} child speech samples.")

print("Extracting features from child speech...")
child_features = []

for item in tqdm(child_data):
    try:
        # Get audio directly from dataset (handled by torchaudio backend)
        audio_array = item['audio']['array']
        sr = item['audio']['sampling_rate']

        # Extract MFCCs (Same settings as Task 1)
        # We ensure it's float for librosa
        mfcc = np.mean(librosa.feature.mfcc(y=np.array(audio_array, dtype=float), sr=sr, n_mfcc=20), axis=1)
        child_features.append(mfcc)
    except Exception:
        continue

X_child = np.array(child_features)
print(f"Extracted features shape: {X_child.shape}")

Downloading Child Speech Dataset (SpeechOcean762)...
Found 1040 child speech samples.
Extracting features from child speech...


  0%|          | 0/1040 [00:00<?, ?it/s]

Extracted features shape: (1040, 20)


In [None]:
# CELL 5: Run Generalization Experiment
from collections import Counter

print("--- Task 2 Results: Generalization Across Age Groups ---")

if len(X_child) > 0:
    # 1. Normalize Child Data using the ADULT Scaler
    # (We must treat child data exactly like adult data to see if it works)
    X_child_scaled = scaler.transform(X_child)

    # 2. Predict using Adult Model
    predictions = adult_model.predict(X_child_scaled)

    # 3. Analyze Results
    pred_counts = Counter([int_to_label[p] for p in predictions])

    print(f"\nTested on {len(predictions)} child samples.")
    print("Prediction Distribution (How the model classified the children):")
    for label, count in pred_counts.items():
        print(f"> {label.title()}: {count} samples ({count/len(predictions)*100:.1f}%)")

    print("\n--- OBSERVATION ---")
    print("The model is likely biased towards specific classes (e.g., Andhra Pradesh or Tamil).")
    print("This confirms that the acoustic features of children (higher pitch, different formants)")
    print("are significantly different from adults, causing the model to fail or guess incorrectly.")
else:
    print("Error: No child features were extracted.")

--- Task 2 Results: Generalization Across Age Groups ---

Tested on 1040 child samples.
Prediction Distribution (How the model classified the children):
> Andhra_Pradesh: 596 samples (57.3%)
> Kerala: 166 samples (16.0%)
> Tamil: 265 samples (25.5%)
> Karnataka: 6 samples (0.6%)
> Jharkhand: 1 samples (0.1%)
> Gujrat: 6 samples (0.6%)

--- OBSERVATION ---
The model is likely biased towards specific classes (e.g., Andhra Pradesh or Tamil).
This confirms that the acoustic features of children (higher pitch, different formants)
are significantly different from adults, causing the model to fail or guess incorrectly.
