# ðŸ‘¶ðŸ‘¨ Age Generalization Study

## Objective
Evaluate how well accent classification models trained on adult speech generalize to children's speech.

## Research Question
Do accent patterns learned from adult speakers transfer to child speakers?

## Methodology
1. Split dataset into Adult and Child subsets
2. Train models on Adult data only
3. Test on both Adult (in-domain) and Child (out-of-domain) data
4. Compare MFCC vs HuBERT robustness across ages

## Expected Outcome
- Performance drop when testing on children
- HuBERT expected to generalize better than MFCC
- Analysis of which accents are more age-invariant

## Estimated Runtime: 3-4 hours

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_DIR = '/content/drive/MyDrive/IndicAccent_Project'
os.chdir(PROJECT_DIR)
print(f'âœ… Working directory: {os.getcwd()}')

In [None]:
!pip install -q datasets transformers torch torchaudio librosa soundfile scikit-learn matplotlib tqdm
!pip install -q torchcodec
print('âœ… Dependencies installed!')

## Step 1: Load and Analyze Dataset

In [None]:
from datasets import load_dataset, Audio
import numpy as np

# Load dataset
dataset = load_dataset("DarshanaS/IndicAccentDb")
dataset = dataset.cast_column("audio", Audio(decode=True))

print(f'Total samples: {len(dataset["train"])}')
print(f'\nDataset features: {dataset["train"].features}')

# Check if age information exists
sample = dataset['train'][0]
print(f'\nSample keys: {sample.keys()}')

## Step 2: Split by Age

**Note**: If the dataset doesn't have explicit age labels, we can:
1. Use audio duration as proxy (children typically have shorter utterances)
2. Use pitch/formant analysis (children have higher pitch)
3. Manual annotation of subset

For this analysis, we'll demonstrate the methodology assuming age labels exist or can be inferred.

In [None]:
import librosa

def estimate_age_group(audio_dict):
    """
    Estimate age group based on pitch (F0)
    Children typically have F0 > 250 Hz
    Adults typically have F0 < 250 Hz
    """
    arr = audio_dict["array"].astype(float)
    sr = audio_dict["sampling_rate"]
    
    # Extract pitch
    f0 = librosa.yin(arr, fmin=50, fmax=500, sr=sr)
    mean_f0 = np.nanmean(f0)
    
    # Classify based on pitch
    if mean_f0 > 250:
        return 'child'
    else:
        return 'adult'

print('âœ… Age estimation function defined')
print('\nNote: This is a heuristic. Actual age labels would be more reliable.')

In [None]:
from tqdm import tqdm

# Estimate age for all samples (or use existing labels if available)
print("Estimating age groups...")

adult_indices = []
child_indices = []

# Process subset for demonstration (remove limit for full analysis)
for idx in tqdm(range(min(1000, len(dataset['train'])))):
    try:
        item = dataset['train'][idx]
        age_group = estimate_age_group(item['audio'])
        
        if age_group == 'adult':
            adult_indices.append(idx)
        else:
            child_indices.append(idx)
    except:
        continue

print(f"\nâœ… Age groups identified:")
print(f"   Adults: {len(adult_indices)} samples")
print(f"   Children: {len(child_indices)} samples")

# Create subsets
adult_data = dataset['train'].select(adult_indices)
child_data = dataset['train'].select(child_indices)

print(f"\nðŸ“Š Distribution:")
print(f"   Adult: {len(adult_data)} ({len(adult_data)/(len(adult_data)+len(child_data))*100:.1f}%)")
print(f"   Child: {len(child_data)} ({len(child_data)/(len(adult_data)+len(child_data))*100:.1f}%)")

## Step 3: Extract Features (MFCC & HuBERT)

In [None]:
import torch
from transformers import HubertModel, Wav2Vec2FeatureExtractor

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load HuBERT
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
hubert = HubertModel.from_pretrained("facebook/hubert-base-ls960").to(device)
hubert.eval()

def extract_mfcc(audio_dict):
    arr = audio_dict["array"].astype(float)
    sr = audio_dict["sampling_rate"]
    
    if sr != 16000:
        arr = librosa.resample(arr, orig_sr=sr, target_sr=16000)
    
    arr = arr / (np.max(np.abs(arr)) + 1e-9)
    mfcc = librosa.feature.mfcc(y=arr, sr=16000, n_mfcc=40)
    return np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)])

def extract_hubert(audio_dict):
    arr = audio_dict["array"].astype(float)
    sr = audio_dict["sampling_rate"]
    
    if sr != 16000:
        arr = librosa.resample(arr, orig_sr=sr, target_sr=16000)
    
    arr = arr / (np.max(np.abs(arr)) + 1e-9)
    inputs = feature_extractor(arr, sampling_rate=16000, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        outputs = hubert(inputs.input_values.to(device))
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()[0]
    
    return embedding

print('âœ… Feature extraction functions ready')

In [None]:
# Extract features for adults (training set)
print("Extracting features from ADULT data...")

X_adult_mfcc = []
X_adult_hubert = []
y_adult = []

for item in tqdm(adult_data):
    try:
        X_adult_mfcc.append(extract_mfcc(item['audio']))
        X_adult_hubert.append(extract_hubert(item['audio']))
        y_adult.append(item['label'])
    except:
        continue

X_adult_mfcc = np.array(X_adult_mfcc)
X_adult_hubert = np.array(X_adult_hubert)
y_adult = np.array(y_adult)

print(f"âœ… Adult features: MFCC {X_adult_mfcc.shape}, HuBERT {X_adult_hubert.shape}")

In [None]:
# Extract features for children (test set)
print("Extracting features from CHILD data...")

X_child_mfcc = []
X_child_hubert = []
y_child = []

for item in tqdm(child_data):
    try:
        X_child_mfcc.append(extract_mfcc(item['audio']))
        X_child_hubert.append(extract_hubert(item['audio']))
        y_child.append(item['label'])
    except:
        continue

X_child_mfcc = np.array(X_child_mfcc)
X_child_hubert = np.array(X_child_hubert)
y_child = np.array(y_child)

print(f"âœ… Child features: MFCC {X_child_mfcc.shape}, HuBERT {X_child_hubert.shape}")

## Step 4: Train on Adults, Test on Both

In [None]:
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Split adult data into train/val
X_train_mfcc, X_val_mfcc, y_train, y_val = train_test_split(
    X_adult_mfcc, y_adult, test_size=0.2, random_state=42, stratify=y_adult
)

X_train_hubert, X_val_hubert, _, _ = train_test_split(
    X_adult_hubert, y_adult, test_size=0.2, random_state=42, stratify=y_adult
)

print(f"Training set: {len(X_train_mfcc)} samples")
print(f"Validation set (adult): {len(X_val_mfcc)} samples")
print(f"Test set (child): {len(X_child_mfcc)} samples")

In [None]:
class Classifier(nn.Module):
    def __init__(self, input_dim, num_classes=6):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        return self.net(x)

class FeatureDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, i):
        return self.X[i], self.y[i]

def train_and_evaluate(X_train, X_val, X_test, y_train, y_val, y_test, input_dim, model_name):
    print(f"\n{'='*60}")
    print(f"Training {model_name} Model")
    print(f"{'='*60}")
    
    # Create dataloaders
    train_loader = DataLoader(FeatureDataset(X_train, y_train), batch_size=64, shuffle=True)
    val_loader = DataLoader(FeatureDataset(X_val, y_val), batch_size=64)
    test_loader = DataLoader(FeatureDataset(X_test, y_test), batch_size=64)
    
    # Initialize model
    model = Classifier(input_dim).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    # Train
    best_val_acc = 0
    for epoch in range(15):
        model.train()
        for feats, labels in train_loader:
            feats, labels = feats.to(device), labels.to(device)
            optimizer.zero_grad()
            preds = model(feats)
            loss = loss_fn(preds, labels)
            loss.backward()
            optimizer.step()
        
        # Validate on adults
        model.eval()
        correct = total = 0
        with torch.no_grad():
            for feats, labels in val_loader:
                feats, labels = feats.to(device), labels.to(device)
                preds = model(feats).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        val_acc = correct / total
        if val_acc > best_val_acc:
            best_val_acc = val_acc
        print(f"Epoch {epoch+1}/15 | Val Acc (Adult): {val_acc:.4f}")
    
    # Test on children
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for feats, labels in test_loader:
            feats, labels = feats.to(device), labels.to(device)
            preds = model(feats).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    test_acc = correct / total
    
    print(f"\nâœ… {model_name} Results:")
    print(f"   Adult Val Accuracy: {best_val_acc:.4f}")
    print(f"   Child Test Accuracy: {test_acc:.4f}")
    print(f"   Performance Drop: {(best_val_acc - test_acc):.4f} ({(best_val_acc - test_acc)/best_val_acc*100:.1f}%)")
    
    return best_val_acc, test_acc

print('âœ… Training function ready')

In [None]:
# Train MFCC model
mfcc_adult_acc, mfcc_child_acc = train_and_evaluate(
    X_train_mfcc, X_val_mfcc, X_child_mfcc,
    y_train, y_val, y_child,
    input_dim=80,
    model_name="MFCC"
)

In [None]:
# Train HuBERT model
hubert_adult_acc, hubert_child_acc = train_and_evaluate(
    X_train_hubert, X_val_hubert, X_child_hubert,
    y_train, y_val, y_child,
    input_dim=768,
    model_name="HuBERT"
)

## Step 5: Visualize Results

In [None]:
import matplotlib.pyplot as plt

# Comparison plot
models = ['MFCC', 'HuBERT']
adult_accs = [mfcc_adult_acc, hubert_adult_acc]
child_accs = [mfcc_child_acc, hubert_child_acc]

x = np.arange(len(models))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width/2, adult_accs, width, label='Adult (In-Domain)', color='#2ecc71')
bars2 = ax.bar(x + width/2, child_accs, width, label='Child (Out-of-Domain)', color='#e74c3c')

ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Age Generalization: Adult â†’ Child', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig(f'{PROJECT_DIR}/age_generalization.png', dpi=300)
plt.show()

print("\nðŸ“Š Summary:")
print(f"\nMFCC:")
print(f"  Adult: {mfcc_adult_acc:.4f}")
print(f"  Child: {mfcc_child_acc:.4f}")
print(f"  Drop: {(mfcc_adult_acc - mfcc_child_acc)/mfcc_adult_acc*100:.1f}%")

print(f"\nHuBERT:")
print(f"  Adult: {hubert_adult_acc:.4f}")
print(f"  Child: {hubert_child_acc:.4f}")
print(f"  Drop: {(hubert_adult_acc - hubert_child_acc)/hubert_adult_acc*100:.1f}%")

if (mfcc_adult_acc - mfcc_child_acc) > (hubert_adult_acc - hubert_child_acc):
    print("\nâœ… HuBERT generalizes better across age groups!")
else:
    print("\nâœ… MFCC generalizes better across age groups!")

## Conclusion

### Expected Findings:
- **Performance Drop**: Both models show accuracy decrease on children
- **HuBERT Advantage**: HuBERT typically shows smaller performance drop
- **Reason**: HuBERT's self-supervised learning captures more robust features

### Typical Results:
- MFCC: 15-25% accuracy drop on children
- HuBERT: 10-15% accuracy drop on children

### Implications:
- Accent patterns differ between adults and children
- Children's speech has different acoustic properties (higher pitch, less stable articulation)
- Deep learned features (HuBERT) are more age-invariant
- For production systems, need age-diverse training data