# üî¨ HuBERT Layer-by-Layer Analysis

## Objective
Analyze which of the 13 HuBERT layers best captures accent-related information for Native Language Identification.

## Methodology
1. Extract embeddings from all 13 layers (0-12) of HuBERT
2. Train a separate classifier on each layer's features
3. Compare accuracy across layers
4. Identify optimal layer for accent detection

## Expected Outcome
- Layer-wise accuracy comparison
- Visualization showing which layers encode accent information
- Typically, middle-to-upper layers (8-11) perform best for accent tasks

## Estimated Runtime: 12-15 hours

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_DIR = '/content/drive/MyDrive/IndicAccent_Project'
os.chdir(PROJECT_DIR)
print(f'‚úÖ Working directory: {os.getcwd()}')

In [None]:
!pip install -q datasets transformers torch torchaudio librosa soundfile scikit-learn matplotlib tqdm joblib
!pip install -q torchcodec
print('‚úÖ Dependencies installed!')

In [None]:
import torch
import torch.nn as nn
import numpy as np
from transformers import HubertModel, Wav2Vec2FeatureExtractor
from datasets import load_dataset, Audio
from tqdm import tqdm
import joblib
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

## Step 1: Load Dataset

In [None]:
# Load dataset
dataset = load_dataset("DarshanaS/IndicAccentDb")
dataset = dataset.cast_column("audio", Audio(decode=True))
dataset = dataset.shuffle(seed=42)

# Use subset for faster testing (remove this for full analysis)
# dataset['train'] = dataset['train'].select(range(1000))

print(f'Dataset size: {len(dataset["train"])}')

## Step 2: Load HuBERT Model

In [None]:
# Load HuBERT
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
hubert = HubertModel.from_pretrained("facebook/hubert-base-ls960", output_hidden_states=True).to(device)
hubert.eval()

# Freeze parameters
for param in hubert.parameters():
    param.requires_grad = False

print('‚úÖ HuBERT loaded with all 13 layers accessible')

## Step 3: Extract Features from All 13 Layers

In [None]:
import librosa

def extract_layer_embeddings(audio_dict, layer_idx):
    """Extract embeddings from specific HuBERT layer"""
    arr = audio_dict["array"].astype(float)
    sr = audio_dict["sampling_rate"]
    
    # Resample if needed
    if sr != 16000:
        arr = librosa.resample(arr, orig_sr=sr, target_sr=16000)
    
    # Normalize
    arr = arr / (np.max(np.abs(arr)) + 1e-9)
    
    # Extract features
    inputs = feature_extractor(arr, sampling_rate=16000, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        outputs = hubert(inputs.input_values.to(device))
        # outputs.hidden_states contains all 13 layers
        # Layer 0 = first layer, Layer 12 = last layer
        layer_output = outputs.hidden_states[layer_idx]
        embedding = layer_output.mean(dim=1).cpu().numpy()[0]
    
    return embedding

print('‚úÖ Extraction function defined')

In [None]:
# Extract features from all 13 layers
# WARNING: This takes 12-15 hours for full dataset!

CHUNK_SIZE = 200
TOTAL = len(dataset['train'])

for layer_idx in range(13):
    print(f"\n{'='*60}")
    print(f"Processing Layer {layer_idx}/12")
    print(f"{'='*60}")
    
    save_path = f"{PROJECT_DIR}/layer_{layer_idx}_features.joblib"
    
    # Skip if already exists
    if os.path.exists(save_path):
        print(f"‚úÖ Layer {layer_idx} already extracted. Skipping.")
        continue
    
    all_features = []
    all_labels = []
    
    # Process in chunks
    for start in range(0, TOTAL, CHUNK_SIZE):
        end = min(start + CHUNK_SIZE, TOTAL)
        subset = dataset['train'].select(range(start, end))
        
        print(f"  Chunk {start}-{end}...")
        
        for item in tqdm(subset, desc=f"Layer {layer_idx}"):
            try:
                embedding = extract_layer_embeddings(item["audio"], layer_idx)
                all_features.append(embedding)
                all_labels.append(item["label"])
            except Exception as e:
                print(f"    Error: {e}")
                continue
        
        # Clear GPU cache
        torch.cuda.empty_cache()
    
    # Save layer features
    joblib.dump({
        'features': np.array(all_features),
        'labels': np.array(all_labels)
    }, save_path)
    
    print(f"‚úÖ Layer {layer_idx} saved: {len(all_features)} samples")

print("\n‚úÖ All 13 layers extracted!")

## Step 4: Train Classifier on Each Layer

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

class SimpleClassifier(nn.Module):
    def __init__(self, input_dim=768, num_classes=6):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
    
    def forward(self, x):
        return self.net(x)

class FeatureDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, i):
        return self.X[i], self.y[i]

print('‚úÖ Classifier defined')

In [None]:
# Train on each layer and record accuracy
layer_results = {}

for layer_idx in range(13):
    print(f"\n{'='*60}")
    print(f"Training on Layer {layer_idx}")
    print(f"{'='*60}")
    
    # Load layer features
    data = joblib.load(f"{PROJECT_DIR}/layer_{layer_idx}_features.joblib")
    X = data['features']
    y = data['labels']
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Create dataloaders
    train_loader = DataLoader(FeatureDataset(X_train, y_train), batch_size=64, shuffle=True)
    val_loader = DataLoader(FeatureDataset(X_val, y_val), batch_size=64)
    
    # Initialize model
    model = SimpleClassifier().to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    # Train for 10 epochs
    best_acc = 0.0
    
    for epoch in range(10):
        model.train()
        for feats, labels in train_loader:
            feats, labels = feats.to(device), labels.to(device)
            optimizer.zero_grad()
            preds = model(feats)
            loss = loss_fn(preds, labels)
            loss.backward()
            optimizer.step()
        
        # Validate
        model.eval()
        correct = total = 0
        with torch.no_grad():
            for feats, labels in val_loader:
                feats, labels = feats.to(device), labels.to(device)
                preds = model(feats).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        
        acc = correct / total
        if acc > best_acc:
            best_acc = acc
        
        print(f"  Epoch {epoch+1}/10 | Val Acc: {acc:.4f}")
    
    layer_results[layer_idx] = best_acc
    print(f"‚úÖ Layer {layer_idx} Best Accuracy: {best_acc:.4f}")

print("\n‚úÖ All layers trained!")

## Step 5: Visualize Results

In [None]:
# Plot layer-wise accuracy
layers = list(layer_results.keys())
accuracies = list(layer_results.values())

plt.figure(figsize=(12, 6))
plt.plot(layers, accuracies, marker='o', linewidth=2, markersize=8)
plt.xlabel('HuBERT Layer', fontsize=12)
plt.ylabel('Validation Accuracy', fontsize=12)
plt.title('Layer-wise Accent Classification Accuracy', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(layers)

# Mark best layer
best_layer = max(layer_results, key=layer_results.get)
best_acc = layer_results[best_layer]
plt.axvline(x=best_layer, color='r', linestyle='--', alpha=0.5, label=f'Best: Layer {best_layer}')
plt.legend()

plt.tight_layout()
plt.savefig(f'{PROJECT_DIR}/layer_analysis.png', dpi=300)
plt.show()

print(f"\nüèÜ Best Layer: {best_layer} with {best_acc:.4f} accuracy")
print(f"\nüìä All Results:")
for layer, acc in layer_results.items():
    print(f"  Layer {layer:2d}: {acc:.4f}")

## Conclusion

### Expected Findings:
- **Lower layers (0-3)**: Capture basic acoustic features, lower accuracy
- **Middle layers (4-8)**: Capture phonetic information, moderate accuracy
- **Upper layers (9-12)**: Capture high-level linguistic features, highest accuracy for accent

### Typical Results:
- Best layer is usually **Layer 9-11** for accent tasks
- Accuracy improves from layer 0 to middle layers
- May plateau or slightly decrease in final layers

### Implications:
- Confirms that accent information is encoded in mid-to-upper layers
- Can use best layer for more efficient models
- Validates HuBERT's hierarchical representation learning