In [1]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType
import gc
from pathlib import Path
import warnings
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
import os
warnings.filterwarnings('ignore')


Fixing label index

In [None]:
DATA_DIR = Path("/teamspace/studios/this_studio/speechSentimentAnalysis/processed_data")

# Label mapping
label_map = {
    0: 0,  # angry
    1: 1,  # calm
    2: 2,  # disgust
    3: 3,  # fear
    4: 4,  # happy
    6: 5,  # sad → 5
    7: 6   # surprise → 6
}

print("=== MEMORY-EFFICIENT LABEL MAPPING ===")
print(f"Label mapping: {label_map}")

# Process each dataset separately to minimize memory usage
datasets = ["train", "valid", "test"]
for dataset_name in datasets:
    print(f"\n--- Processing {dataset_name} dataset ---")
    
    # Load only one dataset at a time
    file_path = DATA_DIR / f"{dataset_name}_processed.parquet"
    print(f"Loading {file_path}...")
    df = pd.read_parquet(file_path)
    
    # Show original stats
    print(f"Original size: {len(df)} samples")
    print(f"Original labels range: {df['labels'].min()} to {df['labels'].max()}")
    
    # Apply label mapping
    original_labels = df['labels'].copy()
    df['labels'] = df['labels'].map(label_map)
    
    # Check for unmapped labels
    unmapped_mask = df['labels'].isna()
    if unmapped_mask.any():
        print(f"Warning: {unmapped_mask.sum()} unmapped labels found")
        print(f"Unmapped values: {original_labels[unmapped_mask].unique()}")
    
    # Remove unmapped rows and convert to int
    df = df.dropna(subset=['labels']).copy()
    df['labels'] = df['labels'].astype(int)
    
    # Show final stats
    print(f"Final size: {len(df)} samples")
    print(f"Final labels range: {df['labels'].min()} to {df['labels'].max()}")
    print(f"Label distribution: {dict(df['labels'].value_counts().sort_index())}")
    
    # Save back to file
    print(f"Saving {file_path}...")
    df.to_parquet(file_path, index=False)
    
    # Clean up memory
    del df, original_labels
    gc.collect()
    
    print(f"✓ {dataset_name} dataset processed and saved")

print("\n=== VERIFICATION ===")
# Quick verification without loading full datasets
for dataset_name in datasets:
    file_path = DATA_DIR / f"{dataset_name}_processed.parquet"
    # Read only labels column for verification
    labels = pd.read_parquet(file_path, columns=['labels'])['labels']
    print(f"{dataset_name}: {len(labels)} samples, labels range {labels.min()}-{labels.max()}")
    del labels
    gc.collect()

print("\nAll datasets processed successfully!")


=== MEMORY-EFFICIENT LABEL MAPPING ===
Label mapping: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 6: 5, 7: 6}

--- Processing train dataset ---
Loading /teamspace/studios/this_studio/speechSentimentAnalysis/processed_data/train_processed.parquet...


Original size: 9672 samples
Original labels range: 0 to 6
Unmapped values: [5]
Final size: 8138 samples
Final labels range: 0 to 5
Label distribution: {0: 1538, 1: 1515, 2: 1537, 3: 1537, 4: 1538, 5: 473}
Saving /teamspace/studios/this_studio/speechSentimentAnalysis/processed_data/train_processed.parquet...
✓ train dataset processed and saved

--- Processing valid dataset ---
Loading /teamspace/studios/this_studio/speechSentimentAnalysis/processed_data/valid_processed.parquet...
Original size: 1208 samples
Original labels range: 0 to 6
Unmapped values: [5]
Final size: 1016 samples
Final labels range: 0 to 5
Label distribution: {0: 191, 1: 189, 2: 192, 3: 192, 4: 193, 5: 59}
Saving /teamspace/studios/this_studio/speechSentimentAnalysis/processed_data/valid_processed.parquet...
✓ valid dataset processed and saved

--- Processing test dataset ---
Loading /teamspace/studios/this_studio/speechSentimentAnalysis/processed_data/test_processed.parquet...
Original size: 1208 samples
Original lab

In [4]:
DATA_DIR = Path("/teamspace/studios/this_studio/speechSentimentAnalysis/processed_data")

# Load parquet files
train_df = pd.read_parquet(DATA_DIR / "train_processed.parquet")
valid_df = pd.read_parquet(DATA_DIR / "valid_processed.parquet")
test_df  = pd.read_parquet(DATA_DIR / "test_processed.parquet")


In [5]:
# DEBUGGING: Check label distributions
print("=== DEBUGGING LABEL DISTRIBUTIONS ===")
print(f"Train labels: min={train_df['labels'].min()}, max={train_df['labels'].max()}")
print(f"Valid labels: min={valid_df['labels'].min()}, max={valid_df['labels'].max()}")
print(f"Test labels: min={test_df['labels'].min()}, max={test_df['labels'].max()}")

print("\nTrain label distribution:")
print(train_df['labels'].value_counts().sort_index())
print("\nValid label distribution:")
print(valid_df['labels'].value_counts().sort_index())
print("\nTest label distribution:")
print(test_df['labels'].value_counts().sort_index())

# Check for any NaN or invalid values
print(f"\nNaN values in train labels: {train_df['labels'].isna().sum()}")
print(f"NaN values in valid labels: {valid_df['labels'].isna().sum()}")
print(f"NaN values in test labels: {test_df['labels'].isna().sum()}")


=== DEBUGGING LABEL DISTRIBUTIONS ===
Train labels: min=0, max=5
Valid labels: min=0, max=5
Test labels: min=0, max=5

Train label distribution:
labels
0    1538
1    1515
2    1537
3    1537
4    1538
5     473
Name: count, dtype: int64

Valid label distribution:
labels
0    191
1    189
2    192
3    192
4    193
5     59
Name: count, dtype: int64

Test label distribution:
labels
0    192
1    188
2    192
3    193
4    192
5     59
Name: count, dtype: int64

NaN values in train labels: 0
NaN values in valid labels: 0
NaN values in test labels: 0


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️ Using device: {device}")

MODEL_NAME = "facebook/wav2vec2-large-960h-lv60-self"
NUM_LABELS = 7 

processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="single_label_classification"
).to(device)

🖥️ Using device: cuda


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from datasets import Dataset

def create_generator(df):
    for _, row in df.iterrows():
        yield row.to_dict()

train_dataset = Dataset.from_generator(lambda: create_generator(train_df))
valid_dataset = Dataset.from_generator(lambda: create_generator(valid_df))
test_dataset = Dataset.from_generator(lambda: create_generator(test_df))

datasets = {
    "train": train_dataset,
    "valid": valid_dataset,
    "test": test_dataset
}

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
"""
print("🔍 Checking test set labels...")
test_labels = [x["labels"] for x in train_dataset]
min_label = min(test_labels)
max_label = max(test_labels)
print(f"Test labels range from {min_label} to {max_label}")
assert 0 <= min_label and max_label <= 6, "❌ Label values out of expected range [0–6]!"
"""

'\nprint("🔍 Checking test set labels...")\ntest_labels = [x["labels"] for x in train_dataset]\nmin_label = min(test_labels)\nmax_label = max(test_labels)\nprint(f"Test labels range from {min_label} to {max_label}")\nassert 0 <= min_label and max_label <= 6, "❌ Label values out of expected range [0–6]!"\n'

In [9]:
def preprocess_example(example):
    audio = example["input_values"]
    audio = np.array(audio, dtype=np.float32)
    
    max_len = 16000 * 10  # 10 seconds
    if len(audio) > max_len:
        audio = audio[:max_len]
    
    # Return with the exact field names expected by Wav2Vec2
    return {
        "input_values": audio,
        "labels": example["labels"]
    }

# MODIFIED: Completely new data collator that handles the input format correctly
def collate_fn(batch):
    # Extract arrays
    input_values = [item["input_values"] for item in batch]
    labels = [item["labels"] for item in batch]
    
    # Convert to tensors and pad
    input_values = [torch.tensor(arr, dtype=torch.float32) for arr in input_values]
    max_len = max(len(arr) for arr in input_values)
    
    # Pad sequences
    padded_inputs = []
    for arr in input_values:
        if len(arr) < max_len:
            padded = torch.cat([arr, torch.zeros(max_len - len(arr))])
        else:
            padded = arr
        padded_inputs.append(padded)
    
    # Stack and return
    return {
        "input_values": torch.stack(padded_inputs),
        "labels": torch.tensor(labels, dtype=torch.long)
    }

# Apply preprocessing
print("Preprocessing datasets...")
train_dataset = train_dataset.map(preprocess_example, batched=False, load_from_cache_file=False)
valid_dataset = valid_dataset.map(preprocess_example, batched=False, load_from_cache_file=False)
test_dataset = test_dataset.map(preprocess_example, batched=False, load_from_cache_file=False)
gc.collect()

Preprocessing datasets...


Map:   0%|          | 0/8138 [00:00<?, ? examples/s]

KeyboardInterrupt: 

### BASELINE EVALUATION

In [8]:

"""
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Compute accuracy, precision, recall, and F1-score
    """
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    
    # Calculate macro metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro', zero_division=0
    )
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def print_results(results):
    """
    """
    print("\nEvaluation Results:")
    print("-" * 40)
    
    # Get metrics with or without eval_ prefix
    accuracy = results.get('eval_accuracy', results.get('accuracy'))
    precision = results.get('eval_precision', results.get('precision'))
    recall = results.get('eval_recall', results.get('recall'))
    f1 = results.get('eval_f1', results.get('f1'))
    
    if accuracy is not None:
        print(f"Accuracy:  {accuracy:.4f}")
    if precision is not None:
        print(f"Precision: {precision:.4f}")
    if recall is not None:
        print(f"Recall:    {recall:.4f}")
    if f1 is not None:
        print(f"F1-Score:  {f1:.4f}")
    
    print("-" * 40)

# Training arguments for evaluation
eval_args = TrainingArguments(
    output_dir="./eval_results",
    do_train=False,
    do_eval=True,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    remove_unused_columns=False,
    report_to=None
)

# Create trainer
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=test_dataset,
    data_collator=collator,
    compute_metrics=compute_metrics
)

results = trainer.evaluate()

print_results(results)

import json
with open("results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Results saved to results.json")
"""

IndentationError: unexpected indent (3934799266.py, line 8)

###  FINE TUNNING WAVE2VEC2


In [None]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=[
        "q_proj",
        "k_proj", 
        "v_proj",
        "out_proj",
        "intermediate_dense",
        "output_dense"
    ]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,540,743 || all params: 319,243,662 || trainable%: 1.1091


In [24]:
model1 = Wav2Vec2ForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="single_label_classification"
).to(device)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

# Training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-emotion",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=2,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
)

# MODIFIED: Use standard Trainer without PEFT
trainer = Trainer(
    model=model1,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

# Start training
print("Starting training...")
trainer.train()

# Save model
print("Saving model...")
model1.save_pretrained("./wav2vec2_finetuned")
print("Training completed!")

# Test the model
print("Testing model...")
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")

Starting training...


Epoch,Training Loss,Validation Loss


/pytorch/aten/src/ATen/native/cuda/Loss.cu:242: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
