In [None]:
import torch
import os
import pandas as pd
import torchaudio
from datasets import Dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification,DataCollatorWithPadding, TrainingArguments, Trainer
import gc
import numpy as np
import psutil
from pathlib import Path
import warnings
import json
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')



ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️ Using device: {device}")

MODEL_NAME = "facebook/wav2vec2-base"
NUM_LABELS = 8  # Change this if needed

# ===============================
# 2. Load Processor and Model
# ===============================
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="single_label_classification"
).to(device)

Original DataFrame:
  Emotions                                               Path
0  neutral  C:\Users\asus\Desktop\Speech sentiment\speechS...
1  neutral  C:\Users\asus\Desktop\Speech sentiment\speechS...
2  neutral  C:\Users\asus\Desktop\Speech sentiment\speechS...
3  neutral  C:\Users\asus\Desktop\Speech sentiment\speechS...
4     calm  C:\Users\asus\Desktop\Speech sentiment\speechS...

Column names: ['Emotions', 'Path']

DataFrame shape before cleaning: (12161, 2)
DataFrame shape after removing nulls: (12161, 2)
Removed 0 rows with non-existent files

Found 8 unique labels:
  angry: 0
  calm: 1
  disgust: 2
  fear: 3
  happy: 4
  neutral: 5
  sad: 6
  surprise: 7

DataFrame after adding label_id:
                                                path    label  label_id
0  C:\Users\asus\Desktop\Speech sentiment\speechS...  neutral         5
1  C:\Users\asus\Desktop\Speech sentiment\speechS...  neutral         5
2  C:\Users\asus\Desktop\Speech sentiment\speechS...  neutral         5
3 

In [None]:
DATA_DIR = Path("/teamspace/studios/this_studio/speechSentimentAnalysis/processed_data")

# Load parquet files
train_df = pd.read_parquet(DATA_DIR / "train_processed.parquet")
valid_df = pd.read_parquet(DATA_DIR / "valid_processed.parquet")
test_df  = pd.read_parquet(DATA_DIR / "test_processed.parquet")


In [None]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset  = Dataset.from_pandas(test_df)
datasets = {
    "train": train_dataset,
    "valid": valid_dataset,
    "test": test_dataset
}


In [None]:
from typing import List, Dict, Any

# Step 1: Preprocessing function - DON'T pad here, just truncate
def preprocess_example(example):
    audio = example["input_values"]
    audio = np.array(audio, dtype=np.float32)

    max_len = 16000 * 10  # 10 seconds
    # Only truncate if too long, don't pad here
    if len(audio) > max_len:
        audio = audio[:max_len]
    
    return {"input_values": audio}  # Return original length

# Step 2: Define memory-safe data collator - padding happens here during batch creation
class DataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_values = [torch.tensor(f["input_values"], dtype=torch.float32) for f in features]
        labels = [f["labels"] for f in features]
        max_len = max(len(i) for i in input_values)
        padded = [torch.cat([i, torch.zeros(max_len - len(i))]) if len(i) < max_len else i for i in input_values]
        batch_inputs = torch.stack(padded)
        return {"input_values": batch_inputs, "labels": torch.tensor(labels, dtype=torch.long)}

collator = DataCollator(processor)


# %%
print("Preprocessing train_dataset...")

# Process in smaller chunks and clear cache frequently
train_dataset = train_dataset.map(
    preprocess_example,
    batched=False,
    desc="Processing train",
    load_from_cache_file=False,
    writer_batch_size=100  # Write to disk more frequently
)

# Clear memory
gc.collect()
print("✅ Done with train_dataset")


# %%
print("Preprocessing valid_dataset...")

valid_dataset = valid_dataset.map(
    preprocess_example,
    batched=False,
    desc="Processing valid",
    load_from_cache_file=False,
    writer_batch_size=100
)

gc.collect()
print("✅ Done with valid_dataset")

print("Preprocessing test_dataset...")

test_dataset = test_dataset.map(
    preprocess_example,
    batched=False,
    desc="Processing test",
    load_from_cache_file=False,
    writer_batch_size=100
)

gc.collect()
print("✅ Done with test_dataset")



SPLITTING DATASET
Train dataset size: 9728
Validation dataset size: 1216
Test dataset size: 1217

PREPROCESSING DATASETS
Processing training dataset...
Processing train example 0/9728
Processing train example 100/9728
Processing train example 200/9728
Processing train example 300/9728
Processing train example 400/9728
Processing train example 500/9728
Processing train example 600/9728
Processing train example 700/9728
Processing train example 800/9728
Processing train example 900/9728
Processing train example 1000/9728
Processing train example 1100/9728
Processing train example 1200/9728
Processing train example 1300/9728
Processing train example 1400/9728
Processing train example 1500/9728
Processing train example 1600/9728
Processing train example 1700/9728
Processing train example 1800/9728
Processing train example 1900/9728
Processing train example 2000/9728
Processing train example 2100/9728
Processing train example 2200/9728
Processing train example 2300/9728
Processing train ex

In [None]:
from transformers import TrainingArguments, Trainer

# Metric
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    return {"accuracy": (preds == pred.label_ids).mean()}

# Training args
training_args = TrainingArguments(
    output_dir="./wav2vec2-emotion",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    dataloader_num_workers=0,  # Reduce memory overhead
    remove_unused_columns=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collator,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

Processing validation dataset...
Processing valid example 0/1216
Processing valid example 50/1216
Processing valid example 100/1216
Processing valid example 150/1216
Processing valid example 200/1216
Processing valid example 250/1216
Processing valid example 300/1216
Processing valid example 350/1216
Processing valid example 400/1216
Processing valid example 450/1216
Processing valid example 500/1216
Processing valid example 550/1216
Processing valid example 600/1216
Processing valid example 650/1216
Processing valid example 700/1216
Processing valid example 750/1216
Processing valid example 800/1216
Processing valid example 850/1216
Processing valid example 900/1216
Processing valid example 950/1216
Processing valid example 1000/1216
Processing valid example 1050/1216
Processing valid example 1100/1216
Processing valid example 1150/1216
Processing valid example 1200/1216
Successfully processed 1216/1216 validation examples
Processing test dataset...
Processing test example 0/1217
Proc