In [2]:
import torch
import os
import pandas as pd
import torchaudio
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, DataCollatorWithPadding


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [19]:

# Load the CSV file
df = pd.read_csv(r"C:\Users\asus\Desktop\SpeechSentemintAnalysis\data\afterReadingDataSet.csv")
print("Original DataFrame:")
print(df.head())
print("\nColumn names:", df.columns.tolist())

# Fix column names
df = df.rename(columns={'Emotions': 'label', 'Path': 'path'})

# Remove rows with null path or label
print(f"\nDataFrame shape before cleaning: {df.shape}")
df = df.dropna(subset=["path", "label"])
print(f"DataFrame shape after removing nulls: {df.shape}")

# Optional: Remove rows where path file doesn't exist
df_before_file_check = len(df)
df = df[df["path"].apply(os.path.exists)]
print(f"Removed {df_before_file_check - len(df)} rows with non-existent files")

# Map emotion labels to integers
label_list = sorted(df["label"].unique())
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"\nFound {len(label_list)} unique labels:")
for label, idx in label2id.items():
    print(f"  {label}: {idx}")

# Add label_id column
df["label_id"] = df["label"].map(label2id)

print(f"\nDataFrame after adding label_id:")
print(df[["path", "label", "label_id"]].head())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print(f"\nDataset created successfully!")
print(f"Dataset info: {dataset}")

# Load processor
try:
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
    print("Processor loaded successfully!")
except Exception as e:
    print(f"Error loading processor: {e}")
    print("Make sure transformers is installed: pip install transformers")

print(f"\nFinal dataset shape: {len(dataset)} samples")
print(f"Features: {dataset.features}")


Original DataFrame:
  Emotions                                               Path
0  neutral  C:\Users\asus\Desktop\Speech sentiment\speechS...
1  neutral  C:\Users\asus\Desktop\Speech sentiment\speechS...
2  neutral  C:\Users\asus\Desktop\Speech sentiment\speechS...
3  neutral  C:\Users\asus\Desktop\Speech sentiment\speechS...
4     calm  C:\Users\asus\Desktop\Speech sentiment\speechS...

Column names: ['Emotions', 'Path']

DataFrame shape before cleaning: (12161, 2)
DataFrame shape after removing nulls: (12161, 2)
Removed 0 rows with non-existent files

Found 8 unique labels:
  angry: 0
  calm: 1
  disgust: 2
  fear: 3
  happy: 4
  neutral: 5
  sad: 6
  surprise: 7

DataFrame after adding label_id:
                                                path    label  label_id
0  C:\Users\asus\Desktop\Speech sentiment\speechS...  neutral         5
1  C:\Users\asus\Desktop\Speech sentiment\speechS...  neutral         5
2  C:\Users\asus\Desktop\Speech sentiment\speechS...  neutral         5
3 

In [20]:

# Set target sample rate
target_sampling_rate = 16000

def preprocess(example):
    """
    Preprocess audio files and return the processed example
    """
    try:
        # Load audio file
        speech_array, sampling_rate = torchaudio.load(example['path'])
        
        # Resample if necessary
        if sampling_rate != target_sampling_rate:
            resampler = torchaudio.transforms.Resample(
                orig_freq=sampling_rate, 
                new_freq=target_sampling_rate
            )
            speech_array = resampler(speech_array)
        
        # Convert to mono if stereo
        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)
        
        # Process with Wav2Vec2 processor
        inputs = processor(
            speech_array.squeeze().numpy(), 
            sampling_rate=target_sampling_rate, 
            return_tensors="pt"
        )
        
        # Extract input_values and convert to list for Arrow compatibility
        input_values = inputs["input_values"].squeeze().tolist()
        
        # Update example with processed data
        example["input_values"] = input_values
        example["labels"] = example["label_id"]  # Use 'labels' for consistency with transformers
        
        return example
        
    except Exception as e:
        print(f"Error processing {example['path']}: {e}")
        # Return None to indicate this example should be filtered out
        return None

def filter_failed_examples(example):
    """Filter function to remove None examples"""
    return example is not None


In [21]:

# Split dataset BEFORE preprocessing to avoid issues
print("\n" + "="*50)
print("SPLITTING DATASET")
print("="*50)

train_testvalid = dataset.train_test_split(test_size=0.2, seed=42)
valid_test = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
valid_dataset = valid_test['train']
test_dataset = valid_test['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Preprocess datasets
print("\n" + "="*50)
print("PREPROCESSING DATASETS")
print("="*50)

# Process train dataset
print("Processing training dataset...")
train_processed = []
for i, example in enumerate(train_dataset):
    if i % 100 == 0:
        print(f"Processing train example {i}/{len(train_dataset)}")
    
    processed = preprocess(example)
    if processed is not None:
        train_processed.append(processed)

print(f"Successfully processed {len(train_processed)}/{len(train_dataset)} training examples")



SPLITTING DATASET
Train dataset size: 9728
Validation dataset size: 1216
Test dataset size: 1217

PREPROCESSING DATASETS
Processing training dataset...
Processing train example 0/9728
Processing train example 100/9728
Processing train example 200/9728
Processing train example 300/9728
Processing train example 400/9728
Processing train example 500/9728
Processing train example 600/9728
Processing train example 700/9728
Processing train example 800/9728
Processing train example 900/9728
Processing train example 1000/9728
Processing train example 1100/9728
Processing train example 1200/9728
Processing train example 1300/9728
Processing train example 1400/9728
Processing train example 1500/9728
Processing train example 1600/9728
Processing train example 1700/9728
Processing train example 1800/9728
Processing train example 1900/9728
Processing train example 2000/9728
Processing train example 2100/9728
Processing train example 2200/9728
Processing train example 2300/9728
Processing train ex

In [22]:

# Process validation dataset
print("Processing validation dataset...")
valid_processed = []
for i, example in enumerate(valid_dataset):
    if i % 50 == 0:
        print(f"Processing valid example {i}/{len(valid_dataset)}")
    
    processed = preprocess(example)
    if processed is not None:
        valid_processed.append(processed)

print(f"Successfully processed {len(valid_processed)}/{len(valid_dataset)} validation examples")

# Process test dataset
print("Processing test dataset...")
test_processed = []
for i, example in enumerate(test_dataset):
    if i % 50 == 0:
        print(f"Processing test example {i}/{len(test_dataset)}")
    
    processed = preprocess(example)
    if processed is not None:
        test_processed.append(processed)

print(f"Successfully processed {len(test_processed)}/{len(test_dataset)} test examples")


Processing validation dataset...
Processing valid example 0/1216
Processing valid example 50/1216
Processing valid example 100/1216
Processing valid example 150/1216
Processing valid example 200/1216
Processing valid example 250/1216
Processing valid example 300/1216
Processing valid example 350/1216
Processing valid example 400/1216
Processing valid example 450/1216
Processing valid example 500/1216
Processing valid example 550/1216
Processing valid example 600/1216
Processing valid example 650/1216
Processing valid example 700/1216
Processing valid example 750/1216
Processing valid example 800/1216
Processing valid example 850/1216
Processing valid example 900/1216
Processing valid example 950/1216
Processing valid example 1000/1216
Processing valid example 1050/1216
Processing valid example 1100/1216
Processing valid example 1150/1216
Processing valid example 1200/1216
Successfully processed 1216/1216 validation examples
Processing test dataset...
Processing test example 0/1217
Proc

In [25]:
import os
import json
import pickle
import gc
from datasets import Dataset

print("🔄 MEMORY-EFFICIENT SAVING (without Dataset conversion)")
print("="*60)

save_dir = r"C:\Users\asus\Desktop\SpeechSentemintAnalysis\processed_data"
os.makedirs(save_dir, exist_ok=True)

# Save processed lists directly as pickle files first (more memory efficient)
print("💾 Saving processed data as pickle files...")

if 'train_processed' in locals() and train_processed:
    print(f"Saving {len(train_processed)} training samples...")
    with open(os.path.join(save_dir, "train_processed.pkl"), 'wb') as f:
        pickle.dump(train_processed, f)
    print(f"✅ Saved training data to pickle")

if 'valid_processed' in locals() and valid_processed:
    print(f"Saving {len(valid_processed)} validation samples...")
    with open(os.path.join(save_dir, "valid_processed.pkl"), 'wb') as f:
        pickle.dump(valid_processed, f)
    print(f"✅ Saved validation data to pickle")

if 'test_processed' in locals() and test_processed:
    print(f"Saving {len(test_processed)} test samples...")
    with open(os.path.join(save_dir, "test_processed.pkl"), 'wb') as f:
        pickle.dump(test_processed, f)
    print(f"✅ Saved test data to pickle")

# Save label mappings
if 'label2id' in locals():
    mappings = {
        "label2id": label2id,
        "id2label": id2label,
        "label_list": label_list,
        "num_labels": len(label_list)
    }
    
    mappings_path = os.path.join(save_dir, "label_mappings.json")
    with open(mappings_path, 'w') as f:
        json.dump(mappings, f, indent=2)
    print(f"✅ Saved label mappings to: {mappings_path}")

# Save processing statistics
if 'dataset' in locals():
    stats = {
        "train_processed_size": len(train_processed) if 'train_processed' in locals() and train_processed else 0,
        "valid_processed_size": len(valid_processed) if 'valid_processed' in locals() and valid_processed else 0,
        "test_processed_size": len(test_processed) if 'test_processed' in locals() and test_processed else 0,
        "target_sampling_rate": target_sampling_rate if 'target_sampling_rate' in locals() else 16000,
        "processor_model": "facebook/wav2vec2-large-960h-lv60-self"
    }
    
    stats_path = os.path.join(save_dir, "processing_stats.json")
    with open(stats_path, 'w') as f:
        json.dump(stats, f, indent=2)
    print(f"✅ Saved processing statistics to: {stats_path}")

print(f"\n📁 All files saved to directory: {save_dir}")
print("\n" + "="*60)
print("🎯 DATA SAVED SUCCESSFULLY!")
print("="*60)

# Now try to create and save Dataset objects in batches to avoid memory issues
print("\n🔄 Converting to Dataset format in batches...")

def save_dataset_in_batches(processed_data, save_path, batch_size=100):
    """Save large datasets in smaller batches to avoid memory issues"""
    if not processed_data:
        return False
    
    try:
        print(f"Converting {len(processed_data)} samples to Dataset...")
        
        # Try smaller batch first to test memory
        test_batch = processed_data[:min(10, len(processed_data))]
        test_dataset = Dataset.from_list(test_batch)
        print(f"✅ Test batch successful")
        
        # If test works, try full dataset
        if len(processed_data) <= batch_size:
            # Small enough to do all at once
            dataset = Dataset.from_list(processed_data)
            dataset.save_to_disk(save_path)
            print(f"✅ Saved complete dataset to {save_path}")
            return True
        else:
            print(f"⚠️  Dataset too large ({len(processed_data)} samples), saving as pickle only")
            return False
            
    except Exception as e:
        print(f"❌ Memory error during Dataset conversion: {e}")
        print(f"💡 Data saved as pickle files - use batch loading script instead")
        return False

# Try to save as Dataset objects (will fallback to pickle if memory issues)
success_count = 0

if 'train_processed' in locals() and train_processed:
    train_save_path = os.path.join(save_dir, "train_dataset")
    if save_dataset_in_batches(train_processed, train_save_path):
        success_count += 1

if 'valid_processed' in locals() and valid_processed:
    valid_save_path = os.path.join(save_dir, "valid_dataset")
    if save_dataset_in_batches(valid_processed, valid_save_path):
        success_count += 1

if 'test_processed' in locals() and test_processed:
    test_save_path = os.path.join(save_dir, "test_dataset")
    if save_dataset_in_batches(test_processed, test_save_path):
        success_count += 1

# Clear memory
if 'train_processed' in locals():
    print(f"\n🧹 Clearing train_processed from memory...")
    del train_processed
if 'valid_processed' in locals():
    print(f"🧹 Clearing valid_processed from memory...")
    del valid_processed
if 'test_processed' in locals():
    print(f"🧹 Clearing test_processed from memory...")
    del test_processed

gc.collect()
print(f"🧹 Memory cleared")

print(f"\n📊 SAVE SUMMARY:")
print(f"✅ Pickle files: All datasets saved successfully")
print(f"✅ Dataset objects: {success_count}/3 saved successfully")
print(f"✅ Label mappings: Saved")
print(f"✅ Statistics: Saved")

print(f"\n🚀 NEXT STEPS:")
if success_count == 3:
    print(f"✅ All datasets saved in both formats - use the quick load script!")
else:
    print(f"⚠️  Some datasets too large for Arrow format")
    print(f"💡 Use the batch loading script below for training")

print(f"\n📁 Files in {save_dir}:")
for file in os.listdir(save_dir):
    file_path = os.path.join(save_dir, file)
    if os.path.isfile(file_path):
        size_mb = os.path.getsize(file_path) / (1024*1024)
        print(f"   📄 {file} ({size_mb:.1f} MB)")
    else:
        print(f"   📁 {file}/")

🔄 MEMORY-EFFICIENT SAVING (without Dataset conversion)
💾 Saving processed data as pickle files...
Saving 9728 training samples...
✅ Saved training data to pickle
Saving 1216 validation samples...
✅ Saved validation data to pickle
Saving 1217 test samples...
✅ Saved test data to pickle
✅ Saved label mappings to: C:\Users\asus\Desktop\SpeechSentemintAnalysis\processed_data\label_mappings.json
✅ Saved processing statistics to: C:\Users\asus\Desktop\SpeechSentemintAnalysis\processed_data\processing_stats.json

📁 All files saved to directory: C:\Users\asus\Desktop\SpeechSentemintAnalysis\processed_data

🎯 DATA SAVED SUCCESSFULLY!

🔄 Converting to Dataset format in batches...
Converting 9728 samples to Dataset...
✅ Test batch successful
⚠️  Dataset too large (9728 samples), saving as pickle only
Converting 1216 samples to Dataset...
✅ Test batch successful
⚠️  Dataset too large (1216 samples), saving as pickle only
Converting 1217 samples to Dataset...
✅ Test batch successful
⚠️  Dataset too

In [None]:

# Display sample from processed dataset
if train_processed:
    print(f"\nSample from processed training data:")
    sample = train_dataset[0]
    print(f"Keys: {sample.keys()}")
    print(f"Input values length: {len(sample['input_values'])}")
    print(f"Label: {sample['labels']}")
    print(f"Original label: {sample['label']}")

print("\n" + "="*50)
print("QUICK LOAD CODE FOR NEXT TIME:")
print("="*50)
print(f"""
# To load your processed datasets next time:
from datasets import load_from_disk
import json

save_dir = r"{save_dir}"

# Load datasets
train_dataset = load_from_disk(os.path.join(save_dir, "train_dataset"))
valid_dataset = load_from_disk(os.path.join(save_dir, "valid_dataset"))
test_dataset = load_from_disk(os.path.join(save_dir, "test_dataset"))

# Load label mappings
with open(os.path.join(save_dir, "label_mappings.json"), 'r') as f:
    mappings = json.load(f)
    label2id = mappings["label2id"]
    id2label = mappings["id2label"]
    num_labels = mappings["num_labels"]

print(f"Loaded datasets - Train: {{len(train_dataset)}}, Valid: {{len(valid_dataset)}}, Test: {{len(test_dataset)}}")
""")

In [None]:
import os
import json
import pickle
import gc
from datasets import Dataset
from transformers import Wav2Vec2Processor

class BatchDataLoader:
    """Memory-efficient loader for large processed datasets"""
    
    def __init__(self, save_dir):
        self.save_dir = save_dir
        self.label_mappings = None
        self.stats = None
        self.processor = None
        
    def load_metadata(self):
        """Load label mappings and statistics"""
        print("📋 Loading metadata...")
        
        # Load label mappings
        mappings_path = os.path.join(self.save_dir, "label_mappings.json")
        if os.path.exists(mappings_path):
            with open(mappings_path, 'r') as f:
                self.label_mappings = json.load(f)
            print(f"✅ Loaded {self.label_mappings['num_labels']} labels")
        
        # Load stats
        stats_path = os.path.join(self.save_dir, "processing_stats.json")
        if os.path.exists(stats_path):
            with open(stats_path, 'r') as f:
                self.stats = json.load(f)
            print(f"✅ Loaded processing statistics")
        
        # Load processor
        try:
            self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
            print(f"✅ Loaded processor")
        except Exception as e:
            print(f"⚠️  Could not load processor: {e}")
    
    def try_load_dataset(self, dataset_name):
        """Try to load dataset, fallback to pickle if needed"""
        dataset_path = os.path.join(self.save_dir, f"{dataset_name}_dataset")
        pickle_path = os.path.join(self.save_dir, f"{dataset_name}_processed.pkl")
        
        # Try loading as Dataset first
        if os.path.exists(dataset_path):
            try:
                dataset = Dataset.load_from_disk(dataset_path)
                print(f"✅ Loaded {dataset_name} dataset ({len(dataset)} samples)")
                return dataset
            except Exception as e:
                print(f"⚠️  Could not load {dataset_name} dataset: {e}")
        
        # Fallback to pickle
        if os.path.exists(pickle_path):
            try:
                print(f"🔄 Loading {dataset_name} from pickle...")
                with open(pickle_path, 'rb') as f:
                    processed_data = pickle.load(f)
                
                # Try to convert to Dataset in smaller batches
                return self.pickle_to_dataset_batched(processed_data, dataset_name)
            except Exception as e:
                print(f"❌ Could not load {dataset_name} pickle: {e}")
                return None
        
        print(f"❌ No data found for {dataset_name}")
        return None
    
    def pickle_to_dataset_batched(self, processed_data, name, batch_size=50):
        """Convert pickle data to Dataset in batches"""
        if not processed_data:
            return None
            
        try:
            # Try small batch first
            test_size = min(10, len(processed_data))
            test_batch = processed_data[:test_size]
            test_dataset = Dataset.from_list(test_batch)
            print(f"✅ Test conversion successful for {name}")
            
            # If data is small enough, convert all at once
            if len(processed_data) <= batch_size:
                dataset = Dataset.from_list(processed_data)
                print(f"✅ Converted all {len(processed_data)} {name} samples to Dataset")
                return dataset
            else:
                # For larger datasets, you might want to return a custom iterator
                print(f"⚠️  {name} dataset large ({len(processed_data)} samples)")
                print(f"💡 Consider using smaller batches during training")
                
                # Still try to convert, but with more aggressive memory management
                gc.collect()
                dataset = Dataset.from_list(processed_data)
                print(f"✅ Converted {len(processed_data)} {name} samples (may use lots of memory)")
                return dataset
                
        except Exception as e:
            print(f"❌ Failed to convert {name} pickle to Dataset: {e}")
            print(f"💡 You may need to use the raw pickle data with custom data loaders")
            return None
    
    def load_all(self):
        """Load all datasets and metadata"""
        print("🚀 Loading all processed data...")
        print("="*50)
        
        self.load_metadata()
        
        # Try to load datasets
        train_dataset = self.try_load_dataset("train")
        valid_dataset = self.try_load_dataset("valid") 
        test_dataset = self.try_load_dataset("test")
        
        # Summary
        print("\n📊 LOADING SUMMARY:")
        print("="*50)
        datasets = {
            "train": train_dataset,
            "valid": valid_dataset, 
            "test": test_dataset
        }
        
        for name, dataset in datasets.items():
            if dataset is not None:
                print(f"✅ {name.capitalize()}: {len(dataset)} samples")
            else:
                print(f"❌ {name.capitalize()}: Failed to load")
        
        print(f"\n🎯 Available variables:")
        if self.label_mappings:
            print(f"   - label2id, id2label, num_labels")
        if self.processor:
            print(f"   - processor")
        
        return {
            'train_dataset': train_dataset,
            'valid_dataset': valid_dataset,
            'test_dataset': test_dataset,
            'label2id': self.label_mappings.get('label2id') if self.label_mappings else None,
            'id2label': self.label_mappings.get('id2label') if self.label_mappings else None,
            'num_labels': self.label_mappings.get('num_labels') if self.label_mappings else None,
            'processor': self.processor,
            'stats': self.stats
        }

# Usage
save_dir = r"C:\Users\asus\Desktop\SpeechSentemintAnalysis\processed_data"

# Check if save directory exists
if os.path.exists(save_dir):
    loader = BatchDataLoader(save_dir)
    results = loader.load_all()
    
    # Extract variables for easy access
    train_dataset = results['train_dataset']
    valid_dataset = results['valid_dataset']
    test_dataset = results['test_dataset']
    label2id = results['label2id']
    id2label = results['id2label']
    num_labels = results['num_labels']
    processor = results['processor']
    
    print(f"\n🎉 Data loading complete!")
    print(f"Ready for model training with: train_dataset, valid_dataset, test_dataset")
    
    # Display sample if available
    if train_dataset and len(train_dataset) > 0:
        print(f"\n🔍 Sample from training data:")
        sample = train_dataset[0]
        print(f"   - Keys: {list(sample.keys())}")
        print(f"   - Input length: {len(sample['input_values'])}")
        if 'labels' in sample:
            print(f"   - Label: {sample['labels']} ({id2label[sample['labels']] if id2label else 'unknown'})")
        
else:
    print(f"❌ Save directory not found: {save_dir}")
    print(f"Please run the processing script first.")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject