<a href="https://colab.research.google.com/github/jayshah18/Sentiment_Analysis/blob/main/sentiment_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as  np

In [None]:
import os
import librosa

def review_dataset(dataset_path):
    total_files = 0
    summary = {}

    for actor_folder in sorted(os.listdir(dataset_path)):
        actor_path = os.path.join(dataset_path, actor_folder)
        if os.path.isdir(actor_path):
            file_list = [f for f in os.listdir(actor_path) if f.endswith(".wav") or f.endswith(".mp3")]
            total_files += len(file_list)

            durations = []
            for file in file_list:
                file_path = os.path.join(actor_path, file)
                signal, sr = librosa.load(file_path, sr=None)
                durations.append(librosa.get_duration(y=signal, sr=sr))

            summary[actor_folder] = {
                "file_count": len(file_list),
                "avg_duration_sec": round(sum(durations)/len(durations), 2) if durations else 0
            }

    print(f"\n✅ Total Actors: {len(summary)}")
    print(f"✅ Total Audio Files: {total_files}\n")

    for actor, stats in summary.items():
        print(f"{actor}: {stats['file_count']} files | Avg Duration: {stats['avg_duration_sec']} sec")

# Run this
review_dataset("/content/drive/MyDrive/DATASET")



✅ Total Actors: 24
✅ Total Audio Files: 1440

Actor_01: 60 files | Avg Duration: 3.75 sec
Actor_02: 60 files | Avg Duration: 3.79 sec
Actor_03: 60 files | Avg Duration: 3.76 sec
Actor_04: 60 files | Avg Duration: 3.63 sec
Actor_05: 60 files | Avg Duration: 3.74 sec
Actor_06: 60 files | Avg Duration: 3.79 sec
Actor_07: 60 files | Avg Duration: 3.75 sec
Actor_08: 60 files | Avg Duration: 3.73 sec
Actor_09: 60 files | Avg Duration: 3.49 sec
Actor_10: 60 files | Avg Duration: 3.75 sec
Actor_11: 60 files | Avg Duration: 3.44 sec
Actor_12: 60 files | Avg Duration: 3.75 sec
Actor_13: 60 files | Avg Duration: 3.33 sec
Actor_14: 60 files | Avg Duration: 3.68 sec
Actor_15: 60 files | Avg Duration: 3.5 sec
Actor_16: 60 files | Avg Duration: 3.73 sec
Actor_17: 60 files | Avg Duration: 3.67 sec
Actor_18: 60 files | Avg Duration: 3.75 sec
Actor_19: 60 files | Avg Duration: 3.87 sec
Actor_20: 60 files | Avg Duration: 3.73 sec
Actor_21: 60 files | Avg Duration: 3.92 sec
Actor_22: 60 files | Avg Durat

In [None]:
import os
import pandas as pd

def create_dataset_dataframe(dataset_path):
    """
    Parses the RAVDESS dataset filenames to create a structured DataFrame.

    The RAVDESS filename consists of a 7-part numerical identifier (e.g., 03-01-01-01-01-01-01.wav).
    The 3rd part identifies the emotion.

    Args:
        dataset_path (str): The root path to the RAVDESS dataset (e.g., '/content/drive/MyDrive/DATASET').

    Returns:
        pandas.DataFrame: A DataFrame with columns for filepath, actor, and emotion.
    """

    # Emotion labels mapping from the RAVDESS documentation
    emotion_map = {
        "01": "neutral",
        "02": "calm",
        "03": "happy",
        "04": "sad",
        "05": "angry",
        "06": "fearful",
        "07": "disgust",
        "08": "surprised"
    }

    data = []

    print(f"Parsing dataset from: {dataset_path}")

    for actor_folder in sorted(os.listdir(dataset_path)):
        actor_path = os.path.join(dataset_path, actor_folder)
        if os.path.isdir(actor_path):
            for file_name in os.listdir(actor_path):
                if file_name.endswith(".wav"):
                    parts = file_name.split(".")[0].split("-")

                    # Ensure the filename has the correct number of parts
                    if len(parts) == 7:
                        emotion_code = parts[2]
                        actor_id = parts[6]
                        file_path = os.path.join(actor_path, file_name)

                        # Get the emotion label from the map
                        emotion_label = emotion_map.get(emotion_code)

                        if emotion_label:
                            data.append({
                                "filepath": file_path,
                                "actor": actor_id,
                                "emotion": emotion_label
                            })
                        else:
                            print(f"Warning: Unknown emotion code '{emotion_code}' in file {file_name}")

    if not data:
        print("\nError: No data was loaded. Please check the following:")
        print(f"1. Does the path '{dataset_path}' exist?")
        print("2. Is it the correct root folder containing the 'Actor_01', 'Actor_02', etc. subfolders?")
        return None

    # Create a DataFrame
    df = pd.DataFrame(data)
    print(f"\n✅ Successfully created DataFrame with {len(df)} entries.")
    return df

# --- HOW TO RUN THIS SCRIPT ---
# 1. Make sure pandas is installed: pip install pandas
# 2. Update the dataset_path to your actual path
# 3. Run the script

if __name__ == '__main__':
    dataset_path = "/content/drive/MyDrive/DATASET"
    ravdess_df = create_dataset_dataframe(dataset_path)

    if ravdess_df is not None:
        # Display the first few rows of the DataFrame
        print("\n--- DataFrame Head ---")
        print(ravdess_df.head())

        # Display the distribution of emotions
        print("\n--- Emotion Distribution ---")
        print(ravdess_df['emotion'].value_counts())


Parsing dataset from: /content/drive/MyDrive/DATASET

✅ Successfully created DataFrame with 1440 entries.

--- DataFrame Head ---
                                            filepath actor  emotion
0  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01  neutral
1  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01     calm
2  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01  neutral
3  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01     calm
4  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01     calm

--- Emotion Distribution ---
emotion
calm         193
happy        192
sad          192
fearful      192
angry        192
disgust      192
surprised    191
neutral       96
Name: count, dtype: int64


In [None]:
import os
import pandas as pd
import librosa
import numpy as np
from tqdm import tqdm

def create_dataset_dataframe(dataset_path):
    """
    Parses the RAVDESS dataset filenames to create a structured DataFrame.
    This is the same function from the previous script.
    """
    emotion_map = {
        "01": "neutral", "02": "calm", "03": "happy", "04": "sad",
        "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
    }
    data = []
    print(f"Parsing dataset from: {dataset_path}")
    for actor_folder in sorted(os.listdir(dataset_path)):
        actor_path = os.path.join(dataset_path, actor_folder)
        if os.path.isdir(actor_path):
            for file_name in os.listdir(actor_path):
                if file_name.endswith(".wav"):
                    parts = file_name.split(".")[0].split("-")
                    if len(parts) == 7:
                        emotion_code = parts[2]
                        actor_id = parts[6]
                        file_path = os.path.join(actor_path, file_name)
                        emotion_label = emotion_map.get(emotion_code)
                        if emotion_label:
                            data.append({
                                "filepath": file_path,
                                "actor": actor_id,
                                "emotion": emotion_label
                            })
    if not data:
        print("\nError: No data was loaded.")
        return None
    df = pd.DataFrame(data)
    print(f"\n✅ Successfully created DataFrame with {len(df)} entries.")
    return df

def extract_features(file_path):
    """
    Extracts Mel Spectrogram and aggregated features (MFCC, Chroma, ZCR) from an audio file.
    """
    try:
        # Load audio file at a consistent sample rate
        signal, sample_rate = librosa.load(file_path, sr=22050)

        # --- Features for Deep Learning Models (CNN, LSTM) ---
        # Mel Spectrogram (for CNNs)
        mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_mels=128)

        # --- Features for Traditional ML Models (SVM, Random Forest) ---
        # MFCCs, Chroma, Zero-Crossing Rate
        mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=40)
        chroma = librosa.feature.chroma_stft(y=signal, sr=sample_rate)
        zcr = librosa.feature.zero_crossing_rate(y=signal)

        # Aggregate features by taking the mean across time
        mfccs_mean = np.mean(mfccs, axis=1)
        chroma_mean = np.mean(chroma, axis=1)
        zcr_mean = np.mean(zcr, axis=1)

        # Combine aggregated features into a single feature vector
        aggregated_features = np.hstack([mfccs_mean, chroma_mean, zcr_mean])

        return mel_spectrogram, aggregated_features

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None


# --- HOW TO RUN THIS SCRIPT ---
# 1. Make sure pandas, librosa, numpy, and tqdm are installed.
#    pip install pandas librosa numpy tqdm
# 2. Update the dataset_path.
# 3. Run the script. It will save a 'features_dataframe.pkl' file.

if __name__ == '__main__':
    # Initialize tqdm to work with pandas .apply()
    tqdm.pandas()

    dataset_path = "/content/drive/MyDrive/DATASET"
    ravdess_df = create_dataset_dataframe(dataset_path)

    if ravdess_df is not None:
        print("\nExtracting features from audio files... (This may take a few minutes)")

        # Apply the feature extraction function to each file
        # The result of the lambda function is a tuple, which pandas expands into two new columns
        ravdess_df[['mel_spectrogram', 'aggregated_features']] = ravdess_df['filepath'].progress_apply(
            lambda filepath: pd.Series(extract_features(filepath))
        )

        # Drop rows where feature extraction might have failed
        ravdess_df.dropna(inplace=True)

        # Save the feature-rich dataframe to a pickle file for quick loading later
        output_path = "features_dataframe.pkl"
        ravdess_df.to_pickle(output_path)

        print(f"\n✅ Feature extraction complete.")
        print(f"✅ DataFrame saved to '{output_path}'")
        print("\n--- DataFrame Head with New Features ---")
        print(ravdess_df.head())


Parsing dataset from: /content/drive/MyDrive/DATASET

✅ Successfully created DataFrame with 1440 entries.

Extracting features from audio files... (This may take a few minutes)


100%|██████████| 1440/1440 [01:41<00:00, 14.16it/s]



✅ Feature extraction complete.
✅ DataFrame saved to 'features_dataframe.pkl'

--- DataFrame Head with New Features ---
                                            filepath actor  emotion  \
0  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01  neutral   
1  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01     calm   
2  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01  neutral   
3  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01     calm   
4  /content/drive/MyDrive/DATASET/Actor_01/03-01-...    01     calm   

                                     mel_spectrogram  \
0  [[1.9769283e-07, 3.2483592e-07, 1.3091303e-06,...   
1  [[4.4146126e-10, 1.5210988e-08, 1.9769656e-08,...   
2  [[1.3324016e-07, 1.4781753e-07, 1.0845676e-07,...   
3  [[2.1944395e-09, 3.8604355e-08, 7.6826e-08, 1....   
4  [[4.6863455e-08, 1.553473e-08, 6.2439507e-09, ...   

                                 aggregated_features  
0  [-691.587890625, 58.024662017822266, 0.1594650...  
1  [-6