# Sleep pattern Emotion Analysis Model

In [1]:
!pip install pyedflib

import pyedflib
import os

base_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0'

# Define paths
psg_file_path = os.path.join(base_path, 'sleep-cassette', 'SC4001E0-PSG.edf')
hypnogram_file_path = os.path.join(base_path, 'sleep-cassette', 'SC4001EC-Hypnogram.edf')

# Load PSG file
with pyedflib.EdfReader(psg_file_path) as psg_file:
    n_signals = psg_file.signals_in_file
    signal_labels = psg_file.getSignalLabels()
    print("PSG File Signal Labels:", signal_labels)
    print("Sampling Frequency:", psg_file.getSampleFrequency(0))
    print("File Duration:", psg_file.file_duration)

# Load Hypnogram file
with pyedflib.EdfReader(hypnogram_file_path) as hyp_file:
    annotations = hyp_file.readAnnotations()
    print("Hypnogram Annotations (first 10):", annotations[:10])


PSG File Signal Labels: ['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', 'Resp oro-nasal', 'EMG submental', 'Temp rectal', 'Event marker']
Sampling Frequency: 100.0
File Duration: 79500.0
Hypnogram Annotations (first 10): (array([    0., 30630., 30750., 31140., 31170., 31200., 31350., 31380.,
       31440., 31500., 31530., 31650., 31680., 31800., 31830., 31890.,
       31950., 32070., 32100., 32130., 32250., 32460., 32490., 32550.,
       32670., 32700., 32850., 32910., 32940., 32970., 33000., 33120.,
       33270., 33300., 33330., 33390., 33420., 33510., 35400., 35430.,
       35640., 35700., 35790., 35940., 35970., 36840., 37020., 37260.,
       37290., 37410., 37500., 37530., 38460., 38490., 38520., 38580.,
       38610., 38640., 38670., 38700., 38730., 38760., 38910., 38970.,
       39060., 39120., 39150., 39180., 39240., 39300., 39480., 39540.,
       39570., 39600., 39870., 39900., 39960., 39990., 40200., 40230.,
       40290., 40320., 40500., 41370., 41400., 41460., 41490., 41610.,

In [2]:
!pip install mne

import pyedflib
import matplotlib.pyplot as plt
import numpy as np
import mne
import os

# Paths to sample PSG and Hypnogram files
psg_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/sleep-cassette/SC4001E0-PSG.edf'
hypnogram_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/sleep-cassette/SC4001EC-Hypnogram.edf'

# Load the PSG file using MNE for convenience
psg_data = mne.io.read_raw_edf(psg_path, preload=True, verbose=False)
print(psg_data)
sampling_rate = psg_data.info['sfreq']  # Sampling frequency
channels = psg_data.ch_names  # Channel names

print("PSG Channel Names:", channels)
print("Sampling Frequency:", sampling_rate)

# Load the Hypnogram file to get sleep stages
with pyedflib.EdfReader(hypnogram_path) as hypnogram:
    annotations = hypnogram.readAnnotations()

# Display the first 10 annotations to understand the format
print("First 10 Annotations:", annotations[:10])




  psg_data = mne.io.read_raw_edf(psg_path, preload=True, verbose=False)
  psg_data = mne.io.read_raw_edf(psg_path, preload=True, verbose=False)
  psg_data = mne.io.read_raw_edf(psg_path, preload=True, verbose=False)


<RawEDF | SC4001E0-PSG.edf, 7 x 7950000 (79500.0 s), ~424.6 MB, data loaded>
PSG Channel Names: ['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', 'Resp oro-nasal', 'EMG submental', 'Temp rectal', 'Event marker']
Sampling Frequency: 100.0
First 10 Annotations: (array([    0., 30630., 30750., 31140., 31170., 31200., 31350., 31380.,
       31440., 31500., 31530., 31650., 31680., 31800., 31830., 31890.,
       31950., 32070., 32100., 32130., 32250., 32460., 32490., 32550.,
       32670., 32700., 32850., 32910., 32940., 32970., 33000., 33120.,
       33270., 33300., 33330., 33390., 33420., 33510., 35400., 35430.,
       35640., 35700., 35790., 35940., 35970., 36840., 37020., 37260.,
       37290., 37410., 37500., 37530., 38460., 38490., 38520., 38580.,
       38610., 38640., 38670., 38700., 38730., 38760., 38910., 38970.,
       39060., 39120., 39150., 39180., 39240., 39300., 39480., 39540.,
       39570., 39600., 39870., 39900., 39960., 39990., 40200., 40230.,
       40290., 40320., 40500., 41

In [3]:
import os
import pyedflib
import numpy as np
from collections import Counter

# Path to the dataset
base_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0'

# Dictionary to store cumulative time spent in each stage
overall_stage_durations = Counter()

# List all hypnogram files
hypnogram_files = []
for root, dirs, files in os.walk(base_path):
    for file in files:
        if 'Hypnogram.edf' in file:
            hypnogram_files.append(os.path.join(root, file))

# Process each hypnogram file
for hypnogram_path in hypnogram_files:
    try:
        with pyedflib.EdfReader(hypnogram_path) as hypnogram:
            annotations = hypnogram.readAnnotations()

            # Calculate time spent in each stage
            stage_durations = Counter()
            for onset, duration, annotation in zip(*annotations):
                if "Sleep stage" in annotation:
                    stage = annotation.split()[-1]
                    stage_durations[stage] += duration

            # Add to the overall stage durations
            overall_stage_durations.update(stage_durations)

    except OSError as e:
        print(f"Could not read file {hypnogram_path}: {e}")
        continue

# Convert counts to percentages
total_duration = sum(overall_stage_durations.values())
stage_percentages = {stage: (duration / total_duration) * 100 for stage, duration in overall_stage_durations.items()}

# Display the results
print("Sleep Stage Distribution Across All Files (in %):")
for stage, percentage in stage_percentages.items():
    print(f"{stage}: {percentage:.2f}%")



Sleep Stage Distribution Across All Files (in %):
W: 60.70%
1: 5.16%
2: 18.11%
3: 2.45%
4: 1.43%
R: 6.91%
?: 5.25%


In [4]:
import pandas as pd

# Initialize a list to store feature data for each file
features_list = []

# Loop through each hypnogram file and extract features
for hypnogram_path in hypnogram_files:
    try:
        with pyedflib.EdfReader(hypnogram_path) as hypnogram:
            annotations = hypnogram.readAnnotations()

        # Calculate total time spent in each stage
        stage_times = {'W': 0, '1': 0, '2': 0, '3': 0, '4': 0, 'R': 0, '?': 0}
        total_time = 0

        for onset, duration, annotation in zip(*annotations):
            if "Sleep stage" in annotation:
                stage = annotation.split()[-1]
                if stage in stage_times:
                    stage_times[stage] += duration
                    total_time += duration

        # Calculate the proportion of time spent in each stage
        stage_proportions = {f"prop_{stage}": (time / total_time) * 100 for stage, time in stage_times.items()}
        stage_proportions['total_duration'] = total_time  # Total duration of sleep in seconds

        # Add file-specific features to the list
        file_features = {'file_name': os.path.basename(hypnogram_path)}
        file_features.update(stage_proportions)
        features_list.append(file_features)

    except Exception as e:
        print(f"Could not read file {hypnogram_path}: {e}")

# Convert the list of features to a DataFrame for analysis
features_df = pd.DataFrame(features_list)

# Display the extracted features
features_df.head()


Unnamed: 0,file_name,prop_W,prop_1,prop_2,prop_3,prop_4,prop_R,prop_?,total_duration
0,SC4001EC-Hypnogram.edf,69.340278,2.013889,8.680556,3.506944,4.131944,4.340278,7.986111,86400.0
1,SC4002EC-Hypnogram.edf,65.474123,2.049323,12.955887,3.265023,7.051059,7.467871,1.736714,86370.0
2,SC4011EH-Hypnogram.edf,64.444444,3.784722,19.513889,3.333333,0.3125,5.902778,2.708333,86400.0
3,SC4012EC-Hypnogram.edf,63.333333,3.194444,22.916667,2.777778,0.555556,6.111111,1.111111,86400.0
4,SC4021EH-Hypnogram.edf,66.215278,3.263889,18.923611,2.534722,0.763889,5.659722,2.638889,86400.0


In [5]:
import pyedflib
import os

# Define the base path and list for unreadable files
base_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0'
unreadable_files = []

# Function to identify and delete unreadable files
for root, _, files in os.walk(base_path):
    for file in files:
        if file.endswith(".edf"):  # Only target .edf files
            file_path = os.path.join(root, file)
            try:
                # Try opening the file to check if it is EDF-compliant
                with pyedflib.EdfReader(file_path) as edf_file:
                    pass  # File is readable
            except:
                # Log and remove unreadable files
                unreadable_files.append(file_path)
                os.remove(file_path)  # Delete the unreadable file

# Save the list of unreadable files to a text file for reference
unreadable_files_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/unreadable_files.txt'
with open(unreadable_files_path, 'w') as f:
    for file_path in unreadable_files:
        f.write(f"{file_path}\n")

print(f"Unreadable files have been removed. A list of unreadable files is saved to {unreadable_files_path}.")


Unreadable files have been removed. A list of unreadable files is saved to C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/unreadable_files.txt.


In [6]:
# Display the unreadable files log
with open('C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/unreadable_files.txt', 'r') as f:
    print(f.read())





In [7]:
# Load metadata files
metadata_sc = pd.read_excel('C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/SC-subjects.xls')
metadata_st = pd.read_excel('C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/ST-subjects.xls')


In [8]:
print("Columns in features_df:", features_df.columns)
print("Columns in metadata_sc:", metadata_sc.columns)
print("Columns in metadata_st:", metadata_st.columns)


Columns in features_df: Index(['file_name', 'prop_W', 'prop_1', 'prop_2', 'prop_3', 'prop_4', 'prop_R',
       'prop_?', 'total_duration'],
      dtype='object')
Columns in metadata_sc: Index(['subject', 'night', 'age', 'sex (F=1)', 'LightsOff'], dtype='object')
Columns in metadata_st: Index(['Subject - age - sex', 'Unnamed: 1', 'Unnamed: 2', 'Placebo night',
       'Unnamed: 4', 'Temazepam night', 'Unnamed: 6'],
      dtype='object')


In [9]:
features_df.columns = features_df.columns.str.strip()
metadata_sc.columns = metadata_sc.columns.str.strip()
metadata_st.columns = metadata_st.columns.str.strip()


In [10]:
print(features_df.head())
print(metadata_sc.head())
print(metadata_st.head())


                file_name     prop_W    prop_1     prop_2    prop_3    prop_4  \
0  SC4001EC-Hypnogram.edf  69.340278  2.013889   8.680556  3.506944  4.131944   
1  SC4002EC-Hypnogram.edf  65.474123  2.049323  12.955887  3.265023  7.051059   
2  SC4011EH-Hypnogram.edf  64.444444  3.784722  19.513889  3.333333  0.312500   
3  SC4012EC-Hypnogram.edf  63.333333  3.194444  22.916667  2.777778  0.555556   
4  SC4021EH-Hypnogram.edf  66.215278  3.263889  18.923611  2.534722  0.763889   

     prop_R    prop_?  total_duration  
0  4.340278  7.986111         86400.0  
1  7.467871  1.736714         86370.0  
2  5.902778  2.708333         86400.0  
3  6.111111  1.111111         86400.0  
4  5.659722  2.638889         86400.0  
   subject  night  age  sex (F=1) LightsOff
0        0      1   33          1  00:38:00
1        0      2   33          1  21:57:00
2        1      1   33          1  22:44:00
3        1      2   33          1  22:15:00
4        2      1   26          1  22:50:00
  Subject

In [11]:
if 'subject_id' not in features_df.columns:
    print("Missing 'subject_id' in features_df")
if 'subject_id' not in metadata_sc.columns:
    print("Missing 'subject_id' in metadata_sc")
if 'subject_id' not in metadata_st.columns:
    print("Missing 'subject_id' in metadata_st")


Missing 'subject_id' in features_df
Missing 'subject_id' in metadata_sc
Missing 'subject_id' in metadata_st


In [12]:
# For features_df, extract subject_id from 'file_name'
features_df['subject_id'] = features_df['file_name'].str.split('-').str[0]

# For metadata_sc, use 'subject' as 'subject_id'
metadata_sc.rename(columns={'subject': 'subject_id'}, inplace=True)

# For metadata_st, use 'Subject - age - sex' as 'subject_id'
metadata_st.rename(columns={'Subject - age - sex': 'subject_id'}, inplace=True)

# Ensure all subject_ids are strings
features_df['subject_id'] = features_df['subject_id'].astype(str)
metadata_sc['subject_id'] = metadata_sc['subject_id'].astype(str)
metadata_st['subject_id'] = metadata_st['subject_id'].astype(str)


In [13]:
print("Features DataFrame Columns:", features_df.columns)
print("Metadata SC Columns:", metadata_sc.columns)
print("Metadata ST Columns:", metadata_st.columns)

print(features_df.head())
print(metadata_sc.head())
print(metadata_st.head())


Features DataFrame Columns: Index(['file_name', 'prop_W', 'prop_1', 'prop_2', 'prop_3', 'prop_4', 'prop_R',
       'prop_?', 'total_duration', 'subject_id'],
      dtype='object')
Metadata SC Columns: Index(['subject_id', 'night', 'age', 'sex (F=1)', 'LightsOff'], dtype='object')
Metadata ST Columns: Index(['subject_id', 'Unnamed: 1', 'Unnamed: 2', 'Placebo night', 'Unnamed: 4',
       'Temazepam night', 'Unnamed: 6'],
      dtype='object')
                file_name     prop_W    prop_1     prop_2    prop_3    prop_4  \
0  SC4001EC-Hypnogram.edf  69.340278  2.013889   8.680556  3.506944  4.131944   
1  SC4002EC-Hypnogram.edf  65.474123  2.049323  12.955887  3.265023  7.051059   
2  SC4011EH-Hypnogram.edf  64.444444  3.784722  19.513889  3.333333  0.312500   
3  SC4012EC-Hypnogram.edf  63.333333  3.194444  22.916667  2.777778  0.555556   
4  SC4021EH-Hypnogram.edf  66.215278  3.263889  18.923611  2.534722  0.763889   

     prop_R    prop_?  total_duration subject_id  
0  4.340278  7.98

In [14]:
# Merge SC metadata with features
sc_merged_df = features_df.merge(metadata_sc, on='subject_id', how='left')

# Merge ST metadata with features
st_merged_df = features_df.merge(metadata_st, on='subject_id', how='left')

# Concatenate SC and ST merged DataFrames
aggregated_df = pd.concat([sc_merged_df, st_merged_df], ignore_index=True)

# Save the aggregated DataFrame
aggregated_df.to_csv('C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/aggregated_sleep_data.csv', index=False)
print("Aggregated data has been saved as 'aggregated_sleep_data.csv'.")


Aggregated data has been saved as 'aggregated_sleep_data.csv'.


In [15]:
# Rename the 'sex (F=1)' column to 'sex'
aggregated_df = aggregated_df.rename(columns={'sex (F=1)': 'sex'})

# Group by age and sex to calculate mean and variance of each sleep stage proportion
aggregated_features_by_age_sex = (
    aggregated_df.groupby(['age', 'sex'])
    .agg({
        'prop_W': ['mean', 'var'],
        'prop_1': ['mean', 'var'],
        'prop_2': ['mean', 'var'],
        'prop_3': ['mean', 'var'],
        'prop_4': ['mean', 'var'],
        'prop_R': ['mean', 'var'],
        'prop_?': ['mean', 'var']
    })
).reset_index()

# Flatten the column names for easier handling
aggregated_features_by_age_sex.columns = [
    '_'.join(col).strip() if isinstance(col, tuple) else col for col in aggregated_features_by_age_sex.columns
]

print("Aggregated features by age and sex calculated:")
aggregated_features_by_age_sex.head()



Aggregated features by age and sex calculated:


Unnamed: 0,age_,sex_,prop_W_mean,prop_W_var,prop_1_mean,prop_1_var,prop_2_mean,prop_2_var,prop_3_mean,prop_3_var,prop_4_mean,prop_4_var,prop_R_mean,prop_R_var,prop_?_mean,prop_?_var


In [16]:
from collections import defaultdict

# Function to calculate transition frequencies
def calculate_transitions(annotations):
    transitions = defaultdict(int)
    previous_stage = None
    for onset, duration, annotation in zip(*annotations):
        if "Sleep stage" in annotation:
            current_stage = annotation.split()[-1]
            if previous_stage:
                transitions[(previous_stage, current_stage)] += 1
            previous_stage = current_stage
    return transitions

# Apply transition calculation to each hypnogram file
transition_features_list = []
for hypnogram_path in hypnogram_files:
    try:
        with pyedflib.EdfReader(hypnogram_path) as hypnogram:
            annotations = hypnogram.readAnnotations()
            transitions = calculate_transitions(annotations)

            # Flatten transition counts for the DataFrame
            transition_data = {f"trans_{k[0]}_to_{k[1]}": v for k, v in transitions.items()}
            transition_data['file_name'] = os.path.basename(hypnogram_path)
            transition_features_list.append(transition_data)

    except Exception as e:
        print(f"Could not process file {hypnogram_path}: {e}")

# Convert list of transitions to a DataFrame
transition_features_df = pd.DataFrame(transition_features_list)
print("Transition features calculated:")
transition_features_df.head()


Transition features calculated:


Unnamed: 0,trans_W_to_1,trans_1_to_2,trans_2_to_3,trans_3_to_2,trans_3_to_4,trans_4_to_3,trans_4_to_W,trans_W_to_3,trans_4_to_2,trans_3_to_1,...,trans_1_to_1,trans_4_to_R,trans_?_to_R,trans_1_to_?,trans_?_to_1,trans_3_to_?,trans_?_to_2,trans_2_to_?,trans_R_to_?,trans_W_to_W
0,10,14,28.0,23.0,22.0,18.0,1.0,1.0,2.0,2.0,...,,,,,,,,,,
1,22,20,16.0,12.0,16.0,14.0,,,1.0,,...,,,,,,,,,,
2,14,23,14.0,12.0,8.0,7.0,,,1.0,1.0,...,,,,,,,,,,
3,16,30,21.0,21.0,7.0,8.0,,,1.0,,...,,,,,,,,,,
4,6,18,31.0,27.0,13.0,12.0,,,3.0,1.0,...,,,,,,,,,,


In [17]:
# Calculate REM and deep sleep ratios
features_df['REM_ratio'] = features_df['prop_R'] / features_df['total_duration']
features_df['Deep_sleep_ratio'] = features_df['prop_4'] / features_df['total_duration']

# Display the first few rows to verify calculations
print("Sleep quality indicators (REM and deep sleep ratios) calculated:")
features_df[['file_name', 'REM_ratio', 'Deep_sleep_ratio']].head()


Sleep quality indicators (REM and deep sleep ratios) calculated:


Unnamed: 0,file_name,REM_ratio,Deep_sleep_ratio
0,SC4001EC-Hypnogram.edf,5e-05,4.8e-05
1,SC4002EC-Hypnogram.edf,8.6e-05,8.2e-05
2,SC4011EH-Hypnogram.edf,6.8e-05,4e-06
3,SC4012EC-Hypnogram.edf,7.1e-05,6e-06
4,SC4021EH-Hypnogram.edf,6.6e-05,9e-06


In [18]:
import pandas as pd

# Sample emotion mapping based on research (adjust as needed)
emotion_mapping = {
    'REM': {'contentment': 0.5, 'stress_relief': 0.5},
    'Deep': {'calm': 0.6, 'relaxation': 0.4},
    'Light': {'mild_alertness': 0.7, 'low_energy': 0.3}
}

# Function to calculate emotion scores based on sleep stage proportions
def calculate_emotion_scores(row, emotion_mapping):
    scores = {'contentment': 0, 'stress_relief': 0, 'calm': 0, 'relaxation': 0, 'mild_alertness': 0, 'low_energy': 0}

    # Calculate weighted emotion scores for each stage
    for stage, emotions in emotion_mapping.items():
        if stage == 'REM':
            for emotion, weight in emotions.items():
                scores[emotion] += row['prop_R'] * weight
        elif stage == 'Deep':
            for emotion, weight in emotions.items():
                scores[emotion] += (row['prop_3'] + row['prop_4']) * weight
        elif stage == 'Light':
            for emotion, weight in emotions.items():
                scores[emotion] += (row['prop_1'] + row['prop_2']) * weight

    return scores

# Apply emotion score calculation for each record in the DataFrame
aggregated_df['emotion_scores'] = aggregated_df.apply(lambda row: calculate_emotion_scores(row, emotion_mapping), axis=1)

# Expand the emotion scores into individual columns
emotion_scores_df = pd.DataFrame(aggregated_df['emotion_scores'].tolist())
aggregated_df = pd.concat([aggregated_df, emotion_scores_df], axis=1)

# Drop the temporary 'emotion_scores' column after expansion
aggregated_df.drop(columns=['emotion_scores'], inplace=True)

# Display the final DataFrame with emotion scores
print("Aggregated DataFrame with emotion scores:")
print(aggregated_df[['file_name', 'prop_R', 'prop_3', 'prop_4', 'prop_1', 'prop_2', 'contentment', 'stress_relief', 'calm', 'relaxation', 'mild_alertness', 'low_energy']].head())


Aggregated DataFrame with emotion scores:
                file_name    prop_R    prop_3    prop_4    prop_1     prop_2  \
0  SC4001EC-Hypnogram.edf  4.340278  3.506944  4.131944  2.013889   8.680556   
1  SC4002EC-Hypnogram.edf  7.467871  3.265023  7.051059  2.049323  12.955887   
2  SC4011EH-Hypnogram.edf  5.902778  3.333333  0.312500  3.784722  19.513889   
3  SC4012EC-Hypnogram.edf  6.111111  2.777778  0.555556  3.194444  22.916667   
4  SC4021EH-Hypnogram.edf  5.659722  2.534722  0.763889  3.263889  18.923611   

   contentment  stress_relief      calm  relaxation  mild_alertness  \
0     2.170139       2.170139  4.583333    3.055556        7.486111   
1     3.733935       3.733935  6.189649    4.126433       10.503647   
2     2.951389       2.951389  2.187500    1.458333       16.309028   
3     3.055556       3.055556  2.000000    1.333333       18.277778   
4     2.829861       2.829861  1.979167    1.319444       15.531250   

   low_energy  
0    3.208333  
1    4.501563  
2 

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

# Define the features (sleep stage proportions and metadata)
feature_columns = ['prop_W', 'prop_1', 'prop_2', 'prop_3', 'prop_4', 'prop_R']
X = aggregated_df[feature_columns]

# Define target emotions (synthetic labels created based on sleep feature scores)
emotion_labels = ['happiness', 'sadness', 'anger', 'surprise', 'fear']
aggregated_df['happiness'] = np.clip(aggregated_df['contentment'] + aggregated_df['relaxation'] - aggregated_df['low_energy'], 0, 1)
aggregated_df['sadness'] = np.clip(aggregated_df['low_energy'] - aggregated_df['contentment'], 0, 1)
aggregated_df['anger'] = np.clip(aggregated_df['mild_alertness'] - aggregated_df['calm'], 0, 1)
aggregated_df['surprise'] = np.clip(aggregated_df['stress_relief'] - aggregated_df['relaxation'], 0, 1)
aggregated_df['fear'] = np.clip(aggregated_df['stress_relief'] + aggregated_df['low_energy'], 0, 1)

# Binarize the continuous labels by setting a threshold
y = (aggregated_df[emotion_labels] > 0.5).astype(int)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model with a classification report
print("Multi-label Classification Report:")
print(classification_report(y_test, y_pred, target_names=emotion_labels))



Multi-label Classification Report:
              precision    recall  f1-score   support

   happiness       0.88      1.00      0.93        14
     sadness       1.00      1.00      1.00        71
       anger       1.00      1.00      1.00        76
    surprise       0.97      1.00      0.99        68
        fear       1.00      1.00      1.00        76

   micro avg       0.99      1.00      0.99       305
   macro avg       0.97      1.00      0.98       305
weighted avg       0.99      1.00      0.99       305
 samples avg       0.99      1.00      0.99       305



In [20]:
import pandas as pd

# Predict probabilities for the test set
y_proba = model.predict_proba(X_test)

# Convert probabilities to percentages and create a DataFrame
y_proba_percent = pd.DataFrame(
    {
        emotion: [probs[1] * 100 if probs.shape[0] > 1 else probs[0] * 100 for probs in proba]  # Handle single-column case
        for emotion, proba in zip(emotion_labels, y_proba)
    },
    columns=emotion_labels
)

# Add the corresponding Person IDs or indices from the test set
y_proba_percent['Person_ID'] = X_test.index

# Rearrange columns for better readability
y_proba_percent = y_proba_percent[['Person_ID'] + emotion_labels]

# Display the predicted emotions with percentages
print("Predicted emotion probabilities (as percentages) for the test set:")
print(y_proba_percent)

# Save the predictions to a CSV for further analysis
output_path = "C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/test_set_emotion_predictions.csv"
y_proba_percent.to_csv(output_path, index=False)
print(f"\nPredicted emotion probabilities saved to {output_path}.")

Predicted emotion probabilities (as percentages) for the test set:
    Person_ID  happiness  sadness  anger  surprise   fear
0         266        4.0     96.0  100.0     100.0  100.0
1         261        3.0    100.0  100.0      81.0  100.0
2         265        0.0    100.0  100.0     100.0  100.0
3          39       15.0     99.0  100.0      92.0  100.0
4          33       96.0     17.0  100.0      92.0  100.0
..        ...        ...      ...    ...       ...    ...
71        316       47.0     91.0  100.0      97.0  100.0
72         63        7.0     97.0  100.0     100.0  100.0
73        229       15.0     99.0  100.0      92.0  100.0
74         82        5.0     97.0  100.0      21.0  100.0
75         94        0.0    100.0  100.0     100.0  100.0

[76 rows x 6 columns]

Predicted emotion probabilities saved to C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/test_set_emotion_predictions.csv.


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load the dataset
file_path = "C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/aggregated_sleep_data.csv"
aggregated_df = pd.read_csv(file_path)
print("Dataset loaded successfully!")

# Select relevant features (sleep stage proportions)
feature_columns = ['prop_W', 'prop_1', 'prop_2', 'prop_3', 'prop_4', 'prop_R', 'prop_?']
X = aggregated_df[feature_columns]

# Generate synthetic emotion labels based on sleep stage proportions
# Adjust the logic here if you have specific mappings for emotions
emotion_labels = ['happiness', 'sadness', 'anger', 'surprise', 'fear']
aggregated_df['happiness'] = np.clip(aggregated_df['prop_R'] - aggregated_df['prop_1'], 0, 1)
aggregated_df['sadness'] = np.clip(aggregated_df['prop_?'] - aggregated_df['prop_R'], 0, 1)
aggregated_df['anger'] = np.clip(aggregated_df['prop_3'] - aggregated_df['prop_W'], 0, 1)
aggregated_df['surprise'] = np.clip(aggregated_df['prop_4'] - aggregated_df['prop_2'], 0, 1)
aggregated_df['fear'] = np.clip(aggregated_df['prop_?'] + aggregated_df['prop_1'], 0, 1)

# Binarize the labels (threshold 0.5)
y = (aggregated_df[emotion_labels] > 0.5).astype(int)

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training and testing sets created.")

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1_micro')
grid_search.fit(X_train, y_train)
model = grid_search.best_estimator_
print(f"Best model parameters: {grid_search.best_params_}")

# Train the Random Forest model
model.fit(X_train, y_train)
print("Model training completed!")

# Predict probabilities for the test set
y_proba = model.predict_proba(X_test)

# Convert probabilities to percentages and create a DataFrame
# Adjusted to handle single-class probabilities
y_proba_percent = pd.DataFrame(
    {
        emotion: [
            probs[1] * 100 if probs.shape[0] > 1 else probs[0] * 100
            for probs in proba
        ]
        for emotion, proba in zip(emotion_labels, y_proba)
    }
)

# Add the corresponding indices from the test set
y_proba_percent['Index'] = aggregated_df.iloc[y_test.index].index

# Rearrange columns for better readability
y_proba_percent = y_proba_percent[['Index'] + emotion_labels]

# Display the predicted emotions with percentages
print("Predicted emotion probabilities (as percentages) for the test set:")
print(y_proba_percent.head())

# Save the predictions to a CSV for further analysis
output_path = "C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/aggregated_emotion_predictions.csv"
y_proba_percent.to_csv(output_path, index=False)
print(f"\nPredicted emotion probabilities saved to {output_path}.")


Dataset loaded successfully!
Training and testing sets created.
Best model parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Model training completed!
Predicted emotion probabilities (as percentages) for the test set:
   Index  happiness  sadness  anger  surprise   fear
0    266        5.0     94.0    0.0     100.0  100.0
1    261       80.0     80.0    0.0     100.0  100.0
2    265      100.0      0.0    0.0     100.0  100.0
3     39       84.0     12.0    0.0     100.0  100.0
4     33      100.0      0.0    1.0     100.0  100.0

Predicted emotion probabilities saved to C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/aggregated_emotion_predictions.csv.


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
dataset_path = "C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/aggregated_sleep_data.csv"
sleep_data_df = pd.read_csv(dataset_path)
print("Dataset loaded successfully!")

# Select relevant features
selected_features = ['prop_W', 'prop_1', 'prop_2', 'prop_3', 'prop_4', 'prop_R']
X = sleep_data_df[selected_features]

# Generate synthetic emotion labels for demonstration
emotion_labels = ['happiness', 'sadness', 'anger', 'surprise', 'fear']
sleep_data_df['happiness'] = np.clip(sleep_data_df['prop_R'] - sleep_data_df['prop_?'], 0, 1)
sleep_data_df['sadness'] = np.clip(sleep_data_df['prop_1'] - sleep_data_df['prop_4'], 0, 1)
sleep_data_df['anger'] = np.clip(sleep_data_df['prop_3'] - sleep_data_df['prop_2'], 0, 1)
sleep_data_df['surprise'] = np.clip(sleep_data_df['prop_2'] + sleep_data_df['prop_R'], 0, 1)
sleep_data_df['fear'] = np.clip(sleep_data_df['prop_W'] - sleep_data_df['prop_1'], 0, 1)

# Binarize the labels for multi-label classification
y = (sleep_data_df[emotion_labels] > 0.5).astype(int)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training and testing sets created.")

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1_micro')
grid_search.fit(X_train, y_train)
model = grid_search.best_estimator_
print(f"Best model parameters: {grid_search.best_params_}")

# Train the model
model.fit(X_train, y_train)
print("Model training completed!")

# Predict probabilities for the test set
y_proba = model.predict_proba(X_test)

# Normalize probabilities to sum up to 100% for each individual
normalized_proba = []
for i in range(len(X_test)):
    probabilities = []
    for proba in y_proba:
        # Handle cases with only one class in probabilities
        if len(proba[i]) > 1:
            probabilities.append(proba[i, 1])  # Class 1 probability
        else:
            probabilities.append(proba[i, 0])  # Single class probability
    total = sum(probabilities)
    normalized = (np.array(probabilities) / total) * 100 if total > 0 else probabilities
    normalized_proba.append(normalized)

# Convert to DataFrame
normalized_proba_df = pd.DataFrame(normalized_proba, columns=emotion_labels)

# Add Person IDs or indices
normalized_proba_df['Index'] = sleep_data_df.iloc[X_test.index].index

# Display the normalized probabilities
print("Normalized emotion probabilities (sum to 100%):")
print(normalized_proba_df)

# Save the predictions
output_path = "C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/normalized_emotion_predictions.csv"
normalized_proba_df.to_csv(output_path, index=False)
print(f"Normalized emotion probabilities saved to {output_path}.")



Dataset loaded successfully!
Training and testing sets created.
Best model parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Model training completed!
Normalized emotion probabilities (sum to 100%):
    happiness    sadness      anger   surprise       fear  Index
0    5.131787  23.717053  23.717053  23.717053  23.717053    266
1    2.662052  23.477174  24.620258  24.620258  24.620258    261
2   19.472706  20.132706  20.132706  20.132706  20.129174    265
3   17.069864  17.321675  21.869487  21.869487  21.869487     39
4   21.436076   1.852923  25.655853  25.655853  25.399295     33
..        ...        ...        ...        ...        ...    ...
71  14.438154  18.481309  22.360179  22.360179  22.360179    316
72  15.699989  21.075927  21.075927  21.075927  21.072230     63
73  17.069864  17.321675  21.869487  21.869487  21.869487    229
74   1.271583  24.655650  24.690923  24.690923  24.690923     82
75  18.023068  20.495132  20.495132  2

In [23]:
# Binarize predicted probabilities using a threshold
threshold = 50
y_pred_binarized = (normalized_proba_df.iloc[:, :-1].values >= threshold).astype(int)  # Exclude 'Index' column


In [24]:
print("Label distribution in training set:")
print(y_train.sum(axis=0))  # Sum of each label in training set
print("\nLabel distribution in testing set:")
print(y_test.sum(axis=0))  # Sum of each label in testing set


Label distribution in training set:
happiness    195
sadness      222
anger          0
surprise     304
fear         274
dtype: int64

Label distribution in testing set:
happiness    53
sadness      60
anger         0
surprise     76
fear         70
dtype: int64


In [25]:
# Verify the number of existing labels
print("Existing label shape:", y_train.shape)

# Add missing columns in y_train for alignment (if necessary)
for emotion in emotion_labels:
    if emotion not in y_train.columns:
        y_train[emotion] = 0  # Initialize with 0 for missing labels

# Generate synthetic features (if not already defined)
num_samples = 50  # Number of synthetic samples to generate
synthetic_features = pd.DataFrame(
    np.random.rand(num_samples, X_train.shape[1]),
    columns=X_train.columns
)

# Expand synthetic labels for all emotions
synthetic_labels_full = pd.DataFrame({
    'happiness': [0] * num_samples,
    'sadness': [0] * num_samples,
    'anger': [1] * num_samples,       # Anger is 1 for synthetic data
    'surprise': [0] * num_samples,
    'fear': [0] * num_samples
})

# Append synthetic data to training set
X_train = pd.concat([X_train, synthetic_features], ignore_index=True)
y_train = pd.concat([y_train, synthetic_labels_full], ignore_index=True)

# Verify updated training dataset shapes
print("Updated training dataset shape:")
print("Features:", X_train.shape)
print("Labels:", y_train.shape)



Existing label shape: (304, 5)
Updated training dataset shape:
Features: (354, 6)
Labels: (354, 5)


In [26]:
# Che# Split data again to ensure initial alignment if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Post-split dataset shapes:")
print("Features:", X_train.shape)
print("Labels:", y_train.shape)

# Check if we need to add synthetic data
print("Synthetic features shape:", synthetic_features.shape)
print("Synthetic labels shape:", synthetic_labels_full.shape)

# Append synthetic data if initial shapes align
if X_train.shape[0] == y_train.shape[0] and synthetic_features.shape[0] == synthetic_labels_full.shape[0]:
    X_train = pd.concat([X_train, synthetic_features], ignore_index=True)
    y_train = pd.concat([y_train, synthetic_labels_full], ignore_index=True)
else:
    print("Error: Initial mismatch in data. Verify the dataset generation and split process.")

# Final shape check
print("Updated dataset shape:")
print("Features:", X_train.shape)
print("Labels:", y_train.shape)


Post-split dataset shapes:
Features: (304, 6)
Labels: (304, 5)
Synthetic features shape: (50, 6)
Synthetic labels shape: (50, 5)
Updated dataset shape:
Features: (354, 6)
Labels: (354, 5)


In [27]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Combine original and synthetic data (already done in previous steps)
# Updated X_train and y_train with both original and synthetic data

# Split combined data into new training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model with hyperparameters from previous best GridSearch
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    random_state=42
)
model.fit(X_train, y_train)

print("Model retraining completed!")

# Make predictions on the test set and get probability estimates
y_pred = model.predict(X_test)

# Calculate emotion percentages for the entire test set
y_pred_df = pd.DataFrame(y_pred, columns=emotion_labels)
emotion_percentages = (y_pred_df.sum() / len(y_pred_df)) * 100

print("\nEmotion Percentages for the Test Set:")
for emotion, percentage in emotion_percentages.items():
    print(f"{emotion}: {percentage:.2f}%")

# Save normalized probabilities to a CSV file
output_path = "C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/normalized_emotion_predictions_updated.csv"
y_pred_df.to_csv(output_path, index=False)
print(f"Normalized emotion predictions saved to {output_path}.")


Model retraining completed!

Emotion Percentages for the Test Set:
happiness: 60.56%
sadness: 54.93%
anger: 14.08%
surprise: 85.92%
fear: 80.28%
Normalized emotion predictions saved to C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/normalized_emotion_predictions_updated.csv.


In [28]:
# Label distribution in training and testing datasets
print("Label distribution in training set:")
print(y_train.sum(axis=0))  # Sum of each label in training set

print("\nLabel distribution in testing set:")
print(y_test.sum(axis=0))  # Sum of each label in testing set


Label distribution in training set:
happiness    152
sadness      182
anger         40
surprise     243
fear         217
dtype: int64

Label distribution in testing set:
happiness    43
sadness      40
anger        10
surprise     61
fear         57
dtype: int64


In [29]:
# Identify misclassified samples
misclassified_indices = (y_test.values != y_pred_binarized).any(axis=1)
misclassified_samples = X_test[misclassified_indices]

# Display some misclassified samples
print("Misclassified samples:")
print(misclassified_samples.head())


ValueError: operands could not be broadcast together with shapes (71,5) (76,5) 

In [30]:
# Check shapes of y_test and y_pred_binarized
print("y_test shape:", y_test.values.shape)
print("y_pred_binarized shape:", y_pred_binarized.shape)

# Adjust shapes if there's a mismatch
if len(y_test) > len(y_pred_binarized):
    y_test = y_test.iloc[:len(y_pred_binarized)]
elif len(y_pred_binarized) > len(y_test):
    y_pred_binarized = y_pred_binarized[:len(y_test)]

# Ensure the shapes are now consistent
print("Adjusted y_test shape:", y_test.values.shape)
print("Adjusted y_pred_binarized shape:", y_pred_binarized.shape)

# Identify misclassified samples
misclassified_indices = (y_test.values != y_pred_binarized).any(axis=1)
misclassified_samples = X_test.iloc[misclassified_indices]

# Display some misclassified samples
print("Misclassified samples:")
print(misclassified_samples.head())


y_test shape: (71, 5)
y_pred_binarized shape: (76, 5)
Adjusted y_test shape: (71, 5)
Adjusted y_pred_binarized shape: (71, 5)
Misclassified samples:
        prop_W    prop_1     prop_2    prop_3     prop_4     prop_R
220   9.734513  5.309735  55.044248  7.079646   3.716814  19.115044
42   66.516151  0.729420  11.392845  6.078499   1.389371   9.100382
286  65.138889  3.333333  16.631944  2.222222   0.034722   7.430556
181  67.291667  5.069444  18.055556  3.611111   0.416667   5.555556
56    5.612245  3.061224  26.326531  8.673469  21.326531  35.000000


In [31]:
from sklearn.model_selection import cross_val_score

# Cross-validation on the full dataset
cv_scores = cross_val_score(model, X, y, cv=5, scoring='f1_micro')
print(f"Cross-Validation Scores (F1): {cv_scores}")
print(f"Mean CV Score: {np.mean(cv_scores):.4f}")


Cross-Validation Scores (F1): [0.99801193 0.99807322 1.         0.9943074  0.99796334]
Mean CV Score: 0.9977


In [32]:
# Train model without synthetic data
X_train_no_synthetic = X_train.iloc[:-num_samples]
y_train_no_synthetic = y_train.iloc[:-num_samples]

model_no_synthetic = RandomForestClassifier(
    n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=2, random_state=42
)
model_no_synthetic.fit(X_train_no_synthetic, y_train_no_synthetic)

# Evaluate the model
y_pred_no_synthetic = model_no_synthetic.predict(X_test)
print("Classification Report (No Synthetic Data):")
print(classification_report(y_test, y_pred_no_synthetic, target_names=emotion_labels))


Classification Report (No Synthetic Data):
              precision    recall  f1-score   support

   happiness       0.93      0.95      0.94        43
     sadness       0.97      0.97      0.97        40
       anger       1.00      1.00      1.00        10
    surprise       1.00      1.00      1.00        61
        fear       1.00      0.98      0.99        57

   micro avg       0.98      0.98      0.98       211
   macro avg       0.98      0.98      0.98       211
weighted avg       0.98      0.98      0.98       211
 samples avg       0.98      0.99      0.98       211



In [33]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Choose normalization method: Min-Max Scaling
scaler = MinMaxScaler()

# Apply scaler to feature columns
normalized_features = scaler.fit_transform(aggregated_df[feature_columns])

# Convert back to DataFrame
normalized_df = pd.DataFrame(normalized_features, columns=feature_columns)

# Add normalized features back to the main DataFrame
aggregated_df[feature_columns] = normalized_df

print("Normalized Features (first 5 rows):")
print(aggregated_df[feature_columns].head())


Normalized Features (first 5 rows):
     prop_W    prop_1    prop_2    prop_3    prop_4    prop_R    prop_?
0  0.881679  0.078217  0.020451  0.201858  0.193747  0.102642  0.273810
1  0.831202  0.079991  0.089197  0.187933  0.330624  0.194182  0.059544
2  0.817758  0.166872  0.194647  0.191865  0.014653  0.148374  0.092857
3  0.803251  0.137321  0.249363  0.159888  0.026050  0.154472  0.038095
4  0.840878  0.140797  0.185156  0.145897  0.035819  0.141260  0.090476


In [34]:
# Replace original features with log-transformed ones
aggregated_df['prop_W'] = aggregated_df['prop_W_log']
aggregated_df['prop_2'] = aggregated_df['prop_2_log']
aggregated_df['prop_R'] = aggregated_df['prop_R_log']

# Drop the log-transformed temporary columns
aggregated_df.drop(['prop_W_log', 'prop_2_log', 'prop_R_log'], axis=1, inplace=True)

# Prepare features (X) and labels (y)
feature_columns = ['prop_W', 'prop_1', 'prop_2', 'prop_3', 'prop_4', 'prop_R']
X = aggregated_df[feature_columns]

# Define labels (emotions)
emotion_labels = ['happiness', 'sadness', 'anger', 'surprise', 'fear']
y = (aggregated_df[emotion_labels] > 0.5).astype(int)

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Updated training and testing datasets are ready!")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


KeyError: 'prop_W_log'

In [35]:
# Check if log-transformed columns exist
required_columns = ['prop_W_log', 'prop_2_log', 'prop_R_log']
missing_columns = [col for col in required_columns if col not in aggregated_df.columns]

if missing_columns:
    print(f"Missing columns: {missing_columns}. Creating them using log transformation.")
    import numpy as np
    if 'prop_W' in aggregated_df.columns:
        aggregated_df['prop_W_log'] = np.log1p(aggregated_df['prop_W'])
    if 'prop_2' in aggregated_df.columns:
        aggregated_df['prop_2_log'] = np.log1p(aggregated_df['prop_2'])
    if 'prop_R' in aggregated_df.columns:
        aggregated_df['prop_R_log'] = np.log1p(aggregated_df['prop_R'])

# Replace original features with log-transformed ones
aggregated_df['prop_W'] = aggregated_df['prop_W_log']
aggregated_df['prop_2'] = aggregated_df['prop_2_log']
aggregated_df['prop_R'] = aggregated_df['prop_R_log']

# Drop the log-transformed temporary columns
aggregated_df.drop(['prop_W_log', 'prop_2_log', 'prop_R_log'], axis=1, inplace=True)

# Prepare features (X) and labels (y)
feature_columns = ['prop_W', 'prop_1', 'prop_2', 'prop_3', 'prop_4', 'prop_R']
X = aggregated_df[feature_columns]

# Define labels (emotions)
emotion_labels = ['happiness', 'sadness', 'anger', 'surprise', 'fear']
y = (aggregated_df[emotion_labels] > 0.5).astype(int)

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display dataset information
print("Updated training and testing datasets are ready!")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

Missing columns: ['prop_W_log', 'prop_2_log', 'prop_R_log']. Creating them using log transformation.
Updated training and testing datasets are ready!
X_train shape: (304, 6)
y_train shape: (304, 5)
X_test shape: (76, 6)
y_test shape: (76, 5)


In [36]:
# Percentage of each label in the dataset
label_distribution_train = y_train.sum(axis=0) / len(y_train)
label_distribution_test = y_test.sum(axis=0) / len(y_test)

print("Training Set Label Distribution (%):")
print(label_distribution_train)
print("\nTesting Set Label Distribution (%):")
print(label_distribution_test)


Training Set Label Distribution (%):
happiness    0.657895
sadness      0.263158
anger        0.085526
surprise     0.000000
fear         1.000000
dtype: float64

Testing Set Label Distribution (%):
happiness    0.684211
sadness      0.263158
anger        0.026316
surprise     0.000000
fear         1.000000
dtype: float64


In [37]:
print("Label distribution in training set (per label):")
print(y_train.sum(axis=0))  # This will show the count of 1s for each label



Label distribution in training set (per label):
happiness    200
sadness       80
anger         26
surprise       0
fear         304
dtype: int64


In [38]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Create balanced dataset for each label separately
X_train_balanced = X_train.copy()
y_train_balanced = pd.DataFrame()

for label in y_train.columns:
    # Check if the label has more than one class
    if len(y_train[label].unique()) > 1:  # Skip SMOTE for single-class labels
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train[label])
        X_train_balanced = pd.DataFrame(X_resampled, columns=X_train.columns)
        y_train_balanced[label] = y_resampled
    else:
        # Retain original values for single-class labels
        y_train_balanced[label] = y_train[label]

# Verify the new label distribution
print("Balanced Training Set Label Distribution:")
print(y_train_balanced.sum(axis=0))


Balanced Training Set Label Distribution:
happiness    200.0
sadness      176.0
anger        122.0
surprise       0.0
fear         304.0
dtype: float64


In [39]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Initialize placeholders for the balanced dataset
X_train_balanced = X_train.copy()
y_train_balanced = pd.DataFrame()

# Apply SMOTE for each label independently
for label in y_train.columns:
    # Check if the label has more than one class
    if len(y_train[label].unique()) > 1:  # Avoid errors for single-class labels
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train[label])
        X_train_balanced = pd.DataFrame(X_resampled, columns=X_train.columns)
        y_train_balanced[label] = y_resampled
    else:
        # Retain original values for single-class labels
        y_train_balanced[label] = y_train[label]

# Verify the new label distribution
print("Balanced Training Set Label Distribution:")
print(y_train_balanced.sum(axis=0))



Balanced Training Set Label Distribution:
happiness    200.0
sadness      176.0
anger        122.0
surprise       0.0
fear         304.0
dtype: float64


In [40]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Number of synthetic samples to create for 'surprise'
n_synthetic_samples = 50

# Generate random synthetic features for 'surprise'
synthetic_features = pd.DataFrame(
    np.random.rand(n_synthetic_samples, X_train.shape[1]),
    columns=X_train.columns
)

# Add synthetic 'surprise' label (1) for these samples
synthetic_labels = pd.DataFrame({
    'happiness': [0] * n_synthetic_samples,
    'sadness': [0] * n_synthetic_samples,
    'anger': [0] * n_synthetic_samples,
    'surprise': [1] * n_synthetic_samples,
    'fear': [0] * n_synthetic_samples
})

# Append synthetic data to training set
X_train_augmented = pd.concat([X_train, synthetic_features], ignore_index=True)
y_train_augmented = pd.concat([y_train, synthetic_labels], ignore_index=True)

# Verify the shapes are consistent
print("Augmented Dataset Shapes:")
print(f"X_train_augmented: {X_train_augmented.shape}")
print(f"y_train_augmented: {y_train_augmented.shape}")

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Placeholder for balanced datasets
X_train_balanced = []
y_train_balanced = []

# Apply SMOTE for each label independently
for label in y_train_augmented.columns:
    if len(y_train_augmented[label].unique()) > 1:  # Apply SMOTE only if more than one class
        X_resampled, y_resampled = smote.fit_resample(X_train_augmented, y_train_augmented[label])
        X_train_balanced.append(pd.DataFrame(X_resampled, columns=X_train.columns))
        y_train_balanced.append(pd.DataFrame({label: y_resampled}))
    else:
        # Retain original values for single-class labels
        X_train_balanced.append(X_train_augmented)
        y_train_balanced.append(pd.DataFrame({label: y_train_augmented[label]}))

# Concatenate results
X_train_balanced = pd.concat(X_train_balanced).reset_index(drop=True)
y_train_balanced = pd.concat(y_train_balanced, axis=1).reset_index(drop=True)

# Verify the new label distribution
print("\nBalanced Training Set Label Distribution:")
print(y_train_balanced.sum(axis=0))


Augmented Dataset Shapes:
X_train_augmented: (354, 6)
y_train_augmented: (354, 5)

Balanced Training Set Label Distribution:
happiness    200.0
sadness      274.0
anger        328.0
surprise     304.0
fear         304.0
dtype: float64


In [41]:
print("Checking for NaN values in y_train_balanced:")
print(y_train_balanced.isna().sum())



Checking for NaN values in y_train_balanced:
happiness    256
sadness      108
anger          0
surprise      48
fear          48
dtype: int64


In [42]:
# Replace NaN with 0 or appropriate default value
y_train_balanced.fillna(0, inplace=True)

# Verify no NaN values remain
print("Post-cleaning check for NaN values:")
print(y_train_balanced.isna().sum())


Post-cleaning check for NaN values:
happiness    0
sadness      0
anger        0
surprise     0
fear         0
dtype: int64


In [43]:
print("Shape of X_train_balanced:", X_train_balanced.shape)
print("Shape of y_train_balanced:", y_train_balanced.shape)



Shape of X_train_balanced: (2820, 6)
Shape of y_train_balanced: (656, 5)


In [44]:
if X_train_balanced.shape[0] != y_train_balanced.shape[0]:
    print("Mismatch found! Ensure alignment between features and labels.")
else:
    print("X_train_balanced and y_train_balanced are aligned.")


Mismatch found! Ensure alignment between features and labels.


In [45]:
print("y_train_augmented shape:", y_train_augmented.shape)
print("y_train_augmented dtypes:")
print(y_train_augmented.dtypes)



y_train_augmented shape: (354, 5)
y_train_augmented dtypes:
happiness    int64
sadness      int64
anger        int64
surprise     int64
fear         int64
dtype: object


In [46]:
# Create the combined label by avoiding index alignment issues
binary_weights = 2 ** np.arange(len(y_train_augmented.columns))
y_train_combined = y_train_augmented.dot(binary_weights)



In [47]:
print("Combined Labels (first 10):")
print(y_train_combined.head(10))


Combined Labels (first 10):
0    17
1    17
2    18
3    17
4    17
5    18
6    17
7    16
8    17
9    17
dtype: int64


In [48]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote_combined = smote.fit_resample(X_train_augmented, y_train_combined)


In [49]:
# Create a DataFrame for the decomposed labels
y_train_smote = pd.DataFrame(index=X_train_smote.index, columns=y_train_augmented.columns)
for i, col in enumerate(y_train_augmented.columns):
    y_train_smote[col] = (y_train_smote_combined // (2**i)) % 2


In [50]:
print("Balanced Dataset Shapes:")
print("X_train_smote:", X_train_smote.shape)
print("y_train_smote:", y_train_smote.shape)

print("Balanced Label Distribution:")
print(y_train_smote.sum(axis=0))


Balanced Dataset Shapes:
X_train_smote: (864, 6)
y_train_smote: (864, 5)
Balanced Label Distribution:
happiness    432
sadness      288
anger        144
surprise     144
fear         720
dtype: int64


In [51]:
# Retrain the model with the balanced dataset
model_balanced = RandomForestClassifier(n_estimators=100, random_state=42)
model_balanced.fit(X_train_smote, y_train_smote)

print("Model retraining completed!")


Model retraining completed!


In [52]:
from sklearn.metrics import classification_report, hamming_loss, accuracy_score

# Predict on the testing set
y_pred = model_balanced.predict(X_test)

# Evaluate the predictions
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=y_train_smote.columns))

print(f"Hamming Loss: {hamming_loss(y_test, y_pred):.4f}")
print(f"Exact Match Ratio: {accuracy_score(y_test, y_pred):.4f}")



Classification Report:
              precision    recall  f1-score   support

   happiness       0.96      1.00      0.98        52
     sadness       0.90      0.90      0.90        20
       anger       1.00      1.00      1.00         2
    surprise       0.00      0.00      0.00         0
        fear       1.00      1.00      1.00        76

   micro avg       0.97      0.99      0.98       150
   macro avg       0.77      0.78      0.78       150
weighted avg       0.97      0.99      0.98       150
 samples avg       0.98      0.99      0.98       150

Hamming Loss: 0.0158
Exact Match Ratio: 0.9211


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
# Feature importance calculation
for label in y_train.columns:
    print(f"Feature importance for {label}:")
    importances = model.feature_importances_
    for i, col in enumerate(X_train.columns):
        print(f"{col}: {importances[i]}")


Feature importance for happiness:
prop_W: 0.2805584614116852
prop_1: 0.14215132376955547
prop_2: 0.19628159211078847
prop_3: 0.05250339708654194
prop_4: 0.12494752167796215
prop_R: 0.20355770394346684
Feature importance for sadness:
prop_W: 0.2805584614116852
prop_1: 0.14215132376955547
prop_2: 0.19628159211078847
prop_3: 0.05250339708654194
prop_4: 0.12494752167796215
prop_R: 0.20355770394346684
Feature importance for anger:
prop_W: 0.2805584614116852
prop_1: 0.14215132376955547
prop_2: 0.19628159211078847
prop_3: 0.05250339708654194
prop_4: 0.12494752167796215
prop_R: 0.20355770394346684
Feature importance for surprise:
prop_W: 0.2805584614116852
prop_1: 0.14215132376955547
prop_2: 0.19628159211078847
prop_3: 0.05250339708654194
prop_4: 0.12494752167796215
prop_R: 0.20355770394346684
Feature importance for fear:
prop_W: 0.2805584614116852
prop_1: 0.14215132376955547
prop_2: 0.19628159211078847
prop_3: 0.05250339708654194
prop_4: 0.12494752167796215
prop_R: 0.20355770394346684


In [54]:
for label in y_train.columns:
    model = RandomForestClassifier(random_state=42)  # New model instance
    model.fit(X_train, y_train[label])
    print(f"Feature importance for {label}:")
    print(model.feature_importances_)


Feature importance for happiness:
[0.08621926 0.29064414 0.08284183 0.13191883 0.06020066 0.34817529]
Feature importance for sadness:
[0.20884938 0.09255665 0.16934973 0.10113268 0.06456743 0.36354414]
Feature importance for anger:
[0.3684925  0.076151   0.11795029 0.16170368 0.07146496 0.20423758]
Feature importance for surprise:
[0. 0. 0. 0. 0. 0.]
Feature importance for fear:
[0. 0. 0. 0. 0. 0.]


In [55]:
print(X_train.describe())  # Check for variability
print(y_train.sum(axis=0))  # Verify label distribution


           prop_W      prop_1      prop_2      prop_3      prop_4      prop_R
count  304.000000  304.000000  304.000000  304.000000  304.000000  304.000000
mean     0.494150    0.261811    0.199627    0.190642    0.102738    0.195474
std      0.209590    0.193246    0.165965    0.208045    0.188546    0.137298
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000
25%      0.517283    0.129933    0.099878    0.036994    0.000000    0.107579
50%      0.594311    0.225976    0.137315    0.157200    0.013027    0.152900
75%      0.621568    0.352060    0.207886    0.247826    0.125365    0.214980
max      0.693147    1.000000    0.693147    1.000000    1.000000    0.693147
happiness    200
sadness       80
anger         26
surprise       0
fear         304
dtype: int64


In [56]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Initialize balanced dataset placeholders
X_train_balanced = pd.DataFrame()
y_train_balanced = pd.DataFrame()

for label in y_train_augmented.columns:
    # Check if the label has more than one class
    if len(y_train_augmented[label].unique()) > 1:
        X_resampled, y_resampled = smote.fit_resample(X_train_augmented, y_train_augmented[label])

        # Append only the newly generated rows using pd.concat
        if X_train_balanced.empty:
            X_train_balanced = pd.DataFrame(X_resampled, columns=X_train_augmented.columns)
        else:
            X_train_balanced = pd.concat([
                X_train_balanced.iloc[:len(X_train_augmented)],
                pd.DataFrame(X_resampled[len(X_train_augmented):], columns=X_train_augmented.columns)
            ], ignore_index=True)

        y_train_balanced[label] = pd.concat([
            y_train_augmented[label],
            pd.Series(y_resampled[len(y_train_augmented):])
        ], ignore_index=True)
    else:
        # Retain original label values if SMOTE is not applied
        if X_train_balanced.empty:
            X_train_balanced = X_train_augmented.copy()
        y_train_balanced[label] = y_train_augmented[label]

# Verify the fixed dataset
print("Corrected Balanced Training Set Label Distribution:")
print(y_train_balanced.sum(axis=0))
print("\nCorrected Dataset Shapes:")
print("X_train_balanced:", X_train_balanced.shape)
print("y_train_balanced:", y_train_balanced.shape)


Corrected Balanced Training Set Label Distribution:
happiness    200
sadness      126
anger         72
surprise      96
fear         304
dtype: int64

Corrected Dataset Shapes:
X_train_balanced: (608, 6)
y_train_balanced: (400, 5)


In [57]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Initialize balanced datasets
X_train_balanced = pd.DataFrame()
y_train_balanced = pd.DataFrame()

# Process each label independently
for label in y_train_augmented.columns:
    if len(y_train_augmented[label].unique()) > 1:  # Apply SMOTE if there are multiple classes
        X_resampled, y_resampled = smote.fit_resample(X_train_augmented, y_train_augmented[label])
        new_samples = len(X_resampled) - len(X_train_augmented)

        # Add resampled data
        if X_train_balanced.empty:
            X_train_balanced = X_train_augmented.copy()  # Start with original data
        X_train_balanced = pd.concat(
            [X_train_balanced, pd.DataFrame(X_resampled[-new_samples:], columns=X_train_augmented.columns)],
            ignore_index=True
        )

        # Update labels
        if y_train_balanced.empty:
            y_train_balanced = y_train_augmented.copy()  # Start with original data
        y_train_balanced[label] = pd.concat(
            [y_train_balanced[label], pd.Series(y_resampled[-new_samples:])],
            ignore_index=True
        )
    else:
        # Retain original data for single-class labels
        if X_train_balanced.empty:
            X_train_balanced = X_train_augmented
        if y_train_balanced.empty:
            y_train_balanced = y_train_augmented
        y_train_balanced[label] = y_train_augmented[label]

# Replace any NaN values with 0
y_train_balanced = y_train_balanced.fillna(0)

# Ensure alignment
X_train_balanced = X_train_balanced.iloc[:y_train_balanced.shape[0]]

# Verify the results
assert X_train_balanced.shape[0] == y_train_balanced.shape[0], (
    f"Mismatch in samples: X_train_balanced ({X_train_balanced.shape[0]}), y_train_balanced ({y_train_balanced.shape[0]})"
)

# Output the results
print("Final Balanced Dataset Shapes:")
print(f"X_train_balanced: {X_train_balanced.shape}")
print(f"y_train_balanced: {y_train_balanced.shape}")

print("\nBalanced Label Distribution:")
print(y_train_balanced.sum(axis=0))



Final Balanced Dataset Shapes:
X_train_balanced: (354, 6)
y_train_balanced: (354, 5)

Balanced Label Distribution:
happiness    200
sadness       80
anger         26
surprise      50
fear         304
dtype: int64


In [58]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling only to features
X_train_normalized = scaler.fit_transform(X_train_balanced)

# Convert back to DataFrame for better readability
X_train_normalized = pd.DataFrame(X_train_normalized, columns=X_train_balanced.columns)

# Verify the normalized data
print("Feature Means After Normalization (Should Be ~0):\n", X_train_normalized.mean())
print("\nFeature Standard Deviations After Normalization (Should Be ~1):\n", X_train_normalized.std())


Feature Means After Normalization (Should Be ~0):
 prop_W    4.817239e-16
prop_1   -2.007183e-17
prop_2   -8.028731e-17
prop_3    1.405028e-16
prop_4   -2.007183e-17
prop_R    1.605746e-16
dtype: float64

Feature Standard Deviations After Normalization (Should Be ~1):
 prop_W    1.001415
prop_1    1.001415
prop_2    1.001415
prop_3    1.001415
prop_4    1.001415
prop_R    1.001415
dtype: float64


In [59]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, hamming_loss

# Step 1: Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_train_normalized, y_train_balanced, test_size=0.2, random_state=42
)

# Step 2: Initialize the model
model = RandomForestClassifier(random_state=42, n_jobs=-1)

# Step 3: Train the model
print("Training the model...")
model.fit(X_train, y_train)
print("Training completed!")

# Step 4: Evaluate the model
y_pred = model.predict(X_test)

# Generate the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=y_train.columns))

# Compute the Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)
print(f"Hamming Loss: {hamming_loss_value:.4f}")


Training the model...
Training completed!
Classification Report:
              precision    recall  f1-score   support

   happiness       0.98      1.00      0.99        41
     sadness       1.00      0.92      0.96        13
       anger       1.00      1.00      1.00         6
    surprise       1.00      0.90      0.95        10
        fear       0.98      1.00      0.99        61

   micro avg       0.98      0.98      0.98       131
   macro avg       0.99      0.96      0.98       131
weighted avg       0.99      0.98      0.98       131
 samples avg       0.99      0.98      0.98       131

Hamming Loss: 0.0113


In [60]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train_normalized, y_train_balanced, cv=5, scoring='f1_samples')
print("Cross-Validation F1-scores:", scores)
print("Mean CV F1-score:", scores.mean())


Cross-Validation F1-scores: [0.95023474 1.         0.96807512 0.95492958 0.28285714]
Mean CV F1-score: 0.8312193158953722


In [61]:
# Extract feature importances
feature_importances = model.feature_importances_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(importance_df)


Feature Importance:
  Feature  Importance
0  prop_W    0.222886
5  prop_R    0.210152
1  prop_1    0.167590
4  prop_4    0.164978
2  prop_2    0.152173
3  prop_3    0.082220


In [62]:
# Iterate through each label and calculate feature importance
for label_idx, label in enumerate(y_train.columns):
    print(f"Feature importance for {label}:")
    # Access the individual estimator for the current label
    label_model = model.estimators_[label_idx]
    importances = label_model.feature_importances_

    # Print feature importance for each feature
    for i, col in enumerate(X_train.columns):
        print(f"  {col}: {importances[i]:.4f}")


Feature importance for happiness:
  prop_W: 0.2878
  prop_1: 0.2270
  prop_2: 0.1019
  prop_3: 0.0388
  prop_4: 0.1790
  prop_R: 0.1655
Feature importance for sadness:
  prop_W: 0.2259
  prop_1: 0.0951
  prop_2: 0.3061
  prop_3: 0.0680
  prop_4: 0.1440
  prop_R: 0.1609
Feature importance for anger:
  prop_W: 0.2381
  prop_1: 0.1114
  prop_2: 0.0401
  prop_3: 0.0962
  prop_4: 0.2955
  prop_R: 0.2188
Feature importance for surprise:
  prop_W: 0.2292
  prop_1: 0.0814
  prop_2: 0.1550
  prop_3: 0.0797
  prop_4: 0.1624
  prop_R: 0.2922
Feature importance for fear:
  prop_W: 0.1428
  prop_1: 0.2630
  prop_2: 0.1017
  prop_3: 0.1514
  prop_4: 0.0727
  prop_R: 0.2685


In [63]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train_normalized, y_train_balanced, cv=5, scoring='f1_micro')
print(f"Mean F1-score across folds: {scores.mean():.4f}")


Mean F1-score across folds: 0.8504


In [64]:
print("y_test dtypes:")
print(y_test.dtypes)

print("\ny_pred dtypes:")
print(pd.DataFrame(y_pred, columns=y_train.columns).dtypes)



y_test dtypes:
happiness    int64
sadness      int64
anger        int64
surprise     int64
fear         int64
dtype: object

y_pred dtypes:
happiness    int64
sadness      int64
anger        int64
surprise     int64
fear         int64
dtype: object


In [65]:
print(f"y_test shape: {y_test.shape}")
print(f"y_pred shape: {y_pred.shape}")


y_test shape: (71, 5)
y_pred shape: (71, 5)


In [66]:
print(f"Unique values in y_test:\n{y_test.apply(pd.Series.unique)}")
print(f"Unique values in y_pred:\n{pd.DataFrame(y_pred, columns=y_train.columns).apply(pd.Series.unique)}")


Unique values in y_test:
   happiness  sadness  anger  surprise  fear
0          1        0      0         0     1
1          0        1      1         1     0
Unique values in y_pred:
   happiness  sadness  anger  surprise  fear
0          1        0      0         0     1
1          0        1      1         1     0


In [67]:
from sklearn.metrics import multilabel_confusion_matrix

# Ensure y_pred is in the correct format (DataFrame for column alignment)
y_pred_df = pd.DataFrame(y_pred, columns=y_train.columns)

confusion_matrices = multilabel_confusion_matrix(y_test, y_pred_df)
for idx, label in enumerate(y_train.columns):
    print(f"Confusion Matrix for {label}:\n{confusion_matrices[idx]}")


Confusion Matrix for happiness:
[[29  1]
 [ 0 41]]
Confusion Matrix for sadness:
[[58  0]
 [ 1 12]]
Confusion Matrix for anger:
[[65  0]
 [ 0  6]]
Confusion Matrix for surprise:
[[61  0]
 [ 1  9]]
Confusion Matrix for fear:
[[ 9  1]
 [ 0 61]]


In [68]:
#from google.colab import drive
import joblib

# Mount Google Drive
#drive.mount('/content/drive')

# Define path in Google Drive
model_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/multi_label_model.joblib'

# Save the model
joblib.dump(model, model_path)
print(f"Model saved to Google Drive at {model_path}")


Model saved to Google Drive at C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/multi_label_model.joblib


In [69]:
import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the saved model
model_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/multi_label_model.joblib'
model = joblib.load(model_path)

# Load the test dataset
# Ensure that you have the same normalization and feature set applied as during training
test_data_path = 'C:/Users/reeva/Desktop/690/sleep-edf-database-expanded-1.0.0-20241127T213628Z-001/sleep-edf-database-expanded-1.0.0/aggregated_sleep_data.csv'
test_data = pd.read_csv(test_data_path)

# Select relevant features for prediction (same as training features)
feature_columns = ['prop_W', 'prop_1', 'prop_2', 'prop_3', 'prop_4', 'prop_R']
X_test = test_data[feature_columns]

# Apply the same scaler as during the training
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_test)  # Fit to entire test data for demonstration purposes
X_test_normalized = scaler.transform(X_test)

# Pick one random record from the test data
random_index = random.randint(0, len(X_test_normalized) - 1)
random_sample = X_test_normalized[random_index].reshape(1, -1)


# Predict the emotion for this record
y_pred_random = model.predict(random_sample)

# If it's a multi-label model, ensure the output is binary (0/1) for all classes
y_pred_random_df = pd.DataFrame([y_pred_random[0]], columns=emotion_labels)

# Print the prediction result
print(f"Emotion prediction for the random record at index {random_index}:")
print(y_pred_random_df.T.rename(columns={0: 'Predicted'}))


Emotion prediction for the random record at index 97:
           Predicted
happiness          1
sadness            0
anger              0
surprise           0
fear               1




In [72]:
# If no emotion is predicted, set all to 0%
predicted_emotions = y_pred_random
if np.sum(predicted_emotions) == 0:
    emotion_percentages = [0 for _ in predicted_emotions[0]]
else:
    # Normalize each active emotion to sum to 100%
    total_active = np.sum(predicted_emotions[0])
    emotion_percentages = [(value / total_active) * 100 if value == 1 else 0 for value in predicted_emotions[0]]

# Prepare the output
emotion_percentages_df = pd.DataFrame({
    'Emotion': emotion_labels,
    'Percentage': emotion_percentages
})

# Print the prediction result as percentages
print("Emotion prediction for the random record (as percentages):")
for index, row in emotion_percentages_df.iterrows():
    if row['Percentage'] > 0:  # Only print emotions with non-zero percentages
        print(f"{row['Emotion']}: {row['Percentage']:.2f}%")


Emotion prediction for the random record (as percentages):
happiness: 50.00%
fear: 50.00%
