In [1]:
import os
import sys
import pytz
import numpy as np
import umap
import plotly.express as px

sys.path.insert(0, '..') 
from src.features.feature_generation_utils import *
from src.features.feature_generation import *

In [2]:
%load_ext autoreload
%autoreload 2

# Takeaways:
1. Maybe in addition to focusing on accuracy, we can look into how well the features cluster into 4 categories (or maybe it's more/less?) using UMAP for dimensional reduction  https://umap-learn.readthedocs.io/en/latest/basic_usage.html

# Features (these descriptions are slightly outdated)

## Heart Rate
- Heart Rate
- Heart rate 30 second mean
- Heart rate 30 second standard deviation
- Heart rate very low frequency (0.01 - 0.05 Hz)
- Heart rate VLF power standard deviation
## Movement & Pressure
- Pressure 30 second mean
- Pressure 30 second standard deviation
- Overall Dynamic Body Acceleration (ODBA) 30 second mean
- ODBA 30 second standard deviation
- GyroZ 30 second mean
- GyroZ 30 second standard deviation
## EEG Features
- EEG 30 second standard deviation
- EEG 30 second IQR
- EEG 30 second skew
- EEG 30 second kurtosis
- EEG 30 second zero crossings
- EEG 30 second hjorth mobility
- EEG 30 second hjorth complexity
- Slow Delta power (0.4 - 1 Hz)
- Fast Delta power (1 - 4 Hz)
- Theta power (4 - 8 Hz)
- Alpha power (8 - 12 Hz)
- Sigma power (12 - 16 Hz)
- Beta power (16 - 30 Hz)
- Delta / Theta power
- Delta / Sigma power
- Delta / Beta power
- Alpha / Theta power
- Absolute power (0.4 - 30 Hz)
#### ones I don't really understand
- EEG permutation entropy
- EEG higuchi fractal dimension (is this like the mirror dimension from doctor strange)
- EEG petrosian fractal dimension

## Preprocessing

### Load EDF and EEG data

In [3]:
path_to_edf = '../data/raw/01_edf_data/test12_Wednesday_05_ALL_PROCESSED.edf'
raw = mne.io.read_raw_edf(path_to_edf, include=['ECG_Raw_Ch1', 'EEG_ICA5', 'Pressure', 'ODBA', 'GyrZ'], preload=True)
info = raw.info

Extracting EDF parameters from /Users/michael/Desktop/capstone-seal-sleep/jessie-workshop/ecophys-ecoviz/data/raw/01_edf_data/test12_Wednesday_05_ALL_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 158957499  =      0.000 ... 317914.998 secs...


In [4]:
eeg_data = raw.get_data('EEG_ICA5')[0].copy()

### Load labels

In [5]:
# Load labeled data
# Path to CSV with scored data
file_path = '../data/raw/02_hypnogram_data/test12_Wednesday_06_Hypnogram_JKB_1Hz.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)
df['R.Time'] = pd.to_datetime(df['R.Time']).dt.tz_localize('America/Los_Angeles')
df['Sleep.Code'].value_counts(normalize=True)

Sleep.Code
Active Waking         0.456175
Quiet Waking          0.143221
HV Slow Wave Sleep    0.124564
Drowsiness            0.075781
Certain REM Sleep     0.072228
LV Slow Wave Sleep    0.071997
Putative REM Sleep    0.034134
Unscorable            0.021901
Name: proportion, dtype: float64

### Get recording start time

In [6]:
sfreq = info['sfreq']
edf_start_time = info['meas_date']
# Define the PST timezone
pst_timezone = pytz.timezone('America/Los_Angeles')
# Convert to datetime object in PST
if isinstance(edf_start_time, datetime.datetime):
    # If it's already a datetime object, just replace the timezone
    recording_start_datetime = edf_start_time.replace(tzinfo=None).astimezone(pst_timezone)
    # for some reason using .replace(tzinfo=...) does weird shit - offsets based of LMT instead of UTC and gets confusing
    # recording_start_datetime = edf_start_time.replace(tzinfo=pst_timezone)
elif isinstance(edf_start_time, (int, float)):
    # Convert timestamp to datetime in PST
    recording_start_datetime = pst_timezone.localize(datetime.datetime.fromtimestamp(edf_start_time))

## Get features

In [7]:
model_data_v3 = pd.read_csv('../data/processed/v3_features/Wednesday_used_features_with_labels_v3.csv',
                            index_col=0)
model_data_v3.index = pd.DatetimeIndex(model_data_v3.index, tz=pst_timezone)

In [8]:
model_data_v3 = model_data_v3.dropna()

## UMAP dimensionality reduction, colored by label

In [13]:
X_all = model_data_v3.drop('Simple.Sleep.Code', axis=1)
y_all = model_data_v3['Simple.Sleep.Code']

In [None]:
X_umap = pd.DataFrame(
    umap.UMAP(random_state=42).fit_transform(X_all),
    columns=['UMAP1', 'UMAP2']
)

In [None]:
labeled_umap = X_umap.copy(deep=True)
labeled_umap['Simple.Sleep.Code'] = y_all.reset_index(drop=True)

In [None]:
px.scatter(labeled_umap, x='UMAP1', y='UMAP2', color='Simple.Sleep.Code')

# Recursive Feature Elimination - sklearn

In [None]:
from sklearn.feature_selection import RFECV
from lightgbm import LGBMClassifier

In [None]:
best_params = {'learning_rate': 0.005, 'n_estimators': 400, 'num_leaves': 10}
lgbmodel = LGBMClassifier(**best_params)
rfe = RFECV(lgbmodel)
rfe.fit(X_all, y_all['Simple.Sleep.Code'])

In [None]:
rfe.ranking_

In [None]:
included = rfe.support_
print('Included')
print(X_all.columns[included])
print()
print('Not Included')
print(X_all.columns[~included])