# Initial models features v1
This notebook builds some initial scikit-learn models using the v1 features

In [1]:
import mne
import os
import sys
import pytz
import datetime
import numpy as np
import pandas as pd
from sleepecg import detect_heartbeats

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
sys.path.insert(0, '..') 
import src.features.feature_generation_utils as seal_fe # feature extraction functions
import src.features.feature_generation as seal_fgen # feature generation wrapper

# Table of Contents
## [Wednesday Feature Extraction](#wednesday_feature_extraction)
# [SKlearn Initial Modelling](#sklearn)
### [SVM Classifier](#svm_classifier)
### [KNN Classifier](#k-nearest-neighbors)
### [Random Forest](#random_forest)
### [LightGBM](#light_gbm)

In [4]:
# Construct the relative path to the folder containing processed data
data_path = os.path.abspath(os.path.join("..", "data", "raw"))
process_data_path = os.path.abspath(os.path.join("..", "data", "raw", "01_edf_data"))
print(process_data_path)

# Read the header information to identify channels and their sampling frequencies
info = mne.io.read_raw_edf(f'{process_data_path}/test12_Wednesday_05_ALL_PROCESSED.edf',
                           preload=False).info

# Print the channel information
print(info)

# Identify channels and their corresponding sampling frequencies
channels_info = info['chs']
sampling_freq_map = {}

/Users/michael/Desktop/capstone-seal-sleep/jessie-workshop/ecophys-ecoviz/data/raw/01_edf_data
Extracting EDF parameters from /Users/michael/Desktop/capstone-seal-sleep/jessie-workshop/ecophys-ecoviz/data/raw/01_edf_data/test12_Wednesday_05_ALL_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
<Info | 8 non-empty values
 bads: []
 ch_names: ECG_Raw_Ch1, ECG_ICA2, LEOG_Pruned_Ch2, LEMG_Pruned_Ch4, ...
 chs: 16 EEG
 custom_ref_applied: False
 highpass: 0.0 Hz
 lowpass: 250.0 Hz
 meas_date: 2019-10-25 08:21:02 UTC
 nchan: 16
 projs: []
 sfreq: 500.0 Hz
 subject_info: 1 item (dict)
>


In [5]:
# Load the EDF file, excluding the EOGs and EKG channels
raw = mne.io.read_raw_edf(f'{process_data_path}/test12_Wednesday_05_ALL_PROCESSED.edf', 
                          include=['ECG_Raw_Ch1', 'EEG_ICA5', 'GyrZ', 'MagZ',
                                   'ODBA', 'Pressure'], preload=True)
raw # Outputs summary data about file

# Inspect Data
print(raw.info)
print('The channels are:', raw.ch_names)
print('The sampling frequency is:', raw.info['sfreq'])

# Rename channels (replace spaces if any)
channel_renaming_dict = {name: name.replace(' ', '_') for name in raw.ch_names}
raw.rename_channels(channel_renaming_dict)

# Assuming 'raw' is your Raw object from MNE
channel_types = {}

for ch in raw.ch_names:
    if ch.startswith('ECG'):
        channel_types[ch] = 'ecg'
    elif ch.startswith('EEG'):
        channel_types[ch] = 'eeg'
    elif ch in ['pitch', 'roll', 'heading']:
        channel_types[ch] = 'resp'
    elif ch in ['GyrZ', 'MagZ', 'ODBA']:
        channel_types[ch] = 'syst'
    elif ch in ['Pressure']:
        channel_types[ch] = 'misc'
    elif ch == 'Heart_Rate':
        channel_types[ch] = 'bio'

# Now set the channel types
raw.set_channel_types(channel_types)

# Extract the measurement date (start time) from raw.info
start_time = raw.info['meas_date']
fs = raw.info['sfreq']

# Define the PST timezone
pst_timezone = pytz.timezone('America/Los_Angeles')

# Convert to datetime object in PST
if isinstance(start_time, datetime.datetime):
    # If it's already a datetime object, just replace the timezone
    recording_start_datetime = start_time.replace(tzinfo=None).astimezone(pst_timezone)
elif isinstance(start_time, (int, float)):
    # Convert timestamp to datetime in PST
    recording_start_datetime = pst_timezone.datetime.datetime.fromtimestamp(start_time, pst_timezone)
else:
    # Handle other formats if necessary
    pass

# Calculate the recording duration in seconds
recording_duration_seconds = len(raw) / fs

# Calculate the recording end datetime
recording_end_datetime = recording_start_datetime + datetime.timedelta(seconds=recording_duration_seconds)

# Calculate duration as a timedelta object
duration_timedelta = datetime.timedelta(seconds=recording_duration_seconds)

# Create a time index
#time_index = pd.date_range(recoring_start_datetime, recording_end_datetime)

# Format duration into days, hours, minutes, and seconds
days = duration_timedelta.days
hours, remainder = divmod(duration_timedelta.seconds, 3600)
minutes, seconds = divmod(remainder, 60)

print('The start time in PST (Los Angeles) is:', recording_start_datetime)
print('The end time in PST (Los Angeles) is:', recording_end_datetime)
print(f'Duration: {days} days, {hours} hours, {minutes} minutes, {seconds} seconds')


Extracting EDF parameters from /Users/michael/Desktop/capstone-seal-sleep/jessie-workshop/ecophys-ecoviz/data/raw/01_edf_data/test12_Wednesday_05_ALL_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 158957499  =      0.000 ... 317914.998 secs...
<Info | 8 non-empty values
 bads: []
 ch_names: ECG_Raw_Ch1, EEG_ICA5, GyrZ, MagZ, ODBA, Pressure
 chs: 6 EEG
 custom_ref_applied: False
 highpass: 0.0 Hz
 lowpass: 250.0 Hz
 meas_date: 2019-10-25 08:21:02 UTC
 nchan: 6
 projs: []
 sfreq: 500.0 Hz
 subject_info: 1 item (dict)
>
The channels are: ['ECG_Raw_Ch1', 'EEG_ICA5', 'GyrZ', 'MagZ', 'ODBA', 'Pressure']
The sampling frequency is: 500.0
The start time in PST (Los Angeles) is: 2019-10-25 08:21:02-07:00
The end time in PST (Los Angeles) is: 2019-10-29 00:39:37-07:00
Duration: 3 days, 16 hours, 18 minutes, 35 seconds


  raw.set_channel_types(channel_types)


In [6]:
# Load labeled data
# Path to CSV with scored data
file_path = f'{data_path}/02_hypnogram_data/test12_Wednesday_06_Hypnogram_JKB_1Hz.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)
df['R.Time'] = pd.to_datetime(df['R.Time']).dt.tz_localize('America/Los_Angeles')

print('The start time for labels in PST (Los Angeles) is:', df['R.Time'].min())
print('The end time for labels in PST (Los Angeles) is:', df['R.Time'].max())

df['Simple.Sleep.Code'].value_counts(normalize=True)

The start time for labels in PST (Los Angeles) is: 2019-10-25 14:45:22-07:00
The end time for labels in PST (Los Angeles) is: 2019-10-29 00:36:53-07:00


Simple.Sleep.Code
Active Waking    0.456175
SWS              0.196561
Quiet Waking     0.143221
REM              0.106362
Drowsiness       0.075781
Unscorable       0.021901
Name: proportion, dtype: float64

<a id='wednesday_feature_extraction'></a>

# Calculate Features for Wednesday the Seal

In [7]:
start_index = int((df['R.Time'].min() - recording_start_datetime).total_seconds() * 500)
end_index = int(((df['R.Time'].max() - recording_start_datetime).total_seconds() + 1) * 500)

In [8]:
eeg_subset = raw.copy().pick(['EEG_ICA5']).get_data()[0, start_index:end_index]
ecg_subset = raw.copy().pick(['ECG_Raw_Ch1']).get_data()[0, start_index:end_index] # TODO: Use ECG_Raw_Ch1

In [9]:
print(len(eeg_subset))
print(len(ecg_subset))

147346000
147346000


In [10]:
# heart rate for the subset time period
hr_subset = seal_fe.get_heart_rate(ecg_subset)

  heart_rates = [60 / ((rpeaks_corrected[i+1] - rpeaks_corrected[i]) / fs) for i in range(len(rpeaks_corrected) - 1)]


Filled: 0


In [11]:
delta_power = seal_fe.get_rolling_band_power_welch(eeg_subset, 0, len(eeg_subset), freq_range=(0.5, 4), ref_power=1e-14,
                                           freq=500, window_sec=30, step_size=1)
print('done')
zero_crossings = seal_fe.get_rolling_zero_crossings(eeg_subset, 0, len(eeg_subset), window_sec=10)
print('done')
absolute_power = seal_fe.get_rolling_band_power_welch(eeg_subset, 0, len(eeg_subset), freq_range=(0.4, 30),
                                                 ref_power=1e-14, freq=500, window_sec=30, step_size=1)
print('done')
hr_subset = seal_fe.get_heart_rate(ecg_subset)
hr_mean, hr_std = seal_fe.get_rolling_mean_std(hr_subset, 0, len(hr_subset), window_sec=30, freq=500)
print('done')
vlf_power = seal_fe.get_rolling_band_power_fourier_sum(hr_subset, 0, len(hr_subset), freq_range=(0.001, 0.05),
                                               window_sec=30, freq=500, ref_power=1)
print('done')
_, vlf_power_std = seal_fe.get_rolling_mean_std(vlf_power, 0, len(vlf_power), freq=1, window_sec=60)


done
done
done
Filled: 0
done


  delta_power = 10 * np.log(np.sum(power_spectrum[delta_freq_indices]) * freq_resolution / ref_power)


done


  x = asanyarray(arr - arrmean)


In [12]:
for myarr in [delta_power, zero_crossings, absolute_power, hr_subset, hr_mean, hr_std, vlf_power, vlf_power_std]:
    print(len(myarr))

294692
294692
294692
147346000
294692
294692
294692
294692


In [13]:
features_v1 = pd.DataFrame({
    'Delta Power': delta_power,
    'Rolling Zero Crossings': zero_crossings,
    'Rolling Absolute Power': absolute_power,
    'Heart Rate': [hr_subset[i] for i in range(0, len(hr_subset), 500)], # Downsample from 500 Hz to 1 Hz
    'Heart Rate Mean': hr_mean,
    'Heart Rate Std.Dev': hr_std,
    'Heart Rate Very Low Frequency Power': vlf_power,
    'Heart Rate VLF Power Std.Dev': vlf_power_std,
    'Simple.Sleep.Code': df['Simple.Sleep.Code']
})

In [14]:
features_v1.index = df['R.Time']

In [15]:
features_v1.to_csv('../data/processed/v1_features/Wednesday_features_v1.csv')

In [16]:
with open('../data/processed/v1_features/Wednesday_features_v1_meta.txt', 'w') as f: 
    f.write(
"""
delta_power = get_rolling_band_power_welch(eeg_subset, 0, len(eeg_subset), freq_range=(0.5, 4), ref_power=1e-14,
                                           freq=500, window_sec=30, step_size=1)
zero_crossings = get_rolling_zero_crossings(eeg_subset, 0, len(eeg_subset), window_sec=10)
absolute_power = get_rolling_band_power_welch(eeg_subset, 0, len(eeg_subset), freq_range=(0.4, 30),
                                                 ref_power=1e-14, freq=500, window_sec=30, step_size=1)
hr_subset = get_heart_rate(ecg_subset)
hr_mean, hr_std = get_rolling_mean_std(hr_subset, 0, len(hr_3subset), window_sec=30, freq=500)
vlf_power = get_rolling_band_power_fourier_sum(hr_subset, 0, len(hr_subset), freq_range=(0.001, 0.05),
                                               window_sec=30, freq=500, ref_power=1)
_, vlf_power_std = get_rolling_mean_std(vlf_power, 0, len(vlf_power), freq=1, window_sec=60)

features_v1 = pd.DataFrame({
    'Delta Power': delta_power,
    'Rolling Zero Crossings': zero_crossings,
    'Rolling Absolute Power': absolute_power,
    'Heart Rate': [hr_subset[i] for i in range(0, len(hr_subset), 500)], # Downsample from 500 Hz to 1 Hz
    'Heart Rate Mean': hr_mean,
    'Heart Rate Std.Dev': hr_std,
    'Heart Rate Very Low Frequency Power': vlf_power,
    'Heart Rate VLF Power Std.Dev': vlf_power_std,
    'Simple.Sleep.Code': df['Simple.Sleep.Code']
})
"""
)


<a id='sklearn'></a>

# Scikit-Learn on Wednesday

In [17]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
# hours in features_v1
(features_v1.index[-1] - features_v1.index[0]).total_seconds() / 60 / 60

81.8586111111111

In [19]:
model_train_data = features_v1.dropna()
train_X, test_X, train_y, test_y = train_test_split(model_train_data.drop('Simple.Sleep.Code', axis=1),
                                                    model_train_data['Simple.Sleep.Code'],
                                                    test_size=0.50, shuffle=False)

<a id='svm_classifier'></a>

## SVC (support vector machine classifier)

In [None]:
param_grid = {'C': [0.1,1, 10], 'gamma': [1,0.1,0.01,0.001,0.0001],'kernel': ['rbf']} # kernel: 'sigmoid', 'poly' excluded
grid = GridSearchCV(SVC(),param_grid,cv=2,refit=True,verbose=2,scoring='f1_weighted',n_jobs=8)
grid.fit(train_X,train_y)

In [22]:
print(grid.best_params_)

{'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'}


In [None]:
best_params = {'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'}

In [20]:
model_svc = SVC(**best_params, n_jobs=8)

In [23]:
def print_model_stats(model, data):
    train_X, test_X, train_y, test_y = train_test_split(data.drop('Simple.Sleep.Code', axis=1),
                                                        data['Simple.Sleep.Code'],
                                                        test_size=0.50, shuffle=False)
    from IPython.display import display, HTML
    model.fit(train_X, train_y)
    preds = model.predict(test_X)
    print('Accuracy:')
    print(np.mean(preds == test_y))
    print()
    
    print('Actual Target Dist')
    print(test_y.value_counts(normalize=True).sort_index())
    print()
    
    print('Prediction Target Dist')
    print(pd.Series(preds).value_counts(normalize=True).sort_index())
    print()
    
    labels=['Active Waking', 'Quiet Waking', 'Drowsiness', 'SWS', 'REM', 'Unscorable']
    display(pd.DataFrame(
        confusion_matrix(test_y, preds),
        index=[f'True_{x}' for x in labels],
        columns=[f'Predicted_{x}' for x in labels]
    ))

In [24]:
print_model_stats(model_svc, model_train_data)

Accuracy:
0.5756532252039313

Actual Target Dist
Simple.Sleep.Code
Active Waking    0.597775
Drowsiness       0.034062
Quiet Waking     0.123282
REM              0.051898
SWS              0.160428
Unscorable       0.032554
Name: proportion, dtype: float64

Prediction Target Dist
Active Waking    0.458422
Drowsiness       0.116389
Quiet Waking     0.070474
REM              0.163390
SWS              0.190558
Unscorable       0.000768
Name: proportion, dtype: float64



Unnamed: 0,Predicted_Active Waking,Predicted_Quiet Waking,Predicted_Drowsiness,Predicted_SWS,Predicted_REM,Predicted_Unscorable
True_Active Waking,61821,4723,2883,6012,12557,15
True_Quiet Waking,376,1892,1272,517,958,0
True_Drowsiness,914,2925,3651,10109,552,0
True_SWS,111,1317,1740,4284,189,0
True_REM,1144,6045,774,2649,13008,0
True_Unscorable,3128,234,56,485,792,98


In [25]:
model_svc.fit(train_X, train_y)

In [26]:
preds_svc_test = model_svc.predict(test_X)

In [27]:
model_svc.fit(test_X, test_y)

In [28]:
preds_svc_train = model_svc.predict(train_X)

<a id='k-nearest-neighbors'></a>

## K Nearest Neighbors

In [None]:
param_grid = {
    'n_neighbors': [10,20,40,100,200],
    'p': [1,2,3],
    'weights': ('uniform', 'distance'),
    'metric': ('minkowski', 'chebyshev')
}
grid = GridSearchCV(KNeighborsClassifier(),param_grid,cv=2,refit=True,verbose=2,n_jobs=8,scoring='f1_weighted')
grid.fit(train_X,train_y)

In [30]:
print(grid.best_params_)

{'metric': 'minkowski', 'n_neighbors': 200, 'p': 3, 'weights': 'distance'}


In [31]:
best_params = {'metric': 'minkowski', 'n_neighbors': 200, 'p': 3, 'weights': 'distance'}

In [32]:
model_knn = KNeighborsClassifier(**best_params, n_jobs=8)

In [33]:
print_model_stats(model_knn, model_train_data)

Accuracy:
0.568460446509227

Actual Target Dist
Simple.Sleep.Code
Active Waking    0.597775
Drowsiness       0.034062
Quiet Waking     0.123282
REM              0.051898
SWS              0.160428
Unscorable       0.032554
Name: proportion, dtype: float64

Prediction Target Dist
Active Waking    0.446204
Drowsiness       0.073504
Quiet Waking     0.113753
REM              0.171757
SWS              0.193988
Unscorable       0.000795
Name: proportion, dtype: float64



Unnamed: 0,Predicted_Active Waking,Predicted_Quiet Waking,Predicted_Drowsiness,Predicted_SWS,Predicted_REM,Predicted_Unscorable
True_Active Waking,59991,2328,7584,6197,11871,40
True_Quiet Waking,681,1765,811,557,1201,0
True_Drowsiness,1238,2247,3987,9766,913,0
True_SWS,49,896,2030,4340,326,0
True_REM,677,3464,2010,3934,13535,0
True_Unscorable,3059,122,326,494,715,77


In [34]:
model_knn.fit(train_X, train_y)

In [35]:
preds_knn_test = model_knn.predict(test_X)

In [36]:
model_knn.fit(test_X, test_y)

In [37]:
preds_knn_train = model_knn.predict(train_X)

<a id='random_forest'></a>

## Random Forest

In [None]:
param_grid = {
    'n_estimators': [25, 50, 100, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [50, 25, 10, 5]
}
grid = GridSearchCV(RandomForestClassifier(),param_grid,cv=2,refit=True,verbose=2,n_jobs=8,
                    scoring='f1_weighted')
grid.fit(train_X,train_y)

In [39]:
print(grid.best_params_)

{'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'n_estimators': 200}


In [40]:
best_params = {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'n_estimators': 200}

In [41]:
model_rfc = RandomForestClassifier(**best_params, n_jobs=8)

In [42]:
print_model_stats(model_rfc, model_train_data)

Accuracy:
0.510551446366594

Actual Target Dist
Simple.Sleep.Code
Active Waking    0.597775
Drowsiness       0.034062
Quiet Waking     0.123282
REM              0.051898
SWS              0.160428
Unscorable       0.032554
Name: proportion, dtype: float64

Prediction Target Dist
Active Waking    0.423803
Drowsiness       0.259300
Quiet Waking     0.048855
REM              0.114718
SWS              0.152054
Unscorable       0.001270
Name: proportion, dtype: float64



Unnamed: 0,Predicted_Active Waking,Predicted_Quiet Waking,Predicted_Drowsiness,Predicted_SWS,Predicted_REM,Predicted_Unscorable
True_Active Waking,57137,14607,1955,3436,10754,122
True_Quiet Waking,518,2449,1019,492,537,0
True_Drowsiness,1007,6022,2239,8114,769,0
True_SWS,94,2190,1422,3751,184,0
True_REM,764,11918,556,854,9528,0
True_Unscorable,2877,991,2,243,615,65


In [50]:
model_rfc.fit(train_X, train_y)

In [51]:
preds_rfc_test = model_rfc.predict(test_X)

In [52]:
model_rfc.fit(test_X, test_y)

In [53]:
preds_rfc_train = model_rfc.predict(train_X)

#### save model results

In [54]:
features_v1_with_preds = model_train_data.copy()
features_v1_with_preds['SVC Predictions'] = np.concatenate([preds_svc_train, preds_svc_test])
features_v1_with_preds['KNN Predictions'] = np.concatenate([preds_knn_train, preds_knn_test])
features_v1_with_preds['RFC Predictions'] = np.concatenate([preds_rfc_train, preds_rfc_test])
features_v1_with_preds.to_csv('../data/processed/v1_features/Wednesday_features_v1_with_predictions.csv')

<a id='light_gbm'></a>
# LightGBM
added later, but used for reference to see how lightgbm model performs with v1 features

In [55]:
from lightgbm import LGBMClassifier
best_params = {'learning_rate': 0.005, 'n_estimators': 400, 'num_leaves': 10}

In [58]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Initialize k-fold
n_splits = 5  # Define the number of splits for k-fold
skf = KFold(n_splits=n_splits, shuffle=False)

# Initialize arrays to store accuracies
class_accuracies = []
overall_accuracies = []

X, y = model_train_data.drop('Simple.Sleep.Code', axis=1), model_train_data['Simple.Sleep.Code']
kfold_preds = []
# Perform k-fold cross-validation
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f'Fold {fold + 1}/{n_splits}')
    kmodel = LGBMClassifier(**best_params, n_jobs=8)
    
    X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Train the model without unscorable so we can see what it predicts it as
    train_filter = y_tr != 'Unscorable'
    X_tr, y_tr = X_tr.loc[train_filter], y_tr.loc[train_filter]
    
    # Train the model
    kmodel.fit(X_tr, y_tr)
    
    # Predict on validation set
    y_pred = pd.Series(kmodel.predict(X_val), index=y_val.index)
    kfold_preds.append(y_pred)

    # Calculate accuracy per class
    class_accuracy = []
    for class_label in np.sort(np.unique(y_val)):
        class_accuracy.append(np.sum((y_pred == y_val) & (y_val == class_label)) / 
                              np.sum(y_val == class_label))

    class_accuracies.append(pd.Series(class_accuracy, index=np.unique(y_val)))
    overall_accuracies.append(np.mean(y_pred == y_val))

# Calculate mean accuracy per class across folds
mean_class_accuracies = pd.concat(class_accuracies, axis=1).mean(axis=1).round(4) * 100
print("Overall accuracy: ", round(np.mean(overall_accuracies) * 100, 2), '%', sep='')
print("Mean class accuracies across folds:")
print(mean_class_accuracies)


Fold 1/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 229393, number of used features: 8
[LightGBM] [Info] Start training from score -0.679246
[LightGBM] [Info] Start training from score -2.568106
[LightGBM] [Info] Start training from score -1.951830
[LightGBM] [Info] Start training from score -2.314085
[LightGBM] [Info] Start training from score -1.740401
Fold 2/5
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000635 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 230004, number of used features: 8
[Light