# Initial models features v1
This notebook builds some initial scikit-learn models using the v1 features

In [1]:
import mne
import os
import sys
import pytz
import datetime
import numpy as np
import pandas as pd
from sleepecg import detect_heartbeats

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Add the src directory to the path
current_path = os.getcwd()
src_path = os.path.abspath(os.path.join(current_path, '..', 'src', 'features'))
sys.path.insert(0, src_path) 
import feature_extraction as seal_fe
import feature_generation as seal_fgen

# Table of Contents
## [Wednesday Feature Extraction](#wednesday_feature_extraction)
# [SKlearn Initial Modelling](#sklearn)
### [SVM Classifier](#svm_classifier)
### [KNN Classifier](#k-nearest-neighbors)
### [Random Forest](#random_forest)

In [4]:
# Construct the relative path to the folder containing processed data
data_path = os.path.abspath(os.path.join("..", "data", "raw"))
process_data_path = os.path.abspath(os.path.join("..", "data", "raw", "01_edf_data"))
print(process_data_path)

# Read the header information to identify channels and their sampling frequencies
info = mne.io.read_raw_edf(f'{process_data_path}/test12_Wednesday_05_ALL_PROCESSED.edf',
                           preload=False).info

# Print the channel information
print(info)

# Identify channels and their corresponding sampling frequencies
channels_info = info['chs']
sampling_freq_map = {}

/Users/michael/Desktop/capstone-seal-sleep/jessie-workshop/ecophys-ecoviz/data/raw/01_edf_data
Extracting EDF parameters from /Users/michael/Desktop/capstone-seal-sleep/jessie-workshop/ecophys-ecoviz/data/raw/01_edf_data/test12_Wednesday_05_ALL_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
<Info | 8 non-empty values
 bads: []
 ch_names: ECG_Raw_Ch1, ECG_ICA2, LEOG_Pruned_Ch2, LEMG_Pruned_Ch4, ...
 chs: 16 EEG
 custom_ref_applied: False
 highpass: 0.0 Hz
 lowpass: 250.0 Hz
 meas_date: 2019-10-25 08:21:02 UTC
 nchan: 16
 projs: []
 sfreq: 500.0 Hz
 subject_info: 1 item (dict)
>


In [5]:
# Load the EDF file, excluding the EOGs and EKG channels
raw = mne.io.read_raw_edf(f'{process_data_path}/test12_Wednesday_05_ALL_PROCESSED.edf', 
                          include=['ECG_Raw_Ch1', 'EEG_ICA5', 'GyrZ', 'MagZ',
                                   'ODBA', 'Pressure'], preload=True)
raw # Outputs summary data about file

# Inspect Data
print(raw.info)
print('The channels are:', raw.ch_names)
print('The sampling frequency is:', raw.info['sfreq'])

# Rename channels (replace spaces if any)
channel_renaming_dict = {name: name.replace(' ', '_') for name in raw.ch_names}
raw.rename_channels(channel_renaming_dict)

# Assuming 'raw' is your Raw object from MNE
channel_types = {}

for ch in raw.ch_names:
    if ch.startswith('ECG'):
        channel_types[ch] = 'ecg'
    elif ch.startswith('EEG'):
        channel_types[ch] = 'eeg'
    elif ch in ['pitch', 'roll', 'heading']:
        channel_types[ch] = 'resp'
    elif ch in ['GyrZ', 'MagZ', 'ODBA']:
        channel_types[ch] = 'syst'
    elif ch in ['Pressure']:
        channel_types[ch] = 'misc'
    elif ch == 'Heart_Rate':
        channel_types[ch] = 'bio'

# Now set the channel types
raw.set_channel_types(channel_types)

# Extract the measurement date (start time) from raw.info
start_time = raw.info['meas_date']
fs = raw.info['sfreq']

# Define the PST timezone
pst_timezone = pytz.timezone('America/Los_Angeles')

# Convert to datetime object in PST
if isinstance(start_time, datetime.datetime):
    # If it's already a datetime object, just replace the timezone
    recording_start_datetime = start_time.replace(tzinfo=None).astimezone(pst_timezone)
elif isinstance(start_time, (int, float)):
    # Convert timestamp to datetime in PST
    recording_start_datetime = pst_timezone.datetime.datetime.fromtimestamp(start_time, pst_timezone)
else:
    # Handle other formats if necessary
    pass

# Calculate the recording duration in seconds
recording_duration_seconds = len(raw) / fs

# Calculate the recording end datetime
recording_end_datetime = recording_start_datetime + datetime.timedelta(seconds=recording_duration_seconds)

# Calculate duration as a timedelta object
duration_timedelta = datetime.timedelta(seconds=recording_duration_seconds)

# Create a time index
#time_index = pd.date_range(recoring_start_datetime, recording_end_datetime)

# Format duration into days, hours, minutes, and seconds
days = duration_timedelta.days
hours, remainder = divmod(duration_timedelta.seconds, 3600)
minutes, seconds = divmod(remainder, 60)

print('The start time in PST (Los Angeles) is:', recording_start_datetime)
print('The end time in PST (Los Angeles) is:', recording_end_datetime)
print(f'Duration: {days} days, {hours} hours, {minutes} minutes, {seconds} seconds')


Extracting EDF parameters from /Users/michael/Desktop/capstone-seal-sleep/jessie-workshop/ecophys-ecoviz/data/raw/01_edf_data/test12_Wednesday_05_ALL_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 158957499  =      0.000 ... 317914.998 secs...
<Info | 8 non-empty values
 bads: []
 ch_names: ECG_Raw_Ch1, EEG_ICA5, GyrZ, MagZ, ODBA, Pressure
 chs: 6 EEG
 custom_ref_applied: False
 highpass: 0.0 Hz
 lowpass: 250.0 Hz
 meas_date: 2019-10-25 08:21:02 UTC
 nchan: 6
 projs: []
 sfreq: 500.0 Hz
 subject_info: 1 item (dict)
>
The channels are: ['ECG_Raw_Ch1', 'EEG_ICA5', 'GyrZ', 'MagZ', 'ODBA', 'Pressure']
The sampling frequency is: 500.0
The start time in PST (Los Angeles) is: 2019-10-25 08:21:02-07:00
The end time in PST (Los Angeles) is: 2019-10-29 00:39:37-07:00
Duration: 3 days, 16 hours, 18 minutes, 35 seconds


  raw.set_channel_types(channel_types)


In [6]:
# Load labeled data
# Path to CSV with scored data
file_path = f'{data_path}/02_hypnogram_data/test12_Wednesday_06_Hypnogram_JKB_1Hz.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)
df['R.Time'] = pd.to_datetime(df['R.Time']).dt.tz_localize('America/Los_Angeles')
df['Sleep.Code'].value_counts(normalize=True)

print('The start time for labels in PST (Los Angeles) is:', df['R.Time'].min())
print('The end time for labels in PST (Los Angeles) is:', df['R.Time'].max())


The start time for labels in PST (Los Angeles) is: 2019-10-25 14:45:22-07:00
The end time for labels in PST (Los Angeles) is: 2019-10-29 00:36:53-07:00


<a id='wednesday_feature_extraction'></a>

# Calculate Features for Wednesday the Seal

In [7]:
start_index = int((df['R.Time'].min() - recording_start_datetime).total_seconds() * 500)
end_index = int(((df['R.Time'].max() - recording_start_datetime).total_seconds() + 1) * 500)

In [8]:
eeg_subset = raw.copy().pick(['EEG_ICA5']).get_data()[0, start_index:end_index]
ecg_subset = raw.copy().pick(['ECG_Raw_Ch1']).get_data()[0, start_index:end_index] # TODO: Use ECG_Raw_Ch1

In [9]:
print(len(eeg_subset))
print(len(ecg_subset))

147346000
147346000


In [10]:
# heart rate for the subset time period
hr_subset = seal_fe.get_heart_rate(ecg_subset)

  heart_rates = [60 / ((rpeaks_corrected[i+1] - rpeaks_corrected[i]) / fs) for i in range(len(rpeaks_corrected) - 1)]


Filled: 0


In [11]:
delta_power = seal_fe.get_rolling_band_power_welch(eeg_subset, 0, len(eeg_subset), freq_range=(0.5, 4), ref_power=1e-14,
                                           freq=500, window_sec=30, step_size=1)
print('done')
zero_crossings = seal_fe.get_rolling_zero_crossings(eeg_subset, 0, len(eeg_subset), window_sec=10)
print('done')
absolute_power = seal_fe.get_rolling_band_power_welch(eeg_subset, 0, len(eeg_subset), freq_range=(0.4, 30),
                                                 ref_power=1e-14, freq=500, window_sec=30, step_size=1)
print('done')
hr_subset = seal_fe.get_heart_rate(ecg_subset)
hr_mean, hr_std = seal_fe.get_rolling_mean_std(hr_subset, 0, len(hr_subset), window_sec=30, freq=500)
print('done')
vlf_power = seal_fe.get_rolling_band_power_fourier_sum(hr_subset, 0, len(hr_subset), freq_range=(0.001, 0.05),
                                               window_sec=30, freq=500, ref_power=1)
print('done')
_, vlf_power_std = seal_fe.get_rolling_mean_std(vlf_power, 0, len(vlf_power), freq=1, window_sec=60)


done
done
done
Filled: 0
done


  delta_power = 10 * np.log(np.sum(power_spectrum[delta_freq_indices]) * freq_resolution / ref_power)


done


  x = asanyarray(arr - arrmean)


In [12]:
sw1_filter = df['Sleep.Num'] == 4
sw2_filter = df['Sleep.Num'] == 5
# sw2_filter = np.array([[x] * 500 for x in sw2_filter]).flatten()
rem_filter = df['Sleep.Num'] == 7
# rem_filter = np.array([[x] * 500 for x in rem_filter]).flatten()
drowsy_filter = df['Sleep.Num'] == 3

In [13]:
for myarr in [delta_power, zero_crossings, absolute_power, hr_subset, hr_mean, hr_std, vlf_power, vlf_power_std]:
    print(len(myarr))

294692
294692
294692
147346000
294692
294692
294692
294692


In [14]:
features_v1 = pd.DataFrame({
    'Delta Power': delta_power,
    'Rolling Zero Crossings': zero_crossings,
    'Rolling Absolute Power': absolute_power,
    'Heart Rate': [hr_subset[i] for i in range(0, len(hr_subset), 500)], # Downsample from 500 Hz to 1 Hz
    'Heart Rate Mean': hr_mean,
    'Heart Rate Std.Dev': hr_std,
    'Heart Rate Very Low Frequency Power': vlf_power,
    'Heart Rate VLF Power Std.Dev': vlf_power_std,
    'Sleep.Num': df['Sleep.Num']
})

In [15]:
features_v1.index = df['R.Time']

In [16]:
features_v1.to_csv('../data/processed/v1_features/Wednesday_features_v1.csv')

In [17]:
with open('../data/processed/v1_features/Wednesday_features_v1_meta.txt', 'w') as f: 
    f.write(
"""
delta_power = get_rolling_band_power_welch(eeg_subset, 0, len(eeg_subset), freq_range=(0.5, 4), ref_power=1e-14,
                                           freq=500, window_sec=30, step_size=1)
zero_crossings = get_rolling_zero_crossings(eeg_subset, 0, len(eeg_subset), window_sec=10)
absolute_power = get_rolling_band_power_welch(eeg_subset, 0, len(eeg_subset), freq_range=(0.4, 30),
                                                 ref_power=1e-14, freq=500, window_sec=30, step_size=1)
hr_subset = get_heart_rate(ecg_subset)
hr_mean, hr_std = get_rolling_mean_std(hr_subset, 0, len(hr_3subset), window_sec=30, freq=500)
vlf_power = get_rolling_band_power_fourier_sum(hr_subset, 0, len(hr_subset), freq_range=(0.001, 0.05),
                                               window_sec=30, freq=500, ref_power=1)
_, vlf_power_std = get_rolling_mean_std(vlf_power, 0, len(vlf_power), freq=1, window_sec=60)

features_v1 = pd.DataFrame({
    'Delta Power': delta_power,
    'Rolling Zero Crossings': zero_crossings,
    'Rolling Absolute Power': absolute_power,
    'Heart Rate': [hr_subset[i] for i in range(0, len(hr_subset), 500)], # Downsample from 500 Hz to 1 Hz
    'Heart Rate Mean': hr_mean,
    'Heart Rate Std.Dev': hr_std,
    'Heart Rate Very Low Frequency Power': vlf_power,
    'Heart Rate VLF Power Std.Dev': vlf_power_std,
    'Sleep.Num': subset['Sleep.Num']
})
"""
)


<a id='sklearn'></a>

# Scikit-Learn on Wednesday

In [18]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [19]:
# hours in features_v1
(features_v1.index[-1] - features_v1.index[0]).total_seconds() / 60 / 60

81.8586111111111

In [20]:
model_train_data = features_v1.dropna()
train_X, test_X, train_y, test_y = train_test_split(model_train_data.drop('Sleep.Num', axis=1),
                                                    model_train_data['Sleep.Num'], test_size=0.50, shuffle=False)

<a id='svm_classifier'></a>

## SVC (support vector machine classifier)

In [None]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['sigmoid', 'rbf']} # kernel: 'rbf', 'poly' excluded
grid = GridSearchCV(SVC(),param_grid,cv=2,refit=True,verbose=2,scoring='balanced_accuracy')
grid.fit(train_X,train_y)

In [53]:
print(grid.best_params_)

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}


In [21]:
model_svc = SVC(**{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'})

In [22]:
def print_model_stats(model, data):
    train_X, test_X, train_y, test_y = train_test_split(data.drop('Sleep.Num', axis=1),
                                                        data['Sleep.Num'], test_size=0.50, shuffle=False)
    from IPython.display import display, HTML
    model.fit(train_X, train_y)
    preds = model.predict(test_X)
    print('Accuracy:')
    print(np.mean(preds == test_y))
    print()
    
    print('Actual Target Dist')
    print(test_y.value_counts(normalize=True).sort_index())
    print()
    
    print('Prediction Target Dist')
    print(pd.Series(preds).value_counts(normalize=True).sort_index())
    print()
    
    display(pd.DataFrame(
        confusion_matrix(test_y, preds),
        index=[f'Actual {x}' for x in range(0, 8)],
        columns=[f'Predicted {x}' for x in range(0, 8)]
    ))

In [23]:
print_model_stats(model_svc, model_train_data)

Accuracy:
0.5908606203856525

Actual Target Dist
Sleep.Num
0    0.032554
1    0.597775
2    0.123282
3    0.034062
4    0.054248
5    0.106180
6    0.023473
7    0.028425
Name: proportion, dtype: float64

Prediction Target Dist
0    0.001963
1    0.613838
2    0.051579
3    0.104007
4    0.065244
5    0.065217
6    0.006860
7    0.091292
Name: proportion, dtype: float64



Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7
Actual 0,84,4001,18,143,322,57,5,163
Actual 1,204,73531,1672,3200,4420,2307,124,2553
Actual 2,0,5295,2318,2191,1767,227,432,5921
Actual 3,1,836,1092,2048,80,447,85,426
Actual 4,0,2222,780,1927,1093,688,72,1205
Actual 5,0,3347,194,4770,1333,5810,20,159
Actual 6,0,650,891,359,292,55,152,1057
Actual 7,0,494,629,675,299,11,120,1957


In [24]:
model_svc.fit(train_X, train_y)

In [25]:
preds_svc_test = model_svc.predict(test_X)

In [26]:
model_svc.fit(test_X, test_y)

In [27]:
preds_svc_train = model_svc.predict(train_X)

<a id='k-nearest-neighbors'></a>

## K Nearest Neighbors

In [None]:
param_grid = {
    'n_neighbors': [1,5,10,20,40,100,200,500],
    'p': [1,2,3,4,5],
    'weights': ('uniform', 'distance'),
    'metric': ('minkowski', 'chebyshev')
}
grid = GridSearchCV(KNeighborsClassifier(),param_grid,cv=2,refit=True,verbose=2,n_jobs=8,scoring='balanced_accuracy')
grid.fit(train_X,train_y)

In [38]:
print(grid.best_params_)

{'metric': 'minkowski', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'}


In [28]:
model_knn = KNeighborsClassifier(**{'metric': 'minkowski', 'n_neighbors': 100, 'p': 1, 'weights': 'distance'},
                                 n_jobs=8)

In [29]:
print_model_stats(model_knn, model_train_data)

Accuracy:
0.5191841392098131

Actual Target Dist
Sleep.Num
0    0.032554
1    0.597775
2    0.123282
3    0.034062
4    0.054248
5    0.106180
6    0.023473
7    0.028425
Name: proportion, dtype: float64

Prediction Target Dist
0    0.000822
1    0.438033
2    0.087617
3    0.114229
4    0.022292
5    0.158805
6    0.023215
7    0.154988
Name: proportion, dtype: float64



Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7
Actual 0,98,2967,192,219,104,674,32,507
Actual 1,22,58874,5166,4268,1515,11217,663,6286
Actual 2,1,1199,3338,2652,382,443,1258,8878
Actual 3,0,655,892,2150,134,679,94,411
Actual 4,0,288,955,2163,388,1321,473,2399
Actual 5,0,427,450,4195,633,8877,197,854
Actual 6,0,31,1144,419,53,122,459,1228
Actual 7,0,51,763,752,73,48,242,2256


In [30]:
df[['Sleep.Code', 'Sleep.Num']].drop_duplicates().set_index('Sleep.Num', drop=True).sort_index()

Unnamed: 0_level_0,Sleep.Code
Sleep.Num,Unnamed: 1_level_1
0,Unscorable
1,Active Waking
2,Quiet Waking
3,Drowsiness
4,LV Slow Wave Sleep
5,HV Slow Wave Sleep
6,Putative REM Sleep
7,Certain REM Sleep


In [31]:
model_knn.fit(train_X, train_y)

In [32]:
preds_knn_test = model_knn.predict(test_X)

In [33]:
model_knn.fit(test_X, test_y)

In [34]:
preds_knn_train = model_knn.predict(train_X)

<a id='random_forest'></a>

## Random Forest

In [None]:
param_grid = {
    'n_estimators': [25, 50, 100, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 100, 50, 25, 10, 5]
}
grid = GridSearchCV(RandomForestClassifier(),param_grid,cv=2,refit=True,verbose=2,n_jobs=8,
                    scoring='balanced_accuracy')
grid.fit(train_X,train_y)

In [61]:
print(grid.best_params_)

{'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 100}


In [35]:
model_rfc = RandomForestClassifier(**{'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 
                                     'n_estimators': 100}, n_jobs=8)

In [36]:
print_model_stats(model_rfc, model_train_data)

Accuracy:
0.502747383363558

Actual Target Dist
Sleep.Num
0    0.032554
1    0.597775
2    0.123282
3    0.034062
4    0.054248
5    0.106180
6    0.023473
7    0.028425
Name: proportion, dtype: float64

Prediction Target Dist
0    0.000869
1    0.441762
2    0.050553
3    0.254512
4    0.020967
5    0.095639
6    0.004870
7    0.130828
Name: proportion, dtype: float64



Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7
Actual 0,104,3003,0,965,108,343,0,270
Actual 1,24,59837,1903,14503,1606,6403,62,3673
Actual 2,0,761,2456,4423,453,225,348,9485
Actual 3,0,448,1169,2583,6,382,96,331
Actual 4,0,469,444,4366,540,758,1,1409
Actual 5,0,453,44,8923,205,5873,0,135
Actual 6,0,56,867,650,122,69,180,1512
Actual 7,0,14,560,1059,47,28,30,2447


In [37]:
df[['Sleep.Code', 'Sleep.Num']].drop_duplicates().set_index('Sleep.Num', drop=True).sort_index()

Unnamed: 0_level_0,Sleep.Code
Sleep.Num,Unnamed: 1_level_1
0,Unscorable
1,Active Waking
2,Quiet Waking
3,Drowsiness
4,LV Slow Wave Sleep
5,HV Slow Wave Sleep
6,Putative REM Sleep
7,Certain REM Sleep


In [39]:
model_rfc.fit(train_X, train_y)

In [45]:
preds_rfc_test = model_rf.predict(test_X)

In [41]:
model_rfc.fit(test_X, test_y)

In [46]:
preds_rfc_train = model_rf.predict(train_X)

#### save model results

In [47]:
features_v1_with_preds = model_train_data.copy()
features_v1_with_preds['SVC Predictions'] = np.concatenate([preds_svc_train, preds_svc_test])
features_v1_with_preds['KNN Predictions'] = np.concatenate([preds_knn_train, preds_knn_test])
features_v1_with_preds['RFC Predictions'] = np.concatenate([preds_rfc_train, preds_rfc_test])
features_v1_with_preds.to_csv('../data/processed/v1_features/Wednesday_features_v1_with_predictions.csv')