In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# Global libs 
import logging 
from datetime import datetime 
from collections import defaultdict
import math
import os

# ML/DS libs
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, log_loss, precision_score, recall_score, roc_auc_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC, SVC

# Local libs 
import reader
import preprocessing
import features
import training 
import selection
import visualize

In [7]:
sessions = [
    'PRE',
    'POST',
    'TEST01',
    'TEST02',
    'TEST03',
    'TEST04',
    'TEST05',
    'TEST06',
    'TEST07',
    'TEST08'
]

participants = range(1, 10, 1) # P001 - P073

raw_dir = 'data/raw_data/'
extracted_dir = 'data/extracted_data_old/'

In [23]:
target_feature = 'annotation'
target_key = 'R1'
target_keys = [target_key]
target_threshold = 3

target_function = lambda df : features.format_annotation(
    df,
    window_size=20.0,
    stride=5.0,
    window_fn=lambda x : np.mean(x, axis=0),
    threshold=target_threshold,
    time_key='Time (s)',
    target_keys=target_keys
)


In [17]:
# FROM feature, TO feature, Extraction Function, Format Function, Whether or not to write extraction, Use existing
features_to_extract = [

    [
        'audio',
        'wav',
        'eGeMAPSv02_20sec_5sec',
        lambda df : features.get_audio_features(df['Audio'].to_numpy(), df.sr, 20.0, 5.0, 'eGeMAPSv02', 'LLDs'),
        lambda df : features.format_extracted_features(
            df,
            time_key='Time (s)',
            shift_fn=lambda df : preprocessing.shift_dataframe(df, 4, False)
        ),
        False,
        False
    ],
    [
        'audio',
        'wav',
        'eGeMAPSv02_10sec_5sec',
        lambda df : features.get_audio_features(df['Audio'].to_numpy(), df.sr, 10.0, 5.0, 'eGeMAPSv02', 'LLDs'),
        lambda df : features.format_extracted_features(
            df,
            time_key='Time (s)',
            sampling_fn=lambda df : preprocessing.downsample_dataframe(df, 2, 'last'),
            shift_fn=lambda df : preprocessing.shift_dataframe(df, 4, False)
        ),
        False,
        False
    ],
    [
        'E4_EDA_PPT', 
        'excel',
        'EDA_20sec_5sec',
        lambda df : features.get_EDA_features(df['EDA'].to_numpy(), 4, 20.0, 5.0, df['Time (s)'].to_numpy()),
        lambda df : features.format_extracted_features(
            df,
            time_key='Time (s)',
            shift_fn=lambda df : preprocessing.shift_dataframe(df, 4, False)
        ),
        False,
        False
    ],

    [
        'E4_BVP_PPT', 
        'excel',
        'HRV_20sec_5sec',
        lambda df : features.get_HRV_features(df['BVP'].to_numpy(), 64, 20.0, 5.0, df['Time (s)'].to_numpy()),
        lambda df : features.format_extracted_features(
            df,
            time_key='Time (s)',
            shift_fn=lambda df : preprocessing.shift_dataframe(df, 4, False)
        ),
        False,
        False
    ]
]

data_features = list(set([f[0] for f in features_to_extract]))
data_formats = list(set([(f[0], f[1]) for f in features_to_extract])) # To remove duplicates

In [12]:
features_data, features_missing = reader.get_pts_data(raw_dir, data_formats, participants, sessions)

target_data, target_missing = reader.get_pts_data(extracted_dir, [(target_feature, 'excel')], participants, sessions)

In [13]:
feature_search = data_features.copy()
feature_search.append(target_feature)
valid_pts_sessions = training.get_valid_pts_sessions(
    participants, 
    [features_missing, target_missing],
    sessions,
    feature_search
)

In [21]:
pt_dfs = training.get_pt_dfs(features_data, target_data, valid_pts_sessions, target_feature, target_function, features_to_extract, extracted_dir)

Valid sessions for Participant 1: ['PRE', 'TEST01', 'TEST02', 'TEST03', 'TEST04']
Valid sessions for Participant 3: ['PRE']
Valid sessions for Participant 4: ['PRE', 'TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 5: ['PRE', 'POST', 'TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 6: ['PRE']
Valid sessions for Participant 8: ['PRE', 'POST', 'TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 9: ['PRE']


In [34]:
for key in pt_dfs.keys():
    pt_dfs[key].fillna(0, inplace=True)
    #pt_dfs[key].drop(columns=['t0','tn'], inplace=True)

KeyError: "['t0' 'tn'] not found in axis"

In [33]:
# Combine all 
df_all = pd.concat(pt_dfs.values(), axis=0, ignore_index=True)

# Holds normalized dataframes
normalized_pt_dfs = {}

# All keys except target key
feature_keys = [key for key in df_all.columns if key != target_key]

# Compute mean and std over all feature keys
df_mean = df_all[feature_keys].mean(axis=0)
df_std = df_all[feature_keys].std(axis=0)

# Drop any keys that are too correlated with eachother (keeps one)
_, corr_dropped_keys, corr_kept_keys = selection.drop_within_correlations(df_all, -0.9, 0.9, feature_keys)

# Drop any keys with stddev of 0
invalid_features = corr_dropped_keys
for feature in feature_keys:
    if math.isclose(df_std[feature], 0.0): # Remove 0 std features
        invalid_features.append(feature)

# Keep list of features we kept
valid_features = [f for f in feature_keys if f not in invalid_features]
        
# Drop all the invalid features from our mean and std
df_mean.drop(labels=invalid_features, inplace=True)
df_std.drop(labels=invalid_features, inplace=True)
    
# Normalize the keys we're keeping to 0 mean 1 std
for pt in pt_dfs.keys():
    normalized_pt_dfs[pt] = pt_dfs[pt].drop(invalid_features, axis=1)
    normalized_pt_dfs[pt][valid_features] = normalized_pt_dfs[pt][valid_features].sub(df_mean, axis=1).div(df_std, axis=1)
    
# Do the same for the whole dataframe
df_all.drop(invalid_features, axis=1, inplace=True)
df_all[valid_features] = df_all[valid_features].sub(df_mean, axis=1).div(df_std, axis=1)

# Resample the combined dataframe to equalize distribution
df_all_resampled = training.eq_class_dist(df_all, target_key, [0,1], 'under')

# Apply LASSO to combined dataframe
dropped_keys, kept_keys = selection.select_by_LASSO(df_all_resampled, target_key)
print('Dropped:', dropped_keys)
print('\nKept:', kept_keys)

Loudness_sma3 alphaRatio_sma3


array([[       nan,        nan],
       [       nan,        nan],
       [       nan, 6.2723855 ],
       ...,
       [6.27255284, 6.27780145],
       [6.27373301, 6.27360075],
       [6.26931072, 6.27454259]])

array([[        nan,         nan],
       [        nan,         nan],
       [        nan, -5.0068742 ],
       ...,
       [-4.95775782, -4.94313005],
       [-4.91847036, -4.96212897],
       [-4.9237738 , -4.9046453 ]])

IndexError: index 148 is out of bounds for axis 0 with size 4

In [27]:
# Only keep the keys that made it through lasso and other filtering methods
final_dfs = {}
final_keys = kept_keys.copy()
final_keys.append(target_key)

for pt, df in pt_dfs.items():
    final_dfs[pt] = df[final_keys]

NameError: name 'kept_keys' is not defined

In [None]:
# Equalize class distribution again for easier viewing
df_plot = training.eq_class_dist(df_all, target_key, [0, 1], method='under')
print('Plotting final keys')
plt.clf()
plt.figure(figsize=(32, 32))
sns_plot = sns.pairplot(df_plot[final_keys], hue=target_key)
sns_plot.savefig("plots/plot.png")

In [None]:
model = RandomForestClassifier(n_estimators=200, max_depth=100, random_state=500)
#model = LinearSVC(C=0.1, penalty='l2', dual=False, max_iter=10000)
#model = SVC(C=0.01, random_state=500, max_iter=100000) # Need to have max_iter otherwise this can take forever
#model = LogisticRegression(max_iter=10000, C=0.1)
#model = GradientBoostingClassifier(n_estimators=300, max_depth=2, learning_rate=0.1)

#metrics = training.LOOCV_subject(final_dfs.keys(), final_dfs, 'R1', [0, 1], model, resample_method='under', show_confusion=True)
metrics = training.k_fold_CV(final_dfs.keys(), final_dfs, 'R1', [0, 1], model, n_folds=10, resample_method='under', show_confusion=True)

for key in ['accuracy','recall','precision','f1']:
    print(f'Average {key} = {np.mean(metrics[key])}')
