In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Global libs 
import logging 
from datetime import datetime 
from collections import defaultdict
import math
import os

# ML/DS libs
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, log_loss, precision_score, recall_score, roc_auc_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC, SVC

# Local libs 
import reader
import preprocessing
import features
import training 
import selection
import visualize

In [3]:
sessions = [
    'TEST01',
    'TEST02',
    'TEST03',
    'TEST04',
    'TEST05',
    'TEST06',
    'TEST07',
    'TEST08'
]

participants = range(1, 74, 1) # P001 - P073

raw_dir = 'data/raw_data/'
extracted_dir = 'data/extracted_data_old/'

In [4]:
target_feature = 'annotation'
target_keys = ['R1', 'R2']
target_key='R1'
target_threshold = 3

target_function = lambda df : features.format_annotation(
    df,
    window_size=20.0,
    stride=5.0,
    window_fn=lambda x : np.mean(x, axis=0),
    threshold=target_threshold,
    time_key='Time (s)',
    target_keys=target_keys
)


In [5]:
# FROM feature, TO feature, Extraction Function, Format Function, Whether or not to write extraction, Use existing
features_to_extract = [
    [
        'E4_EDA_PPT', 
        'excel',
        'EDA_20sec_5sec',
        lambda df : features.get_EDA_features(df['EDA'].to_numpy(), 4, 20.0, 5.0, df['Time (s)'].to_numpy()),
        lambda df : features.format_extracted_features(
            df,
            time_key='Time (s)',
            shift_fn=lambda df : preprocessing.shift_dataframe(df, 4, False)
        ),
        False,
        False
    ]
]

data_features = list(set([f[0] for f in features_to_extract]))
data_formats = list(set([(f[0], f[1]) for f in features_to_extract])) # To remove duplicates

In [6]:
features_data, features_missing = reader.get_pts_data(raw_dir, data_formats, participants, sessions)

target_data, target_missing = reader.get_pts_data(extracted_dir, [(target_feature, 'excel')], participants, sessions)

In [7]:
feature_search = data_features.copy()
feature_search.append(target_feature)
valid_pts_sessions = training.get_valid_pts_sessions(
    participants, 
    [features_missing, target_missing],
    sessions,
    feature_search
)

In [8]:
pt_dfs = training.get_pt_dfs(features_data, target_data, valid_pts_sessions, target_feature, target_function, features_to_extract, extracted_dir)

Valid sessions for Participant 1: ['TEST01', 'TEST02', 'TEST03', 'TEST04']
Valid sessions for Participant 4: ['TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 5: ['TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 8: ['TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 13: ['TEST01', 'TEST02', 'TEST03', 'TEST04']
Valid sessions for Participant 16: ['TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 20: ['TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 21: ['TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 23: ['TEST01', 'TEST02', 'TEST03', 'TEST04', 'TEST05', 'TEST06', 'TEST07', 'TEST08']
Valid sessions for Participant 27: ['TES

In [11]:
for pt, session_dfs in pt_dfs.items():
    if len(session_dfs) == 8:
        try:
            train_sessions = pd.concat(session_dfs[0:6]).fillna(0)
            train_sessions = training.eq_class_dist(train_sessions, target_key, [0, 1], method='under')
            test_sessions = pd.concat(session_dfs[6:]).fillna(0)
            model = LogisticRegression(solver='liblinear')
            x_train = train_sessions.drop(target_key,axis=1).to_numpy()
            y_train = train_sessions[target_key].to_numpy()
            x_test = test_sessions.drop(target_key, axis=1).to_numpy()
            y_test = test_sessions[target_key].to_numpy()
            model.fit(x_train, y_train)
            y_pred = model.predict(x_test)
            print(f'Participant {pt}: Recall:{recall_score(y_test, y_pred):.3f} Acc:{accuracy_score(y_test, y_pred):.3f}')
        except:
            pass

Participant 5: 0.200 0.579
Participant 8: 1.000 0.689
Participant 16: 0.357 0.730
Participant 20: 0.000 0.790
Participant 21: 0.000 0.863
Participant 23: 0.000 0.766
Participant 32: 0.613 0.317
Participant 41: 0.000 0.722
Participant 42: 0.000 0.907
Participant 51: 0.385 0.880
Participant 53: 0.419 0.512
Participant 60: 0.731 0.725
Participant 62: 0.649 0.356
Participant 71: 0.722 0.649
Participant 73: 0.500 0.667


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
