In [1]:
import pandas as pd
import numpy as np
import pickle5 as pickle
from sklearn.model_selection import train_test_split

In [94]:
'''
    Optional: Steps to prepare data.
    Alternatively, read in already processed file in cell below
'''

import preprocessor as prep

raw_data = prep.read_data()
raw_data = prep.clear_columns(raw_data)

# Apply bandpass filter: limit 250Hz
raw_data = prep.apply_bp_filter(raw_data)

# Clean edges
raw_data = prep.clear_edges(raw_data)

# Normalize raw data
raw_data = prep.normalize_data(raw_data)

# ??
picture_blocks = prep.extract_picture_blocks(raw_data)
data_samples = prep.extract_data_samples(picture_blocks, sample_length=2, sample_step=2)

# Extract 15 features in features.py
final_data = prep.extract_features_for_all_data(data_samples)
final_data.head()

################################################################
Step 1 - Collecting data from .txt files
Collecting data from: aun_start16s.txt
Collecting data from: kas_start16s.txt
Collecting data from: ingrid_start7s.txt
Collecting data from: ele_start15s.txt
Collecting data from: kau_start23s.txt
Collecting data from: marie_start9s.txt
Collecting data from: alar_start24s.txt
Step 2 - Done
################################################################

Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracting features...
Extracti

Unnamed: 0,apen,fmean,fmed,fmode,intrange,mav,mavfd,mavsd,peak,rms,sampen,srange,std,var,zc,Emotion,Muscle,Person,Sample idx
0,0.348922,0.297715,0.300781,97.645657+65.859915j,0.73316,0.331975,0.539612,0.874312,0.565392,0.369964,0.375472,1.157271,0.369963,0.136873,299,happy,Zygomaticus Major,aun,0
1,0.226602,0.30007,0.300781,67.496357+95.333692j,0.698057,0.3278,0.533154,0.86099,0.566129,0.36678,0.245257,1.134112,0.36678,0.134527,300,happy,Zygomaticus Major,aun,1
2,0.324885,0.299863,0.300781,30.513549+23.107629j,0.753091,0.329021,0.535712,0.86804,0.586717,0.366916,0.336129,1.19874,0.366915,0.134627,299,happy,Zygomaticus Major,aun,2
3,0.292069,0.299166,0.300781,24.674829+26.778399j,0.713777,0.330327,0.536233,0.867013,0.618406,0.368607,0.314431,1.205682,0.368602,0.135868,300,happy,Zygomaticus Major,aun,3
4,0.539241,0.29676,0.300781,72.088227+80.914239j,0.662158,0.303459,0.491866,0.796976,0.55342,0.339462,0.603989,1.155964,0.339447,0.115225,300,happy,Zygomaticus Major,aun,4


In [95]:
'''
    Optional [Step 2]: Combine zygomaticus & corrugator readings for selected windows
'''

from sklearn.preprocessing import LabelEncoder

participants = set(final_data.Person)
emotions = set(final_data.Emotion)
muscles = list(set(final_data.Muscle))

formatted_df = None
s = None

for participant in participants:
    for emotion in emotions:
        s1 = final_data[(final_data.Person == participant) & (final_data.Emotion == emotion)\
                        & (final_data.Muscle == muscles[0])]
        s2 = final_data[(final_data.Person == participant) & (final_data.Emotion == emotion)\
                        & (final_data.Muscle == muscles[1])]
        s = pd.merge(s1, s2, on=['Sample idx', 'Emotion', 'Person'])
        s = s.drop(columns=['Muscle_x', 'Muscle_y', 'Sample idx', 'Person'])
    
        if formatted_df is None:
            formatted_df = s
        else:
            formatted_df = pd.concat([formatted_df, s])

emotions_le = LabelEncoder()
formatted_df.Emotion = emotions_le.fit_transform(formatted_df.Emotion)
formatted_df.head()

Unnamed: 0,apen_x,fmean_x,fmed_x,fmode_x,intrange_x,mav_x,mavfd_x,mavsd_x,peak_x,rms_x,...,mav_y,mavfd_y,mavsd_y,peak_y,rms_y,sampen_y,srange_y,std_y,var_y,zc_y
0,0.703716,0.297315,0.300781,3.857791-27.983526j,0.530114,0.239779,0.389784,0.628871,0.457745,0.269492,...,0.075735,0.122803,0.198492,0.138249,0.084725,0.589629,0.289563,0.084722,0.007178,300
1,0.675895,0.28647,0.300781,83.593420+8.784230j,0.521056,0.240644,0.387864,0.628781,0.486805,0.269554,...,0.086698,0.120913,0.195467,0.308097,0.104154,1.045735,0.587438,0.104154,0.010848,254
2,0.831419,0.298709,0.300781,3.922971-81.933789j,0.502573,0.238452,0.381597,0.616544,0.527613,0.26822,...,0.135024,0.125587,0.203155,0.695475,0.186991,0.537025,1.250999,0.186856,0.034915,222
3,0.741667,0.278089,0.300781,30.211786+1.083549j,0.483293,0.238713,0.380999,0.616876,0.544474,0.270065,...,0.106034,0.121779,0.197858,0.382165,0.135589,0.759954,0.828687,0.135437,0.018343,232
4,0.601913,0.293854,0.300781,14.326529-80.102364j,0.520299,0.237645,0.383255,0.619769,0.491606,0.265446,...,0.121316,0.122574,0.197079,0.53582,0.161968,0.718163,1.122203,0.161896,0.02621,212


In [None]:
'''
    Ensemble Classifier from TPOT
'''

from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('preprocessed_data.csv')
features = tpot_data.drop('Emotion', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['Emotion'], random_state=42)

# Average CV score on the training set was: 0.7454545454545455
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.0001),
    RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.05, n_estimators=100), step=0.15000000000000002),
    StackingEstimator(estimator=LinearSVC(C=25.0, dual=True, loss="squared_hinge", penalty="l2", tol=1e-05)),
    GradientBoostingClassifier(learning_rate=0.1, max_depth=4, max_features=0.6000000000000001, min_samples_leaf=1, min_samples_split=8, n_estimators=100, subsample=0.9500000000000001)
)

# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

# Mean Accuracy
print('Mean accuracy: %r' % np.mean(results == testing_target.values))

In [5]:
try:
    df = pd.read_pickle("data_sample_len2_sample_step2.pkl")
except:
    try:
        with open('data_sample_len2_sample_step2.pkl', 'rb') as fh:
            df = pickle.load(fh)
    except:
        print('Cannot open pickle file')

In [51]:
df["Muscle"] = df["Muscle"].astype('category')
df["Emotion"] = df["Emotion"].astype('category')
df = df.drop(["Person", "Sample idx"], axis=1)

X = pd.get_dummies(df.drop(["Emotion", "fmode"], axis=1))
y = df["Emotion"]

In [9]:
df.head(n=3)

Unnamed: 0,apen,fmean,fmed,fmode,intrange,mav,mavfd,mavsd,peak,rms,sampen,srange,std,var,zc,Emotion,Muscle
0,1.307227,0.238228,0.253906,11.391353+7.735965j,0.296268,0.184651,0.253215,0.396857,1.000022,0.233531,1.939744,1.718309,0.233519,0.054531,238,happy,Zygomaticus Major
1,1.326351,0.244564,0.261719,14.179856+12.743834j,0.356517,0.205449,0.280853,0.439035,0.704597,0.257154,2.101206,1.537603,0.257139,0.066121,242,happy,Zygomaticus Major
2,1.252011,0.243792,0.269531,14.682492+0.998472j,0.342912,0.205583,0.298171,0.466317,0.912664,0.26291,1.887932,1.691022,0.262871,0.069101,258,happy,Zygomaticus Major
3,1.325152,0.076673,0.015625,10.933231-0.048197j,0.206222,0.136505,0.147402,0.23789,0.564521,0.181422,1.931197,1.221809,0.181422,0.032914,187,happy,Zygomaticus Major
4,1.279311,0.227714,0.246094,12.965370+10.648279j,0.210121,0.129111,0.186362,0.294306,0.442515,0.166861,1.988861,1.005221,0.166834,0.027834,258,happy,Zygomaticus Major
5,1.279078,0.24333,0.257812,20.661735-0.038338j,0.208477,0.127903,0.170512,0.271788,0.582802,0.163754,1.914128,1.02706,0.163351,0.026683,246,happy,Zygomaticus Major
6,1.289047,0.149394,0.152344,9.940519+0.146295j,0.3221,0.187468,0.214039,0.338752,0.862564,0.233311,1.924678,1.559338,0.233082,0.054327,202,happy,Zygomaticus Major
7,1.206356,0.117703,0.046875,10.952140+6.187237j,0.197548,0.108668,0.064651,0.103962,0.28862,0.130342,1.404475,0.637425,0.130329,0.016986,64,neutral,Zygomaticus Major
8,1.248492,0.087167,0.007812,13.232429+2.899074j,0.089472,0.054208,0.05905,0.094815,0.220781,0.068833,1.654984,0.358381,0.067974,0.00462,204,neutral,Zygomaticus Major
9,1.24838,0.199107,0.300781,7.806749+10.534022j,0.083308,0.051128,0.056432,0.090647,0.197932,0.066204,1.690715,0.457629,0.066022,0.004359,223,neutral,Zygomaticus Major


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [96]:
from sklearn.svm import SVC

clf = SVC(kernel="rbf")
clf.fit(X_train, y_train)
print(np.mean(clf.predict(X_test) == y_test))

0.2972972972972973


In [97]:
clf = SVC(kernel="linear")
clf.fit(X_train, y_train)
print(np.mean(clf.predict(X_test) == y_test))

0.32432432432432434


In [98]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
print(np.mean(clf.predict(X_test) == y_test))

0.4594594594594595


In [99]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, random_state=5)
clf = clf.fit(X_train, y_train)
np.mean(clf.predict(X_test) == y_test)

0.7297297297297297

In [10]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=len(y), random_state=0, shuffle=True)
accuracies = []

for train_index, test_index in kf.split(X, y):
    clf = RandomForestClassifier(n_estimators=100, random_state=0)
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    acc = np.mean(clf.predict(X_test) == y_test)
    accuracies.append(acc)

In [11]:
np.mean(accuracies)

0.5277777777777778

In [12]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

rf = RandomForestClassifier(n_estimators=100)
seed = 0
param_grid = {"min_samples_split":[2,4,8,16,32] }

# Inner CV: for hyperparameter tuning inside the leave-one-out CV
inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)

# Outer CV: leave-one-out CV
outer_cv = KFold(n_splits=len(y), shuffle=True, random_state=seed)

# Non-nested parameter search: just leave-one-out CV
clf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=inner_cv)
clf.fit(X, y)
non_nested_score = clf.best_score_

# Nested CV
clf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=inner_cv)
nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv).mean()

In [13]:
non_nested_score

0.528

In [14]:
nested_score

0.5119047619047619

In [15]:
len(y)

252

In [None]:
!pip insall tpot

In [None]:
'''
    Optional: Build Model pipeline with tpot
'''

from tpot import TPOTClassifier

X = formatted_df.drop(["Emotion", "fmode_x", "fmode_y"], axis=1)
y = formatted_df["Emotion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

pipeline_optimizer = TPOTClassifier(generations=10, population_size=20, cv=5,
                                    random_state=42, verbosity=2, scoring='accuracy')
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))

# Save pipeline
pipeline_optimizer.export('pipeline_.py')