In [1]:
# Model Template: Engagement -- P5, P7, P8, P9

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn import metrics

# 1 -- Load, Choose Dataset

### Choose Datasets: Regular, Smoothed, Windowed

In [3]:
# Regular Data: All Frames, No Smoothing 
# p5_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master/p5_master.csv'
# p7_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master/p7_master.csv'
# p8_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master/p8_master.csv'
# p9_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master/p9_master.csv'

# Smoothed Data: All Frames, Smoothing of Open Face Features 
p5_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master_Smooth/p5_master_smooth.csv'
p7_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master_Smooth/p7_master_smooth.csv'
p8_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master_Smooth/p8_master_smooth.csv'
p9_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master_Smooth/p9_master_smooth.csv'

# Windowed Data: 1 Second Overlapping Windows, Feature Median + Variance in Window 
# p5_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master_Window/p5_master_window.csv'
# p7_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master_Window/p7_master_window.csv'
# p8_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master_Window/p8_master_window.csv'
# p9_file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Engagement/Data/Master_Window/p9_master_window.csv'

data5 = pd.read_csv(p5_file)
data7 = pd.read_csv(p7_file)
data8 = pd.read_csv(p8_file)
data9 = pd.read_csv(p9_file)

In [4]:
data = [data5, data7, data8, data9]
part = [5, 7, 8, 9]

# 2 -- Choose Feature Set

Feature Dictionary: https://docs.google.com/spreadsheets/d/1ewoVPHwW68Ins0AOVZf-0lsl_wW0_ZzuByuDiNJETBY/edit?usp=sharing

### Data Overview

In [5]:
# Main Columns
basic_cols = []
for i in data[0].columns:
    if 'of_' not in i and 'op_' not in i and 'p_' not in i and 'a_' not in i:
        basic_cols.append(i)
        
basic_cols = sorted(basic_cols)
for i in basic_cols:
    print(i)

engagement
participant
session_num
timestamp


In [6]:
# Open Face Columns

of_cols = []
for i in data[0].columns:
    if 'of_' in i:
        of_cols.append(i)
        
of_cols = sorted(of_cols)
for i in of_cols:
    print(i)

of_AU01_c
of_AU02_c
of_AU04_c
of_AU05_c
of_AU06_c
of_AU07_c
of_AU09_c
of_AU10_c
of_AU12_c
of_AU14_c
of_AU15_c
of_AU17_c
of_AU20_c
of_AU23_c
of_AU25_c
of_AU26_c
of_AU28_c
of_AU45_c
of_confidence
of_gaze_0_x
of_gaze_0_y
of_gaze_0_z
of_gaze_1_x
of_gaze_1_y
of_gaze_1_z
of_gaze_angle_x
of_gaze_angle_y
of_gaze_distance
of_pose_Rx
of_pose_Ry
of_pose_Rz
of_pose_Tx
of_pose_Ty
of_pose_Tz
of_pose_distance
of_success
of_ts_success


In [7]:
# Audio Columns

a_cols = []
for i in data[0].columns:
    if 'a_' in i:
        a_cols.append(i)
        
a_cols = sorted(a_cols)
for i in a_cols:
    print(i)

a_harmonicity
a_intensity
a_mfcc_0
a_mfcc_1
a_pitch_frequency
a_pitch_strength


In [8]:
# Performance Columns

p_cols = []
for i in data[0].columns:
    if 'p_' in i and 'op_' not in i:
        p_cols.append(i)
        
p_cols = sorted(p_cols)
for i in p_cols:
    print(i)

p_aptitude
p_diff_1
p_diff_2
p_diff_3
p_diff_4
p_diff_5
p_games_session
p_games_total
p_mistakes_game
p_mistakes_session
p_mistakes_total
p_no_game
p_skill_EM
p_skill_NC
p_skill_OS
p_ts_attempt
p_ts_game_played
p_ts_game_start
p_ts_robot_talked


In [9]:
# For Window Only:
non_window_features = []
window_features = []
for i in data[0].columns:
    if i not in basic_cols:
        if 'change' in i or 'var' in i:
            window_features.append(i)
        else:
            non_window_features.append(i)
        
window_features = sorted(window_features)
for i in window_features:
    print(i)

### Filter Feature Set (Optional)

In [10]:
# features_to_keep = data[0].columns

# always include basic_cols, add desired group of features 
# features_to_keep = basic_cols + of_cols + p_cols + a_cols

# for i,d in enumerate(data):
#    data[i] = d[features_to_keep]

# 3 -- Train-Test Split

In [11]:
# Helper Function: Split a Participant's Data Chronologically 

def part_split(split, split_size):
    split = split.sort_values(['session_num', 'timestamp'], ascending=[True, True])
    bogus = split['engagement']

    split_train, split_test, bogus1, bogus2 = train_test_split(split, bogus, test_size=split_size, shuffle=False)
    return split_train, split_test

In [12]:
# Function: Formulate Train-Test Split 

# train_part: an array of participants 
# test_part: single participant
# split_size: how much of test_part data to use for testing
def split(train_part, test_part, split_size):
    train_data = []
    for p in train_part:
        i = part.index(p)
        train_data.append(data[i])
    
    i = part.index(test_part)
    test_data = data[i]
    if split_size < 1:
        split_train, test_data = part_split(test_data, split_size)
        train_data.append(split_train)
    
    train_data = pd.concat(train_data, ignore_index=True, sort=True)
    
    return train_data, test_data

In [13]:
# Train-Test Split

# Excluding Participant 8
perc = 1
# train, test = split(train_part=[5,7], test_part=9, split_size=perc)
train, test = split(train_part=[5,9], test_part=7, split_size=perc)
# train, test = split(train_part=[7,9], test_part=5, split_size=perc)

# Individualized
# perc = 0.7
# train, test = split(train_part=[], test_part=7, split_size=perc)
# train, test = split(train_part=[], test_part=5, split_size=perc)
# train, test = split(train_part=[], test_part=9, split_size=perc)
# train, test = split(train_part=[], test_part=8, split_size=perc)

# 4 -- Scenario Based Modeling

- During Game / Outside of Game
- Open Face Success / Failure
- Robot Talking / Not Talking
- First 10 Minutes / After 10 Minutes

In [None]:
# Create Separate Models for Different Scenarios

# # Outside of Game
# train = train.loc[train['no_game']==1]
# test = test.loc[test['no_game']==1]

# # During Game
# train = train.loc[train['no_game']==0]
# test = test.loc[test['no_game']==0]

# # Open Face Success
# train = train.loc[train['of_success']==1]
# test = test.loc[test['of_success']==1]

# # Open Face Failure
# train = train.loc[train['of_success']==0]
# test = test.loc[test['of_success']==0]

# # Robot Talking
# train = train.loc[train['p_ts_robot_talked']==0]
# test = test.loc[test['p_ts_robot_talked']==0]

# # Robot Not Talking
# train = train.loc[train['p_ts_robot_talked']>0]
# test = test.loc[test['p_ts_robot_talked']>0]

# # First 10 Minutes of Session
# train = train.loc[train['timestamp']<=(10*60)]
# test = test.loc[test['timestamp']<=(10*60)]

# # After first 10 Minutes of Session
# train = train.loc[train['timestamp']>(10*60)]
# test = test.loc[test['timestamp']>(10*60)]

### Divide into X_train, y_train, X_test, y_test

In [14]:
y_train = train['engagement']
X_train = train.drop(columns=basic_cols)

y_test = test['engagement']
X_test = test.drop(columns=basic_cols)

In [15]:
# Sort Training Data Together 

X_train, bogus1, y_train, bogus2 = train_test_split(X_train, y_train, test_size=0.0, shuffle=True)
X_train, bogus1, y_train, bogus2 = train_test_split(X_train, y_train, test_size=0.0, shuffle=True)
X_train, bogus1, y_train, bogus2 = train_test_split(X_train, y_train, test_size=0.0, shuffle=True)
X_train, bogus1, y_train, bogus2 = train_test_split(X_train, y_train, test_size=0.0, shuffle=True)
X_train, bogus1, y_train, bogus2 = train_test_split(X_train, y_train, test_size=0.0, shuffle=True)

# 5 -- Preprocessing
Standardization Recommended

### Min-Max Scaling
- x' = ( x - min(x) ) / ( max(x) - min(x) )
- => x' in [0, 1], fill NaN with -1

In [16]:
# for c in X_train.columns:
#     min_val = np.nanmin(X_train[c])
#     max_val = np.nanmax(X_train[c])
    
#     X_train[c] = (X_train[c]-min_val)/(max_val - min_val)
#     X_test[c] = (X_test[c]-min_val)/(max_val - min_val)
    
# X_train = X_train.fillna(-1)
# X_test = X_test.fillna(-1)

### Standardization (Recommended)
- x' = ( x - mean(x) ) / ( stdev(x) )
- => x' is z-score, NaN's filled with min_val

In [17]:
for c in X_train.columns:
    mean = np.nanmean(X_train[c])
    std = np.nanstd(X_train[c])
    
    X_train[c] = (X_train[c]-mean)/(std)
    X_test[c] = (X_test[c]-mean)/(std)
        
    if 'of_pose_distance' not in c:
        min_val = np.nanmin(X_train[c])

        X_train[c] = X_train[c].fillna(min_val)
        X_test[c] = X_test[c].fillna(min_val)
    else:
        max_val = np.nanmax(X_train[c])

        X_train[c] = X_train[c].fillna(max_val)
        X_test[c] = X_test[c].fillna(max_val)

# 6 -- Model!

In [18]:
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.utils import np_utils

Using TensorFlow backend.


In [19]:
num_epochs = 5
size_batch = 96   

nn_model = Sequential()

nn_model.add(Dense(units=1, input_dim=(X_train.shape[1])))
nn_model.add(Activation('sigmoid'))

nn_model.summary()

nn_model.compile(loss='binary_crossentropy',  
              optimizer='Adam',
              metrics=['accuracy'])

nn_model.fit(X_train, y_train, epochs=num_epochs, batch_size=size_batch, verbose=1)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 64        
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 64
Trainable params: 64
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a48cd7588>

In [20]:
loss_and_metrics = nn_model.evaluate(X_test, y_test)
print(loss_and_metrics)

[0.3932184453870827, 0.8220476485076282]


In [21]:
scores = nn_model.predict_proba(X_test)
pred = nn_model.predict_classes(X_test)

# 7 -- Evaluation

In [22]:
# y_test: actual y values for test data
# scores: predicted probabilistic y values for test data
# pred: predicted binary y values for test data

In [23]:
try:
    print("AUC:",roc_auc_score(y_test, scores)) 
except:
    print("AUC undefinied, only 1 class in test data")
    
print("Accuracy:",accuracy_score(y_test, pred))
print(metrics.confusion_matrix(y_test, pred))
print(metrics.classification_report(y_test, pred))

AUC: 0.9014238316823009
Accuracy: 0.8220476485076282
[[145411  27778]
 [ 33202 136285]]
              precision    recall  f1-score   support

           0       0.81      0.84      0.83    173189
           1       0.83      0.80      0.82    169487

   micro avg       0.82      0.82      0.82    342676
   macro avg       0.82      0.82      0.82    342676
weighted avg       0.82      0.82      0.82    342676

