In [1]:
# Template Notebook for Engagement Classifcation Model

# ToDo/Models to run here: https://docs.google.com/document/d/18EJpFesMEl9Q7C1AZzDeq6uy7c8tDMvEJ_j58-MmiBI/edit?usp=sharing
# Use p8_data_processed.csv here: https://drive.google.com/drive/folders/19aJUAlkTMz7PcZE1q4hogFkjVtwYGcMT
# Upload code to help-seeking github: https://github.com/interaction-lab/help_seeking
# Record model results here: https://docs.google.com/spreadsheets/d/16ye54fSSEuAuDL_j56UIeDB-rIIrxq_kPbtyPRQOrVI/edit?usp=sharing

In [76]:
# Imports

import pandas as pd
import numpy as np
from pandas import *
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.layers import Dense, Activation
from keras.models import Sequential

Using TensorFlow backend.


In [3]:
# Load Data
# Warning: this will probably take some time
# Adjust file path based on your computer

file = '../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Help-Seeking/Data/p8_data_processed.csv'
data = pd.read_csv(file)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data.head()

Unnamed: 0,participant,session_num,session_date,timestamp,engagement,activity,skill,difficulty,aptitude,games_total,...,op_p2_pose_y9,skill_NC,skill_OS,skill_EM,diff_1,diff_2,diff_3,diff_4,diff_5,no_game
0,8.0,1.0,2018-06-07,0.0,1.0,,,,0.5,0.0,...,0.0,0,0,0,0,0,0,0,0,1
1,8.0,1.0,2018-06-07,0.033333,1.0,,,,0.5,0.0,...,0.0,0,0,0,0,0,0,0,0,1
2,8.0,1.0,2018-06-07,0.066667,1.0,,,,0.5,0.0,...,0.0,0,0,0,0,0,0,0,0,1
3,8.0,1.0,2018-06-07,0.1,1.0,,,,0.5,0.0,...,0.0,0,0,0,0,0,0,0,0,1
4,8.0,1.0,2018-06-07,0.133333,1.0,,,,0.5,0.0,...,0.0,0,0,0,0,0,0,0,0,1


In [5]:
# Drop Irrelevant Columns

# transcriptions
data = data.drop(columns=['transcript_spk_0', 'transcript_spk_1', 'transcript_spk_2'])

# raw ros messages
data = data.drop(columns=['ros_PARTICIPANT_STATE', 'ros_ROBOT_STATE'])

# more for counting purposes
data = data.drop(columns=['game_start', 'game_correct', 'game_incorrect', 'mistake_made'])

data = data.drop(columns=['participant', 'session_date'])

# use one-hot encoded difficulty and skill, don't use activity for now
data = data.drop(columns=['difficulty', 'skill', 'activity'])

In [6]:
# What Non Open-Face/Open-Pose Columns? 
# Feature Dictionary Documentation here: https://docs.google.com/document/d/1RSygoLwsM1PKEIOoqDOaEo3lRBZBLZJuzbMSY5K_6zA/edit?usp=sharing

only_ofop = []
for i in data.columns:
    # open face/open pose columns begin with of_ or op_
    if 'op_' not in i and 'of_' not in i:
        print(i)
    else:
        only_ofop.append(i)

session_num
timestamp
engagement
aptitude
games_total
games_session
mistakes_total
mistakes_session
mistakes_game
ts_robot_talked
ts_game_start
ts_attempt
skill_NC
skill_OS
skill_EM
diff_1
diff_2
diff_3
diff_4
diff_5
no_game


In [7]:
# Label Analysis

print(data['engagement'].value_counts())
print()

print(1,data['engagement'].value_counts()[1]/sum(data['engagement'].value_counts()))
print(0,data['engagement'].value_counts()[0]/sum(data['engagement'].value_counts()))

1.0    139463
0.0    120473
Name: engagement, dtype: int64

1 0.5365282223316509
0 0.4634717776683491


### Decision 1: What features to use?

In [8]:
# TODO: Drop Some OpenFace/OpenPose Columns 

# Run a model with ONLY open face/ open pose columns
#only_ofop.append('timestamp')
#only_ofop.append('session_num')
#only_ofop.append('engagement')
#data = data[only_ofop]

# Run a model with only critical open face/open pose columns as determined by Bala's feature analysis

hand_picked = ['confidence', 'success', 'gaze_0_x', 'gaze_0_y', 'gaze_0_z', 'gaze_1_x', 'gaze_1_y', 'gaze_1_z', 'gaze_angle_x', 'gaze_angle_y']
for i,c in enumerate(hand_picked):
    hand_picked[i] = 'of_' + c
hand_picked.append('op_Number of People')

hand_picked.append('timestamp')
hand_picked.append('session_num')
hand_picked.append('engagement')
data = data[hand_picked]

### Decision 2: Train Test Split?

In [9]:
# (Optional) Sort Data Chronologically
# Important for Sequential Models! 

# data = data.sort_values(['session_num', 'timestamp'], ascending=[True, True])

# shuffle data
data = data.sample(frac=1).reset_index(drop=True)

In [10]:
# Make sure engagement is first column

cols = list(data)
cols.insert(0, cols.pop(cols.index('engagement')))
data = data[cols]

In [11]:
# Train Test Split 2: Train on Earlier Sessions, Test on Later Sessions

session_threshold = 8 # train on sessions <= 8, test on sessions > 8

train = data[(data['session_num'] <= 8.0)] 
test = data[(data['session_num'] > 8)]

# drop columns needed for split
train = train.drop(columns=['session_num', 'timestamp'])
test = test.drop(columns=['session_num', 'timestamp'])

X_train2 = train.iloc[:,1:]
y_train2 = train.iloc[:, 0]

X_test2 = test.iloc[:, 1:]
y_test2 = test.iloc[:, 0]

In [12]:
# Train Test Split 1: Random 70-30 Split

# don't need these columns anymore
data = data.drop(columns=['session_num', 'timestamp'])

# Note: random_state: make sure we get same split every time 
X_train1, X_test1, y_train1, y_test1 = train_test_split(data.iloc[:,1:], data.iloc[:,0], test_size=0.3, random_state=42)

In [None]:
 ### Model Metrics

In [77]:
def precision0(y_true, y_pred):
    data_tuples = list(zip(y_true, y_pred))
    df = pd.DataFrame(data_tuples, columns=['true', 'pred'])
    df = df.round(0)
    
    true_pos = len(df[(df['true'] <= 0) & (df['pred'] <= 0)])
    pred_pos = len(df[(df['pred'] <= 0)])
    return float(true_pos) / float(pred_pos)

def recall0(y_true, y_pred):
    data_tuples = list(zip(y_true, y_pred))
    df = pd.DataFrame(data_tuples, columns=['true', 'pred'])
    df = df.round(0)

    true_pos = len(df[(df['true'] <= 0) & (df['pred'] <= 0)])
    real_pos = len(df[(df['true'] <= 0)])
    return float(true_pos) / float(real_pos)

def f1_0(y_true, y_pred):
    p = precision0(y_true, y_pred)
    r = precision0(y_true, y_pred)

    return (2*((p*r)/(p+r)))

In [78]:
def precision1(y_true, y_pred):
    data_tuples = list(zip(y_true, y_pred))
    df = pd.DataFrame(data_tuples, columns=['true', 'pred'])
    df = df.round(0)

    true_pos = len(df[(df['true'] >= 1) & (df['pred'] >= 1)])
    pred_pos = len(df[(df['pred'] >= 1)])
    return float(true_pos) / float(pred_pos)

def recall1(y_true, y_pred):
    data_tuples = list(zip(y_true, y_pred))
    df = pd.DataFrame(data_tuples, columns=['true', 'pred'])
    df = df.round(0)

    true_pos = len(df[(df['true'] >= 1) & (df['pred'] >= 1)])
    real_pos = len(df[(df['true'] >= 1)])
    return float(true_pos) / float(real_pos)

def f1_1(y_true, y_pred):
    p = precision1(y_true, y_pred)
    r = precision1(y_true, y_pred)
    
    return (2*((p*r)/(p+r)))

In [86]:
def auc_roc(y_true, y_pred):
    print(y_true)
    return roc_auc_score(y_true, y_pred)

### Model: NN, Random 70-30 Split, hand_picked features

In [13]:
X_train1 = X_train1.values
X_test1 = X_test1.values
y_train1 = y_train1.values
y_test1 = y_test1.values

In [82]:
print(X_train1.shape)
print(X_test1.shape)

(181955, 11)
(77981, 11)


In [84]:
# Initialize simple neural network model
model = Sequential()

model.add(Dense(units=32, input_dim=11)) # Dense = fully-connected
model.add(Activation('relu')) # Relu activation (avoid unstable gradient problem)

model.add(Dense(units=64))
model.add(Activation('relu'))

model.add(Dense(units=32))
model.add(Activation('relu'))

model.add(Dense(units=1))
model.add(Activation('sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 32)                384       
_________________________________________________________________
activation_5 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                2112      
_________________________________________________________________
activation_6 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080      
_________________________________________________________________
activation_7 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 33        
__________

In [91]:
model.compile(loss='binary_crossentropy', # specify crossentropy cost function 
              optimizer='sgd', # stochastic gradient descent
              metrics=['accuracy'])

In [92]:
model.fit(X_train1, y_train1, epochs=20, batch_size=64, verbose=0, validation_split = .1)

<keras.callbacks.History at 0x1a2776ff60>

In [93]:
loss_and_metrics = model.evaluate(X_test1, y_test1, batch_size=128)
print(loss_and_metrics)

[0.4818522943521359, 0.7579795078250083]


In [109]:
def to_categorical(y, nb_classes):
    y = np.asarray(y, dtype='int32')
    if not nb_classes:
        nb_classes = np.max(y)+1
    Y = np.zeros((len(y), nb_classes))
    Y[np.arange(len(y)),y] = 1.
    return Y


In [110]:
y_train1 = to_categorical(y_train1, 2)
y_test1 = to_categorical(y_test1, 2)

In [112]:
# 2. Create Logistic Regression Model using TensorFlow
import tensorflow as tf
# Set up the TensorFlow variables (Add variables to TensorFlow's computational graph)

# this just helps with using tensorflow inside jupyter (reset/clear all tf variables)
tf.reset_default_graph()

# Input Parameter
x = tf.placeholder(tf.float32, [None, 11])

# Weights Variable (xavier initializer -- random values centered around zero)
W = tf.get_variable("W", shape=[11, 2], initializer = tf.contrib.layers.xavier_initializer())

# Biases variable: initialized to zero
b = tf.Variable(tf.zeros([2]))

# Define hypothesis fxn (y): represents probability of possible outputs given inputs
# Uses the softmax activation function: like sigmoid, but makes sure probabiltiies add to 1    
y = tf.nn.sigmoid(tf.matmul(x, W)+b)

# y_: actual labels
y_ = tf.placeholder(tf.float32, [None, 2])


In [113]:
def generate_batch(batch_size):
    indexes = np.random.randint(len(X_train1), size = (batch_size,))
    return X_train1[indexes], y_train1[indexes]


In [114]:
loss = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))

In [115]:
train_step = tf.train.GradientDescentOptimizer(.01).minimize(loss) # ".01" specifies the learning rate
sess = tf.InteractiveSession() # create the sesion object
tf.global_variables_initializer().run() # initialize variables



In [116]:
for i in range(50000):
    # get the x training batch and y training batch
    batch_xs, batch_ys = generate_batch(500)
    
    # this evaluates the computational graph
    # passes batch_xs in for the x placeholder, batch_ys in for the y_ placeholder
    sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

In [117]:
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [118]:
print(sess.run(accuracy, feed_dict={x: X_test1, y_: y_test1}))

0.65709597
