In [65]:
# Smoothing, Windowing, Data Cleanup

In [66]:
# Imports

import pandas as pd
import numpy as np


In [67]:
file = '../../../../Google Drive File Stream/My Drive/USC Expeditions Year 5/Analysis/Help-Seeking/Data/FS/p8_FS.csv'
data = pd.read_csv(file)

In [68]:
# move timestamp, session_num to first two columns
cols = np.array(data.columns).tolist()
cols.remove('timestamp')
cols.remove('session_num')
cols.remove('engagement')
cols = ['session_num', 'timestamp', 'engagement'] + cols
data = data[cols]

In [69]:
data = data.sort_values(['session_num', 'timestamp'], ascending=[True, True])

In [70]:
orig_len = len(data)

In [71]:
data.head()

Unnamed: 0,session_num,timestamp,engagement,of_confidence,of_success,of_gaze_0_x,of_gaze_0_y,of_gaze_0_z,of_gaze_1_x,of_gaze_1_y,...,aptitude,diff_1,diff_2,diff_3,diff_4,diff_5,skill_NC,skill_OS,skill_EM,no_game
0,1.0,34.866667,1.0,0.88,1.0,0.827797,0.084295,-0.554659,-0.237588,0.272544,...,0.7,0,1,0,0,0,1,0,0,0
1,1.0,34.9,1.0,0.88,1.0,0.85481,0.053164,-0.51621,-0.249434,0.246413,...,0.7,0,1,0,0,0,1,0,0,0
2,1.0,34.933333,1.0,0.88,1.0,0.864059,0.051314,-0.500768,-0.260329,0.271667,...,0.7,0,1,0,0,0,1,0,0,0
3,1.0,34.966667,1.0,0.88,1.0,0.857982,0.04545,-0.511665,-0.249022,0.249095,...,0.7,0,1,0,0,0,1,0,0,0
4,1.0,35.0,1.0,0.88,1.0,0.867188,0.053178,-0.495133,-0.290433,0.270397,...,0.7,0,1,0,0,0,1,0,0,0


In [72]:
data.columns

Index(['session_num', 'timestamp', 'engagement', 'of_confidence', 'of_success',
       'of_gaze_0_x', 'of_gaze_0_y', 'of_gaze_0_z', 'of_gaze_1_x',
       'of_gaze_1_y', 'of_gaze_1_z', 'of_gaze_angle_x', 'of_gaze_angle_y',
       'of_gaze_distance', 'of_pose_Tx', 'of_pose_Ty', 'of_pose_Tz',
       'of_pose_Rx', 'of_pose_Ry', 'of_pose_Rz', 'of_pose_distance',
       'of_AU01_c', 'of_AU02_c', 'of_AU04_c', 'of_AU05_c', 'of_AU06_c',
       'of_AU07_c', 'of_AU09_c', 'of_AU10_c', 'of_AU12_c', 'of_AU14_c',
       'of_AU15_c', 'of_AU17_c', 'of_AU20_c', 'of_AU23_c', 'of_AU25_c',
       'of_AU26_c', 'of_AU28_c', 'of_AU45_c', 'op_Number of People',
       'participant', 'games_total', 'games_session', 'mistakes_total',
       'mistakes_session', 'mistakes_game', 'ts_robot_talked', 'ts_game_start',
       'ts_attempt', 'aptitude', 'diff_1', 'diff_2', 'diff_3', 'diff_4',
       'diff_5', 'skill_NC', 'skill_OS', 'skill_EM', 'no_game'],
      dtype='object')

# 1. Remove Beginning of Videos/Sessions

In [73]:
# Remove data @ beginning of each session before first game has been played 

last_session = -1
session_start = -1 
track = False

# tuples: (session_start, first_game_start)
windows_to_remove = [] 

for i,r in data.iterrows():
    # Check if new session started
    if last_session != r['session_num']:
        session_start = i
        last_session = r['session_num']
        track = True 
    
    if track:
        # check if we've found our first game start! 
        if r['no_game'] == 0: 
            windows_to_remove.append((session_start, i))
            track = False 

In [74]:
for i in windows_to_remove:
    print(i)

(0, 0)
(29001, 29421)
(87184, 87814)
(105104, 105404)
(114446, 115736)
(134945, 135335)
(153798, 154308)
(175732, 176542)
(177969, 178449)


In [75]:
# Remove data at beginning
# Iterate backwards! 

n = len(windows_to_remove)
for i in range(n-1, -1, -1):
    rem = windows_to_remove[i]

    if (rem[0] == rem[1]):
        continue

    seg1 = data.loc[:rem[0]-1]
    seg2 = data.loc[rem[1]:, :]
    data = seg1.append(seg2, sort=True)
    
data = data.reset_index(drop=True)

In [76]:
curr_len = len(data)
print((orig_len-curr_len)/orig_len)

0.019898242939831504


# 2. Remove Ending of Videos/Sessions

In [77]:
# Remove data @ end of each session after last game has been played 

# Iterate Backwards! 

n = len(data)

last_session = -1
session_end = -1 
track = False

# tuples: (last_game_end, session_end)
windows_to_remove = [] 

for i in range(n-1, -1, -1):
    sess = data.loc[i, 'session_num']
    if last_session != sess:
        session_end = i
        last_session = sess
        track = True
    
    if track:
        # check if we've found end of last game! 
        if data.loc[i, 'no_game'] == 0:
            windows_to_remove.append((i, session_end))
            track = False

In [78]:
windows_to_remove

[(228924, 237904),
 (173618, 173618),
 (171078, 172191),
 (146045, 150767),
 (124286, 132304),
 (110384, 113095),
 (102394, 104053),
 (84201, 86763),
 (24994, 29000)]

In [79]:
# Remove data at ending
# windows to remove has end sessions first, which is what we want to remove first

for rem in windows_to_remove:
    if (rem[0] == rem[1]):
        continue

    seg1 = data.loc[:rem[0]]
    seg2 = data.loc[rem[1]+1:, :]
    data = seg1.append(seg2, sort=True)
    
data = data.reset_index(drop=True)

In [80]:
curr_len = len(data)
print((orig_len-curr_len)/orig_len)

0.15902527447628073


In [81]:
curr_len/orig_len

0.8409747255237193

# 3. Smoothing