In [6]:
import pandas as pd
import numpy as np
import h5py as h5
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

In [112]:
#input data
def input_data(folder):
    cv = pd.read_csv(folder+'/' + folder + '_photos_calib_result.csv')
    cv['pupil'] = pd.to_numeric(cv['pupil'], errors='coerce')
    cv['ratio'] = pd.to_numeric(cv['ratio'], errors='coerce')
    cv['x'] = pd.to_numeric(cv['x'], errors='coerce')
    cv['y'] = pd.to_numeric(cv['y'], errors='coerce')
    cv['ts'] = pd.to_numeric(cv['ts'], errors='coerce')
    cv['ts'] = np.around(cv['ts'], decimals=0)
    #seperate by steps
    cv = cv[cv['pupil'] > 0] #remove zeros
    
    s30_cv = cv[cv['step'] == 30]
    s31_cv = cv[cv['step'] == 31]
    s32_cv = cv[cv['step'] == 32]
    
    return cv, s30_cv, s31_cv, s32_cv

def input_eve_data(folder, step, step_num):
    left_pupil_db = pd.DataFrame(np.array(h5.File(folder+'/' + step + '_webcam_c.h5')['left_p']['data']))
    right_pupil_db = pd.DataFrame(np.array(h5.File(folder+'/' + step + '_webcam_c.h5')['right_p']['data']))
    px_2_mm = pd.DataFrame(np.array(h5.File(folder+'/' + step + '_webcam_c.h5')['pixels_per_millimeter']))
    gaze_right = pd.DataFrame(np.array(h5.File(folder+'/' + step + '_webcam_c.h5')['right_o']['data']))
    gaze_left = pd.DataFrame(np.array(h5.File(folder+'/' + step + '_webcam_c.h5')['left_o']['data']))
    gaze_left = gaze_left.rename(columns={0: 'x', 1: 'y', 2: 'z'})
    gaze_right = gaze_right.rename(columns={0: 'x', 1: 'y', 2: 'z'})
    scale = pd.DataFrame(np.array(h5.File(folder+'/' + step + '_webcam_c.h5')['pixels_per_millimeter']))
    scale = scale[0][0]
    
    #normalise input data
    #eve db
    left_pupil_db *= scale
    right_pupil_db *= scale
    
    #insert the timestamp information for the eve data
    ts_eve = attach_ts(step_num, folder)
    left_pupil_db['ts'] = ts_eve
    
    # integrate into one dataframe with the coordinate
    left_pupil_db['x'] = gaze_left['x']
    left_pupil_db['y'] = gaze_left['y']

    right_pupil_db['ts'] = ts_eve
    right_pupil_db['x'] = gaze_right['x']
    right_pupil_db['y'] = gaze_right['y']

    left_pupil_db = left_pupil_db.rename(columns={0: 'pupil'})
    right_pupil_db = right_pupil_db.rename(columns={0: 'pupil'})
    left_pupil_db = left_pupil_db[left_pupil_db['pupil'] != 0]
    right_pupil_db = right_pupil_db[right_pupil_db['pupil'] != 0]
    
    return left_pupil_db, right_pupil_db, ts_eve

def attach_ts(stepnum, folder):
    f = open(folder+'/step_' + stepnum +'_webcam_c.timestamps.txt','r')
    ts = []
    for line in f.readlines():
        ts.append(float(line[:-1]))

    start = ts[0]
    for i in range(len(ts)):
        ts[i] -= start
        ts[i] /= 1000000 #from ms to s
        ts[i] = np.around(ts[i], decimals=0)
    return ts


In [113]:
from statsmodels.stats.stattools import medcouple
import math

def get_quartiles(arr):
    arr = [i for i in arr if not np.isnan(i)]
    arr = np.sort(arr)
    mid = int(len(arr)/2)
    if(len(arr)%2 == 0):
        Q1 = np.median(arr[:mid])
        Q3 = np.median(arr[mid:])
    else:
        Q1 = np.median(arr[:mid])
        Q3 = np.median(arr[mid:])
    return Q1,Q3

def adjust_boxplot(values, param, bias=1.5):
    mc = np.around(medcouple(values), decimals=1)
    
    if mc == 0: #tukeys method
        q1,q3 = get_quartiles(values)
        iqr = q3 - q1
        lowerLimit = np.around(q1 - bias*iqr, decimals=2)
        upperLimit = np.around(q3 + bias*iqr, decimals=2)
    else:
        q1,q3 = get_quartiles(values)
        iqr = q3 - q1
        if mc > 0:
            lowerLimit = np.around(q1 - bias*math.exp(-3.5*mc)*iqr, decimals=2)
            upperLimit = np.around(q3 + bias*math.exp(4*mc)*iqr, decimals=2)
        else:
            lowerLimit = np.around(q1 - bias*math.exp(-4*mc)*iqr, decimals=2)
            upperLimit = np.around(q3 + bias*math.exp(3.5*mc)*iqr, decimals=2)
        
    result = []
    for v in values:
        if v < lowerLimit or v > upperLimit: 
            result.append(0)
        else:
            result.append(v)
    
    remove = np.around(1-(len([i for i in result if i != 0])/len(values)), decimals=2)

    return result

def remove_outliers(df, param, bias=1.5): 
    ad_value = adjust_boxplot(list(df[param]), param, bias)
    df[param] = ad_value
    df = df[df[param] != 0]
    
    return df


In [114]:
#merge function for cv db
def merge_both_side(ts, df_left, df_right):
    data = []
    
    for t in ts:
        left_series = df_left[df_left['ts'] == t]
        right_series = df_right[df_right['ts'] == t]
        
        if len(left_series) != 0:
            left = list(left_series['pupil'])[0]
            y_left = list(left_series['y'])[0]
            x_left = list(left_series['x'])[0]
        else:
            left, y_left, x_left = np.nan, np.nan, np.nan
        
        if len(right_series) != 0:
            right = list(right_series['pupil'])[0]
            y_right = list(right_series['y'])[0]
            x_right = list(right_series['x'])[0]
        else:
            right, y_right, x_right = np.nan, np.nan, np.nan
        
        if not np.isnan(left) and not np.isnan(right):
            data.append([t, np.mean([left, right])])
        elif not np.isnan(right):
            data.append([t, right])
        elif not np.isnan(left):
            data.append([t, left])
        else:
            data.append([t, np.nan])
        
    df = pd.DataFrame(data, columns=['ts', 'pupil'])
    df['pcps'] = df['pupil'].pct_change()
    df = df[df['pupil'] != np.nan]
    return df


In [115]:
### normalise into 1sec
def nor_1000(df, sec=1):
    video_length = np.around(list(df['ts'])[-1]/1000, decimals=0)

    times = np.arange(0, video_length*1000, (1000*sec)) #interval
    start = 0
    data = []
    for t in times[1:]:
        rows = df[ (df['ts'] >= start) & (df['ts'] <= t)]
        values = sorted([i for i in rows['pupil'] if i != 0])
        data.append([start, np.around(np.nanmean(values), decimals=3)])
        start = t
    
    return pd.DataFrame(data, columns=['ts', 'pupil'])

def preprocess(step_cv_df, folder, step, step_num):
    ts_cv = []
    for t in list(step_cv_df['ts']): #the whole ts list before removing any values
        if t not in ts_cv:
            ts_cv.append(t)
    
    #input eve data
    left_pupil_eve, right_pupil_eve, ts_eve = input_eve_data(folder, step, step_num)
    
    #clean cv data
    before_size = len(list(step_cv_df['pupil']))
    step_cv_df = remove_outliers(step_cv_df, 'pupil')
    step_cv_df = remove_outliers(step_cv_df, 'y', 3)
    step_cv_df = remove_outliers(step_cv_df, 'x', 3)
    
    print('** after clean cv data, remove ratio', 1-(len(list(step_cv_df['pupil']))/before_size), '\n')

    changes_cv_left = step_cv_df[step_cv_df['side'] == 'L'] #select individual side
    changes_cv_right = step_cv_df[step_cv_df['side'] == 'R']
    
    #clean eve data
    left_pupil_eve = remove_outliers(left_pupil_eve, 'pupil')
    right_pupil_eve = remove_outliers(right_pupil_eve, 'pupil')
    
    cv = merge_both_side(ts_cv, changes_cv_left, changes_cv_right)
    eve = merge_both_side(ts_eve, left_pupil_eve, right_pupil_eve) 
    
    video_length = np.around(list(eve['ts'])[-1]/1000, decimals=0)
    freq_eve = np.around(len(eve['pupil'])/video_length, decimals=0)
    freq_cv = np.around(len(cv['pupil'])/video_length, decimals=0)
    print('eve freq = ', freq_eve, ' | cv freq = ', freq_cv)
    return cv, eve

In [116]:
from scipy.stats import pearsonr      

def windowed_corr(df, window=3, overlap=0):
    #overall pearson correlation    
    df = df.dropna(subset=['pupil'])
        
    r, p = pearsonr(list(df['ts']), list(df['pupil']))
    print('overall pearson correlation (r, p):', np.around(r, decimals=3), np.around(p, decimals=3))
    
    #windowed pearson correlation
    print('\t-- with window --')
    length = list(df['ts'])[-1]

    idx = np.arange(0, length, (window-overlap)*1000)
    positive, negative, total = 0, 0, 0
    
    start = 0
    for t in idx[1:]:
        rows = df[(df['ts'] >= start) & (df['ts'] < t)]

        if len(rows) >= 2: 
            corr = pearsonr(list(rows['ts']), list(rows['pupil']))
        else:
            corr = [0,0]
        print('\t\tcorrelation (r, p) at window ', window, ':', np.around(corr, decimals=3))
        total += 1
        
        if corr[0] > 0.5:
            positive += 1
        if corr[0] < -0.5:
            negative += 1
        start = t
    
    print('position cor, negative cor, total:', positive, negative, total)


In [117]:
def run(step_num, step_cv_df, folder):
    step_num = str(step_num)
    step = 'step_' + step_num
    save_file = folder+'/'+step
    suffix = 'video' 
    cv, eve = preprocess(step_cv_df, folder, step, step_num)
    
    ##################### 1 sec ########################
    #merge into 1 sec timeframe
    print('-- 1 SEC Timeframe --')
    cv_new = nor_1000(cv)
    eve_new = nor_1000(eve)
    
    print('[cv]')
    windowed_corr(cv_new)
    print('[eve]')
    windowed_corr(eve_new)
    
    ##################### 2 sec ########################
    print('-- 2 SEC Timeframe --')
    cv_new = nor_1000(cv, 2)
    eve_new = nor_1000(eve, 2)
    
    print('[cv]')
    windowed_corr(cv_new)
    print('[eve]')
    windowed_corr(eve_new)

In [118]:
cv_36, s30_cv36, s31_cv36, s32_cv36 = input_data('train36')
run('30', s30_cv36, 'train36')
run('31', s31_cv36, 'train36')
run('32', s32_cv36, 'train36')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


** after clean cv data, remove ratio 0.04397705544933084 

eve freq =  30.0  | cv freq =  27.0
-- 1 SEC Timeframe --
[cv]
overall pearson correlation (r, p): 0.28 0.465
	-- with window --
		correlation (r, p) at window  3 : [-0.974  0.147]
		correlation (r, p) at window  3 : [0.213 0.864]
position cor, negative cor, total: 0 1 2
[eve]
overall pearson correlation (r, p): 0.147 0.706
	-- with window --
		correlation (r, p) at window  3 : [-0.978  0.135]
		correlation (r, p) at window  3 : [0.942 0.218]
position cor, negative cor, total: 1 1 2
-- 2 SEC Timeframe --
[cv]
overall pearson correlation (r, p): 0.39 0.61
	-- with window --
		correlation (r, p) at window  3 : [-1.  1.]
position cor, negative cor, total: 0 1 1
[eve]
overall pearson correlation (r, p): 0.041 0.959
	-- with window --
		correlation (r, p) at window  3 : [-1.  1.]
position cor, negative cor, total: 0 1 1
** after clean cv data, remove ratio 0.11759082217973227 

eve freq =  30.0  | cv freq =  17.0
-- 1 SEC Timeframe 

  # This is added back by InteractiveShellApp.init_path()


[cv]
overall pearson correlation (r, p): -0.074 0.615
	-- with window --
		correlation (r, p) at window  3 : [0.937 0.228]
		correlation (r, p) at window  3 : [0.886 0.307]
		correlation (r, p) at window  3 : [0.676 0.527]
		correlation (r, p) at window  3 : [-0.999  0.034]
		correlation (r, p) at window  3 : [-0.474  0.685]
		correlation (r, p) at window  3 : [0.519 0.653]
		correlation (r, p) at window  3 : [-0.856  0.346]
		correlation (r, p) at window  3 : [0.983 0.119]
		correlation (r, p) at window  3 : [-0.901  0.286]
		correlation (r, p) at window  3 : [-1.  1.]
		correlation (r, p) at window  3 : [0.464 0.693]
		correlation (r, p) at window  3 : [-0.255  0.836]
		correlation (r, p) at window  3 : [0.4   0.738]
		correlation (r, p) at window  3 : [-0.033  0.979]
		correlation (r, p) at window  3 : [-0.958  0.186]
position cor, negative cor, total: 5 5 15
[eve]
overall pearson correlation (r, p): 0.115 0.432
	-- with window --
		correlation (r, p) at window  3 : [0.772 0.438]
		

In [119]:
cv_38, s30_cv38, s31_cv38, s32_cv38 = input_data('train38')
run('30', s30_cv38, 'train38')
run('31', s31_cv38, 'train38')
run('32', s32_cv38, 'train38')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


** after clean cv data, remove ratio 0.15552855407047383 

eve freq =  30.0  | cv freq =  21.0
-- 1 SEC Timeframe --
[cv]
overall pearson correlation (r, p): -0.255 0.183
	-- with window --
		correlation (r, p) at window  3 : [0.649 0.55 ]
		correlation (r, p) at window  3 : [0.584 0.603]
		correlation (r, p) at window  3 : [0.175 0.888]
		correlation (r, p) at window  3 : [-0.681  0.523]
		correlation (r, p) at window  3 : [0.92  0.256]
		correlation (r, p) at window  3 : [-0.966  0.166]
		correlation (r, p) at window  3 : [1.    0.002]
		correlation (r, p) at window  3 : [0.721 0.487]
		correlation (r, p) at window  3 : [-0.831  0.376]
position cor, negative cor, total: 5 3 9
[eve]
overall pearson correlation (r, p): -0.028 0.885
	-- with window --
		correlation (r, p) at window  3 : [0.94  0.222]
		correlation (r, p) at window  3 : [-0.532  0.643]
		correlation (r, p) at window  3 : [0.976 0.139]
		correlation (r, p) at window  3 : [-0.667  0.535]
		correlation (r, p) at window  3 :

In [120]:
cv_1, s30_cv1, s31_cv1, s32_cv1 = input_data('train01')
run('30', s30_cv1, 'train01')
run('31', s31_cv1, 'train01')
run('32', s32_cv1, 'train01')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


** after clean cv data, remove ratio 0.09591836734693882 

eve freq =  30.0  | cv freq =  15.0
-- 1 SEC Timeframe --
[cv]
overall pearson correlation (r, p): -0.42 0.023
	-- with window --
		correlation (r, p) at window  3 : [0.996 0.057]
		correlation (r, p) at window  3 : [-0.187  0.88 ]
		correlation (r, p) at window  3 : [-0.692  0.513]
		correlation (r, p) at window  3 : [-0.971  0.153]
		correlation (r, p) at window  3 : [0.882 0.313]
		correlation (r, p) at window  3 : [-0.73   0.479]
		correlation (r, p) at window  3 : [-1.  0.]
		correlation (r, p) at window  3 : [0.404 0.736]
		correlation (r, p) at window  3 : [0.384 0.749]
position cor, negative cor, total: 2 4 9
[eve]
overall pearson correlation (r, p): -0.352 0.061
	-- with window --
		correlation (r, p) at window  3 : [-0.92   0.256]
		correlation (r, p) at window  3 : [0.591 0.597]
		correlation (r, p) at window  3 : [0.909 0.274]
		correlation (r, p) at window  3 : [-0.43   0.717]
		correlation (r, p) at window  3 : [-

		correlation (r, p) at window  3 : [-1.  1.]
position cor, negative cor, total: 11 8 37
** after clean cv data, remove ratio 0.043261231281197965 

eve freq =  30.0  | cv freq =  16.0
-- 1 SEC Timeframe --
[cv]
overall pearson correlation (r, p): 0.418 0.024
	-- with window --
		correlation (r, p) at window  3 : [0.745 0.465]
		correlation (r, p) at window  3 : [0.995 0.061]
		correlation (r, p) at window  3 : [-0.85   0.353]
		correlation (r, p) at window  3 : [1.    0.016]
		correlation (r, p) at window  3 : [-0.996  0.055]
		correlation (r, p) at window  3 : [-0.986  0.107]
		correlation (r, p) at window  3 : [-0.68   0.524]
		correlation (r, p) at window  3 : [-0.966  0.166]
		correlation (r, p) at window  3 : [0.121 0.922]
position cor, negative cor, total: 3 5 9
[eve]
overall pearson correlation (r, p): -0.201 0.304
	-- with window --
		correlation (r, p) at window  3 : [1. 1.]
		correlation (r, p) at window  3 : [0.931 0.238]
		correlation (r, p) at window  3 : [0.345 0.776]
		

  # This is added back by InteractiveShellApp.init_path()


 3 : [0.503 0.665]
		correlation (r, p) at window  3 : [0.953 0.197]
position cor, negative cor, total: 4 2 9
-- 2 SEC Timeframe --
[cv]
overall pearson correlation (r, p): 0.5 0.069
	-- with window --
		correlation (r, p) at window  3 : [1. 1.]
		correlation (r, p) at window  3 : [0 0]
		correlation (r, p) at window  3 : [-1.  1.]
		correlation (r, p) at window  3 : [0 0]
		correlation (r, p) at window  3 : [-1.  1.]
		correlation (r, p) at window  3 : [0 0]
		correlation (r, p) at window  3 : [-1.  1.]
		correlation (r, p) at window  3 : [0 0]
position cor, negative cor, total: 1 3 8
[eve]
overall pearson correlation (r, p): -0.114 0.698
	-- with window --
		correlation (r, p) at window  3 : [1. 1.]
		correlation (r, p) at window  3 : [0 0]
		correlation (r, p) at window  3 : [1. 1.]
		correlation (r, p) at window  3 : [0 0]
		correlation (r, p) at window  3 : [1. 1.]
		correlation (r, p) at window  3 : [0 0]
		correlation (r, p) at window  3 : [-1.  1.]
		correlation (r, p) at windo

In [121]:
cv_2, s30_cv2, s31_cv2, s32_cv2  = input_data('train02')
run('30', s30_cv2, 'train02')
run('31', s31_cv2, 'train02')
run('32', s32_cv2, 'train02')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


** after clean cv data, remove ratio 0.04347826086956519 

eve freq =  30.0  | cv freq =  6.0
-- 1 SEC Timeframe --
[cv]


  # This is added back by InteractiveShellApp.init_path()


overall pearson correlation (r, p): 0.323 0.1
	-- with window --
		correlation (r, p) at window  3 : [0.912 0.269]
		correlation (r, p) at window  3 : [0.148 0.905]
		correlation (r, p) at window  3 : [-0.995  0.061]
		correlation (r, p) at window  3 : [0.957 0.187]
		correlation (r, p) at window  3 : [0.766 0.445]
		correlation (r, p) at window  3 : [-0.423  0.722]
		correlation (r, p) at window  3 : [-1.  1.]
		correlation (r, p) at window  3 : [1. 1.]
		correlation (r, p) at window  3 : [0.992 0.078]
position cor, negative cor, total: 5 2 9
[eve]
overall pearson correlation (r, p): -0.704 0.0
	-- with window --
		correlation (r, p) at window  3 : [-0.559  0.623]
		correlation (r, p) at window  3 : [-0.977  0.137]
		correlation (r, p) at window  3 : [0.948 0.206]
		correlation (r, p) at window  3 : [-0.431  0.717]
		correlation (r, p) at window  3 : [-0.628  0.568]
		correlation (r, p) at window  3 : [-0.981  0.126]
		correlation (r, p) at window  3 : [-0.043  0.973]
		correlation (r