In [1]:
import os
import ast
import itertools
import glob

import numpy as np
import pandas as pd

import multiprocessing
import tqdm

import scipy
from scipy import interpolate, stats

### Retrieve all phone files and put them in two dicts: one for walking, one for sitting
### walking_dict & seated_dict

In [2]:
%%time
path = "./raw_data_phone/"
tasks = ["DRAG", "SCROLL", "QUESTION", "READ", "TAP", "WRITE"]
phones = {"GT-I8190":"S3",
              "GT-I9505":"S4",
              "A0001":"OPO",
              "Nexus 6":"N6"}

### dicts format: walkling_dict = {P1..21:{S3..N6:[File1..File7]}}
file_list = []
walking_dict = {}
seated_dict = {}
for i in range(1,22):
    walking_dict["P"+str(i)]= {"S3":[],"S4":[],"OPO":[],"N6":[]}
    seated_dict["P"+str(i)]= {"S3":[],"S4":[],"OPO":[],"N6":[]}
for root, subdirs, files in os.walk(path):
    for file in files: 
        if "questions" not in root and "Study" in file and "Sensor" not in file:
            part = root.split("/")[3]
            phone_model = file.split("_")[1]
            phone = phones[phone_model]    
            file_path = os.path.join(root, file)
            if "walking" in file:
                walking_dict[part][phone].append(file_path)
                #print("Part %s - Phone Model %s - filepath %s" % (part, phone_model, file_path))
            elif "seated" in file:
                seated_dict[part][phone].append(file_path) 

CPU times: user 1.31 ms, sys: 5.39 ms, total: 6.7 ms
Wall time: 5.53 ms


### return list containing all filepaths for participant in condition and phone

In [3]:
def getFilesList(pId, phone ,cond):
    if cond == "walking":
        return walking_dict["P"+pId][phone]
    else:
        return seated_dict["P"+pId][phone]

### return start and endtime of dataframe as a timestamp

In [4]:
def getTimes(df):
    firstTime = df.Timestamp[0]
    lastTime = df.Timestamp[-1]
    return firstTime, lastTime

### return starttime of dataframe as a timestamp

In [5]:
def getFirstTime(timestamp):
    if type(timestamp) is np.ndarray:
        firstTime = timestamp[0]
        return firstTime
    else:
        return timestamp

### return endtime of dataframe as a timestamp

In [6]:
def getLastTime(timestamp):
    if type(timestamp) is np.ndarray:
        lastTime = timestamp[-1]
        return lastTime
    else:
        return timestamp

### return row with applied given function on it

In [7]:
def applyChanges(f, np_arr):
    np_arr = list(np_arr)
    np_arr = list(map(lambda x: x + np.round(f(x)), np_arr))
    np_arr = list(map(lambda x: x.astype(int), np_arr))
    return np_arr

### return dataframe for S3 phone for given pid, cond containing interpolated timestamps

In [8]:
def syncS3Times(pid, c, df):
    path = "./raw_data_phone/timestamp_adjusted/"
    all_files = glob.glob(os.path.join(path, "*.txt"))
    for file in all_files:
        #pp = int(pid.replace("P",""))
        if pid in file and c in file: #and pp>2:
            #all S3 files for participant pid combined
            concatDf = df
            #UDP - file with PC timestamps and related phonetimestamps
            matching_df = pd.read_csv(file, delimiter =",")
            #name the columns
            matching_df= matching_df.rename(index=str , columns = {matching_df.columns[0]:"Phonestamp",matching_df.columns[1]:"Motivestamp"})
            #get difference in ms
            matching_df["diff"] = matching_df["Motivestamp"] - matching_df["Phonestamp"]

            #merge the matching df with S3 df (Phonestamp X LastTime)
            merged_df = pd.merge_asof(matching_df, concatDf, left_on='Phonestamp',right_on='LastTime', direction = 'nearest')
            
            '''
            create df that holds X and Y axis for the 1D interpolation 
            (append the first diff and last diff for border case)
            '''
            interpol_df = pd.DataFrame(data={"interpolTime":merged_df["Phonestamp"],"diff": merged_df["diff"]}).append(
                {"diff":merged_df["diff"].iloc[0],"interpolTime":0}, ignore_index=True).append(
                {"diff":merged_df["diff"].iloc[-1],"interpolTime":1600000000000}, ignore_index=True).sort_values(by=['interpolTime'])
            
            #interpolate along the X(timestamps) and Y(diff) axes 
            f = interpolate.interp1d(interpol_df["interpolTime"], interpol_df ["diff"])
            #add new column that holds interpolated values 
            concatDf["interpol"] = applyChanges(f, concatDf["timestamp"])
            concatDf['FirstTime'] = concatDf.apply(lambda row: getFirstTime(row['interpol']), axis=1) 
            concatDf['LastTime'] = concatDf.apply(lambda row: getLastTime(row['interpol']), axis=1)
            return concatDf

### return three lists containing tasks, start and endtime for tasks for given dataframe

In [9]:
def getTaskIntervalNew(df):
    df = df.reset_index(drop=True)
    change = {"SCROLL_H":"Fitts",
              "SCROLL_V":"Fitts",
              "DRAG":"Fitts",
              "TAP":"Fitts",
              "QUESTION":"QUESTION",
              "READ":"READ",
              "WRITE":"WRITE"}
    task = [change[df.Task[0]]]
    task_begin = [df.FirstTime[0]]
    task_end = []
    lastTask = change[df.Task[0]]
    for i in range(len(df)):
        t = df.Task[i]
        currTask = change[t]
        if currTask in "QUESTION":
            task_end.append(df.LastTime[i-1])
        if currTask not in lastTask and currTask not in "QUESTION":
            task.append(currTask)
            task_begin.append(df.FirstTime[i])
        lastTask = currTask
    return task, task_begin, task_end

In [10]:
def getTaskInterval(df):
    df = df.reset_index(drop=True)
    change = {"SCROLL_H":"Fitts",
              "SCROLL_V":"Fitts",
              "DRAG":"Fitts",
              "TAP":"Fitts",
              "QUESTION":"QUESTION",
              "READ":"READ",
              "WRITE":"WRITE"}
    task = [change[df.Task[0]]]
    task_begin = [df.FirstTime[0]]
    task_end = []
    lastTask = change[df.Task[0]]
    for i in range(len(df)):
        t = df.Task[i]
        currTask = change[t]
        if currTask in "QUESTION":
            task_end.append(df.LastTime[i-1])
        if currTask not in lastTask and currTask not in "QUESTION":
            task.append(currTask)
            task_begin.append(df.FirstTime[i])
        lastTask = currTask
    return task, task_begin, task_end,

### Concat the phone files of participant (pid) in this given codition (c)  plus phone and order by time
### return the start and end timestamps for the different tasks

In [11]:
def concatDfsNew(pid, phone, cond):
    corr_cols = []
    dfs = []
    if cond == "walking":
        file_list = walking_dict[pid][phone]
    else:
        file_list = seated_dict[pid][phone]
    for file in file_list:
        if "SCROLL_H" in file:
            task = "SCROLL_H"
        elif "SCROLL_V" in file:
            task = "SCROLL_V"
        else:
            task = file.split("/")[5].split("_")[3].split(".")[0]
        df_temp = pd.read_csv(file, delimiter=";")
        for c in df_temp.columns:
            if c not in ['timestamp']:
                df_temp = df_temp.drop(c, 1)
        df_temp["Task"] = task
        dfs.append(df_temp)
    df = pd.concat(dfs,ignore_index=True)
    #df = df.sort_values(by=['timestamp'])
    df.timestamp = df.timestamp.apply(lambda x: np.array(ast.literal_eval(x)))
    df['FirstTime'] = df.apply(lambda row: getFirstTime(row['timestamp']), axis=1) 
    df['LastTime'] = df.apply(lambda row: getLastTime(row['timestamp']), axis=1)
    df = df.sort_values(by=['LastTime'])
    return getTaskIntervalNew(df)

In [12]:
def concatDfs(pid, phone, cond):
    corr_cols = []
    dfs = []
    if cond == "walking":
        file_list = walking_dict[pid][phone]
    else:
        file_list = seated_dict[pid][phone]
    for file in file_list:
        if "SCROLL_H" in file:
            task = "SCROLL_H"
        elif "SCROLL_V" in file:
            task = "SCROLL_V"
        else:
            task = file.split("/")[5].split("_")[3].split(".")[0]
        df_temp = pd.read_csv(file, delimiter=";")
        for c in df_temp.columns:
            if c not in ['timestamp']:
                df_temp = df_temp.drop(c, 1)
        df_temp["Task"] = task
        dfs.append(df_temp)
    df = pd.concat(dfs,ignore_index=True)
    #df = df.sort_values(by=['timestamp'])
    df.timestamp = df.timestamp.apply(lambda x: np.array(ast.literal_eval(x)))
    df['FirstTime'] = df.apply(lambda row: getFirstTime(row['timestamp']), axis=1) 
    df['LastTime'] = df.apply(lambda row: getLastTime(row['timestamp']), axis=1)
    df = df.sort_values(by=['LastTime'])
    if "S3" in phone:
        df = syncS3Times(pid, cond, df)
    return getTaskInterval(df)

# Main functionality - clean *.pkl files up from needless frames

In [13]:
path = "./Pickles/"
joints = ['Thumb_Fn', 'Thumb_DIP', 'Thumb_PIP', 'Thumb_MCP',\
         'Index_Fn', 'Index_DIP', 'Index_PIP', 'Index_MCP',\
         'Middle_Fn', 'Middle_DIP', 'Middle_PIP', 'Middle_MCP',\
         'Ring_Fn', 'Ring_DIP', 'Ring_PIP', 'Ring_MCP',\
         'Little_Fn', 'Little_DIP', 'Little_PIP', 'Little_MCP',\
          'R_Shape_4','R_Shape_2','R_Shape_3','R_Shape_1',\
         'Wrist']
files = []
for file in os.listdir(path):
    if file.endswith(".pkl") & file.startswith("dfTransformed_"):
        files.append(path+file)
    else:
        print("Ignored: %s" %file)

In [14]:
def doJob(f):
    if "dfTransformed" not in f:
        return None
    
    df_info = pd.DataFrame()
    pid, phone, cond =  f.replace("_P", "_").replace(".pkl", "").split("_")[1:]
    pid = int(pid)
    if (cond == "sitting"):
        cond = "seated"
        
    #print("Working on file: " + f)
    fileFirstTime = []
    fileLastTime = []
    task, firstTime, lastTime = concatDfsNew('P%i'%pid, phone, cond)
    df = pd.read_pickle(f)
    cols = []
    for c in df.columns:
        for replace in ["S3:", "S3_", "S4:", "S4_", "OPO:", "OPO_", "N6:", "N6_"]:
            c = c.replace(replace, "Phone")
        cols.append(c)
    df.columns = cols

    df["Phone"] = phone
    df["Condition"] = cond
    df["PId"] = pid

    df.Time = df.Time.astype(np.int64)
    df_info["Participant"] = [pid]
    df_info["Phone"] = [phone]
    df_info["Cond"] = [cond]
    df_info["Tasks"] = [task]
    df_info["FirstTimes"] = [firstTime]
    df_info["LastTimes"] = [lastTime]
    
    df["Task"] = None
    for i, (t, s, e) in enumerate(zip(task,firstTime,lastTime)):
        df.loc[(df.Time>=s) & (df.Time<=e), 'Task'] = t
        
        if (len(df.loc[(df.Time>=s) & (df.Time<=e)]) == 0):
            test = ", ".join([str(df.Time.min()), str(df.Time.max()), str(firstTime[i]), str(lastTime[i])])
            print("ERROR:", f, df.Time.min() > 1521936000000,i,  test)
            
    totalTime = (df["Time"].iloc[-1] - df["Time"].iloc[0])*0.001
    df_info["ms_before"] = (df["Time"].iloc[-1] - df["Time"].iloc[0])
    #df_info["ms_after"] = tasklength/0.001
    df_info["Frames_before"] = len(df)
    #print("Frames before task cut : %s frames"%(len(df)))
    #print("Frames after task cut: %s - %s = %s"%(len(df), df.isnull().sum()["Task"], len(df)-df.isnull().sum()["Task"]))
    #print("Total Take length before cut in Minutes : %s min"%(totalTime/60))
    #print("Total Take length after cut in Minutes : %s min"%(tasklength/60))
    df = df.dropna(subset=['Task'])
    if (len(df) >0):
        df_info["Frames_after_t"] = len(df)
        #print("Frames after cleaning of takes = %s Frames"%(len(df)))
        cc = []
        for c in df.columns:
            if "Fn" in c:
                cc.append(c)

        x = df['PhoneX_Rotation']
        y = df['PhoneY_Rotation']
        z = df['PhoneZ_Rotation']
        w = df['PhoneW_Rotation']

        rot_matrix = np.array([
            [1-2*y*y-2*z*z, 2*x*y+2*w*z, 2*x*z - 2*w*y],
            [2*x*y - 2*w*z, 1-2*x*x-2*z*z, 2*y*z+2*w*x],
            [2*x*z+2*w*y, 2*y*z-2*w*x, 1-2*x*x-2*y*y]])

        df["Angle"] = np.degrees(np.arccos(np.dot((rot_matrix[:,1,:].T), [0, 1, 0])))
        #adf = pd.DataFrame(angle)
        df = df[np.logical_not(df["Angle"].isnull())]
        df = df[df["Angle"] < df["Angle"].mean() + df["Angle"].std()*2]

        df_info["Frames_after"] = len(df)
        #print("Frames after cleaning of takes and rigid body errors = %s Frames"%(len(df)))

        for i in range (1,10):
            df_info["Filter"+str(i)] = len(df[cc].interpolate(limit=i).dropna(subset=cc))/len(df[cc])-len(df[cc].dropna(subset=cc))/len(df[cc])

        helplist = []
        for t in task:
            help_df = df[df.Task == t].copy(deep = True)
            for j in joints:
                f=(help_df["%s_X"%j]>help_df["%s_X"%j].mean()-help_df["%s_X"%j].std()*3)&\
                (help_df["%s_X"%j]<help_df["%s_X"%j].mean()+help_df["%s_X"%j].std()*3)&\
                (help_df["%s_Y"%j]>help_df["%s_Y"%j].mean()-help_df["%s_Y"%j].std()*3)&\
                (help_df["%s_Y"%j]<help_df["%s_Y"%j].mean()+help_df["%s_Y"%j].std()*3)&\
                (help_df["%s_Z"%j]>help_df["%s_Z"%j].mean()-help_df["%s_Z"%j].std()*3)&\
                (help_df["%s_Z"%j]<help_df["%s_Z"%j].mean()+help_df["%s_Z"%j].std()*3)
                help_df.loc[~f,["%s_X"%j,"%s_Y"%j,"%s_Z"%j]] = np.nan
            helplist.append(help_df)

        df = pd.concat(helplist)

        df = df.dropna(subset=cc)
        #print("Frames after cleaning of fingers = %s Frames"%(len(df)))
        df_info["Frames_final"] = len(df)
        #print("Approx Take length after cut in ms : %s ms"%(round(len(df)/0.24)))
        df_info["ms_after_approx"] = round(len(df)/0.24)
        #df_info_real = df_info_real.append(df_info)
        pickle_path = "TransformedPickles/"
        pickle_name = 'P%i'%pid
        pickle_path = pickle_path+pickle_name + ("_%s_%s.pkl"%(phone, cond))
        df.to_pickle(pickle_path)
    return df_info

In [15]:
pool = multiprocessing.Pool(max(multiprocessing.cpu_count()-1, 1))
ret = list(tqdm.tqdm(pool.imap_unordered(doJob, files), total=len(files)))

100%|██████████| 120/120 [00:33<00:00,  3.55it/s]


In [17]:
df_info_real = pd.concat(ret)
df_info_real["Percentage"] = (df_info_real.Frames_final/df_info_real.Frames_after_t)
df_info_real.to_csv("./out/info.csv", sep=";")  