# Cleaning Data and Improve Matchings
This scripts performs the following data cleaning actions to remove all errorneous samples which are caused by the motion capturing system (e.g., marker occlusion/reflection/etc.):
- Removing unused columns
- Synchronizing motion data and blobs in capacitive images as described in the paper
- Dropping rows with missing values (e.g. marker occlusion) and samples in which the full-touch smartphone was not properly tracked.

In [1]:
import numpy as np
import pandas as pd
import os    
from datetime import datetime
from IPython import display
import matplotlib.pyplot as plt
import re
from skimage import measure
import os

%matplotlib inline
%run py/labeling_names.py

In [2]:
PICKLES_PATH = "./data/pickles/"
CONDITION_DATA_PATH = "./data/condition/"

In [3]:
def log(s):
    with open("status_PY03_2_match_improvement.txt", "a") as myfile:
        myfile.write("[" + str(datetime.now()) + "] " + s + "\n")
    print("[" + str(datetime.now()) + "] " + s)

In [4]:
def get_participant_data(participant):
    dfSynced = pd.read_pickle(PICKLES_PATH + "raw_data_" + participant + ".pkl")
    dfCondition = pd.read_csv(CONDITION_DATA_PATH + "condition_" + participant + ".txt", header=None, names=["Timestamp", "Status", "Grip", "Finger", "Movement"])
    
    dfCondition = dfCondition.dropna()
    
    dfCondition.Movement = dfCondition.Movement.replace('Free Movements', 0)
    dfCondition.Movement = dfCondition.Movement.replace('Free Placements + Thumb on screen', 1)
    dfCondition.Movement = dfCondition.Movement.replace('Swipe Gestures', 2)
    dfCondition.Movement = dfCondition.Movement.astype(np.int)

    dfCondition.Grip = dfCondition.Grip.replace('Grip 1', 1)
    dfCondition.Grip = dfCondition.Grip.replace('Grip 2', 2)
    dfCondition.Grip = dfCondition.Grip.replace('Grip 3', 3)
    dfCondition.Grip = dfCondition.Grip.replace('Grip 4', 4)
    dfCondition.Grip = dfCondition.Grip.replace('Grip 5', 5)
    dfCondition.Grip = dfCondition.Grip.astype(np.int)

    return dfSynced, dfCondition

In [5]:
def unify_column_names(df, participant):
    rb_replace_dict = {}
    hand_replace_dict = {}
    
    for c in df.columns:
        if (RIGID_BODY_NAMES[participant]) in c:
            rb_replace_dict[c] = c.replace(RIGID_BODY_NAMES[participant], "Phone")

    for c in df.columns:
        if (HAND_MARKERSET_PREFIX[participant]) in c:
            hand_replace_dict[c] = c.replace(HAND_MARKERSET_PREFIX[participant] + ":", "")
        
            
    df = df.rename(columns=rb_replace_dict)
    df = df.rename(columns=hand_replace_dict)
    return df

In [6]:
def set_moving_finger(df, dfCondition):
    df['MovingFinger'] = (np.ones(len(df)) * -1).astype(int)
    df = df.sort_values(by="OptiTrack_Timestamp")

    starts = np.array(dfCondition[::2].Timestamp)
    ends = np.array(dfCondition[1::2].Timestamp)
    fingers = np.array(dfCondition[::2].Finger.replace({'Thumb':0, 'Index':1, 'Middle':2, 'Ring':3, 'Little':4}).astype(int))
    
    for idx in range(len(starts)):
        starttime = starts[idx]
        endtime = ends[idx]
        finger = fingers[idx]
    
        df.loc[(df.OptiTrack_Timestamp >= starttime) & (df.OptiTrack_Timestamp < endtime), 'MovingFinger'] = finger
        
    return df

In [7]:
def drop_nan_rows(df):
    """
    Remove all frames in which no data about the fingernails are available
    """
    return df.dropna(subset=["Thumb_Fn_X", "Index_Fn_X", "Middle_Fn_X", "Ring_Fn_X", "Little_Fn_X", "Phone_X"])

In [8]:
def remove_errorneous_samples(df, participant):
    # remove all points which are outside of the device
    cleft = (-MAX_OUTOFDEVICE_DISTANCE)
    cright = (NEXUS_5_SCREEN_WIDTH_MM + MAX_OUTOFDEVICE_DISTANCE)
    df = df[(df.Thumb_Fn_X < cright) & (df.Thumb_Fn_X > cleft)
            & (df.Index_Fn_X < cright) & (df.Index_Fn_X > cleft)
            & (df.Middle_Fn_X < cright) & (df.Middle_Fn_X > cleft)
            & (df.Ring_Fn_X < cright) & (df.Ring_Fn_X > cleft)
            & (df.Little_Fn_X < cright) & (df.Little_Fn_X > cleft)]
    
    return df

In [9]:
def errorneous_removal_heuristics(df):
    df1 = df[["Phone_X_Rotation", "Phone_Y_Rotation", "Phone_Z_Rotation", "Phone_W_Rotation"]]
    x = df1[df1.columns[0]]
    y = df1[df1.columns[1]]
    z = df1[df1.columns[2]]
    w = df1[df1.columns[3]]

    rot_matrix = np.array([
        [1-2*y*y-2*z*z, 2*x*y+2*w*z, 2*x*z - 2*w*y],
        [2*x*y - 2*w*z, 1-2*x*x-2*z*z, 2*y*z+2*w*x],
        [2*x*z+2*w*y, 2*y*z-2*w*x, 1-2*x*x-2*y*y]])

    angle = np.degrees(np.arccos(np.dot((rot_matrix[:,1,:].T), [0, 1, 0])))
    adf = pd.DataFrame(angle)
    df2 = adf[np.logical_not(adf[0].isnull())]
    df3 = df2[df2[0] < df2[0].mean() + df2[0].std()*4]
    dfFinal = df.iloc[df3.index]
    
    return dfFinal

In [10]:
def remove_unnecessary_columns(df):
    # remove unnecessary columns to reduce file size
    return df.drop(UNNECESSARY_COLUMNS, axis=1)

In [11]:
def getContourBoundingBox(img):
    contours = measure.find_contours(img, 35)
    results = []
    min_x, max_x, min_y, max_y = [], [], [], []

    for n, contour in enumerate(contours):
        if (len(contour) > 5):
            r = [contour[:, 1], contour[:, 0]]
            results.append(r)
            min_x.append(r[0].min())
            max_x.append(r[0].max())
            min_y.append(r[1].min())
            max_y.append(r[1].max())
            
    return min_x, max_x, min_y, max_y

In [12]:
def is_in_bounding_box(x, y, moving_finger, bb):
    x = x * 1000 # convert to mm
    y = y * 1000 # convert to mm
    x = (15/62) * x + 2  # 62mm screen width
    y = (27/110) * y + 2 # 110mm screen height

    for i in range(len(bb[0])):
        # move one screen width (15) + 1 left + 1 right side
        xoffset = 17 if moving_finger == "Thumb" else 0
        
        if ((x + xoffset > bb[0][i] and x + xoffset < bb[1][i]) and (y > bb[2][i] and y < bb[3][i])):
            return True
        
    return False

In [13]:
def improve_matching(df):
    finger_name = {
            -1: None,
            0: "Thumb",
            1: "Index",
            2: "Middle",
            3: "Ring",
            4: "Little"
        }

    cols = [
            "Thumb_Fn_X", 
            "Thumb_Fn_Y",
            "Index_Fn_X", 
            "Index_Fn_Y",
            "Middle_Fn_X", 
            "Middle_Fn_Y",
            "Ring_Fn_X", 
            "Ring_Fn_Y",
            "Little_Fn_X", 
            "Little_Fn_Y",
            "MovingFinger",
            "ContourBoundingBoxes"
           ]

    data = np.array(df[cols])
    
    matches = []
    for idx in range(0, len(data)): # len(df)
        best_matching_frame = idx #-1

        if (idx - 240 >= 0):
            prev_rows = range(idx-240, idx)
            mov_finger = finger_name[data[idx][cols.index('MovingFinger')]]
            boundingboxes = data[idx][cols.index("ContourBoundingBoxes")]

            if (idx % 1000 == 0):
                print("Progress: " + str(idx) + " / " + str(len(df)) + " (" + "{:2.2f}".format(idx / len(df) * 100) + "%)" ,end='\r')

            if (mov_finger != None):
                trues = []
                for j in prev_rows:
                    x_pos = data[j][cols.index(mov_finger + "_Fn_X")]
                    y_pos = data[j][cols.index(mov_finger + "_Fn_Y")]

                    is_in = is_in_bounding_box(x_pos, y_pos, mov_finger, boundingboxes)
                    if (is_in):
                        trues.append(j)
                
                len_trues = len(trues)
                if (len_trues > 0):
                    best_matching_frame = trues[len_trues//2]
                        
        matches.append(best_matching_frame)
        
        
    df['Matches'] = matches
    return df

In [14]:
def apply_matching(df):
    right = df.drop(["MatrixMerged"], axis=1)
    left = df[["MatrixMerged", "Matches"]]

    result = left.merge(right, left_on='Matches', right_index=True, how='left')

    result = result.drop(["Matches_y", "Matches_x"], axis=1)
    cols = result.columns.tolist()
    result = result[cols[-2:] + cols[:-2]]
    
    return result

In [15]:
for filename in os.listdir(PICKLES_PATH):
    if filename.endswith(".pkl"):

        # To start from full_data (script 1), uncomment all boxes except the errorneous_samples and drop_nan_rows.
        match = re.match("^raw_data_P[0-9]+.pkl", filename)
        if (match == None):
            continue
            
        current_participant = filename.split(".")[0].split("_")[2]
        
        if os.path.isfile("./data/pickles/corrected_data_with_bb_" + current_participant + ".pkl") :
            log(current_participant + " is already available. Skipped.")
            continue
        
        
        log("Start reading " + PICKLES_PATH + filename)
        dfTemp, dfCondition = get_participant_data(current_participant)
        
        # Make naming of columns consistent 
        log(current_participant + ": Unify Column Names.")
        dfTemp = unify_column_names(dfTemp, current_participant)
        
        # Remove unused columns
        log(current_participant + ": Remove unused columns.")
        dfTemp = remove_unnecessary_columns(dfTemp)
        
        # set currently moving finger by merging condition data with raw data
        log(current_participant + ": Add moving finger column.")
        dfTemp = set_moving_finger(dfTemp, dfCondition)        
        
        # Create contours
        log(current_participant + ": Create Contour Bounding Boxes.")
        contour_bbs = dfTemp.MatrixMerged.apply(lambda x : getContourBoundingBox(x))
        dfTemp['ContourBoundingBoxes'] = contour_bbs
        
        # perform a better matching based on blob detection
        log(current_participant + ": Find best matches to improve sync.")
        dfTemp = improve_matching(dfTemp)

        # Apply matching and remove temporary columns
        log(current_participant + ": Apply column matching.")
        dfTemp = apply_matching(dfTemp)
        
        ######
        # Drop rows with missing values
        dfTemp = drop_nan_rows(dfTemp)
        
        # remove samples in which the full-touch smartphone is not properly tracked
        dfTemp = errorneous_removal_heuristics(dfTemp)

        # Set participant ID
        dfTemp['Participant'] = (np.ones(len(dfTemp)) * int(current_participant[1:])).astype(np.int)
        
        dfTemp.to_pickle("./data/pickles/corrected_data_with_bb_" + current_participant + ".pkl")

[2018-07-13 15:06:13.828359] P5 is already available. Skipped.
[2018-07-13 15:06:13.828945] P21 is already available. Skipped.
[2018-07-13 15:06:13.829281] P20 is already available. Skipped.
[2018-07-13 15:06:13.829595] P11 is already available. Skipped.
[2018-07-13 15:06:13.829913] P3 is already available. Skipped.
[2018-07-13 15:06:13.830234] P6 is already available. Skipped.
[2018-07-13 15:06:13.830555] P19 is already available. Skipped.
[2018-07-13 15:06:13.830859] P16 is already available. Skipped.
[2018-07-13 15:06:13.831196] P14 is already available. Skipped.
[2018-07-13 15:06:13.831510] P13 is already available. Skipped.
[2018-07-13 15:06:13.832572] P8 is already available. Skipped.
[2018-07-13 15:06:13.832974] P9 is already available. Skipped.
[2018-07-13 15:06:13.833307] P18 is already available. Skipped.
[2018-07-13 15:06:13.833628] P2 is already available. Skipped.
[2018-07-13 15:06:13.833937] P10 is already available. Skipped.
[2018-07-13 15:06:13.834243] P22 is already av