In [1]:
import pandas as pd, numpy as np
import os, json

data_path = ['..', 'data', 'raw']

# Data preparation
- calculating variables
- reformatting columns

In [2]:
# load results
respondents = pd.read_csv(os.path.join(*data_path, 'respondents.csv'), index_col=0)
results = pd.read_csv(os.path.join(*data_path, 'results.csv'), index_col=0)
interactions = pd.read_csv(os.path.join(*data_path, 'interactions.csv'), index_col=0)

## Respondents data

In [3]:
# calculate umux score
for index, row in respondents.iterrows():
    score = (12 + row.umux1 + row.umux3 - row.umux2 - row.umux4) / 24 * 100
    respondents.loc[index, 'umux'] = score

respondents.umux = respondents.umux.round(2)

In [4]:
# format time
respondents.time = respondents.time.apply(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]) + int(x.split(':')[2]) / 60)
respondents.time = respondents.time.round(2)

In [5]:
# reformat the hotspot options variable
respondents.hotspotOptions = respondents.hotspotOptions.apply(
    lambda x: ', '.join(list(map(lambda y: 
                       'noticed' if y=='I have noticed the blue flashing areas.' 
                       else 'helped' if y=='The blue flashing areas helped me complete the task more easily.'
                       else 'used' if y=='I sometimes deliberately clicked unrelated areas of the screen to make the blue flashing areas appear.'
                       else 'distracted' if y=='The blue flashing areas distracted me while navigating the prototype.'
                       else 'sped-up' if y=='The blue areas flash too quickly.'
                       else 'none-above' if y=='None of the above.'
                       else 'non-answer',
                       x.split(', '))))
    if not pd.isnull(x) else None
)
respondents.loc[(respondents.respondent==15) & (respondents.variant=='non-interactive'), 'hotspotOptions'] = respondents[respondents.variant=='non-interactive'].groupby(['hotspotOptions']).size().sort_values(ascending=False).index[0]

In [6]:
# count the spam clicks and hotspot helps
for index, row in respondents[respondents.variant=='non-interactive'].iterrows():
    spam = respondents.loc[index, 'spamClicks']
    hotspotHelp = respondents.loc[index, 'hotspotHelp']
    respondents.loc[index, 'spamCount'] = len(spam.split(', ')) if not pd.isnull(spam) else 0
    respondents.loc[index, 'helpOccurences'] = len(hotspotHelp.split(', ')) if not pd.isnull(hotspotHelp) else 0

In [7]:
# drop umux columns
respondents.drop(labels=["umux1", "umux2", "umux3", "umux4"], inplace=True, axis=1)

## Interaction data

In [8]:
# assign interaction ids
interactions['id'] = range(1, len(interactions.index)+1)

In [9]:
# calculate first clicks and last clicks
interactions['first'], interactions['last'] = [False, False]
ids_first = []
ids_last = []

for index, group in interactions.groupby(['variant', 'respondent', 'task', 'screen']):
    ids_first.append(group.sort_values(['order']).iloc[0,:].id)
    ids_last.append(group.sort_values(['order']).iloc[len(group)-1,:].id)

interactions.loc[interactions.id.isin(ids_first), 'first'] = True
interactions.loc[interactions.id.isin(ids_last), 'last'] = True

## Result data

In [10]:
# reformat the success variable
results.success = results.success.apply(lambda x: True if x==1 else False)

In [11]:
# prototype screens
all = []

for index, row in results.iterrows():
    path = row.rawPath.split(' > ')
    all.append(list(map(lambda x: x[0:x.index(" ") if " " in x else len(x)], path)))

screens = set(np.hstack(all))

In [12]:
# calculate path lengths and clean the paths
# also calculate issue counts, click counts

for index, row in results.iterrows():
    path = list(filter(lambda x: x in screens, row.rawPath.split(' ')))

    results.loc[index, 'path'] = ' > '.join(path)
    results.loc[index, 'pathLength'] = len(path)

    results.loc[index, 'issuesCount'] = len(list(filter(lambda x: x not in ("ok", "skip"), 
                                                        results.loc[index, 'issues'].split(', ') if not pd.isnull(results.loc[index, 'issues']) else []) ))
    results.loc[index, 'issuesReported'] = not pd.isnull(results.loc[index, 'issues']) and results.loc[index, 'issues'] not in ("ok", "skip")

    results.loc[index, 'clicks'] = interactions[
        (interactions.variant==row.variant) &
        (interactions.respondent==row.respondent) &
        (interactions.task==row.task)
    ].shape[0]

    non_dead = 0
    
    for item in json.loads(interactions[
        (interactions.variant==row.variant) &
        (interactions.respondent==row.respondent) &
        (interactions.task==row.task)
    ].path.values[0]):
        if('c' in item or 'h' in item):
            if('i' in item):
                non_dead += len(item['i'])
            else:
                non_dead += 1
    
    results.loc[index, 'deadClicks'] = results.loc[index, 'clicks'] - non_dead

In [13]:
# remove path column
interactions = interactions.loc[:, interactions.columns != 'path']

## Funnel data

In [14]:
tempFunnels = {
    "interactive": {"task1": {}, "task2": {}, "task3": {}},
    "non-interactive": {"task1": {}, "task2": {}, "task3": {}}
}

allScreens = []

# calculate funnels
for variant in ["interactive", "non-interactive"]:
    for task, taskName in [(1, "task1"), (2, "task2"), (3, "task3")]:
        temp = results[(results.variant == variant) & (results.task == task)]
        for index, row in temp.iterrows():
            path = list(filter(lambda x: not x.startswith('BEGIN'), row.path.split(' > ')))
            path = list(map(lambda x: x.split("/")[0] if not x.startswith('PROFILE') else x.split("-")[0], path))
            allScreens += path
            for jndex, item in enumerate(path):
                if item not in tempFunnels[variant][taskName]:
                    tempFunnels[variant][taskName][item] = {}
                if len(path) > jndex+1:
                    if path[jndex+1] not in tempFunnels[variant][taskName][item]:
                        tempFunnels[variant][taskName][item][path[jndex+1]] = 1
                    else:
                        tempFunnels[variant][taskName][item][path[jndex+1]] += 1

In [15]:
# save funnels into dataframe
allScreens = set(allScreens)
funnels = pd.DataFrame(columns=["task", "screen", "nextScreen", "interactive", "nonInteractive"])

for task, taskName in [(1, "task1"), (2, "task2"), (3, "task3")]:
    for screen in allScreens:
        for nextScreen in allScreens:
            funnels.loc[len(funnels.index)] = [
                task, screen, nextScreen, 
                tempFunnels["interactive"][taskName][screen][nextScreen] if (
                    screen in tempFunnels["interactive"][taskName] and nextScreen in tempFunnels["interactive"][taskName][screen]
                ) else 0,
                tempFunnels["non-interactive"][taskName][screen][nextScreen] if (
                    screen in tempFunnels["non-interactive"][taskName] and nextScreen in tempFunnels["non-interactive"][taskName][screen]
                ) else 0
            ] 

In [16]:
# calculate differences and sums
funnels = funnels[(funnels.interactive != 0) | (funnels.nonInteractive != 0)].reset_index(drop=True)
funnels['diff'] = np.abs(funnels['interactive'] - funnels['nonInteractive'])
funnels['sum'] = funnels['interactive'] + funnels['nonInteractive']
funnels = funnels.sort_values(by=["task", "screen", "nextScreen"])
funnels = funnels.reset_index(drop=True)

# Export

In [17]:
# export
respondents.to_csv(os.path.join('..', 'data', 'respondents.csv'))
results.to_csv(os.path.join('..', 'data', 'results.csv'))
interactions.to_csv(os.path.join('..', 'data', 'interactions.csv'))
funnels.to_csv(os.path.join('..', 'data', 'funnels.csv'))