Make examples of user tasks for illustration purpose. 

## Setup


In [37]:
%matplotlib inline
import pymongo
from pymongo import MongoClient
import numpy as np
import itertools
import urllib
import sys
sys.path.append('../variables/')
import UserActivity
import matplotlib.pylab as pylab
from bson.objectid import ObjectId
from scipy.stats import gaussian_kde, ks_2samp

# DB connections
client = MongoClient()
db = client.db_tasklog_clean
# Collections
User = db.user
Log = db.log_chrome
Labelled = db.data_labeled
UserTasks = db.user_tasks

# Tasks to be excluded
to_exclude = ['000', '001', '002', '003', '004']
# Pre-defined labels - excluding "not sure (001)"
pre_tasks = ['001', '002', '003', '004']
# new query 
query_events = ['tab-search-new', 'tab-search-verticle'] 
# Get user task activities
# Set dummy parametersb
t_thresh = -1 # Look at first X minutes, not used
session_thresh = 30 # in minutes, threshold to consider users have left without closing the tab, not used
to_include = [] # Not used for this computation 




In [38]:
# All events
def event_stream(data):
    events = []
    # data consists of tab-groups
    for s in data:
        tab_group = s['tab_group']
        tabid = s['tabid']
        # tab groups consist of url groups
        for ug in tab_group:
            # a group of events sharing the same url
            url_group = ug['url_group']
            for e in url_group:
                events.append(e)
    events.sort(key=lambda x: x['timestamp_bson'])
    return events

# Get logical sessions
# Criteria for logical session:
# 1. consecutive events belong to the same task
# 2. a pause longer than 90 mins makes a session break (user may have left)
# Data format: stream of events
# Data should have already been filtered to only include event where the user is "on"
# A logical session should last more than 0 seconds
def logical_session(data):
    L = []
    s = []
    prev = -1
    for event, on in data:
        if prev == -1:
            prev = event
            s.append(event)
            continue
        gap = (event['timestamp_bson'] - prev['timestamp_bson']).total_seconds()/60
        # Keep adding to the logical session if conditions are satisfied
        if prev['taskid'] == event['taskid'] and gap < 90:
            s.append(event)
        # Otherwise, start a new session
        else:
            slength = (s[-1]['timestamp_bson'] - s[0]['timestamp_bson']).total_seconds()
            # Filter out sessions that last 0 seconds
            if not s == [] and slength > 0:
                L.append(s)
            s = []
            s.append(event)
        prev = event
    # Add the last session
    slength = (s[-1]['timestamp_bson'] - s[0]['timestamp_bson']).total_seconds()
    if not s == [] and slength > 0:
        L.append(s)
    return L
    

# Gether user activity data for analysis
UA = []
users = list(User.find({}))
for u in users:
    userid = u['userid']
    data = list(Labelled.find({'userid': u['userid']}))[0]['data']
    events = event_stream(data)
    ua = UserActivity.UserActivity(data, to_include, t_thresh, session_thresh)
    a_path, a_stream = ua.get_user_path()
    
    # filter out events where user was not on the tab
    user_stream = list(itertools.ifilter(lambda x: x[1] == True, a_stream))
    UA.append((userid, user_stream))

## Example task and sessions (physical, logical, task)

## Example tasks at different cognitive levels

## Task content categorisation w.r.t ODP

## Task with subtasks

In [49]:
def all_registered_tasks():
    all_tasks = list(UserTasks.find({'task_level': 0}))
    return all_tasks

# Tasks that were used to label events:
# based on logical sessions of tasks, i.e.
# - user should be "on" the page
# - a logical session of a task should be more than 0 second
def labelled_tasks():
    labelled_tasks = set([])
    for userid, ua in UA:
        L = logical_session(ua)
        taskids = set([l[0]['taskid'] for l in L])
        # filter out predefined and None tasks
        taskids = taskids - set(['None'] + to_exclude)
        labelled_tasks = labelled_tasks.union(taskids)
    return labelled_tasks

# Number of top lavel tasks that have subtasks and the top-level is used for labelling
# And list the subtasks
def task_hierarchy(labelled_tasks):
    task_hier = {}
    for t in labelled_tasks:
        task = list(UserTasks.find({'_id': ObjectId(t)}))
        if len(task) == 0:
            continue
        subtasks = list(UserTasks.find({'parent_task': t}))
        if len(subtasks) > 0:
            task_hier[task[0]['task']] = [s['task'] for s in subtasks]
    return task_hier


print "Number of tasks registered (top level): ", len(all_registered_tasks())
LT = labelled_tasks()
print "Number of tasks labelled (top level): ", len(LT)
# Among labelled tasks, those that have subtasks
task_hier = task_hierarchy(LT)
print "Number of labelled tasks that are registered with subtasks:", len(task_hier)
for task in task_hier:
    print task, task_hier[task]
    
#print "Number of subtasks that were used to label events: "

Number of tasks registered (top level):  305
Number of tasks labelled (top level):  289
Number of labelled tasks that are registered with subtasks: 17
Finding job [u'Checking Email']
Accoutability Hack [u'parse XML', u'prepare back-end']
plan vacation trip christmas [u'find out how to go from denmark to norway', u'find accommodation in oslo']
find brownie recipe [u'needs to use only ingredients I already have']
Check readings for Construction [u'download readings']
Prepare for Black Friday sales [u'Buy Stand Mixer', u'Buy Shaver']
Plan winter trip [u'Browse Train tickets']
Tasklog user study [u'UI/logger coding', u'Analysis', u'Recruiting users', u'study design']
Maritime arb readings [u'Ask vivien about printer, need to pay?']
Parents visa application [u'Translation', u'Application files', u'Visa info']
Finish logging hours for isd pay. [u'March ', u'September']
create trailer for software platform [u'find rights-free music', u'check current applications', u'adobe premiere troubleshoo