Make examples of user tasks for illustration purpose. 

## Setup


In [199]:
%matplotlib inline
import pymongo
from pymongo import MongoClient
import numpy as np
import itertools
import urllib
import sys
sys.path.append('../variables/')
import UserActivity
import matplotlib.pylab as pylab
from bson.objectid import ObjectId
from scipy.stats import gaussian_kde, ks_2samp

# DB connections
client = MongoClient()
db = client.db_tasklog_clean
# Collections
User = db.user
Log = db.log_chrome
Labelled = db.data_labeled
UserTasks = db.user_tasks

# Tasks to be excluded
to_exclude = ['000', '001', '002', '003', '004']
# Pre-defined labels - excluding "not sure (001)"
pre_tasks = ['001', '002', '003', '004']
# new query 
query_events = ['tab-search-new', 'tab-search-verticle'] 
# Get user task activities
# Set dummy parametersb
t_thresh = -1 # Look at first X minutes, not used
session_thresh = 30 # in minutes, threshold to consider users have left without closing the tab, not used
to_include = [] # Not used for this computation 

# Task properties names
TP = {
      'task_stage': 'STG',
      'task_collaboration': 'COL',
      'task_difficulty_subjective': 'DIF',
      'task_sailence_subjective': 'IMP',
      'task_frequency': 'FQ',
      'task_satisfaction': 'SAT',
      'task_knowledge_topic': 'KT',
      'task_complexity_objective': 'CL',
      'task_knowledge_procedure': 'KP',
      'task_complexity_subjective': 'COM',
      'task_length': 'TL',
      'task_urgency_subjective': 'UR'
 }
TP_exclude = ['task_goals_quantity']

## Functions of getting labelled tasks

In [200]:
def all_registered_tasks():
    all_tasks = list(UserTasks.find({'task_level': 0}))
    return all_tasks

def task_names():
    all_tasks = list(UserTasks.find({'task_level': 0}))
    T = []
    for t in all_tasks:
        T.append((str(t['_id']), t['task']))
    return dict(T)

def get_postQ():
    Task_Q = []
    for u in User.find({}):
        Q = u['postQ']['questionnaire']
        for t in Q:
            questions = dict([(TP[q], Q[t][q]) for q in Q[t] if not q in TP_exclude])
            Task_Q.append((t, questions))
    return Task_Q


In [201]:
# All events
def event_stream(data):
    events = []
    # data consists of tab-groups
    for s in data:
        tab_group = s['tab_group']
        tabid = s['tabid']
        # tab groups consist of url groups
        for ug in tab_group:
            # a group of events sharing the same url
            url_group = ug['url_group']
            for e in url_group:
                events.append(e)
    events.sort(key=lambda x: x['timestamp_bson'])
    return events

# Get logical sessions
# Criteria for logical session:
# 1. consecutive events belong to the same task
# 2. a pause longer than 90 mins makes a session break (user may have left)
# Data format: stream of events
# Data should have already been filtered to only include event where the user is "on"
# A logical session should last more than 0 seconds
def logical_session(data):
    L = []
    s = []
    prev = -1
    for event, on in data:
        if prev == -1:
            prev = event
            s.append(event)
            continue
        gap = (event['timestamp_bson'] - prev['timestamp_bson']).total_seconds()/60
        # Keep adding to the logical session if conditions are satisfied
        if prev['taskid'] == event['taskid'] and gap < 90:
            s.append(event)
        # Otherwise, start a new session
        else:
            slength = (s[-1]['timestamp_bson'] - s[0]['timestamp_bson']).total_seconds()
            # Filter out sessions that last 0 seconds
            if not s == [] and slength > 0:
                L.append(s)
            s = []
            s.append(event)
        prev = event
    # Add the last session
    slength = (s[-1]['timestamp_bson'] - s[0]['timestamp_bson']).total_seconds()
    if not s == [] and slength > 0:
        L.append(s)
    return L
    

# Gether user activity data for analysis
UA = []
users = list(User.find({}))
for u in users:
    userid = u['userid']
    data = list(Labelled.find({'userid': u['userid']}))[0]['data']
    events = event_stream(data)
    ua = UserActivity.UserActivity(data, to_include, t_thresh, session_thresh)
    a_path, a_stream = ua.get_user_path()
    
    # filter out events where user was not on the tab
    user_stream = list(itertools.ifilter(lambda x: x[1] == True, a_stream))
    UA.append((userid, user_stream))

## Task content categorisation w.r.t ODP

## Task with subtasks

In [202]:
# Tasks that were used to label events:
# based on logical sessions of tasks, i.e.
# - user should be "on" the page
# - a logical session of a task should be more than 0 second
def labelled_tasks():
    labelled_tasks = set([])
    for userid, ua in UA:
        L = logical_session(ua)
        taskids = set([l[0]['taskid'] for l in L])
        # filter out predefined and None tasks
        taskids = taskids - set(['None'] + to_exclude)
        labelled_tasks = labelled_tasks.union(taskids)
    return labelled_tasks

# Number of top lavel tasks that have subtasks and the top-level is used for labelling
# And list the subtasks
def task_hierarchy(labelled_tasks):
    task_hier = {}
    for t in labelled_tasks:
        task = list(UserTasks.find({'_id': ObjectId(t)}))
        if len(task) == 0:
            continue
        subtasks = list(UserTasks.find({'parent_task': t}))
        if len(subtasks) > 0:
            task_hier[task[0]['task']] = [s['task'] for s in subtasks]
    return task_hier


print "Number of tasks registered (top level): ", len(all_registered_tasks())
LT = labelled_tasks()
print "Number of tasks labelled (top level): ", len(LT)
# Among labelled tasks, those that have subtasks
task_hier = task_hierarchy(LT)
print "Number of labelled tasks that are registered with subtasks:", len(task_hier)
for task in task_hier:
    print task, task_hier[task]
    
#print "Number of subtasks that were used to label events: "

Number of tasks registered (top level):  305
Number of tasks labelled (top level):  289
Number of labelled tasks that are registered with subtasks: 17
Finding job [u'Checking Email']
Accoutability Hack [u'parse XML', u'prepare back-end']
plan vacation trip christmas [u'find out how to go from denmark to norway', u'find accommodation in oslo']
find brownie recipe [u'needs to use only ingredients I already have']
Check readings for Construction [u'download readings']
Prepare for Black Friday sales [u'Buy Stand Mixer', u'Buy Shaver']
Plan winter trip [u'Browse Train tickets']
Tasklog user study [u'UI/logger coding', u'Analysis', u'Recruiting users', u'study design']
Maritime arb readings [u'Ask vivien about printer, need to pay?']
Parents visa application [u'Translation', u'Application files', u'Visa info']
Finish logging hours for isd pay. [u'March ', u'September']
create trailer for software platform [u'find rights-free music', u'check current applications', u'adobe premiere troubleshoo

## Example tasks at different cognitive levels

- try to classify them into categories, e.g. the 7 categories from X

In [203]:
postQ = get_postQ()
T = task_names()
annotated_tasks = [t[0] for t in postQ]
#postQ.sort(key=lambda x: x[1]['CL'])
#for k, g in itertools.groupby(postQ, lambda x: x[1]['CL']):
#    g = list(g)
#    print k
#    for t in g:
#        print '\t %s \t%s'%(t[0], T[t[0]])

out = []
outfile = 'task_query_for_classify.txt'
taskids = []
for userid, ua in UA:
    task_events = [e for e, on in ua if e['taskid'] in annotated_tasks]
    task_events.sort(key=lambda x: x['taskid'])
    for k, g in itertools.groupby(task_events, lambda x: x['taskid']):
        g = list(g)
        queries = [e['details']['query'] for e in g if e['event'] in query_events]
        out.append('\t'.join([userid, k, T[k], '; '.join(queries)]))  
        taskids.append(k)
        
print "Logs that were annotated but not recorded in the log: "
for t in set(annotated_tasks) - set(taskids):
    print t, T[t]
    
o = open(outfile, 'w')
o.write('\n'.join(out))
o.close()


Logs that were annotated but not recorded in the log: 
562412b8ed9cac35200a8bb4 port ad feedback feature to java
562a5c550973e424691dec03 testing tasklog interface
563bc225e7557e4047734e51 learn about lambdarank
56571ce6e685c622118a5a5d apply for new born visa
563bc157e7557e4047734e47 relevance rating interface for logger


In [204]:
# Map the tasks to some higher level representation
def load_taskcontent():
    inputfile = 'task_content.txt'
    f = open(inputfile)
    taskmap = []
    for l in f:
        cl, taskname, taskmask, taskanno = l.strip().split('\t')
        taskmap.append((cl, taskname, taskmask, taskanno.lower()))
    f.close()
    return taskmap    

In [229]:
# Group tasks after masking
def group_task_by_mask(taskmap):
    taskmap.sort(key=lambda x: x[3])
    G = []
    for k, g in itertools.groupby(taskmap, lambda x: x[3]):
        g = list(g)
        G.append((k, g, len(g)))
    return G

def get_topX_groups(taskmap, X, noprint=False):
    G = group_task_by_mask(taskmap)
    G.sort(key=lambda x: x[2], reverse=True)
    i = 0
    topX = 0
    topX_groups = []
    for g in G:
        i += 1
        topX += g[2]
        topX_groups.append((g[0], g[2]))
        if not noprint:
            print i, g[0], g[2], float(g[2])/len(taskmap)
            print [x[1] for x in g[1]]
            print
        if i >= X:
            break
    if not noprint:
        print
        print "Top", X, "task content sum up to"
        print topX, float(topX)/len(taskmap)
        print 'Other', 135 - topX, float(135-topX)/len(taskmap)
    return dict(topX_groups)

taskmap = load_taskcontent()
get_topX_groups(taskmap, 8)

 1 shopping 18 0.133333333333
['Baby products', 'Shopping', 'Looking for a new bag', 'sort out christmas decoration lights', 'Husbands present', 'Buy a dress for dec visit', 'Buying contact lenses', 'Search for a Internet Plan', 'food shopping', 'shopping', 'shoes', 'shop for speakers', 'Prepare for Black Friday sales', 'Shopping', 'hair and skin-amazon', 'Check out graze-free box', 'Amazon-Heater', 'Hair things']

2 writing report/paper/presentation 11 0.0814814814815
['Complete Econometrics Tutorial', 'Complete IEP Presentation Slide', 'SIGIR mobile apps paper', 'finish writing related work', 'background chapter of report', 'writing academic article', 'Tort Essay', 'EU Law Eassy Research', 'Tort Law essay research', 'Paper writing', 'compile BX paper']

3 travel planning/booking 10 0.0740740740741
['Flight home (Christmas)', 'Book trip to Berlin', 'Plan trip for next reading week', 'Arrange some journeys afterwards', 'book tickets for wsdm', 'Plan for reading week', 'Weekend Travel',

{'do/manage project': 6,
 'job hunting': 7,
 'programming': 5,
 'research': 6,
 'shopping': 18,
 'travel planning/booking': 10,
 'watch x online': 5,
 'writing report/paper/presentation': 11}

In [233]:
# Examples of tasks at each level
taskmap = load_taskcontent()
# Get top X task groups
topX = 8
topx_G = get_topX_groups(taskmap, topX, noprint=True)
taskmap.sort(key=lambda x: x[0])
print topx_G
for k, g in itertools.groupby(taskmap, lambda x: x[0]):
    g = list(g)
    print "======="
    print k, len(g)
    print "======="
    LG = group_task_by_mask(g)
    LG.sort(key=lambda x: x[2], reverse=True)
    other = []
    for gg in LG:
        if gg[0] in topx_G:
            print gg[0], gg[2], '%.2f'%(gg[2]/float(topx_G[gg[0]])), [x[1] for x in gg[1]]
        else:
            other += [x[1] for x in gg[1]]
    print "other:", len(other), other
    print

{'travel planning/booking': 10, 'shopping': 18, 'writing report/paper/presentation': 11, 'programming': 5, 'research': 6, 'do/manage project': 6, 'job hunting': 7, 'watch x online': 5}
analyse 18
programming 3 0.60 ['port ad feedback feature to java', 'complete ad feedback data join', 'Calculate features in client side logger']
shopping 3 0.17 ['Baby products', 'Shopping', 'Looking for a new bag']
watch x online 3 0.60 ['binge watch London Spy', 'Binge watch outlander', 'watch Dr Who']
do/manage project 1 0.17 ['Algorithms proj']
job hunting 1 0.14 ['Internship applications']
research 1 0.17 ['General research for study']
travel planning/booking 1 0.10 ['Flight home (Christmas)']
other: 5 ['learn about lambdarank', 'learn about good slides design in academic talks', 'Recruitment', 'Solving technical issues', 'Eu tutorial']

apply 28
research 3 0.50 ['Research', 'research', 'Research']
do/manage project 2 0.33 ['GIS project', 'Current software project']
shopping 2 0.11 ['sort out christ