# Exploring ways to propagate labels from anchor actions to all activities
## Results -- strategy for label propagation:
* Consecutive events sharing the same tab-url are assumed to belong to the same task.
* Exception: Multiple anchor events with different labels may be included in the same segment.

## Exception handling: 
* Loglikelihood testing for consecutive events: p(i, j)/p(i)p(j) ~ Chi square p-value < 0.5
* non-exchangable: p(i|j) > p(j|i)

In [89]:
%matplotlib inline
from pymongo import MongoClient
import pymongo
import numpy as np
import itertools
import matplotlib.pylab as pylab

# DB connections
client = MongoClient()
db = client.db_tasklog_clean
# Collections
User = db.user
Log = db.log_chrome

# users
allUsers = list(User.find({}))

## Log segmentation and annotation

### Setup

In [90]:
userLogs = {}
for u in allUsers:
    user_log = Log.aggregate([
        {'$match': {'userid': u['userid'], 'removed': False}},
        {'$project': {'affected_tab_id': 1, 
                     'event': 1, 
                     'url': 1,
                     'timestamp_bson': 1, 
                     'taskname': {'$ifNull': ['$annotation.task.name', 'NA']},
                     'taskid': {'$ifNull': ['$annotation.task.taskid', 'NA']},
                     'title': {'$ifNull': ['$details.current_tab.title', 'NA']},
                     }}, 
        {'$sort':{'timestamp_bson': pymongo.ASCENDING}},
        ])
    userLogs[u['userid']] = list(user_log)
        

### Group log by tab and urls:

* Assumption: consecutive same (tab, url) belong to the same task

#### Check: 
* Time distribution within a tab-url segment, see if there are exceptionally long pauses

#### Check result: 
* < 3% consecutive tabs with same URLs have a time gap > 1 min
* < 1% consecutive tabs with same URLs have a time gap > 5 mins
* Most of the consecutive tab with same URLs are due to the chrome event fire mechanism, e.g., click a link -> navigation_link -> tab_loaded. 
* We can assume that if the gap is less than 5 min, then they belong to the same task. It is not superising that a user spend 5 mins on the same page not doing anything. 

In [91]:
# Group by tab
tabs = []
for u in userLogs:
    group = []
    current_tab = 0
    current_url = '$'
    for l in userLogs[u]:
        # If there is different tab_id or url
        if not (current_tab == l['affected_tab_id'] or current_url == l['url']):
            if not current_tab == 0:
                tabs.append(group)
                group = []
        current_tab = l['affected_tab_id']
        current_url = l['url']
        l['userid'] = u
        group.append(l)
    tabs.append(group)

# Check time gap distribution
t_gap = []
len_groups = []
for tab in tabs:
    if len(tab) > 1:
        len_groups.append(len(tab))
        for i in range(len(tab)-1):
            j = i + 1
            gap_min = (tab[j]['timestamp_bson'] - tab[i]['timestamp_bson']).total_seconds()/60
            t_gap.append(gap_min)
#    if len(tab)> 100:
#        for t in tab:
#            print t['userid'], t['event'], t['affected_tab_id'], len(tab)
            
x_1 = list(itertools.ifilter(lambda x: x>1, t_gap))
x_5 = list(itertools.ifilter(lambda x: x>5, t_gap))
x_10 = list(itertools.ifilter(lambda x: x>10, t_gap))
x_30 = list(itertools.ifilter(lambda x: x>30, t_gap))
tot = float(len(t_gap))

print 'Distribution of time gap'
print len(x_1)/tot, len(x_5)/tot, len(x_10)/tot, len(x_30)/tot
mean = np.mean(t_gap)
std = np.std(t_gap)
print 'Mean:', mean, 'Std:', std

print

#pylab.hist(x_1, 100)
print 'Group length > 1:', len(len_groups)/float(len(tabs))
x_20 = list(itertools.ifilter(lambda x: x>20, len_groups))
print 'Group length > 20:', len(x_20)/float(len(tabs))
x_50 = list(itertools.ifilter(lambda x: x>50, len_groups))
print 'Group length > 50:', len(x_50)/float(len(tabs))
mean = np.mean([len(t) for t in tabs])
std = np.std([len(t) for t in tabs])
print 'group length: ', 'Mean:', mean, 'Std:', std
#pylab.hist(len_groups, 20)


Distribution of time gap
0.0248382693335 0.00520802421509 0.00289334678616 0.0012908777969
Mean: 0.440881743476 Std: 14.4789145226

Group length > 1: 0.465919160248
Group length > 20: 0.0187103810238
Group length > 50: 0.00374889237271
group length:  Mean: 3.29691227592 Std: 7.12319726981


### Problem of tab-url groups:
It is possible that consecutive events within a tab-url group are assigned to different task labels by users. 
It is hard to know where the boundary is. Most likely a group of events as the result of chrome event firing 
sequence should be grouped together. 

What we need is to check what are the common combination of events with tab-loaded and tab-search
The reason there are two different tasks is because there are two tab-loaded/tab-search events within the same group.



In [92]:
# Bigrams of tab-search/tab-loaded
keys = ['tab-loaded', 'tab-search']
pairs = []
singles = []
for group in tabs:
    prev = group[0]
    singles.append(prev['event'])
    for l in group[1:]:
        if (prev['event'] in keys) or (l['event'] in keys):
            time_gap = (l['timestamp_bson'] - prev['timestamp_bson']).total_seconds()
            pairs.append((prev['event'], l['event'], time_gap))
        singles.append(l['event'])
        prev = l

# Indications of non-random combination:
# - p(a_i, a_j) > p(a_i)p(a_j)
# - p(a_i|a_j) > p(a_j|a_i): not switchable due to chrome event firing sequence
            
# count the singles
singles.sort()
single_count = []
for k, g in itertools.groupby(singles):
    single_count.append((k, len(list(g))))
single_count = dict(single_count)

# compare to joint probability and switched conditional probability
pairs.sort(key=lambda x: (x[0], x[1]))
tot_single = float(len(singles))
stats = []
for k, g in itertools.groupby(pairs, lambda x: (x[0], x[1])):
    # Likelihood ratio
    g = list(g)
    p_pair = len(g)/float(len(pairs))
    p_single = single_count.get(k[0])/tot_single * single_count.get(k[1])/tot_single
    lr_joint = p_pair/p_single
#    time_gap = [x[2] for x in g]
    p_ij = len(g)/float(single_count[k[0]])
    p_ji = 0
    if k[0] in single_count:
        p_ji = len(g)/float(single_count[k[1]])
    if lr_joint > 1 and (p_ij > p_ji):
        stats.append((k, lr_joint, p_ij, p_ji))
        
stats.sort(key=lambda x: x[1], reverse=True)
for s in stats:
    print s



((u'tab-replace-switch', u'tab-search'), 53.901885248053304, 0.7805953693495039, 0.233201581027668)
((u'tab-open-in-new', u'tab-search'), 9.121481170165625, 0.13209530483531884, 0.12417654808959157)
((u'navigation-omni_query-forward_back', u'tab-loaded'), 6.940292510127106, 0.9942857142857143, 0.005793434108010921)
((u'navigation-form_submit', u'tab-loaded'), 6.618495912952857, 0.9481842338352524, 0.07128587600719184)
((u'navigation-link-forward_back', u'tab-loaded'), 6.5138310661276435, 0.9331896551724138, 0.028833988146766996)
((u'navigation-keyword', u'tab-loaded'), 6.204603776614272, 0.8888888888888888, 0.00026636478657521474)
((u'navigation-manual_subframe', u'tab-loaded'), 5.505590542649707, 0.788746298124383, 0.026603183059199575)
((u'navigation-link', u'tab-loaded'), 5.437433555959008, 0.7789819376026272, 0.23693147765865352)
((u'tab-search', u'navigation-link'), 4.958762088371499, 0.2160737812911726, 0.07181171319102353)
((u'navigation-reload', u'tab-loaded'), 4.93033774187775

### 2. Check transition patterns between events

#### Level-0: Pattern forms because of the way chrome API sending events


In [93]:
# Find sequences of events commonly appear in one tab-url group
pairs = []
singles = []
for group in tabs:
    prev = group[0]
    singles.append(prev['event'])
    for l in group[1:]:
        time_gap = (l['timestamp_bson'] - prev['timestamp_bson']).total_seconds()
        pairs.append((prev['event'], l['event'], time_gap))
        prev = l
        singles.append(l['event'])
      
# count the singles
singles.sort()
single_count = []
for k, g in itertools.groupby(singles):
    single_count.append((k, len(list(g))))
single_count = dict(single_count)

pairs.sort(key=lambda x: (x[0], x[1]))
tot_single = float(len(singles))
stats = []
for k, g in itertools.groupby(pairs, lambda x: (x[0], x[1])):
    # Likelihood ratio
    g = list(g)
    p_pair = len(g)/float(len(pairs))
    p_single = single_count.get(k[0])/tot_single * single_count.get(k[1])/tot_single
    lr = p_pair/p_single
    time_gap = [x[2] for x in g]
    stats.append((k, len(g), lr, np.mean(time_gap), np.std(time_gap)))
        
stats.sort(key=lambda x: x[2], reverse=True)
#for s in stats:
#    print s

In [94]:
# Check if a sequence of event is likely to be together by chance, in all data, not limited to tab-url groups
pairs = []
singles = []
for u in userLogs:
    prev = userLogs[u][0]
    singles.append(prev['event'])
    for l in userLogs[u][1:]:
        #also get time gap
        time_gap = (l['timestamp_bson'] - prev['timestamp_bson']).total_seconds()
        pairs.append((prev['event'], l['event'], time_gap))
        singles.append(l['event'])
        prev = l
        
# count the singles
singles.sort()
single_count = []
for k, g in itertools.groupby(singles):
    single_count.append((k, len(list(g))))
single_count = dict(single_count)

pairs.sort(key=lambda x: (x[0], x[1]))
tot_single = float(len(singles))
stats = []
for k, g in itertools.groupby(pairs, lambda x: (x[0], x[1])):
    # Likelihood ratio
    g = list(g)
    p_pair = len(g)/float(len(pairs))
    p_single = single_count.get(k[0])/tot_single * single_count.get(k[1])/tot_single
    lr = p_pair/p_single
    time_gap = [x[2] for x in g]
    stats.append((k, len(g), lr, np.mean(time_gap), np.std(time_gap)))
        
stats.sort(key=lambda x: x[2], reverse=True)
#for s in stats:
#    print s


There are two types of events:
* Active events: triggered by user doing something
* Passive events: triggered by the active events

Active events: 
* form_submit
* link_click
* navigation-link-forward_back: triggered by user clicking forward/backward
* navigation-omni_query: triggered by user typing in query in omnibox
* navigation-omni_query-forward_back: triggered by user clicking forward/backward to a SERP
* navigation-omni_url: triggered by user typing in a url in omnibox
* navigation-omni_url-forward_back: triggered by user clicking forward/backward 
* tab-close: triggered by user closing a tab
* tab-new: triggered by user open a new tab
* tab-open-in-new: triggered by user open a page in new tab
* tab-switch: triggered by user switching to a differen tab

Passive events:
* navigation-link: triggered by link_click, or form_submit, or other interactions with links
* navigation-manual_subframe: not sure how it is triggered
* navigation-reload: this can be triggered by user or automatic reloading
* navigation-start_page: not sure how it's triggered
* tab-close-switch: this is triggered by tab-clost
* tab-loaded: this can be triggered by any events that loading a page
* tab-new-switch: triggered by tab-new
* tab-replace: this is something special with Google pre-loading some SERP
* tab-replace-switch: this is something special with Google pre-loading some SERP
* tab-search: triggered by form_submit or omni-query

In [95]:
# See the time distribution between events to determine:
# - frequent occuring segments of events
# - time gap that are exceptionally long for an indication of split
for u in allUsers:
    user_log = Log.aggregate([{'$match': {'userid': u['userid'], 'removed': False}},
                              {'$project': {'event': 1, 'timestamp_bson': 1}},
                              {'$sort':{'timestamp_bson': pymongo.ASCENDING}},
                             ])
    user_log = list(user_log)
    tuples = []
    # Get the transition types
    for i in range(len(user_log)-1):
        j = i + 1
        # Time gap in terms of milliseconds
        time_gap = (user_log[j]['timestamp_bson'] - user_log[i]['timestamp_bson']).total_seconds()*1000
        tuples.append(((user_log[i]['event'], user_log[j]['event']), time_gap))
        
    # Aggregate the tuples
    tuples.sort(key=lambda x: x[0])
    top_tuples = []
    for k, g in itertools.groupby(tuples, lambda x: x[0]):
        time_gaps = [x[1] for x in list(g)]
        top_tuples.append((k, float(len(time_gaps))/len(tuples)))
        
    top_tuples.sort(key=lambda x: x[1], reverse=True)
    for t in top_tuples:
        print t

((u'tab-loaded', u'tab-loaded'), 0.24097331240188383)
((u'tab-loaded', u'link_click'), 0.14285714285714285)
((u'link_click', u'tab-loaded'), 0.09379905808477237)
((u'tab-switch', u'tab-switch'), 0.08634222919937205)
((u'navigation-link', u'tab-loaded'), 0.06789638932496075)
((u'link_click', u'navigation-link'), 0.05729984301412873)
((u'link_click', u'link_click'), 0.04277864992150707)
((u'tab-loaded', u'tab-switch'), 0.023547880690737835)
((u'tab-close', u'tab-close-switch'), 0.02315541601255887)
((u'tab-switch', u'link_click'), 0.018838304552590265)
((u'tab-new', u'tab-new-switch'), 0.012951334379905808)
((u'link_click', u'tab-open-in-new'), 0.01216640502354788)
((u'tab-open-in-new', u'navigation-link'), 0.01020408163265306)
((u'tab-switch', u'tab-loaded'), 0.00902668759811617)
((u'navigation-form_submit', u'tab-loaded'), 0.007849293563579277)
((u'navigation-link', u'link_click'), 0.007849293563579277)
((u'tab-loaded', u'tab-close'), 0.007849293563579277)
((u'tab-close-switch', u'tab-

### Heuristic rules for segmentations and task label assignment:
* R1: A time gap > 30 min makes a split (it doesn't matter if eventually two consecutive segments have the same task label)
* R2: Continuous operations on the same page can be merged unless they have different task labels

* R3: Events within the same segment are assigned to the same task label
* R4: events outside any labeled segments should be checked



In [96]:
###### Task sessions
for u in allUsers:
    userid = u['userid']
    user_log = Log.aggregate([
            {'$match': {'userid': userid, 'removed': False}},
            {'$project': {
                'userid': 1,
                'event': 1,
                'timestamp_bson': 1,
                'url': 1,
                'taskname': {'$ifNull': ['$annotation.task.name', 'NA']},
                'taskid': {'$ifNull': ['$annotation.task.taskid', 'NA']},
                'title': {'$ifNull': ['$details.current_tab.title', 'NA']}
                }},
            {'$sort':{'timestamp_bson': pymongo.ASCENDING}} 
            ])
    user_log = list(user_log)

    # Pass 1: split the data by time gap
    segments = []
    current_entry = user_log[0]
    s = []
    for l in user_log[1:]:
        # Time gap in minute, if gap > 30 mins, split
        gap = (l['timestamp_bson'] - current_entry['timestamp_bson']).total_seconds()/60
        # Same page check
        same_page = (l['url'] == current_entry['url'])
        if same_page == False or gap > 30:
            segments.append(s)
            s = []
        current_entry = l
        s.append(current_entry)
    segments.append(s)
   
    for s in segments:
        tasklabels = list(set([x['taskname'] for x in s]))
        if 'NA' in tasklabels:
            tasklabels.remove('NA')
        if len(tasklabels)>1:
            print tasklabels
            for x in s:
                print x['event'], x['taskname'], x['url']
        

[u'Not sure', u'Write SIGIR paper']
tab-search Write SIGIR paper https://www.google.co.uk/webhp?sourceid=chrome-instant&ion=1&espv=2&ie=UTF-8#q=multiple%20lines%20inside%20braces%20late
navigation-omni_query-forward_back NA https://www.google.co.uk/webhp?sourceid=chrome-instant&ion=1&espv=2&ie=UTF-8#q=multiple%20lines%20inside%20braces%20late
tab-loaded Not sure https://www.google.co.uk/webhp?sourceid=chrome-instant&ion=1&espv=2&ie=UTF-8#q=multiple%20lines%20inside%20braces%20late
link_click NA https://www.google.co.uk/webhp?sourceid=chrome-instant&ion=1&espv=2&ie=UTF-8#q=multiple%20lines%20inside%20braces%20late
[u'Not sure', u'Walthamstow Labour organising']
tab-search Not sure https://www.google.co.uk/url?sa=t&rct=j&q=&esrc=s&source=web&cd=3&ved=0CEIQygQwAmoVChMIsrPZwPTdyAIVhlgaCh0apgI6&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FDisability%23People-first_language&usg=AFQjCNHhrOutbQ-oHC8WcAFREqOSPkE3nQ&sig2=FkzNKEOjVahssmpfTNiN8A&bvm=bv.105841590,d.bGg&cad=rja
navigation-link NA htt