## Setup

In [1]:
import pymongo
from pymongo import MongoClient
import numpy as np
import itertools
import urllib

# DB connections
client = MongoClient()
db = client.db_tasklog_clean
# Collections
User = db.user
Log = db.log_chrome
Labelled = db.data_labeled
UserTasks = list(db.user_tasks.find({}))

## Descriptive stats of user demographics:

In [2]:
# Gender groups
print 'Gender groups:'
for d in User.aggregate([{'$project': {'info.gender': 1}},
        {'$group': {'_id': '$info.gender', 'count': {'$sum': 1}}}]):
    print d['_id'], d['count']
print
    
# Age groups
print 'Age groups:'
for d in User.aggregate([{'$project': {'info.age': 1}}, 
    {'$group': {'_id': '$info.age', 'count': {'$sum': 1}}}]):
    print d['_id'], d['count']
print

Gender groups:
female 10
male 13

Age groups:
18_24 11
25_34 12



In [3]:
allUsers = list(User.find({}))

# Experience computer
exp_comp = [int(a['info']['exp_comp']) for a in allUsers]
iqr = np.percentile(exp_comp, 75) - np.percentile(exp_comp, 25)
print 'User computer technology experience:', 'Median =', np.median(exp_comp), 'IQR =', iqr

# Experience Search engine
exp_se = [int(a['info']['exp_se']) for a in allUsers]
iqr = np.percentile(exp_se, 75) - np.percentile(exp_se, 25)
print 'User Search engine experience:', 'Median =', np.median(exp_se), 'IQR =', iqr

# Frequency of work related usage of computer
comp_work = [int(a['info']['comp_work']) for a in allUsers]
iqr = np.percentile(comp_work, 75) - np.percentile(comp_work, 25)
print 'Frequency of using the experiment laptop for work:', 'Median =', np.median(comp_work), 'IQR =', iqr


User computer technology experience: Median = 5.0 IQR = 1.0
User Search engine experience: Median = 5.0 IQR = 1.0
Frequency of using the experiment laptop for work: Median = 4.0 IQR = 1.0


In [4]:
total_users = float(len(list(allUsers)))

# Frequency of using the experiment laptop
comp_freq_labels = {'1': '<25%', '2':'25%-50%', '3': '50%-75%', '4': '75%-100%', '5': '100%'}
comp_freq = User.aggregate([{'$project': {'info.freq_comp': 1}}, 
                           {'$group': {'_id': '$info.freq_comp', 'count': {'$sum': 1}}}, 
                           {'$sort': {'_id': 1}}])

print 'Frequency of using the experiment laptop:'
for f in comp_freq:
    print comp_freq_labels[f['_id']], ':', f['count'], '(','%.2f'%(float(f['count'])/total_users), ')'
print

# Search device
search_device_label = {'1': 'always on computer', '2': 'mostly on computer', 
                      '3': 'half-half', '4': 'mostly on mobile', '5': 'always on mobile'}
search_device = User.aggregate([{'$project': {'info.search_device': 1}}, 
                                {'$group': {'_id': '$info.search_device', 'count': {'$sum': 1}}},
                                {'$sort': {'_id': 1}}
                               ])
print 'User search device preferences:'
for s in search_device:
    if s['_id'] != None:   # Need to update Marc's registration form
        print search_device_label[s['_id']], ':', s['count'], '(', '%.2f'%(float(s['count'])/total_users), ')'
 

Frequency of using the experiment laptop:
<25% : 1 ( 0.04 )
25%-50% : 2 ( 0.09 )
50%-75% : 1 ( 0.04 )
75%-100% : 7 ( 0.30 )
100% : 12 ( 0.52 )

User search device preferences:
always on computer : 3 ( 0.13 )
mostly on computer : 15 ( 0.65 )
half-half : 5 ( 0.22 )


In [5]:
#Total number of users
count_tot = float(len(list(allUsers)))

# Top types of information searched on Mobile
computer_search_for = [a['info']['computer_search_for'] for a in allUsers]
computer_search_for = [s for sublist in computer_search_for for s in sublist]
if a['info']['computer_search_for_other'] != '':
    computer_search_for.append(a['info']['computer_search_for_other'])
computer_search_for.sort()

tot = []
for k, g in itertools.groupby(computer_search_for):
    tot.append((k, len(list(g))))
tot.sort(key=lambda x: x[1], reverse=True)

print 'Top searched informaiton types with computers: '
for t in tot:
    print t[0], t[1], '%.2f'%(t[1]/count_tot)
print
    
# Top types of information searched on Computer
mobile_search_for = [a['info']['mobile_search_for'] for a in allUsers]
mobile_search_for = [s for sublist in mobile_search_for for s in sublist]
if a['info']['mobile_search_for_other'] != '':
    mobile_search_for.append(a['info']['mobile_search_for_other'])
mobile_search_for.sort()

tot = []
for k, g in itertools.groupby(mobile_search_for):
    tot.append((k, len(list(g))))
tot.sort(key=lambda x: x[1], reverse=True)

print "Top searched information types with mobiles: "
for t in tot:
    print t[0], t[1], '%.2f'%(t[1]/count_tot)
    


Top searched informaiton types with computers: 
entertainment 18 0.78
topic 18 0.78
social 17 0.74
fact 14 0.61
news 14 0.61
shopping 11 0.48
task 10 0.43
job 9 0.39
routes 7 0.30
travel 7 0.30
person 6 0.26
house 3 0.13

Top searched information types with mobiles: 
social 18 0.78
news 17 0.74
routes 15 0.65
entertainment 14 0.61
fact 14 0.61
topic 10 0.43
task 9 0.39
person 6 0.26
shopping 5 0.22
travel 2 0.09
job 1 0.04


## Descriptive stats of the log


### Prepare data

In [6]:
# Prepare data
Data_stream = []
for u in allUsers:
    D =  list(Labelled.find({"userid": u['userid']}))[0]
    data = D['data']
    D_stream = []
    for d in data:
        tab_group = d['tab_group']
        for ug in tab_group:
            url_group = ug['url_group']
            for e in url_group:
                D_stream.append(e)
    Data_stream.append((u['userid'], D_stream))            

## Number of tasks defined by users

In [7]:
main_tasks = list(itertools.ifilter(lambda x: x['task_level'] == 0, UserTasks))
main_tasks = [str(t['_id']) for t in main_tasks]
subtasks = list(itertools.ifilter(lambda x: x['task_level'] == 1, UserTasks))
parent_tasks = []
for s in subtasks:
    parent_tasks.append(s['parent_task'])
parent_tasks = list(set(parent_tasks))
print len(parent_tasks)

Filter = ['None', '000', '001', '002', '003', '004']
# Not all tasks have been used to annotate data
tasks = []
p_tasks = []
for u, data in Data_stream:
    for e in data:
        if e['taskid'] not in Filter:
            tasks.append(e['taskid'])
            if e['taskid'] in parent_tasks:
                p_tasks.append(e['taskid'])
                
print 'Number of user defined and annotated tasks:', len(set(tasks))
print 'Number of tasks having subtasks:', len(set(p_tasks))

20
Number of user defined and annotated tasks: 291
Number of tasks having subtasks: 17


## Number of tasks analyzed in postQ

In [8]:
# Total number of tasks being analyzed in postQ
chosen_tasks = []
for u in allUsers:
    postQ = u.get('postQ', {})
    tasklist = postQ.get('tasklist', [])
    for t in tasklist:
        if t['chosen'] == True:
            chosen_tasks.append(t)
print 'Number of tasks analysed in postQ:', len(chosen_tasks)

Number of tasks analysed in postQ: 135


## Number of days, queries issued and annotated, pages visited and annotated

In [9]:
# Number of days
days = []
for u, data in Data_stream:
    data.sort(key = lambda x: x['timestamp_bson'])
    # Find days that actually have data
    dates = [d['timestamp_bson'].date() for d in data]
    days.append(len(set(dates)))
print "Number of days in the experiment: ", 'min:', min(days), 'max:', max(days), 'mean:', np.median(days)

Number of days in the experiment:  min: 2 max: 11 mean: 6.0


In [10]:
# Number of queries
num_q = 0
# Number of queries annotated with user defined tasks
num_q_userdefine = 0
for u, data in Data_stream:
    q = itertools.ifilter(lambda x: x['event'] in ['tab-search-new', 'tab-search-verticle'] 
                          and not x['taskid'] == 'None', data)
    q1 = itertools.ifilter(lambda x: x['event'] in ['tab-search-new', 'tab-search-verticle'] 
                          and not x['taskid'] in ['None', '000', '001', '002', '003', '004'], data)
    num_q += len(list(q))
    num_q_userdefine += len(list(q1))
print 'Total number of queries annotated: ', num_q
print 'Total number of queries annotated with user defined tasks:', num_q_userdefine

Total number of queries annotated:  2626
Total number of queries annotated with user defined tasks: 1808


In [11]:
# Number of pages annotated - Does not consider how long the user actually stayed on the page, 
# only count loaded pages that have been annotated
# It also includes those propagated labeles
num_p = 0
num_p_userdefine = 0
for u, data in Data_stream:
    p = itertools.ifilter(lambda x: x['event'] == 'tab-loaded' and not x['taskid'] == 'None', data)
    num_p += len(list(p))
    p1 = itertools.ifilter(lambda x: x['event'] == 'tab-loaded' and not x['taskid'] in ['None', 
                                '000', '001', '002', '003', '004'], data)
    num_p_userdefine += len(list(p1))
print 'Total number of pages annotated:', num_p
print 'Total number of pages annotated with user defined tasks:', num_p_userdefine

Total number of pages annotated: 32902
Total number of pages annotated with user defined tasks: 17313


## Do not use the result of the following cell

In [12]:
X = Log.aggregate([{'$match': {'event': 'tab-search'}}, 
                         {'$project':{
                                'engine': '$details.engine',
                                'media': '$details.media',
                                'start': '$details.start',
                            }},
                         ])
X = list(X)
E = [x.get('engine', '') for x in X]
M = [x.get('media', '') for x in X]
S = [x.get('start', '') for x in X]
E.sort()
M.sort()
S.sort()
print 'Engines:'
for k, g in itertools.groupby(E):
    print k, len(list(g))
print
print 'Media:'
for k, g in itertools.groupby(M):
    print k, len(list(g))
print
print 'Page:'
for k, g in itertools.groupby(S):
    print k, len(list(g))


Engines:
 242
bing 27
google 3823
yahoo 3

Media:
 242
apps 1
flights 2
images 386
maps 21
news 40
shopping 1
videos 11
web 3391

Page:
0 3765
10 51
11 1
20 17
30 9
40 4
50 2
60 2
70 1
80 1
 242
