## Setup

In [248]:
from pymongo import MongoClient
import numpy as np
import itertools

# DB connections
client = MongoClient()
db = client.db_tasklog_clean
# Collections
User = db.user
Log = db.log_chrome

## Descriptive stats of user demographics:

In [249]:
# Gender groups
print 'Gender groups:'
for d in User.aggregate([{'$project': {'info.gender': 1}},
        {'$group': {'_id': '$info.gender', 'count': {'$sum': 1}}}]):
    print d['_id'], d['count']
print
    
# Age groups
print 'Age groups:'
for d in User.aggregate([{'$project': {'info.age': 1}}, 
    {'$group': {'_id': '$info.age', 'count': {'$sum': 1}}}]):
    print d['_id'], d['count']
print

Gender groups:
female 6
male 10

Age groups:
18_24 7
25_34 9



In [250]:
allUsers = list(User.find({}))

# Experience computer
exp_comp = [int(a['info']['exp_comp']) for a in allUsers]
iqr = np.percentile(exp_comp, 75) - np.percentile(exp_comp, 25)
print 'User computer technology experience:', 'Median =', np.median(exp_comp), 'IQR =', iqr

# Experience Search engine
exp_se = [int(a['info']['exp_se']) for a in allUsers]
iqr = np.percentile(exp_se, 75) - np.percentile(exp_se, 25)
print 'User Search engine experience:', 'Median =', np.median(exp_se), 'IQR =', iqr

# Frequency of work related usage of computer
comp_work = [int(a['info']['comp_work']) for a in allUsers]
iqr = np.percentile(comp_work, 75) - np.percentile(comp_work, 25)
print 'Frequency of using the experiment laptop for work:', 'Median =', np.median(comp_work), 'IQR =', iqr


User computer technology experience: Median = 5.0 IQR = 1.0
User Search engine experience: Median = 5.0 IQR = 1.0
Frequency of using the experiment laptop for work: Median = 3.5 IQR = 1.25


In [251]:
total_users = float(len(list(allUsers)))

# Frequency of using the experiment laptop
comp_freq_labels = {'1': '<25%', '2':'25%-50%', '3': '50%-75%', '4': '75%-100%', '5': '100%'}
comp_freq = User.aggregate([{'$project': {'info.freq_comp': 1}}, 
                           {'$group': {'_id': '$info.freq_comp', 'count': {'$sum': 1}}}, 
                           {'$sort': {'_id': 1}}])

print 'Frequency of using the experiment laptop:'
for f in comp_freq:
    print comp_freq_labels[f['_id']], ':', f['count'], '(','%.2f'%(float(f['count'])/total_users), ')'
print

# Search device
search_device_label = {'1': 'always on computer', '2': 'mostly on computer', 
                      '3': 'half-half', '4': 'mostly on mobile', '5': 'always on mobile'}
search_device = User.aggregate([{'$project': {'info.search_device': 1}}, 
                                {'$group': {'_id': '$info.search_device', 'count': {'$sum': 1}}},
                                {'$sort': {'_id': 1}}
                               ])
print 'User search device preferences:'
for s in search_device:
    if s['_id'] != None:   # Need to update Marc's registration form
        print search_device_label[s['_id']], ':', s['count'], '(', '%.2f'%(float(s['count'])/total_users), ')'
 

Frequency of using the experiment laptop:
<25% : 1 ( 0.06 )
25%-50% : 2 ( 0.12 )
75%-100% : 4 ( 0.25 )
100% : 9 ( 0.56 )

User search device preferences:
always on computer : 2 ( 0.12 )
mostly on computer : 12 ( 0.75 )
half-half : 2 ( 0.12 )


In [252]:
#Total number of users
count_tot = float(len(list(allUsers)))

# Top types of information searched on Mobile
computer_search_for = [a['info']['computer_search_for'] for a in allUsers]
computer_search_for = [s for sublist in computer_search_for for s in sublist]
if a['info']['computer_search_for_other'] != '':
    computer_search_for.append(a['info']['computer_search_for_other'])
computer_search_for.sort()

tot = []
for k, g in itertools.groupby(computer_search_for):
    tot.append((k, len(list(g))))
tot.sort(key=lambda x: x[1], reverse=True)

print 'Top searched informaiton types with computers: '
for t in tot:
    print t[0], t[1], '%.2f'%(t[1]/count_tot)
print
    
# Top types of information searched on Computer
mobile_search_for = [a['info']['mobile_search_for'] for a in allUsers]
mobile_search_for = [s for sublist in mobile_search_for for s in sublist]
if a['info']['mobile_search_for_other'] != '':
    mobile_search_for.append(a['info']['mobile_search_for_other'])
mobile_search_for.sort()

tot = []
for k, g in itertools.groupby(mobile_search_for):
    tot.append((k, len(list(g))))
tot.sort(key=lambda x: x[1], reverse=True)

print "Top searched information types with mobiles: "
for t in tot:
    print t[0], t[1], '%.2f'%(t[1]/count_tot)
    


Top searched informaiton types with computers: 
entertainment 12 0.75
social 11 0.69
topic 11 0.69
fact 9 0.56
news 9 0.56
shopping 7 0.44
job 6 0.38
task 6 0.38
routes 5 0.31
travel 5 0.31
house 2 0.12
person 2 0.12

Top searched information types with mobiles: 
news 13 0.81
routes 12 0.75
social 12 0.75
entertainment 9 0.56
fact 9 0.56
topic 7 0.44
task 6 0.38
person 4 0.25
shopping 3 0.19
travel 1 0.06


## Descriptive stats of the log

In [253]:
# Total number of queries annotated
queries = Log.aggregate([{'$match': {'to_annotate': True, 'event': 'tab-search', 'removed': False}},
               {'$project': {
                            'annotation.task': {'$ifNull': ['$annotation.task', 0]}}},
                ])
Q = list(queries)
count_tot = len(list(Q))
count_anno = count_tot - sum([int(q['annotation']['task'] == 0) for q in Q])
print 'Total number of annotated queries:', count_anno

# Total number of viewed pages annotated
pages = Log.aggregate([{'$match': {'to_annotate': True, 'event': 'tab-loaded', 'removed': False}},
               {'$project': {
                            'annotation.task': {'$ifNull': ['$annotation.task', 0]}}},
                ])
Q = list(pages)
count_tot = len(list(Q))
count_anno = count_tot - sum([int(q['annotation']['task'] == 0) for q in Q])
print 'Total number of annotated pages:', count_anno

# Total number of tasks annotated
tasks = Log.aggregate([{'$match': {'to_annotate': True, 'removed': False}}, 
                      {'$project': {'annotation.task': {'$ifNull': ['$annotation.task', {}]}}}])
T = [t['annotation']['task'].get('taskid', '000') for t in list(tasks)]
# Excluding pre-defined tasks, only include user defined ones
user_tasks = set(T) - set(['000', '001', '002', '003', '004'])
print 'Number of user defined tasks: ', len(user_tasks)

# Total number of tasks being analyzed in postQ
chosen_tasks = []
for u in allUsers:
    postQ = u.get('postQ', {})
    tasklist = postQ.get('tasklist', [])
    for t in tasklist:
        if t['chosen'] == True:
            chosen_tasks.append(t)
print 'Number of tasks analysed in postQ:', len(chosen_tasks)



Total number of annotated queries: 2565
Total number of annotated pages: 21193
Number of user defined tasks:  167
Number of tasks analysed in postQ: 79


In [254]:
num_days = []
num_pages = []
num_queries = []
num_tasks = []
for u in allUsers:
    items = Log.aggregate([{'$match': {'userid': u['userid'], 'to_annotate': True}}, 
                  {'$project': {
                    'event': 1,
                    'annotation.task': {'$ifNull': ['$annotation.task', 0]}, 
                    'year': {'$year': '$timestamp_bson'},
                    'day': {'$dayOfYear': '$timestamp_bson'},
                    }},
                  ])
    items = list(items)
    # Number of days covered per person (max, min, median)
    days = len(set([(x['year'], x['day']) for x in items]))
    num_days.append(days)
    
    # Number of queries per person (max, min, median)
    queries = [int(x['event'] == 'tab-search') for x in items]
    num_queries.append(sum(queries)/float(days))
    
    # Number of viewed pages per person (max, min, median)
    views = [int(x['event'] == 'tab-loaded')/float(days) for x in items]
    num_pages.append(sum(views))

    
    # Number of tasks per person (max, min, median)
    tasks = [x['annotation']['task'] for x in items]
    T = []
    for t in tasks:
        if t == 0:
            continue
        T.append(t['taskid'])
    num_tasks.append(len(set(T)))
    
print "Number of days covered p.p.: ", 'min:', min(num_days), 'max:', max(num_days), 'median:', np.median(num_days)
print "Number of queries issued p.d.: ", 'min:', '%.1f'%min(num_queries), 'max:', '%.1f'%max(num_queries), 'median:', '%.1f'%np.median(num_queries)
print "Number of pages viewed p.d.: ", 'min:', '%.1f'%min(num_pages), 'max:', '%.1f'%max(num_pages), 'median:', '%.1f'%np.median(num_pages)
print "Number of tasks performed p.p.", 'min:', min(num_tasks), 'max:', max(num_tasks), 'median:', np.median(num_tasks)
   
    

Number of days covered p.p.:  min: 2 max: 10 median: 6.0
Number of queries issued p.d.:  min: 3.6 max: 77.8 median: 17.9
Number of pages viewed p.d.:  min: 21.7 max: 684.7 median: 190.1
Number of tasks performed p.p. min: 4 max: 27 median: 12.0
