## Setup

In [164]:
from pymongo import MongoClient
import numpy as np
import itertools

# DB connections
client = MongoClient()
db = client.db_tasklog_clean
# Collections
User = db.user
Log = db.log_chrome

## Descriptive stats of user demographics:

In [165]:
# Gender groups
print 'Gender groups:'
for d in User.aggregate([{'$project': {'info.gender': 1}},
        {'$group': {'_id': '$info.gender', 'count': {'$sum': 1}}}]):
    print d['_id'], d['count']
print
    
# Age groups
print 'Age groups:'
for d in User.aggregate([{'$project': {'info.age': 1}}, 
    {'$group': {'_id': '$info.age', 'count': {'$sum': 1}}}]):
    print d['_id'], d['count']
print

Gender groups:
female 4
male 3

Age groups:
25_34 7



In [173]:
allUsers = list(User.find({}))

# Experience computer
exp_comp = [int(a['info']['exp_comp']) for a in allUsers]
iqr = np.percentile(exp_comp, 75) - np.percentile(exp_comp, 25)
print 'User computer technology experience:', 'Median =', np.median(exp_comp), 'IQR =', iqr

# Experience Search engine
exp_se = [int(a['info']['exp_se']) for a in allUsers]
iqr = np.percentile(exp_se, 75) - np.percentile(exp_se, 25)
print 'User Search engine experience:', 'Median =', np.median(exp_se), 'IQR =', iqr

# Frequency of work related usage of computer
comp_work = [int(a['info']['comp_work']) for a in allUsers]
iqr = np.percentile(comp_work, 75) - np.percentile(comp_work, 25)
print 'Frequency of using the experiment laptop for work:', 'Median =', np.median(comp_work), 'IQR =', iqr


User computer technology experience: Median = 5.0 IQR = 0.0
[5, 5, 4, 5, 5, 5, 5]
User Search engine experience: Median = 5.0 IQR = 0.0
Frequency of using the experiment laptop for work: Median = 4.0 IQR = 2.0


In [167]:
total_users = float(len(list(allUsers)))

# Frequency of using the experiment laptop
comp_freq_labels = {'1': '<25%', '2':'25%-50%', '3': '50%-75%', '4': '75%-100%', '5': '100%'}
comp_freq = User.aggregate([{'$project': {'info.freq_comp': 1}}, 
                           {'$group': {'_id': '$info.freq_comp', 'count': {'$sum': 1}}}, 
                           {'$sort': {'_id': 1}}])

print 'Frequency of using the experiment laptop:'
for f in comp_freq:
    print comp_freq_labels[f['_id']], ':', f['count'], '(','%.2f'%(float(f['count'])/total_users), ')'
print

# Search device
search_device_label = {'1': 'always on computer', '2': 'mostly on computer', 
                      '3': 'half-half', '4': 'mostly on mobile', '5': 'always on mobile'}
search_device = User.aggregate([{'$project': {'info.search_device': 1}}, 
                                {'$group': {'_id': '$info.search_device', 'count': {'$sum': 1}}},
                                {'$sort': {'_id': 1}}
                               ])
print 'User search device preferences:'
for s in search_device:
    if s['_id'] != None:   # Need to update Marc's registration form
        print search_device_label[s['_id']], ':', s['count'], '(', '%.2f'%(float(s['count'])/total_users), ')'
 

Frequency of using the experiment laptop:
<25% : 1 ( 0.14 )
25%-50% : 1 ( 0.14 )
75%-100% : 2 ( 0.29 )
100% : 3 ( 0.43 )

User search device preferences:
mostly on computer : 6 ( 0.86 )
half-half : 1 ( 0.14 )


In [168]:
#Total number of users
count_tot = float(len(list(allUsers)))

# Top types of information searched on Mobile
computer_search_for = [a['info']['computer_search_for'] for a in allUsers]
computer_search_for = [s for sublist in computer_search_for for s in sublist]
if a['info']['computer_search_for_other'] != '':
    computer_search_for.append(a['info']['computer_search_for_other'])
computer_search_for.sort()

tot = []
for k, g in itertools.groupby(computer_search_for):
    tot.append((k, len(list(g))))
tot.sort(key=lambda x: x[1], reverse=True)

print 'Top searched informaiton types with computers: '
for t in tot:
    print t[0], t[1], '%.2f'%(t[1]/count_tot)
print
    
# Top types of information searched on Computer
mobile_search_for = [a['info']['mobile_search_for'] for a in allUsers]
mobile_search_for = [s for sublist in mobile_search_for for s in sublist]
if a['info']['mobile_search_for_other'] != '':
    mobile_search_for.append(a['info']['mobile_search_for_other'])
mobile_search_for.sort()

tot = []
for k, g in itertools.groupby(mobile_search_for):
    tot.append((k, len(list(g))))
tot.sort(key=lambda x: x[1], reverse=True)

print "Top searched information types with mobiles: "
for t in tot:
    print t[0], t[1], '%.2f'%(t[1]/count_tot)
    


Top searched informaiton types with computers: 
social 5 0.71
topic 5 0.71
entertainment 4 0.57
fact 4 0.57
shopping 4 0.57
job 3 0.43
news 3 0.43
routes 3 0.43
house 2 0.29
task 2 0.29
travel 2 0.29

Top searched information types with mobiles: 
routes 7 1.00
fact 6 0.86
news 6 0.86
social 5 0.71
entertainment 4 0.57
task 3 0.43
person 1 0.14
topic 1 0.14


## Descriptive stats of the log

In [178]:
# Total number of queries annotated
queries = Log.aggregate([{'$match': {'to_annotate': True, 'event': 'tab-search', 'removed': False}},
               {'$project': {
                            'annotation.task': {'$ifNull': ['$annotation.task', 0]}}},
                ])
Q = list(queries)
count_tot = len(list(Q))
count_anno = count_tot - sum([int(q['annotation']['task'] == 0) for q in Q])
print 'Total number of annotated queries:', count_anno

# Total number of viewed pages annotated
pages = Log.aggregate([{'$match': {'to_annotate': True, 'event': 'tab-loaded', 'removed': False}},
               {'$project': {
                            'annotation.task': {'$ifNull': ['$annotation.task', 0]}}},
                ])
Q = list(pages)
count_tot = len(list(Q))
count_anno = count_tot - sum([int(q['annotation']['task'] == 0) for q in Q])
print 'Total number of annotated pages:', count_anno

# Total number of tasks annotated
tasks = Log.aggregate([{'$match': {'to_annotate': True, 'removed': False}}, 
                      {'$project': {'annotation.task': {'$ifNull': ['$annotation.task', {}]}}}])
T = [t['annotation']['task'].get('taskid', '000') for t in list(tasks)]
# Excluding pre-defined tasks, only include user defined ones
user_tasks = set(T) - set(['000', '001', '002', '003', '004'])
print 'Number of user defined tasks: ', len(user_tasks)

# Total number of tasks being analyzed in postQ
chosen_tasks = []
for u in allUsers:
    postQ = u.get('postQ', {})
    tasklist = postQ.get('tasklist', [])
    for t in tasklist:
        if t['chosen'] == True:
            chosen_tasks.append(t)
print 'Number of tasks analysed in postQ:', len(chosen_tasks)



Total number of annotated queries: 787
Total number of annotated pages: 4591
Number of user defined tasks:  68
Number of tasks analysed in postQ: 20


In [205]:
num_days = []
num_pages = []
num_queries = []
num_tasks = []
for u in allUsers:
    items = Log.aggregate([{'$match': {'userid': u['userid'], 'to_annotate': True}}, 
                  {'$project': {
                    'event': 1,
                    'annotation.task': {'$ifNull': ['$annotation.task', 0]}, 
                    'year': {'$year': '$timestamp_bson'},
                    'day': {'$dayOfYear': '$timestamp_bson'},
                    }},
                  ])
    items = list(items)
    # Number of days covered per person (max, min, median)
    days = len(set([(x['year'], x['day']) for x in items]))
    num_days.append(days)
    
    # Number of queries per person (max, min, median)
    queries = [int(x['event'] == 'tab-search') for x in items]
    num_queries.append(sum(queries)/float(days))
    
    # Number of viewed pages per person (max, min, median)
    views = [int(x['event'] == 'tab-loaded')/float(days) for x in items]
    num_pages.append(sum(views))

    
    # Number of tasks per person (max, min, median)
    tasks = [x['annotation']['task'] for x in items]
    T = []
    for t in tasks:
        if t == 0:
            continue
        T.append(t['taskid'])
    num_tasks.append(len(set(T)))
    
print "Number of days covered p.p.: ", 'min:', min(num_days), 'max:', max(num_days), 'median:', np.median(num_days)
print "Number of queries issued p.d.: ", 'min:', '%.1f'%min(num_queries), 'max:', '%.1f'%max(num_queries), 'median:', '%.1f'%np.median(num_queries)
print "Number of pages viewed p.d.: ", 'min:', '%.1f'%min(num_pages), 'max:', '%.1f'%max(num_pages), 'median:', '%.1f'%np.median(num_pages)
print "Number of tasks performed p.p.", 'min:', min(num_tasks), 'max:', max(num_tasks), 'median:', np.median(num_tasks)
   
    

Number of days covered p.p.:  min: 1 max: 12 median: 5.0
Number of queries issued p.d.:  min: 7.0 max: 77.8 median: 13.4
Number of pages viewed p.d.:  min: 28.2 max: 341.5 median: 103.3
Number of tasks p.p. min: 1 max: 25 median: 10.0
