## Setup

In [115]:
from pymongo import MongoClient
import numpy as np
import itertools

# DB connections
client = MongoClient()
db = client.db_tasklog_clean
# Collections
User = db.user
Log = db.log_chrome

## Descriptive stats of user demographics:

In [116]:
# Gender groups
print 'Gender groups:'
for d in User.aggregate([{'$project': {'info.gender': 1}},
        {'$group': {'_id': '$info.gender', 'count': {'$sum': 1}}}]):
    print d['_id'], d['count']
print
    
# Age groups
print 'Age groups:'
for d in User.aggregate([{'$project': {'info.age': 1}}, 
    {'$group': {'_id': '$info.age', 'count': {'$sum': 1}}}]):
    print d['_id'], d['count']
print

Gender groups:
female 3
male 3

Age groups:
25_34 6



In [117]:
allUsers = list(User.find({}))

# Experience computer
exp_comp = [int(a['info']['exp_comp']) for a in allUsers]
iqr = np.percentile(exp_comp, 75) - np.percentile(exp_comp, 25)
print 'User computer technology experience:', 'Median =', np.median(exp_comp), 'IQR =', iqr

# Experience Search engine
exp_se = [int(a['info']['exp_se']) for a in allUsers]
iqr = np.percentile(exp_se, 75) - np.percentile(exp_se, 25)
print 'User Search engine experience:', 'Median =', np.median(exp_se), 'IQR =', iqr

# Frequency of work related usage of computer
comp_work = [int(a['info']['comp_work']) for a in allUsers]
iqr = np.percentile(comp_work, 75) - np.percentile(comp_work, 25)
print 'Frequency of using the experiment laptop for work:', 'Median =', np.median(comp_work), 'IQR =', iqr


User computer technology experience: Median = 5.0 IQR = 0.0
User Search engine experience: Median = 5.0 IQR = 0.0
Frequency of using the experiment laptop for work: Median = 4.0 IQR = 1.5


In [118]:
total_users = float(len(list(allUsers)))

# Frequency of using the experiment laptop
comp_freq_labels = {'1': '<25%', '2':'25%-50%', '3': '50%-75%', '4': '75%-100%', '5': '100%'}
comp_freq = User.aggregate([{'$project': {'info.freq_comp': 1}}, 
                           {'$group': {'_id': '$info.freq_comp', 'count': {'$sum': 1}}}, 
                           {'$sort': {'_id': 1}}])

print 'Frequency of using the experiment laptop:'
for f in comp_freq:
    print comp_freq_labels[f['_id']], ':', f['count'], '(','%.2f'%(float(f['count'])/total_users), ')'
print

# Search device
search_device_label = {'1': 'always on computer', '2': 'mostly on computer', 
                      '3': 'half-half', '4': 'mostly on mobile', '5': 'always on mobile'}
search_device = User.aggregate([{'$project': {'info.search_device': 1}}, 
                                {'$group': {'_id': '$info.search_device', 'count': {'$sum': 1}}},
                                {'$sort': {'_id': 1}}
                               ])
print 'User search device preferences:'
for s in search_device:
    if s['_id'] != None:   # Need to update Marc's registration form
        print search_device_label[s['_id']], ':', s['count'], '(', '%.2f'%(float(s['count'])/total_users), ')'
 

Frequency of using the experiment laptop:
<25% : 1 ( 0.17 )
25%-50% : 1 ( 0.17 )
75%-100% : 2 ( 0.33 )
100% : 2 ( 0.33 )

User search device preferences:
mostly on computer : 5 ( 0.83 )
half-half : 1 ( 0.17 )


In [119]:
#Total number of users
count_tot = float(len(list(allUsers)))

# Top types of information searched on Mobile
computer_search_for = [a['info']['computer_search_for'] for a in allUsers]
computer_search_for = [s for sublist in computer_search_for for s in sublist]
if a['info']['computer_search_for_other'] != '':
    computer_search_for.append(a['info']['computer_search_for_other'])
computer_search_for.sort()

tot = []
for k, g in itertools.groupby(computer_search_for):
    tot.append((k, len(list(g))))
tot.sort(key=lambda x: x[1], reverse=True)

print 'Top searched informaiton types with computers: '
for t in tot:
    print t[0], t[1], '%.2f'%(t[1]/count_tot)
print
    
# Top types of information searched on Computer
mobile_search_for = [a['info']['mobile_search_for'] for a in allUsers]
mobile_search_for = [s for sublist in mobile_search_for for s in sublist]
if a['info']['mobile_search_for_other'] != '':
    mobile_search_for.append(a['info']['mobile_search_for_other'])
mobile_search_for.sort()

tot = []
for k, g in itertools.groupby(mobile_search_for):
    tot.append((k, len(list(g))))
tot.sort(key=lambda x: x[1], reverse=True)

print "Top searched information types with mobiles: "
for t in tot:
    print t[0], t[1], '%.2f'%(t[1]/count_tot)
    


Top searched informaiton types with computers: 
social 5 0.83
fact 4 0.67
topic 4 0.67
entertainment 3 0.50
job 3 0.50
shopping 3 0.50
house 2 0.33
news 2 0.33
routes 2 0.33
task 2 0.33
travel 2 0.33

Top searched information types with mobiles: 
fact 6 1.00
routes 6 1.00
news 5 0.83
social 5 0.83
entertainment 3 0.50
task 2 0.33
person 1 0.17


## Descriptive stats of the log

In [None]:
# Total number of queries

# Total number of viewed pages
# Total number of tasks

In [None]:
# Number of days covered per person (max, min, median)
# Number of queries per day per person (max, min, median)
# Number of viewed pages per day per person (max, min, median)
# Number of tasks per day per person (max, min, median)

In [None]:
# Number of tasks distributed over week days (median of subjects)
# Number of queries distributed over week days (median of subjects)
# Number of viewed pages distributed over week days (median of subjects)