In [1]:
import pathlib
import json

In [2]:
from textrec.paths import paths

In [3]:
import glob
import os
import json
import dateutil.parser
import datetime
import toolz

In [4]:
INVALID = (
    '3vf5fg', #(already done as 9fmfm4),
    '73qq5q', #(already done as 77j4mf),
    'ffhgxm', #(something messed up in analysis, logfile is out of sync)
)


In [5]:
def get_log_data(log_file, earliest):
    size = os.path.getsize(log_file)
    meta = None
    num_nexts = 0
    with open(log_file) as f:
        for idx, line in enumerate(f):
            if 'next' not in line and 'login' not in line and 'finalData' not in line:
                continue
            line = json.loads(line)
            if line.get('type') == 'next':
                num_nexts += 1
            elif line.get('type') == 'login':
                if 'jsTimestamp' in line:
                    timestamp = datetime.datetime.fromtimestamp(line['jsTimestamp'] / 1000)
                else:
                    timestamp = dateutil.parser.parse(line['timestamp'])
                if timestamp < earliest:
                    return
                platform_id = line['platform_id']
                meta = dict(
                    timestamp=timestamp,
                    config=line['config'],
                    platform_id=platform_id,
                    participant_id=line['participant_id'],
                    size=size,
                    complete=False) # will override
            elif line.get('type') == 'finalData':
#                 meta['finalData'] = line['finalData']
                meta['complete'] = True
    if meta:
        return dict(meta, num_nexts=num_nexts)


def get_logs(log_path, earliest):
    log_files = []
    for log_file in log_path.glob('*.jsonl'):
        data = get_log_data(log_file, earliest)
        if data is not None:
#             print(data)
            log_files.append(data)
    return log_files

In [6]:
log_files = get_logs(paths.top_level / 'logs-gcp1', earliest = datetime.datetime(2018, 5, 2))

In [7]:
not_invalid = [entry for entry in log_files if entry['participant_id'] not in INVALID]

In [8]:
did_some_work = [entry for entry in not_invalid if entry['num_nexts'] > 4] # arbitrary cutoff

In [9]:
incomplete = [entry for entry in did_some_work if not entry['complete']]

In [10]:
len(incomplete)

7

In [13]:
from textrec import analysis_util

In [23]:
an = analysis_util.get_log_analysis(incomplete[0]['participant_id'])

In [24]:
an.keys()

dict_keys(['participant_id', 'isComplete', 'byExpPage', 'pageSeq', 'screenTimes', 'allControlledInputs', 'git_rev'])

In [25]:
an['isComplete']

False

In [26]:
an['allControlledInputs']

[['intro-use_predictive', 'Yes'],
 ['intro-I like to solve complex problems.', 1],
 ['intro-I have difficulty understanding abstract ideas.', 1],
 ['intro-I feel comfortable around people.', 2],
 ['intro-I have little to say.', 2]]

In [32]:
%run -m textrec.batch_analysis

In [35]:
summarize([x['participant_id'] for x in incomplete], incomplete_ok=True)


3wjx7c
practice-0:general:a black cat napping on a sunny unpainted wood bench in front of a red wall 
final-0-0:general:there is a cat that is laying on top of a tablet in front of a cup of wine.? the cat is looking at its master. 
final-0-1:general:this is a small bathroom and there is someone in tne shower and the bathroom is also a modern toilet. 
final-0-2:general:

intro-use_predictive: Yes

Total time: 4.7m
ExperimentScreen: 245.0
IntroSurvey: 16.3
Welcome: 12.2
TaskDescription: 4.2
PostPractice: 1.9
StudyDesc: 1.8
Instructions: 1.1

5xc2f3
practice-0:specific:a black cat napping on a sunny unpainted wood bench in front of a red wall 
final-0-0:specific:a cat sitting in front of a glass of wine looking up at the camera. 
final-0-1:specific:a bathroom shower with dirty glass doors with a towel hanging on it 
final-0-2:specific:there is no image here 
practice-1:general:a man with black hair and glasses placing a large turkey into an upper oven

intro-use_predictive: Yes
postTask-

In [36]:
complete = [entry for entry in not_invalid if entry['complete']]
complete.sort(key=lambda x: x['timestamp'])

In [37]:
' '.join(entry['participant_id'] for entry in complete)

'h52x67 jvccx2 36x2r3 gg65g6 692c8j qmwvwv 77j4mf 4ggxj8 5c39rx fvwhpc 26w4jv 7g8xw8 533r6c 74v545 vxjcf7 9f5xwx 3267ww wf4c3m 7jcm37 cf9p8m phqcw9 5jj59g gw3w72 559x69 gvwqp6'

In [38]:
# Dump a list of participant_ids
for config, group in toolz.groupby('config', complete).items():
    print()
    group = sorted(group, key=lambda x: x['timestamp'])
    print(f'{len(group)} completed in {config}')
    print(f'{config}:',  ' '.join(participant['participant_id'] for participant in group))


25 completed in cap
cap: h52x67 jvccx2 36x2r3 gg65g6 692c8j qmwvwv 77j4mf 4ggxj8 5c39rx fvwhpc 26w4jv 7g8xw8 533r6c 74v545 vxjcf7 9f5xwx 3267ww wf4c3m 7jcm37 cf9p8m phqcw9 5jj59g gw3w72 559x69 gvwqp6
