In [None]:
import pathlib
import json

In [None]:
from textrec.paths import paths

In [None]:
import glob
import os
import json
import dateutil.parser
import datetime
import toolz

import re

In [None]:
def get_invalid():
    invalid = set()
    with open(paths.data / 'invalid.txt') as f:
        for line in f:
            line = re.sub(r'#.+', '', line)
            line = line.strip()
            if not line:
                continue
            invalid.add(line)
    return invalid
INVALID = get_invalid()
INVALID

In [None]:
def get_log_data(log_file, earliest):
    size = os.path.getsize(log_file)
    meta = None
    num_nexts = 0
    with open(log_file) as f:
        for idx, line in enumerate(f):
            if 'next' not in line and 'login' not in line and 'finalData' not in line:
                continue
            line = json.loads(line)
            if line.get('type') == 'next':
                num_nexts += 1
            elif line.get('type') == 'login':
                if 'jsTimestamp' in line:
                    timestamp = datetime.datetime.fromtimestamp(line['jsTimestamp'] / 1000)
                else:
                    timestamp = dateutil.parser.parse(line['timestamp'])
                if timestamp < earliest:
                    return
                platform_id = line['platform_id']
                meta = dict(
                    timestamp=timestamp,
                    batch=line.get('batch'),
                    config=line['config'],
                    platform_id=platform_id,
                    participant_id=line['participant_id'],
                    size=size,
                    complete=False) # will override
            elif line.get('type') == 'finalData':
#                 meta['finalData'] = line['finalData']
                meta['complete'] = True
    if meta:
        return dict(meta, num_nexts=num_nexts)


def get_logs(log_path, earliest):
    log_files = []
    for log_file in log_path.glob('*.jsonl'):
        data = get_log_data(log_file, earliest)
        if data is not None:
#             print(data)
            log_files.append(data)
    return log_files

In [None]:
log_files = get_logs(paths.top_level / 'logs-gcp1', earliest = datetime.datetime(2018, 5, 2))

In [None]:
not_invalid = [entry for entry in log_files if entry['participant_id'] not in INVALID]

In [None]:
complete = [entry for entry in not_invalid if entry['complete']]
complete.sort(key=lambda x: x['timestamp'])

In [None]:
complete_by_group = {
    config: [participant['participant_id'] for participant in sorted(group, key=lambda x: x['timestamp'])]
    for config, group in toolz.groupby('batch', complete).items()
}

In [None]:
# Dump a list of participant_ids
for config, group in complete_by_group.items():
    print()
    print(f'{len(group)} completed in {config}')
    print(f'{config}:',  ' '.join(group))

In [None]:
from textrec.quick_summary import summarize

In [None]:
import sys

In [None]:
reload(sys.modules[summarize.__module__])

In [None]:
summarize(complete_by_group['xs1'])

In [None]:
import sys
sys.exit(0)

In [None]:
did_some_work = [entry for entry in not_invalid if entry['num_nexts'] > 4] # arbitrary cutoff

In [None]:
incomplete = [entry for entry in did_some_work if not entry['complete']]

In [None]:
len(incomplete)

In [None]:
%run -m textrec.quick_summary

In [None]:
summarize([x['participant_id'] for x in incomplete], incomplete_ok=True)

In [None]:
' '.join(entry['participant_id'] for entry in complete)

In [None]:
from textrec.counterbalancing import get_completion_data

In [None]:
import pandas as pd

In [None]:
completions = pd.DataFrame(get_completion_data('gc1'))
completions['login_timestamp'] = pd.to_datetime(completions.login_timestamp, unit='s')
completions = completions.sort_values('login_timestamp')

In [None]:
completions.login_timestamp.iloc[-5]

In [None]:
completions[
    completions.completed
   & (completions.login_timestamp < pd.Timestamp(year=2018, month=6, day=29))].assignment.value_counts()


In [None]:
completions