# 1. Local links

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import json

import ideafast_deviceselection as ifds

source = Path(ifds.__file__).parent.parent.absolute() / 'local/keyboard'
users = source /'users'
config = source / 'configs.json'
prompts = source / 'prompts.json'

source_files = users / 'raw'

# prep folders

imp_path = users / 'implicit'
exp_path = users / 'explicit'
oth_path = users / 'other'
imp_path.mkdir(exist_ok=True)
exp_path.mkdir(exist_ok=True)
oth_path.mkdir(exist_ok=True)



# 2. Parse JSON - split in subcategories 

In [None]:
# split all files
# SKIP THIS IF YOU ALREADY HAVE SPLIT FILES (change False to True if you haven't)

if False:
    for file in sorted(source_files.glob('*.json')):
        with open(file) as r:
            data = json.load(r)

            implicit = data['completedTasks']['implicit_mode']
            explicit = data['completedTasks'].get(study_mode, None) or data['completedTasks'].get(study_mode_june, None)
            other = {key:data[key] for key in ['config', 'device']}

            # store implicit data        
            with open(users / 'implicit' / file.name, 'w', encoding='utf-8') as w:
                json.dump(implicit, w, ensure_ascii=False, indent=4)

            # store explicit data
            if not explicit:
                print(f'No study data found for {file.stem}')
                continue

            with open(users / 'explicit' / file.name, 'w', encoding='utf-8') as w:
                json.dump(explicit, w, ensure_ascii=False, indent=4)

            # store other data
            with open(users / 'other' / file.name, 'w', encoding='utf-8') as w:
                json.dump(other, w, ensure_ascii=False, indent=4)

# 3. Converting to DataFrame

Below the json data is tranformed into dataframes, adding as much relevant metrics as possible. These need to be re-run if/when the source data is altered. Saving these into `.csv` files will allow the other notebooks to open the dataframes and visualise them.

## Implicit data

A few errors/dicards can occur. In any case, each timestamp contains a `"phrases"` key:
```json
"timestamp": {
    "phrases": []
}
```
Known or found errors are stored as (listing examples - these don't occur at the same time):

1. `"error": "something went wrong with input buffer calculation"`
2. `"discarded": "edit box was not empty"`
3. `"discarded": "error calculating input buffer"`
4. `"words-per-minute": "invalid value"`
5. `"timestamp": 1234567890  # missing 'discarded' - assuming this is a discard`
6. Some values might not exists even though others were appropriately calculated

Calculating with `words-per-minute` also needs checking, as it's value can be `"invalid value"`.

Based on the data structure (`"phrases"` being a list) we assume multiple phrases can exists.

Metrics calculation below is based on [André Santos' code](https://github.com/WildKey-Dev/ideafast-keyboard-study-creator) for the report generation.


In [None]:
study_configs = [
    {
        'name': 'Participants Study Config',
        'start': pd.to_datetime("2021-05-21T00:00:00.000Z"),
        'end': pd.to_datetime("2021-06-04T00:00:00.000Z")
    },{
        'name': 'Participants Study Config June',
        'start': pd.to_datetime("2021-06-01T00:00:00.000Z"),
        'end': pd.to_datetime("2021-06-15T00:00:00.000Z")
    }
]
print('study_config variable stored')

study_config variable stored


In [None]:

# Metrics of interest
pre_calculated_metrics = [
    'action-count',
    'auto-correct',
    'corrected-error-rate',
    'correction-action-count',
    'cursor-changes',
    'entry-action-count',
    'error-correction-attempts',
    'flight-time',
    'holdtime-deviations',
    'insertions-error-rate',
    'language',
    'omission-error-rate',
    'select-suggestions',
    'substitutions-error-rate',
    'time-per-word', 
    'total-changed-characters',
    'total-error-rate',
    'touch-offset-entered',
    'uncorrected-error-rate',
    'voice-input',
    'words-per-minute',
    'written-characters',
    'written-numbers',
    'written-special-characters',
    'mean-flight-time',
    'mean-holdtime-deviations',
]
lang_pos = pre_calculated_metrics.index('language')
flight_pos = pre_calculated_metrics.index('flight-time')
holdtime_pos = pre_calculated_metrics.index('holdtime-deviations')
mean_flight_pos = pre_calculated_metrics.index('mean-flight-time')
mean_holdtime_pos = pre_calculated_metrics.index('mean-holdtime-deviations')

columns = ['participant','timestamp', 'day', 'day_relative','quality'] + pre_calculated_metrics
df = pd.DataFrame(columns=columns)

# participants joining in the second period
second_wave = ['M79iKh', 'VaH3bx', 'XtAs3f']
datapoints_outside_study = 0

for count, file in enumerate(sorted(imp_path.glob('*.json')), start=1):
    wave = 1 if file.stem[:6] in second_wave else 0

    with open(file) as r:
        data = json.load(r)
        lst = []

        for timestamp, value in data.items():
            for phrase in value['phrases']:
                
                timestamp = pd.to_datetime(timestamp, unit='ms', utc=True)
                
                # filter records outside of the study period
                if timestamp < study_configs[wave]['start'] or timestamp > study_configs[wave]['end']:
                    datapoints_outside_study+=1
                    continue

                # detecting/filtering errors - see list notebook cell above
                quality = 'valid'
                
                # case 1
                if 'error' in phrase:
                    quality = 'error'
                # case 3
                elif ('discarded' in phrase and 'error' in phrase['discarded']):
                    quality = 'error'
                # case 4
                elif ('words-per-minute' in phrase and phrase['words-per-minute'] == "invalid value"):
                    quality = 'error'
                # case 2 and 5
                elif 'discarded' in phrase or 'words-per-minute' not in phrase:
                    quality = 'discarded'
                
                if quality != 'valid':
                    metrics = [np.NaN] * len(pre_calculated_metrics)
                else:
                    metrics = [phrase.get(m,np.NaN) for m in pre_calculated_metrics]
                    
                    # case 6
                    # classify any row as 'error' if any of their metrics are NaN
                    # only two were found before this check was added
                    if not isinstance(metrics[lang_pos],dict):
                        quality = 'error'
                    else:
                        # pull out iso lanuage only (tag)                        
                        metrics[lang_pos] = metrics[lang_pos].get('tag',np.NaN)
                        # calculate average flight_times, and hold_times
                        metrics[mean_flight_pos] = np.average(metrics[flight_pos])
                        metrics[mean_holdtime_pos] = np.average(metrics[holdtime_pos])
                
                lst.append([count, timestamp, timestamp.normalize(), None, quality] + metrics)
        
        sub_df = pd.DataFrame(lst, columns=columns)
        
        sub_df['day_relative'] = (sub_df['day'] - sub_df['day'].min())/np.timedelta64(1, 'D')
        
        df = pd.concat([df, sub_df], ignore_index=True)

df

Unnamed: 0,participant,timestamp,day,day_relative,quality,action-count,auto-correct,corrected-error-rate,correction-action-count,cursor-changes,entry-action-count,error-correction-attempts,flight-time,holdtime-deviations,insertions-error-rate,language,omission-error-rate,select-suggestions,substitutions-error-rate,time-per-word,total-changed-characters,total-error-rate,touch-offset-entered,uncorrected-error-rate,voice-input,words-per-minute,written-characters,written-numbers,written-special-characters,mean-flight-time,mean-holdtime-deviations
0,1,2021-05-21 07:37:07.930000+00:00,2021-05-21 00:00:00+00:00,0.0,valid,8.0,0.0,14.285715,1.0,0.0,7.0,1.0,"[184, 267, 401, 242, 125, 1795, 1169, 1010]","[66, 75, 83, 92, 117, 83, 83, -1, 58, 49]",0.0,en-GB,0.0,0.0,0.0,[5192],8.0,14.285715,"[{'t1': {'t1': 5, 't2': 16}, 't2': 148416373},...",0.0,0.0,13.864818,7.0,0.0,1.0,649.125,70.500000
1,1,2021-05-21 07:37:09.121000+00:00,2021-05-21 00:00:00+00:00,0.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1,2021-05-21 07:37:09.122000+00:00,2021-05-21 00:00:00+00:00,0.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,2021-05-21 07:53:01.918000+00:00,2021-05-21 00:00:00+00:00,0.0,valid,5.0,0.0,25.000000,1.0,0.0,4.0,1.0,"[192, 250, 3032, 367]","[75, 75, 66, 33, -1, 50]",0.0,en-GB,0.0,0.0,0.0,[3836],5.0,25.000000,"[{'t1': {'t1': 25, 't2': 28}, 't2': 149370606}...",0.0,0.0,9.372559,4.0,0.0,0.0,960.250,49.666667
4,1,2021-05-21 07:53:02.057000+00:00,2021-05-21 00:00:00+00:00,0.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35767,20,2021-06-03 23:10:28.611000+00:00,2021-06-03 00:00:00+00:00,13.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
35768,20,2021-06-03 23:11:28.844000+00:00,2021-06-03 00:00:00+00:00,13.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
35769,20,2021-06-03 23:14:02.020000+00:00,2021-06-03 00:00:00+00:00,13.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
35770,20,2021-06-03 23:34:07.662000+00:00,2021-06-03 00:00:00+00:00,13.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
pd.set_option('max_columns', None)
# uncommment below to check all columns briefly
#print(df.head())
pd.reset_option('max_columns')

In [None]:
print(f'data points outside of study period: {datapoints_outside_study}')
print('\nOverview datapoints:')
print(df["quality"].value_counts())

data points outside of study period: 14502

Overview datapoints:
valid        19931
discarded     9340
error         6501
Name: quality, dtype: int64


#### Checking quality of filters / cleaning process

In [None]:
# any NaN in pre-calculated metrics?
pd.set_option('max_columns', None)
df[(df['quality']=='valid') & (df.isna().any(axis=1))]

Unnamed: 0,participant,timestamp,day,day_relative,quality,action-count,auto-correct,corrected-error-rate,correction-action-count,cursor-changes,entry-action-count,error-correction-attempts,flight-time,holdtime-deviations,insertions-error-rate,language,omission-error-rate,select-suggestions,substitutions-error-rate,time-per-word,total-changed-characters,total-error-rate,touch-offset-entered,uncorrected-error-rate,voice-input,words-per-minute,written-characters,written-numbers,written-special-characters,mean-flight-time,mean-holdtime-deviations
19645,9,2021-05-31 10:51:17.377000+00:00,2021-05-31 00:00:00+00:00,10.0,valid,18.0,1.0,23.076923,8.0,0.0,18.0,0.0,"[184, 92, 109, 169, 57, 117, 176, 100, 92]","[50, 50, 50, 67, 69, 57, 33, 50, 50, 75]",0.0,,0.0,0.0,0.0,"[553, 368]",16.0,23.076923,"[{'t1': {'t1': 7, 't2': 8}, 't2': 391504801}, ...",0.0,0.0,109.489051,9.0,0.0,0.0,121.777778,55.1


### Variables

Storing to `.csv`, choose which one is useful

In [None]:
outpath = source / 'parsed'
outpath.mkdir(exist_ok=True)

df.to_csv(outpath / 'implicit_raw.csv',index=False)
df[df['quality']=='valid'].to_csv(outpath / 'implicit_only_valid.csv',index=False)

print('Output stored in /local/parsed')

Output stored in /local/parsed


## Explicit data

First, we grab all metrics of interest and generate two .csv's, one with all fatigue measurements, and one with all explicit tasks and their metrics. The link between these is their timeframeID. Since the fatigue measurement doesn't include a timestamp, we will calculate that in retrospect with the completed `explicit.csv` using their timeframeIDs.

In [None]:
# Metrics of interest
phrases_pre_calculated_metrics = [
    'action-count',
    'auto-correct',
    'corrected-error-rate',
    'correction-action-count',
    'cursor-changes',
    'entry-action-count',
    'error-correction-attempts',
    'flight-time',
    'holdtime-deviations',
    'input-stream',
    'input-stream-og',
    'insertions-error-rate',
    'language',
    'omission-error-rate',
    'select-suggestions',
    'substitutions-error-rate',
    'target_phrase',
    'time-per-word', 
    'total-changed-characters',
    'total-error-rate',
    'touch-offset-entered',
    'transcribe',
    'uncorrected-error-rate',
    'voice-input',
    'words-per-minute',
    'written-characters',
    'written-numbers',
    'written-special-characters' ,
    'mean-flight-time',
    'mean-holdtime-deviations',
]

# position of some specific metrics that require a bit more calculation
lang_pos = phrases_pre_calculated_metrics.index('language')
flight_pos = phrases_pre_calculated_metrics.index('flight-time')
holdtime_pos = phrases_pre_calculated_metrics.index('holdtime-deviations')
mean_flight_pos = phrases_pre_calculated_metrics.index('mean-flight-time')
mean_holdtime_pos = phrases_pre_calculated_metrics.index('mean-holdtime-deviations')

# headers in .csv
# the session_timestamp is calculated/averaged to be consistent across the session's tasks
headers_shared = ['participant','session_id', 'session_timestamp', 'time_of_day', 'day_relative'] 
headers_fatigue = ['fatigue_score']
task_prompt_details = ['init','end', 'type']

columns_fatigue = headers_shared + headers_fatigue 
columns_explicit = headers_shared + task_prompt_details + phrases_pre_calculated_metrics

df_explicit = pd.DataFrame(columns=columns_explicit)
df_fatigue = pd.DataFrame(columns=columns_fatigue)

# hardcode task id keys
fatigue_task_id = 'f1b46be-bdfb-dc63-b5a0-0a1d46c280'
fatigue_scale_question_id = 'd63cded-bf4-cbd6-0d23-00b1f631b2f7'
fatigue_scale_comment_field_id = '58fab5-0a86-534-4fec-bdc42024ae31'

# preload config and prompts
global_config = '2ecbe1e-d456-8fb-bf5d-db4324eb4a'  # the two used configs are identical (except timestamps)
with open(config) as r:
    timeframes = json.load(r)[global_config]['timeFrames']
    # pull up the timeframe_id so details can be accessed directly
    pre_loaded_timeframes = {t.get('timeFrameID'):t for t in timeframes}    
with open(prompts) as r:
    pre_loaded_prompts = json.load(r)

# iterate over participants (i.e. sorted files)
for participant, file in enumerate(sorted(exp_path.glob('*.json')), start=1):

    with open(file) as r:
        data = json.load(r)
        lst_explicit = []
        lst_fatigue = []
        dct_session_inits = {}  # collecting all init timestamps for all sessions
        
        for identifier, value in data.items():
            # identifier consists of [timeframeId]_[taskId]
            # we consider the timeframe_id as session_id
            session_id, task_id = identifier.split('_')

            # skip the generated phrases items
            if 'generated-target-phrase' in task_id:
                continue
                
            # skip the fatigue scale comment field (which is a composition task)
            if task_id == fatigue_scale_comment_field_id:
                continue
                
            # get timeframe details
            s = pre_loaded_timeframes[session_id]
            time_of_day = f'{s['start']}-{s['end']}'
            
            "day": 31,
                "end": "20:00",
                "month": 5,
                "notification": {
                    "message": "Please click this notification and complete the tasks.",
                    "title": "New Tasks"
                },
                "start": "14:00",
                "tasks": [
                    "f1b46be-bdfb-dc63-b5a0-0a1d46c280",
                    "e6d0d86-ddd8-71d4-783a-27df84f3e7",
                    "58fab5-0a86-534-4fec-bdc42024ae31",
                    "1df128f-672-4c1-e708-3e072e377586"
                ],
                "timeFrameID": "15f8d6e0-b7c2-11eb-82ce-3908cd4e9308",
                "year": 2021
                    
            timestamp_string = f'{s['year']}-{s['month']}-{s['day']} {s['start']}'
            timeframe_prompt_timestamp = pd.to_datetime(['2019-01-15 13:30:00']).astype(int) / 10**9
            pd.to_datetime(df.agg('-'.join, axis=1), format='%Y-%m-%d-%H-%M-%S')
            
                timestamp = pd.to_datetime(timestamp, unit='ms', utc=True)
                
            
            # pick out fatigue tasks
            if task_id == fatigue_task_id:
                fatigue = value.get(fatigue_scale_question_id,{}).get('response',np.NaN)
                lst_fatigue.append([participant, session_id, None, time_of_day, None, fatigue])
                
                # add initial init timestamp per session to catch eror if no phrases are present
                dct_session_inits.setdefault(session_id,[]).append(np.NaN)
            
            # convert all explicit tasks
            else:
                # shared details
                init_or_end = value.get('init',None) or value.get('end',None)
                phrases_task_prompt_details = [
                    # interestingly, init can sometimes be absent
                    value.get('init',np.NaN),
                    value.get('end',np.NaN),
                    pre_loaded_prompts[task_id]['type']
                ]
                
                # collected all init timestamp per session to pick the earliest later
                dct_session_inits.setdefault(session_id,[]).append(init_or_end or np.NaN)
                
                if 'phrases' in value:
                    for phrase in value['phrases']:
                        # phrase is occasionally found to be None
                        if phrase:
                            metrics = [phrase.get(m,np.NaN) for m in phrases_pre_calculated_metrics]

                            # pull out iso lanuage only (tag)   
                            if isinstance(metrics[lang_pos],dict):
                                metrics[lang_pos] = metrics[lang_pos].get('tag',np.NaN)

                            # calculate average flight_times, and hold_times
                            metrics[mean_flight_pos] = np.average(metrics[flight_pos])
                            metrics[mean_holdtime_pos] = np.average(metrics[holdtime_pos])

                            # ['participant','session_id', 'session_timestamp', 'time_of_day', 'day_relative'] + task details + metrics 
                            lst_explicit.append([participant, session_id, None, time_of_day, None] + phrases_task_prompt_details + metrics)

        
        # after getting all observations, use the earliest timestamp as the session timestamp across fatigue measurements 
        for idx, values in enumerate(lst_fatigue):
            copy = values
            copy[2] = np.nanmin(dct_session_inits[copy[1]])
            lst_fatigue[idx] = copy
            
        # same for explicit tasks    
        for idx, values in enumerate(lst_explicit):
            copy = values
            copy[2] = np.nanmin(dct_session_inits[copy[1]])
            lst_explicit[idx] = copy
            
        # convert lists to dataframes, and add to the larger dataframes
        sub_df_fatigue = pd.DataFrame(lst_fatigue, columns=columns_fatigue)
        sub_df_explicit = pd.DataFrame(lst_explicit, columns=columns_explicit)

        df_fatigue = pd.concat([df_fatigue, sub_df_fatigue], ignore_index=True)
        df_explicit = pd.concat([df_explicit, sub_df_explicit], ignore_index=True)

df_fatigue


  copy[2] = np.nanmin(dct_session_inits[copy[1]])


Unnamed: 0,participant,session_id,session_timestamp,time_of_day,day_relative,fatigue_score
0,1,066fc9e0-b7c2-11eb-82ce-3908cd4e9308,1.622294e+12,14:00-20:00,,45
1,1,08df9160-b7c2-11eb-82ce-3908cd4e9308,1.622315e+12,20:00-23:00,,65
2,1,0b8276d0-b7c2-11eb-82ce-3908cd4e9308,1.622364e+12,9:00-14:00,,69
3,1,0dcefee0-b7c2-11eb-82ce-3908cd4e9308,1.622380e+12,14:00-20:00,,60
4,1,108e1ee0-b7c2-11eb-82ce-3908cd4e9308,1.622403e+12,20:00-23:00,,60
...,...,...,...,...,...,...
673,20,f408ca40-b7c1-11eb-82ce-3908cd4e9308,1.622107e+12,9:00-14:00,,11
674,20,f652ba40-b7c1-11eb-82ce-3908cd4e9308,1.622120e+12,14:00-20:00,,22
675,20,f8968fc0-b7c1-11eb-82ce-3908cd4e9308,1.622143e+12,20:00-23:00,,27
676,20,fb98d340-b7c1-11eb-82ce-3908cd4e9308,1.622190e+12,9:00-14:00,,16


In [None]:
df_explicit

Unnamed: 0,participant,session_id,session_timestamp,time_of_day,day_relative,init,end,type,action-count,auto-correct,...,touch-offset-entered,transcribe,uncorrected-error-rate,voice-input,words-per-minute,written-characters,written-numbers,written-special-characters,mean-flight-time,mean-holdtime-deviations
0,1,01321550-b7c2-11eb-82ce-3908cd4e9308,1.622233e+12,20:00-23:00,,1.622233e+12,1622233347411,composition,20,0,...,"[{'t1': {'t1': 20, 't2': 35}, 't2': 456377245}...",meeting's,0.0,0,51.160587,18,0,0,211.100000,59.384615
1,1,01321550-b7c2-11eb-82ce-3908cd4e9308,1.622233e+12,20:00-23:00,,,1622233332529,transcription,63,2,...,"[{'t1': {'t1': 34, 't2': 13}, 't2': 456343033}...",in chaeo there is fertility,6.666667,0,39.657283,35,0,0,209.487179,61.787234
2,1,01321550-b7c2-11eb-82ce-3908cd4e9308,1.622233e+12,20:00-23:00,,,1622233332529,transcription,40,1,...,"[{'t1': {'t1': 32, 't2': 23}, 't2': 456353140}...",failure is success in progress,0.0,0,87.145969,26,0,0,142.448276,73.466667
3,1,01321550-b7c2-11eb-82ce-3908cd4e9308,1.622233e+12,20:00-23:00,,,1622233332529,transcription,45,2,...,"[{'t1': {'t1': 30, 't2': 2}, 't2': 456358206},...",the sjoetest answer is doing,5.882353,0,54.625264,27,0,0,175.742857,64.000000
4,1,039a89d0-b7c2-11eb-82ce-3908cd4e9308,1.622281e+12,9:00-14:00,,1.622281e+12,1622280744376,transcription,35,1,...,"[{'t1': {'t1': 11, 't2': 34}, 't2': 471312463}...",big egos have littkw ears,7.692308,0,96.092249,21,0,0,130.083333,68.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,20,fb98d340-b7c1-11eb-82ce-3908cd4e9308,1.622190e+12,9:00-14:00,,1.622191e+12,1622190602649,composition,148,4,...,"[{'t1': {'t1': 19, 't2': 32}, 't2': 47843486},...",Todat ill be goinf to uni to get some work don...,1.960784,0,39.496528,79,0,1,242.526316,89.222222
2599,20,fea772d0-b7c1-11eb-82ce-3908cd4e9308,1.622208e+12,14:00-20:00,,1.622208e+12,1622208283429,transcription,29,0,...,"[{'t1': {'t1': 36, 't2': 16}, 't2': 56000244},...",be gentle first with yourself,0.0,0,46.944557,25,0,0,264.750000,107.689655
2600,20,fea772d0-b7c1-11eb-82ce-3908cd4e9308,1.622208e+12,14:00-20:00,,1.622208e+12,1622208283429,transcription,39,1,...,"[{'t1': {'t1': 1, 't2': 26}, 't2': 56008859}, ...",make each day your masterpiece,0.0,0,42.283298,26,0,0,293.586207,100.000000
2601,20,fea772d0-b7c1-11eb-82ce-3908cd4e9308,1.622208e+12,14:00-20:00,,1.622208e+12,1622208283429,transcription,43,2,...,"[{'t1': {'t1': 35, 't2': 11}, 't2': 56018984},...",live life to the fullest,0.0,0,34.740651,24,0,0,251.212121,79.487179


Storing to `.csv`, choose which one is useful

In [None]:
outpath = source / 'parsed'
outpath.mkdir(exist_ok=True)

df_fatigue.to_csv(outpath / 'fatigue_measurements.csv',index=False)
df_explicit.to_csv(outpath / 'explicit_measurements.csv',index=False)

print('Output stored in /local/parsed')

Output stored in /local/parsed
