# 1. Local links

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import json

import ideafast_deviceselection as ifds

source = Path(ifds.__file__).parent.parent.absolute() / 'local/keyboard'
users = source /'users'

source_files = users / 'raw'

# prep folders

imp_path = users / 'implicit'
exp_path = users / 'explicit'
oth_path = users / 'other'
imp_path.mkdir(exist_ok=True)
exp_path.mkdir(exist_ok=True)
oth_path.mkdir(exist_ok=True)



# 2. Parse JSON - split in subcategories 

In [None]:
# split all files

for file in sorted(source_files.glob('*.json')):
    with open(file) as r:
        data = json.load(r)
        
        implicit = data['completedTasks']['implicit_mode']
        explicit = data['completedTasks'].get(study_mode, None) or data['completedTasks'].get(study_mode_june, None)
        other = {key:data[key] for key in ['config', 'device']}
        
        # store implicit data        
        with open(users / 'implicit' / file.name, 'w', encoding='utf-8') as w:
            json.dump(implicit, w, ensure_ascii=False, indent=4)
        
        # store explicit data
        if not explicit:
            print(f'No study data found for {file.stem}')
            continue
            
        with open(users / 'explicit' / file.name, 'w', encoding='utf-8') as w:
            json.dump(explicit, w, ensure_ascii=False, indent=4)
            
        # store other data
        with open(users / 'other' / file.name, 'w', encoding='utf-8') as w:
            json.dump(other, w, ensure_ascii=False, indent=4)

# 3. Converting to DataFrame

Below the json data is tranformed into dataframes, adding as much relevant metrics as possible. These need to be re-run if/when the source data is altered. Saving these into `.csv` files will allow the other notebooks to open the dataframes and visualise them.

## Implicit data

A few errors/dicards can occur. In any case, each timestamp contains a `"phrases"` key:
```json
"timestamp": {
    "phrases": []
}
```
Known or found errors are stored as (listing examples - these don't occur at the same time):

1. `"error": "something went wrong with input buffer calculation"`
2. `"discarded": "edit box was not empty"`
3. `"discarded": "error calculating input buffer"`
4. `"words-per-minute": "invalid value"`
5. `"timestamp": 1234567890  # missing 'discarded' - assuming this is a discard`
6. Some values might not exists even though others were appropriately calculated

Calculating with `words-per-minute` also needs checking, as it's value can be `"invalid value"`.

Based on the data structure (`"phrases"` being a list) we assume multiple phrases can exists.

Metrics calculation below is based on [André Santos' code](https://github.com/WildKey-Dev/ideafast-keyboard-study-creator) for the report generation.


In [None]:
study_configs = [
    {
        'name': 'Participants Study Config',
        'start': pd.to_datetime("2021-05-21T00:00:00.000Z"),
        'end': pd.to_datetime("2021-06-04T00:00:00.000Z")
    },{
        'name': 'Participants Study Config June',
        'start': pd.to_datetime("2021-06-01T00:00:00.000Z"),
        'end': pd.to_datetime("2021-06-15T00:00:00.000Z")
    }
]
print('study_config variable stored')

study_config variable stored


In [None]:

# Metrics of interest
pre_calculated_metrics = [
    'action-count',
    'auto-correct',
    'corrected-error-rate',
    'correction-action-count',
    'cursor-changes',
    'entry-action-count',
    'error-correction-attempts',
    'flight-time',
    'holdtime-deviations',
    'insertions-error-rate',
    'language',
    'omission-error-rate',
    'select-suggestions',
    'substitutions-error-rate',
    'time-per-word', 
    'total-changed-characters',
    'total-error-rate',
    'touch-offset-entered',
    'uncorrected-error-rate',
    'voice-input',
    'words-per-minute',
    'written-characters',
    'written-numbers',
    'written-special-characters',
    'mean-flight-time',
    'mean-holdtime-deviations',
]
lang_pos = pre_calculated_metrics.index('language')
flight_pos = pre_calculated_metrics.index('flight-time')
holdtime_pos = pre_calculated_metrics.index('holdtime-deviations')
mean_flight_pos = pre_calculated_metrics.index('mean-flight-time')
mean_holdtime_pos = pre_calculated_metrics.index('mean-holdtime-deviations')

columns = ['participant','timestamp', 'day', 'day_relative','quality'] + pre_calculated_metrics
df = pd.DataFrame(columns=columns)

# participants joining in the second period
second_wave = ['M79iKh', 'VaH3bx', 'XtAs3f']
datapoints_outside_study = 0

for count, file in enumerate(sorted(imp_path.glob('*.json')), start=1):
    wave = 1 if file.stem[:6] in second_wave else 0

    with open(file) as r:
        data = json.load(r)
        lst = []

        for timestamp, value in data.items():
            for phrase in value['phrases']:
                
                timestamp = pd.to_datetime(timestamp, unit='ms', utc=True)
                
                # filter records outside of the study period
                if timestamp < study_configs[wave]['start'] or timestamp > study_configs[wave]['end']:
                    datapoints_outside_study+=1
                    continue

                # detecting/filtering errors - see list notebook cell above
                quality = 'valid'
                
                # case 1
                if 'error' in phrase:
                    quality = 'error'
                # case 3
                elif ('discarded' in phrase and 'error' in phrase['discarded']):
                    quality = 'error'
                # case 4
                elif ('words-per-minute' in phrase and phrase['words-per-minute'] == "invalid value"):
                    quality = 'error'
                # case 2 and 5
                elif 'discarded' in phrase or 'words-per-minute' not in phrase:
                    quality = 'discarded'
                
                if quality != 'valid':
                    metrics = [np.NaN] * len(pre_calculated_metrics)
                else:
                    metrics = [phrase.get(m,np.NaN) for m in pre_calculated_metrics]
                    
                    # case 6
                    # classify any row as 'error' if any of their metrics are NaN
                    # only two were found before this check was added
                    if not isinstance(metrics[lang_pos],dict):
                        quality = 'error'
                    else:
                        # pull out iso lanuage only (tag)                        
                        metrics[lang_pos] = metrics[lang_pos].get('tag',np.NaN)
                        # calculate average flight_times, and hold_times
                        metrics[mean_flight_pos] = np.average(metrics[flight_pos])
                        metrics[mean_holdtime_pos] = np.average(metrics[holdtime_pos])
                
                lst.append([count, timestamp, timestamp.normalize(), None, quality] + metrics)
        
        sub_df = pd.DataFrame(lst, columns=columns)
        
        sub_df['day_relative'] = (sub_df['day'] - sub_df['day'].min())/np.timedelta64(1, 'D')
        
        df = pd.concat([df, sub_df], ignore_index=True)

df

Unnamed: 0,participant,timestamp,day,day_relative,quality,action-count,auto-correct,corrected-error-rate,correction-action-count,cursor-changes,entry-action-count,error-correction-attempts,flight-time,holdtime-deviations,insertions-error-rate,language,omission-error-rate,select-suggestions,substitutions-error-rate,time-per-word,total-changed-characters,total-error-rate,touch-offset-entered,uncorrected-error-rate,voice-input,words-per-minute,written-characters,written-numbers,written-special-characters,mean-flight-time,mean-holdtime-deviations
0,1,2021-05-21 07:37:07.930000+00:00,2021-05-21 00:00:00+00:00,0.0,valid,8.0,0.0,14.285715,1.0,0.0,7.0,1.0,"[184, 267, 401, 242, 125, 1795, 1169, 1010]","[66, 75, 83, 92, 117, 83, 83, -1, 58, 49]",0.0,en-GB,0.0,0.0,0.0,[5192],8.0,14.285715,"[{'t1': {'t1': 5, 't2': 16}, 't2': 148416373},...",0.0,0.0,13.864818,7.0,0.0,1.0,649.125,70.500000
1,1,2021-05-21 07:37:09.121000+00:00,2021-05-21 00:00:00+00:00,0.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1,2021-05-21 07:37:09.122000+00:00,2021-05-21 00:00:00+00:00,0.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,2021-05-21 07:53:01.918000+00:00,2021-05-21 00:00:00+00:00,0.0,valid,5.0,0.0,25.000000,1.0,0.0,4.0,1.0,"[192, 250, 3032, 367]","[75, 75, 66, 33, -1, 50]",0.0,en-GB,0.0,0.0,0.0,[3836],5.0,25.000000,"[{'t1': {'t1': 25, 't2': 28}, 't2': 149370606}...",0.0,0.0,9.372559,4.0,0.0,0.0,960.250,49.666667
4,1,2021-05-21 07:53:02.057000+00:00,2021-05-21 00:00:00+00:00,0.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35767,20,2021-06-03 23:10:28.611000+00:00,2021-06-03 00:00:00+00:00,13.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
35768,20,2021-06-03 23:11:28.844000+00:00,2021-06-03 00:00:00+00:00,13.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
35769,20,2021-06-03 23:14:02.020000+00:00,2021-06-03 00:00:00+00:00,13.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,
35770,20,2021-06-03 23:34:07.662000+00:00,2021-06-03 00:00:00+00:00,13.0,discarded,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
pd.set_option('max_columns', None)
# uncommment below to check all columns briefly
#print(df.head())
pd.reset_option('max_columns')

In [None]:
print(f'data points outside of study period: {datapoints_outside_study}')
print('\nOverview datapoints:')
print(df["quality"].value_counts())

data points outside of study period: 14502

Overview datapoints:
valid        19931
discarded     9340
error         6501
Name: quality, dtype: int64


#### Checking quality of filters / cleaning process

In [None]:
# any NaN in pre-calculated metrics?
pd.set_option('max_columns', None)
df[(df['quality']=='valid') & (df.isna().any(axis=1))]

Unnamed: 0,participant,timestamp,day,day_relative,quality,action-count,auto-correct,corrected-error-rate,correction-action-count,cursor-changes,entry-action-count,error-correction-attempts,flight-time,holdtime-deviations,insertions-error-rate,language,omission-error-rate,select-suggestions,substitutions-error-rate,time-per-word,total-changed-characters,total-error-rate,touch-offset-entered,uncorrected-error-rate,voice-input,words-per-minute,written-characters,written-numbers,written-special-characters,mean-flight-time,mean-holdtime-deviations
19645,9,2021-05-31 10:51:17.377000+00:00,2021-05-31 00:00:00+00:00,10.0,valid,18.0,1.0,23.076923,8.0,0.0,18.0,0.0,"[184, 92, 109, 169, 57, 117, 176, 100, 92]","[50, 50, 50, 67, 69, 57, 33, 50, 50, 75]",0.0,,0.0,0.0,0.0,"[553, 368]",16.0,23.076923,"[{'t1': {'t1': 7, 't2': 8}, 't2': 391504801}, ...",0.0,0.0,109.489051,9.0,0.0,0.0,121.777778,55.1


### Variables

Storing to `.csv`, choose which one is useful

In [None]:
outpath = source / 'parsed'
outpath.mkdir(exist_ok=True)

df.to_csv(outpath / 'implicit_raw.csv',index=False)
df[df['quality']=='valid'].to_csv(outpath / 'implicit_only_valid.csv',index=False)

print('Output stored in /local/parsed')

Output stored in /local/parsed


## Explicit data

A few errors/dicards can occur. In any case, each timestamp contains a `"phrases"` key:
```json
"timestamp": {
    "phrases": []
}
```
Known or found errors are stored as (listing examples - these don't occur at the same time):

1. -- (none that we are aware of atm)

Based on the data structure (`"phrases"` being a list) we assume multiple phrases can exists.

In [None]:
# Metrics of interest
phrases_pre_calculated_metrics = [
    'action-count',
    'auto-correct',
    'corrected-error-rate',
    'correction-action-count',
    'cursor-changes',
    'entry-action-count',
    'error-correction-attempts',
    'flight-time',
    'holdtime-deviations',
    'input-stream',
    'input-stream-og',
    'insertions-error-rate',
    'language',
    'omission-error-rate',
    'select-suggestions',
    'substitutions-error-rate',
    'target_phrase',
    'time-per-word', 
    'total-changed-characters',
    'total-error-rate',
    'touch-offset-entered',
    'transcribe',
    'uncorrected-error-rate',
    'voice-input',
    'words-per-minute',
    'written-characters',
    'written-numbers',
    'written-special-characters' ,
    'mean-flight-time',
    'mean-holdtime-deviations',
]
lang_pos = phrases_pre_calculated_metrics.index('language')
flight_pos = phrases_pre_calculated_metrics.index('flight-time')
holdtime_pos = phrases_pre_calculated_metrics.index('holdtime-deviations')
mean_flight_pos = phrases_pre_calculated_metrics.index('mean-flight-time')
mean_holdtime_pos = phrases_pre_calculated_metrics.index('mean-holdtime-deviations')

fatigue_details = [
    'fatigue_score'
]
task_details = [
    'end',
    'finished', # boolean
    'init',
]
prompt_details = [
    'promptId',
    'subType',
    'type',
]

columns = (['participant','sesssion_avg_timestamp', 'timestamp', 'day', 'day_relative','quality'] 
    + fatigue_details + prompt_details + task_details + phrases_pre_calculated_metrics)

# hardcode fatigue measurement task
fatigue_task_id = 'f1b46be-bdfb-dc63-b5a0-0a1d46c280'
fatigue_scale_question_id = 'd63cded-bf4-cbd6-0d23-00b1f631b2f7'

# participants joining in the second period, and following the later-config
second_wave = ['M79iKh', 'VaH3bx', 'XtAs3f']

for count, file in enumerate(sorted(exp_path.glob('*.json')), start=1):
    wave = 1 if file.stem[:6] in second_wave else 0

    with open(file) as r:
        data = json.load(r)
        lst = []

        for identifier, value in data.items():
            # identifier consists of [timeframeId]_[taskId]
            timeframe_id, task_id = identifier.split('_')
            
            if task_id == fatigue_task_id:
                fatigue = value.get(fatigue_scale_question_id,{}).get('response',np.NaN)
            
            else:
                for phrase in value['phrases']:

                    timestamp = pd.to_datetime(timestamp, unit='ms', utc=True)

                    # filter records outside of the study period
                    if timestamp < study_configs[wave]['start'] or timestamp > study_configs[wave]['end']:
                        datapoints_outside_study+=1
                        continue

                    # detecting/filtering errors - see list notebook cell above
                    quality = 'valid'

                    # case 1
                    if 'error' in phrase:
                        quality = 'error'
                    # case 3
                    elif ('discarded' in phrase and 'error' in phrase['discarded']):
                        quality = 'error'
                    # case 4
                    elif ('words-per-minute' in phrase and phrase['words-per-minute'] == "invalid value"):
                        quality = 'error'
                    # case 2 and 5
                    elif 'discarded' in phrase or 'words-per-minute' not in phrase:
                        quality = 'discarded'

                    if quality != 'valid':
                        metrics = [np.NaN] * len(pre_calculated_metrics)
                    else:
                        metrics = [phrase.get(m,np.NaN) for m in pre_calculated_metrics]

                        # case 6
                        # classify any row as 'error' if any of their metrics are NaN
                        # only two were found before this check was added
                        if not isinstance(metrics[lang_pos],dict):
                            quality = 'error'
                        else:
                            # pull out iso lanuage only (tag)                        
                            metrics[lang_pos] = metrics[lang_pos].get('tag',np.NaN)
                            # calculate average flight_times, and hold_times
                            metrics[mean_flight_pos] = np.average(metrics[flight_pos])
                            metrics[mean_holdtime_pos] = np.average(metrics[holdtime_pos])

                    lst.append([count, timestamp, timestamp.normalize(), None, quality] + metrics)

            sub_df = pd.DataFrame(lst, columns=columns)

            sub_df['day_relative'] = (sub_df['day'] - sub_df['day'].min())/np.timedelta64(1, 'D')

            df = pd.concat([df, sub_df], ignore_index=True)

df

NameError: name 'timestamp' is not defined