# 1. Local links

In [None]:
from pathlib import Path
import pandas as pd
import altair as alt
import numpy as np
import json

import ideafast_deviceselection as ifds

source = Path(ifds.__file__).parent.parent.absolute() / 'local/keyboard'
users = source /'users'

### Variables

In [None]:
study_configs = [
    {
        'name': 'Participants Study Config',
        'start': pd.to_datetime("2021-05-21T00:00:00.000Z"),
        'end': pd.to_datetime("2021-06-04T00:00:00.000Z")
    },{
        'name': 'Participants Study Config June',
        'start': pd.to_datetime("2021-06-01T00:00:00.000Z"),
        'end': pd.to_datetime("2021-06-16T00:00:00.000Z")
    }
]

# 2. Parse JSON - split in subcategories

In [None]:
source_files = users / 'raw'
files = [file for file in  if file.is_file()]

# prep folders

imp_path = users / 'implicit'
exp_path = users / 'explicit'
oth_path = users / 'other'
imp_path.mkdir(exist_ok=True)
exp_path.mkdir(exist_ok=True)
oth_path.mkdir(exist_ok=True)


# split all files

for file in sorted(source_files.glob('*.json')):
    with open(file) as r:
        data = json.load(r)
        
        implicit = data['completedTasks']['implicit_mode']
        explicit = data['completedTasks'].get(study_mode, None) or data['completedTasks'].get(study_mode_june, None)
        other = {key:data[key] for key in ['config', 'device']}
        
        # store implicit data        
        with open(users / 'implicit' / file.name, 'w', encoding='utf-8') as w:
            json.dump(implicit, w, ensure_ascii=False, indent=4)
        
        # store explicit data
        if not explicit:
            print(f'No study data found for {file.stem}')
            continue
            
        with open(users / 'explicit' / file.name, 'w', encoding='utf-8') as w:
            json.dump(explicit, w, ensure_ascii=False, indent=4)
            
        # store other data
        with open(users / 'other' / file.name, 'w', encoding='utf-8') as w:
            json.dump(other, w, ensure_ascii=False, indent=4)

# 3. Converting to DataFrame

Below the json data is tranformed into dataframes, adding as much relevant metrics as possible. These need to be re-run if/when the source data is altered. Saving these into `.csv` files will allow the other notebooks to open the dataframes and visualise them.

## Implicit data

A few errors/dicards can occur. In any case, each timestamp contains a `"phrases"` key:
```json
"timestamp": {
    "phrases": []
}
```
Known errors are stored as (listing examples - these don't occur at the same time):
```json
{
    "error": "something went wrong with input buffer calculation"
},
{
    "discarded": "edit box was not empty"
},
{
    "discarded": "error calculating input buffer"
},
{
    "words-per-minute": "invalid value"
}

```
Calculating with `words-per-minute` also needs checking, as it's value can be `"invalid value"`.

Based on the data structure (`"phrases"` being a list) we assume multiple phrases can exists.

Metrics calculation below is based on [André Santos' code](https://github.com/WildKey-Dev/ideafast-keyboard-study-creator) for the report generation.


In [None]:
# Metrics of interest
pre_calculated_metrics = [
    'action-count',
    'auto-correct',
    'corrected-error-rate',
    'correction-action-count',
    'cursor-changes',
    'entry-action-count',
    'error-correction-attempts',
    'insertions-error-rate',
    'omission-error-rate',
    'select-suggestions',
    'substitutions-error-rate',
    'total-changed-characters',
    'total-error-rate',
    'uncorrected-error-rate',
    'voice-input',
    'words-per-minute',
    'written-characters',
    'written-numbers',
    'written-special-characters' 
]

columns = ['participant','timestamp', 'day', 'day_relative','quality'] + pre_calculated_metrics
df = pd.DataFrame(columns=columns)

for count, file in enumerate(sorted(imp_path.glob('*.json')), start=1):
    with open(file) as r:
        data = json.load(r)
        lst = []

        for timestamp, value in data.items():
            for phrase in value['phrases']:

                timestamp = pd.to_datetime(timestamp, unit='ms')

                quality = 'valid'
                if 'error' in phrase:
                    quality = 'error'
                elif ('discarded' in phrase and 'error' in phrase['discarded']):
                    quality = 'error'
                elif ('words-per-minute' in phrase and phrase['words-per-minute'] == "invalid value"):
                    quality = 'error'
                elif 'discarded' in phrase:
                    quality = 'discarded'
                
                if quality != 'valid':
                    metrics = [np.NaN] * len(pre_calculated_metrics)
                else:
                    metrics = [phrase.get(m,np.NaN) for m in pre_calculated_metrics] 
                
                lst.append([count, timestamp, timestamp.normalize(), None, quality] + metrics)
        
        sub_df = pd.DataFrame(lst, columns=columns)
        
        sub_df['day_relative'] = (sub_df['day'] - sub_df['day'].min())/np.timedelta64(1, 'D')
        
        df = pd.concat([df, sub_df], ignore_index=True)

df

Unnamed: 0,participant,timestamp,day,day_relative,quality,action-count,auto-correct,corrected-error-rate,correction-action-count,cursor-changes,...,select-suggestions,substitutions-error-rate,total-changed-characters,total-error-rate,uncorrected-error-rate,voice-input,words-per-minute,written-characters,written-numbers,written-special-characters
0,1,2021-05-20 10:37:03.121,2021-05-20,0.0,discarded,,,,,,...,,,,,,,,,,
1,1,2021-05-20 10:43:21.293,2021-05-20,0.0,error,,,,,,...,,,,,,,,,,
2,1,2021-05-20 10:43:21.760,2021-05-20,0.0,error,,,,,,...,,,,,,,,,,
3,1,2021-05-20 10:43:22.329,2021-05-20,0.0,error,,,,,,...,,,,,,,,,,
4,1,2021-05-20 10:43:22.770,2021-05-20,0.0,error,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50269,20,2021-06-04 10:08:02.425,2021-06-04,16.0,discarded,,,,,,...,,,,,,,,,,
50270,20,2021-06-04 10:11:28.941,2021-06-04,16.0,valid,10.0,0.0,0.00,0.0,0.0,...,0.0,0.0,10.0,0.00,0.0,0.0,35.471475,8.0,2.0,0.0
50271,20,2021-06-04 10:12:01.624,2021-06-04,16.0,discarded,,,,,,...,,,,,,,,,,
50272,20,2021-06-04 10:13:15.257,2021-06-04,16.0,valid,58.0,3.0,6.25,17.0,0.0,...,1.0,0.0,34.0,6.25,0.0,0.0,56.022409,35.0,0.0,0.0


Filtering data to only include days from within the study participation

In [None]:
# repeating file loop to identify participant numbers

second_wave_participants = ['M79iKh', 'VaH3bx', 'XtAs3f']
second_wave_participants_num = []

for count, file in enumerate(sorted(imp_path.glob('*.json')), start=1):
    if file.stem[:6] in second_wave_participants:
        second_wave_participants_num.append(count)
        
# TODO: filter based on timeframes (see top of notebook) and participant number
        
print(second_wave_participants_num)

[8, 11, 12]


Storing to `.csv`, choose which one is useful

In [None]:
outpath = source / 'output'
outpath.mkdir(exist_ok=True)

df.to_csv(outpath / 'implicit_raw.csv',index=False)
df[df['quality']=='valid'].to_csv(outpath / 'implicit_only_valid.csv',index=False)