# 1. Local links

In [None]:
from pathlib import Path
import pandas as pd

import ideafast_deviceselection as ifds

source = Path(ifds.__file__).parent.parent.absolute() / 'local/keyboard'
users = source /'users'

# 2. Parse JSON - split in subcategories

In [None]:
import json

study_mode = 'Participants Study Config'
study_mode_june = 'Participants Study Config June'
source_files = users / 'raw'
files = [file for file in source_files.glob('**/*.json') if file.is_file()]

# prep folders

imp_path = users / 'implicit'
exp_path = users / 'explicit'
oth_path = users / 'other'
imp_path.mkdir(exist_ok=True)
exp_path.mkdir(exist_ok=True)
oth_path.mkdir(exist_ok=True)


# split all files

for file in files:
    with open(file) as r:
        data = json.load(r)
        
        implicit = data['completedTasks']['implicit_mode']
        explicit = data['completedTasks'].get(study_mode, None) or data['completedTasks'].get(study_mode_june, None)
        other = {key:data[key] for key in ['config', 'device']}
        
        # store implicit data        
        with open(users / 'implicit' / file.name, 'w', encoding='utf-8') as w:
            json.dump(implicit, w, ensure_ascii=False, indent=4)
        
        # store explicit data
        if not explicit:
            print(f'No study data found for {file.stem}')
            continue
            
        with open(users / 'explicit' / file.name, 'w', encoding='utf-8') as w:
            json.dump(explicit, w, ensure_ascii=False, indent=4)
            
        # store other data
        with open(users / 'other' / file.name, 'w', encoding='utf-8') as w:
            json.dump(other, w, ensure_ascii=False, indent=4)

# 3. Preliminary metrics

## Implicit data

A few errors/dicards can occur. In any case, each timestamp contains a `"phrases"` key:
```json
"timestamp": {
    "phrases": []
}
```
Known errors are stored as (listing examples - these don't occur at the same time):
```json
{
    "error": "something went wrong with input buffer calculation"
},
{
    "discarded": "edit box was not empty",
},
{
    "discarded": "error calculating input buffer",
}

```
Calculating with `words-per-minute` also needs checking, as it's value can be `"invalid value"`.

Based on the data structure (`"phrases"` being a list) we assume multiple phrases can exists.

Metrics calculation below is based on [André Santos' code](https://github.com/WildKey-Dev/ideafast-keyboard-study-creator) for the report generation.


In [None]:
error_keys = ['error', 'discarded']
discarded_values = ['edit box was not empty', 'error calculating input buffer']

files = [file for file in imp_path.glob('**/*.json') if file.is_file()]

for file in files:

    with open(file) as r:
        data = json.load(r)
        lst = []
        
        for timestamp, value in data.items():
            for phrase in value['phrases']:

                datetime = pd.to_datetime(timestamp, unit='ms')
                
                valid = 'yes'
                
                if 'error' in phrase:
                    valid = 'no'
                elif 'discarded' in phrase:
                    if 'error' in phrase['discarded']:
                        valid = 'no'
                    else:
                        valid = 'discarded'
                
                if 'words-per-minute' in phrase and (type(t:= phrase['words-per-minute']) == int or type(t) == float):
                    wpm = t
                else:
                    wpm = None

                lst.append([datetime, valid, wpm])
        
        # valid can be 'yes', 'disregarded', or 'no'
        df = pd.DataFrame(lst, columns = ['datetime','valid','wpm'])

        total_error = len(df[df['valid']=='no'])
        total_discarded = len(df[df['valid']=='discarded'])
        
        print(total_error)
        
    # just do one participant for the moment
    break

        
            