# 1. Local links

In [None]:
from pathlib import Path
import pandas as pd

import ideafast_deviceselection as ifds

source = Path(ifds.__file__).parent.parent.absolute() / 'local/keyboard'
users = source /'users'

# 2. Parse JSON - split in subcategories

In [None]:
import json

study_mode = 'Participants Study Config'
study_mode_june = 'Participants Study Config June'
source_files = users / 'raw'
files = [file for file in source_files.glob('**/*.json') if file.is_file()]

# prep folders

imp_path = users / 'implicit'
exp_path = users / 'explicit'
oth_path = users / 'other'
imp_path.mkdir(exist_ok=True)
exp_path.mkdir(exist_ok=True)
oth_path.mkdir(exist_ok=True)


# split all files

for file in files:
    with open(file) as r:
        data = json.load(r)
        
        implicit = data['completedTasks']['implicit_mode']
        explicit = data['completedTasks'].get(study_mode, None) or data['completedTasks'].get(study_mode_june, None)
        other = {key:data[key] for key in ['config', 'device']}
        
        # store implicit data        
        with open(users / 'implicit' / file.name, 'w', encoding='utf-8') as w:
            json.dump(implicit, w, ensure_ascii=False, indent=4)
        
        # store explicit data
        if not explicit:
            print(f'No study data found for {file.stem}')
            continue
            
        with open(users / 'explicit' / file.name, 'w', encoding='utf-8') as w:
            json.dump(explicit, w, ensure_ascii=False, indent=4)
            
        # store other data
        with open(users / 'other' / file.name, 'w', encoding='utf-8') as w:
            json.dump(other, w, ensure_ascii=False, indent=4)

# 3. Preliminary metrics

## Implicit data

A few errors/dicards can occur. In any case, each timestamp contains a `"phrases"` key:
```json
"timestamp": {
    "phrases": []
}
```
Known errors are stored as (listing examples - these don't occur at the same time):
```json
{
    "error": "something went wrong with input buffer calculation"
},
{
    "discarded": "edit box was not empty",
},
{
    "discarded": "error calculating input buffer",
}

```
Calculating with `words-per-minute` also needs checking, as it's value can be `"invalid value"`.

Based on the data structure (`"phrases"` being a list) we assume multiple phrases can exists.

Metrics calculation below is based on [André Santos' code](https://github.com/WildKey-Dev/ideafast-keyboard-study-creator) for the report generation.


In [None]:
import numpy as np

error_keys = ['error', 'discarded']
discarded_values = ['edit box was not empty', 'error calculating input buffer']

files = [file for file in imp_path.glob('**/*.json') if file.is_file()]

columns = ['participant','datetime', 'date_norm', 'date_rel','valid','wpm','error_count']
df = pd.DataFrame(columns=columns)

for count, file in enumerate(files, start=1):

    with open(file) as r:
        data = json.load(r)
        
        lst = []

        for timestamp, value in data.items():
            for phrase in value['phrases']:

                datetime = pd.to_datetime(timestamp, unit='ms')
                
                valid = 'yes'
                
                if 'error' in phrase:
                    valid = 'no'
                elif 'discarded' in phrase:
                    if 'error' in phrase['discarded']:
                        valid = 'no'
                    else:
                        valid = 'discarded'
                
                if 'words-per-minute' in phrase and (type(t:= phrase['words-per-minute']) == int or type(t) == float):
                    wpm = t
                else:
                    wpm = None
                    
                errors = phrase['correction-action-count'] if 'correction-action-count' in phrase else None

                lst.append([count, datetime, datetime.normalize(), None, valid, wpm, errors])
        
        
        sub_df = pd.DataFrame(lst, columns=columns)
        sub_df['date_rel'] = (sub_df['date_norm'] - sub_df['date_norm'].min())/np.timedelta64(1, 'D')
        df = pd.concat([df, sub_df], ignore_index=True)


# valid can be 'yes', 'disregarded', or 'no'
# df = pd.DataFrame(lst, columns = ['participant','datetime', 'date_norm', 'date_rel','valid','wpm'])

# useful metrics
# total_error = len(df[df['valid']=='no'])
# total_discarded = len(df[df['valid']=='discarded'])
# total_valid = len(df[df['valid']=='yes'])

df

Unnamed: 0,participant,datetime,date_norm,date_rel,valid,wpm,error_count
0,1,2021-05-17 15:12:19.009,2021-05-17,0.0,yes,29.005525,15.0
1,1,2021-05-17 15:12:19.373,2021-05-17,0.0,discarded,,
2,1,2021-05-17 15:12:19.989,2021-05-17,0.0,discarded,,
3,1,2021-05-17 15:23:13.282,2021-05-17,0.0,yes,36.564830,30.0
4,1,2021-05-17 15:58:21.077,2021-05-17,0.0,yes,35.268185,0.0
...,...,...,...,...,...,...,...
50269,20,2021-06-05 08:58:32.678,2021-06-05,15.0,yes,41.702867,1.0
50270,20,2021-06-05 09:54:38.630,2021-06-05,15.0,yes,61.261261,0.0
50271,20,2021-06-05 09:54:41.022,2021-06-05,15.0,yes,82.144894,8.0
50272,20,2021-06-05 09:54:42.234,2021-06-05,15.0,discarded,,


In [None]:
overview = df[df['valid']=='yes'].groupby(['participant','date_rel']).agg({'wpm': ['mean', 'min', 'max'], 'error_count':['mean', 'min', 'max']})
overview.columns = ['wpm_mean', 'wpm_min', 'wpm_max', 'error_mean', 'error_min', 'error_max']

overview      


Unnamed: 0_level_0,Unnamed: 1_level_0,wpm_mean,wpm_min,wpm_max,error_mean,error_min,error_max
participant,date_rel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,60.403619,4.359745,149.152542,11.526316,0.0,59.0
1,1.0,63.507001,3.963448,221.198157,11.528926,0.0,105.0
1,2.0,56.986372,3.150226,187.500000,12.390244,0.0,262.0
1,3.0,56.073064,8.500590,143.712575,4.718750,0.0,29.0
1,4.0,82.163179,3.860072,406.779661,11.400000,0.0,67.0
...,...,...,...,...,...,...,...
20,11.0,116.754155,71.352729,150.943396,2.800000,0.0,7.0
20,12.0,65.089155,23.555805,188.976378,4.650000,0.0,24.0
20,13.0,74.708171,74.708171,74.708171,0.000000,0.0,0.0
20,14.0,55.121958,24.339178,76.800000,8.500000,0.0,48.0


In [None]:
import altair as alt

base = alt.Chart(overview.reset_index())

selection = alt.selection_multi(fields=['participant'], bind='legend')

base.mark_line().encode(
    alt.X('date_rel:O', axis=alt.Axis(title='Days into the study')),
    alt.Y('wpm_mean:Q', axis=alt.Axis(title='Average words-per-minute')),
    color='participant:N',
    opacity = alt.condition(selection, alt.value(1.0), alt.value(0.05))
).add_selection(
    selection
).properties(
    title='Average words-per-minute, per day'
)


In [None]:
import altair as alt



# alt.Chart(overview.reset_index()).mark_point().encode(
#     x='date_rel:O',
#     y='wpm_mean',
#     row='participant:N',
# ).properties(
#     height=150
# )

for participant, sub_df in overview.groupby(level=0):
    print(sub_df.head())

                       wpm_mean   wpm_min     wpm_max  error_mean  error_min  \
participant date_rel                                                           
1           0.0       60.403619  4.359745  149.152542   11.526316        0.0   
            1.0       63.507001  3.963448  221.198157   11.528926        0.0   
            2.0       56.986372  3.150226  187.500000   12.390244        0.0   
            3.0       56.073064  8.500590  143.712575    4.718750        0.0   
            4.0       82.163179  3.860072  406.779661   11.400000        0.0   

                      error_max  
participant date_rel             
1           0.0            59.0  
            1.0           105.0  
            2.0           262.0  
            3.0            29.0  
            4.0            67.0  
                       wpm_mean    wpm_min     wpm_max  error_mean  error_min  \
participant date_rel                                                            
2           0.0       44.356677  14.931