In [None]:
# default_exp gbe.wm.data_provider

# Working memory task
Here, we load the working memory task data into a dataframe for further processing.

In [None]:
%load_ext autoreload
%autoreload 2
from IPython.display import Video

In [None]:
#export
import os
from bs4 import BeautifulSoup
from fastcore.foundation import patch
from trr265.gbe.data_provider import GBEProvider
from trr265.data_provider import get_efficiently
import pandas as pd
import numpy as np
import xmltodict
import collections

## Getting the data

In [None]:
#export
class WMDataProvider(GBEProvider):
    '''This class builds upon GBEProvider to get the working memory task data.'''
    def __init__(self, data_folder_path):
        GBEProvider.__init__(self, data_folder_path)

In [None]:
dp = WMDataProvider('/Users/hilmarzech/Projects/trr265/trr265/data/')

### Parsing output strings

In [None]:
#export
@patch
def decode_wm_strings(self:WMDataProvider, gbe_data):
    df = self.decode_gbe_strings(gbe_data, 'WorkingMemoryGame')
    df['success'] = df.success.astype(int)
    return df

In [None]:
gbe_data = dp.get_gbe_data()
df = dp.decode_wm_strings(gbe_data)

> The raw data gives us trial ids.  However, the different trial types (no distractor, encoding distractor, delayed distractor) and levels (2 to 10) are not yet shown. 

In [None]:
df[['gbe_index','trial_number','timestarted','timesubmitted','timetaken','trialid','trialrot','success']]

Unnamed: 0,gbe_index,trial_number,timestarted,timesubmitted,timetaken,trialid,trialrot,success
0,b001_001,1,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1.506,420,1,1
1,b001_001,2,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1.471,379,1,1
2,b001_001,3,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1.154,343,1,1
3,b001_001,4,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,2.004,463,0,1
4,b001_001,5,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1.589,616,3,1
...,...,...,...,...,...,...,...,...
82226,m271_010,28,2021-07-21 10:25:34.373,2021-07-21 10:29:45.451,0.613,1646,1,0
82227,m271_010,29,2021-07-21 10:25:34.373,2021-07-21 10:29:45.451,2.297,2499,0,0
82228,m271_010,30,2021-07-21 10:25:34.373,2021-07-21 10:29:45.451,2.573,1644,3,0
82229,m271_010,31,2021-07-21 10:25:34.373,2021-07-21 10:29:45.451,3.638,2586,2,1


### Adding trial types and levels
> Here, we add trial types based on the app specifications.  We also double check that everything was specified correctly, by comparing the trial types with the app resources.

In [None]:
#export
@patch
@get_efficiently
def get_wm_trial_types(self:WMDataProvider):
    '''This checks that each trial type follows the appropriate specifications.'''
    types = open(os.path.join(self.external, "types.xml"),"r", encoding='UTF-8').read().encode('utf-8')
    types_dict = xmltodict.parse(types)
    type_list = types_dict['trials']['trialtype']
    type_df = []
    for i, t in enumerate(type_list):
        # Getting the type and difficulty
        trial_type = t['@type']
        level = int(t['@difficulty'])
        if level > 10:
            continue
        # Geting individual trial specifications for each type
        for trial in t['trial']:
            trial_dict = {}
            trial_dict['trialid'] = trial['@id'] #id
            trial_dict['trial_type'] = trial_type #
            trial_dict['level'] = level

            # Getting additional information

            board = trial['board']
            # Checking if circles were shown delayed
            if type(board) == collections.OrderedDict:
                board = [board]
            trial_dict['trial_boards'] = []
            # Checking if the trial had distractors
            has_distractor = False
            trial_dict['number_of_boards'] = 0
            for b in board:
                try:
                    if '#text' in b.keys():
                        trial_dict['trial_boards'].append(b['#text'])
                        trial_dict['number_of_boards'] += 1
                    if '#text' in b.keys() and 'D' in b['#text']:
                        has_distractor = True
                        break
                except:
                    print(trial_dict, board)
            is_delayed = int(trial_dict['number_of_boards'] > 1)
            trial_dict['has_distractor'] = int(has_distractor)
            trial_dict['is_delayed'] = int(trial_dict['number_of_boards'] > 1)
            type_df.append(pd.Series(trial_dict))
    type_df = pd.DataFrame(type_df)
    type_df.trial_type.replace({'0':'no_distractor_1',
                               '1':'encoding_distractor',
                               '2':'delayed_distractor',
                               '3':'no_distractor_2'}, inplace = True)
    return type_df

> Checking if each trial type is specified correctly:

In [None]:
type_df = dp.get_wm_trial_types()
type_df.groupby(['trial_type'])[['has_distractor','is_delayed']].mean().astype(bool)

Unnamed: 0_level_0,has_distractor,is_delayed
trial_type,Unnamed: 1_level_1,Unnamed: 2_level_1
4,False,True
5,True,True
delayed_distractor,True,True
encoding_distractor,True,False
no_distractor_1,False,False
no_distractor_2,False,False


> Adding trial types and levels to dataframe.

In [None]:
#export
@patch
def add_trial_types(self:WMDataProvider, df):
    type_df = self.get_wm_trial_types()
    if 'trial_type' in df.columns:
        df = df.drop.columns('trial_type')
    df = df.merge(type_df[['trialid','trial_type','level']], on = 'trialid', how = 'left', validate = 'many_to_one')
    return df
    

In [None]:
df = dp.add_trial_types(df)

#### Checking trial types in dataset

In [None]:
df.trial_type.value_counts()

no_distractor_1        20554
encoding_distractor    20510
no_distractor_2        20493
delayed_distractor     20284
5                        195
4                        195
Name: trial_type, dtype: int64

> Warning: Note that some participants completed old versions of the task (as some trials were associated with trial types 4 and 5).

### Getting the data

In [None]:
#export
@patch
@get_efficiently
def get_wm_data(self:WMDataProvider):
    gbe_data = self.get_gbe_data()
    df = self.decode_wm_strings(gbe_data)
    df = self.add_trial_types(df)
    return df

In [None]:
df = dp.get_wm_data()
df

Unnamed: 0,gbe_index,trial_number,score,timestarted,timesubmitted,success,timetaken,trialid,trialrot,trial_type,level
0,b001_001,1,600,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1,1.506,420,1,delayed_distractor,3
1,b001_001,2,600,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1,1.471,379,1,encoding_distractor,3
2,b001_001,3,600,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1,1.154,343,1,no_distractor_1,3
3,b001_001,4,600,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1,2.004,463,0,no_distractor_2,3
4,b001_001,5,600,2020-02-24 20:48:52.812,2020-02-24 20:52:50.167,1,1.589,616,3,no_distractor_1,4
...,...,...,...,...,...,...,...,...,...,...,...
82226,m271_010,28,533,2021-07-21 10:25:34.373,2021-07-21 10:29:45.451,0,0.613,1646,1,delayed_distractor,7
82227,m271_010,29,533,2021-07-21 10:25:34.373,2021-07-21 10:29:45.451,0,2.297,2499,0,encoding_distractor,10
82228,m271_010,30,533,2021-07-21 10:25:34.373,2021-07-21 10:29:45.451,0,2.573,1644,3,delayed_distractor,7
82229,m271_010,31,533,2021-07-21 10:25:34.373,2021-07-21 10:29:45.451,1,3.638,2586,2,no_distractor_2,10


## Filters

### Sessions that were completed with old version of the task
These sessions were completed with an old version of the task, due an experimenter error.  The experimenter used the wrong link to install the GBE app, when setting up participant's phones. 

In [None]:
#export
@patch
def filter_old_app_sessions(self:WMDataProvider, df):
    participants_with_old_app = self.get_gbe_data().loc[df[df.trial_type.isin(['4','5'])].gbe_index.unique()].participant.unique()
    sessions_with_old_app = df[df.trial_type.isin(['4','5'])].gbe_index.unique()
    total_sessions = len(df.gbe_index.unique())
    perc_removed = (len(sessions_with_old_app)/total_sessions)*100
    print("%d participants used an old version of the task in some of their sessions.  %d sessions (%.2f%%) were removed from the dataset."%(len(participants_with_old_app), len(sessions_with_old_app), perc_removed))
    df = df[df.gbe_index.isin(sessions_with_old_app)==False]
    return df

In [None]:
test = dp.filter_old_app_sessions(df)
assert len(test[test.trial_type.isin(['4','5'])]) == 0, "Did not succeed filtering old app sessions." 

9 participants used an old version of the task in some of their sessions.  32 sessions (1.23%) are removed from the dataset.


### Sessions with level two failures
"Data were excluded from participants who failed a “load 2” trial in any condition." (McNab, 2015, p. 6515)

In [None]:
#export
@patch
def filter_level_two_failures(self:WMDataProvider, df):
    filtered_sessions = df.query("(level==2) and (success==0)").gbe_index.unique()
    total_sessions = len(df.gbe_index.unique())
    perc_removed = (len(filtered_sessions)/total_sessions)*100
    print("%d sessions (%.2f%%) were removed because participants failed a level two trial."%(len(filtered_sessions), perc_removed))
    df = df[df.gbe_index.isin(filtered_sessions)==False]
    return df

In [None]:
dp.filter_level_two_failures(df)

28 sessions (1.07%) are removed because participants failed a level two trial.


## Checks

### Visual check

Here we show a screen capture of the working memory task with the associated data below.

In [None]:
Video("images/wm_task_video.webm", width=200)

In [None]:
test = dp.decode_gbe_strings(pd.read_csv(os.path.join(dp.raw,'mov_data_test.csv'), sep = ';'),'WorkingMemoryGame')
show = dp.add_trial_types(test)[['trial_type','level','success']]
show

Unnamed: 0,trial_type,level,success
0,delayed_distractor,3,1
1,no_distractor_1,3,1
2,encoding_distractor,3,1
3,no_distractor_2,3,1
4,encoding_distractor,4,1
5,no_distractor_1,4,0
6,delayed_distractor,4,1
7,no_distractor_2,4,1
8,encoding_distractor,5,1
9,no_distractor_1,4,1
