In [1]:
import pandas as pd
import numpy as np
import json
import os

# Background

Exporting data from SQL:

```sh
heroku pg:psql
```

```sql
\copy (SELECT * FROM walkdata_js) TO walkdata_test.csv CSV DELIMITER ',' HEADER;
\copy (SELECT * FROM participants_js) TO participants_test.csv CSV DELIMITER ',' HEADER;
\copy (SELECT * FROM exp_js) TO experiment_test.csv CSV DELIMITER ',' HEADER;
```

```sh
gzip walkdata_test.csv
gzip participants_test.csv
gzip experiment_test.csv
```

# Preprocessing

First let's preprocess our data.

Correct column names.

Get rid of invalid data.

`Participants` contains info like start and end time, browser, etc.

It also contains all saved data, json-encoded in the `datastring` field.

In [2]:
participants = pd.read_csv('../experiment/data/raw/participants_1.csv.gz')
participants.head(n=10)

Unnamed: 0,uniqueid,assignmentid,workerid,hitid,ipaddress,browser,platform,language,cond,counterbalance,codeversion,beginhit,beginexp,endhit,bonus,status,mode,datastring
0,A3MMEYJIBE5KAF:3GA6AFUKOPP5670S76HE4UE1IW73HT,3GA6AFUKOPP5670S76HE4UE1IW73HT,A3MMEYJIBE5KAF,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.754726,,,0,3,,
1,A3S0INU046UTYK:3BDCF01OGYV886G6NCP2MG3L8EAYL0,3BDCF01OGYV886G6NCP2MG3L8EAYL0,A3S0INU046UTYK,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.754825,,,0,3,,
2,ACNPJ7WXOGX5M:3CN4LGXD5YPCAY9LHKQL0Z1AMSFY47,3CN4LGXD5YPCAY9LHKQL0Z1AMSFY47,ACNPJ7WXOGX5M,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.754894,,,0,3,,
3,A1NIK1THWWLKPY:3S4AW7T80CJSAZ2MY51V6RZEDF3L4Y,3S4AW7T80CJSAZ2MY51V6RZEDF3L4Y,A1NIK1THWWLKPY,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.75496,,,0,3,,
4,A2VX8B1CJXRN2:3VJ40NV2QJOKX5K9QYIV6GF0OH4OTP,3VJ40NV2QJOKX5K9QYIV6GF0OH4OTP,A2VX8B1CJXRN2,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.755025,,,0,3,,
5,ACVXBOS4CZGVQ:3UXUOQ9OKFY8X3T13CAC0P3DA337AR,3UXUOQ9OKFY8X3T13CAC0P3DA337AR,ACVXBOS4CZGVQ,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.755089,,,0,3,,
6,A274NFT93QPBGL:317HQ483I8TC664MV7JEDKDMNQFINJ,317HQ483I8TC664MV7JEDKDMNQFINJ,A274NFT93QPBGL,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.755152,,,0,3,,
7,A110JYKWR9T6H8:31LM9EDVOMT81HOSDP4EPLSMGE4JNU,31LM9EDVOMT81HOSDP4EPLSMGE4JNU,A110JYKWR9T6H8,33EEIIWHK8808P3KK45C8J2Z8JEVQO,,,,,,,dummy,2019-04-18 14:12:44.755215,,,0,3,,
8,A1A3TGZ7DKJWRW:3A9AA95ATXN0X3LK42DCS0ECIJ85PQ,3A9AA95ATXN0X3LK42DCS0ECIJ85PQ,A1A3TGZ7DKJWRW,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.755278,,,0,3,,
9,A1G06ZVFHCPVOE:3ZV9H2YQQE8N3XVH253E5XOOXLS3W3,3ZV9H2YQQE8N3XVH253E5XOOXLS3W3,A1G06ZVFHCPVOE,304QEQWKZQLHUCPO2UPJYLH9VN70OF,,,,,,,dummy,2019-04-18 14:12:44.75534,,,0,3,,


`Experiment` contains the custom model for each subject, listing their walk index.

In [3]:
experiment = pd.read_csv('../experiment/data/raw/experiment_1.csv.gz')
experiment.head(n=10)

Unnamed: 0,id,uniqueId,finger_mapping,walk_id,bonus_info
0,1,debugc8dG4:debug8JjiQ,"[[false, false, true, false, false, false, fal...",1,
1,2,debugFB8GGF:debug9NXMQX,"[[false, false, false, false, false, false, tr...",2,
2,3,debugH74ty:debugVdE2r,"[[false, false, false, false, false, false, fa...",3,
3,4,debug8URQXJ:debugCRSICZ,"[[false, true, false, false, false, false, fal...",4,"{""walk_one_bonus"": 0.0, ""total_perf"": null, ""w..."
4,5,A2EFENZUAL6Z9V:3I3WADAZ9R5IC39JP2V14CMZL8G5O9,"[[false, false, false, false, false, false, tr...",5,
5,6,A2EFENZUAL6Z9V:3PQMUDRV7S7KS5TVYR1Q9OUIELNII1,"[[false, false, false, false, false, false, tr...",6,"{""walk_two_bonus"": 0.0, ""total_perf"": null, ""w..."
6,7,A2EFENZUAL6Z9V:34J10VATJGZX93HEBZ6ZIQL45V4IQ4,"[[false, false, false, false, false, true, fal...",7,"{""total_perf"": 0.866, ""total_bonus"": 0.0, ""wal..."
7,9,A1HWB810RJBV2K:3Q5C1WP23N2XGB2IJ02MR4WKYX6155,"[[false, false, false, true, false, false, fal...",9,
8,15,A3AS3TZQF4X6ZK:3IO1LGZLKAYBAFMHN4L6YKXS8YN68A,"[[false, false, false, false, false, false, fa...",15,
9,14,AZGNU1HRKLGNW:3180JW2OT5D4BSIM9P8SV4S6SE5J5Q,"[[false, false, true, false, false, false, fal...",14,"{""walk_one_perf"": 0.956, ""total_bonus"": 0.0, ""..."


`Walkdata` contains the walks used for each subject

In [4]:
walkdata = pd.read_csv('../experiment/data/raw/walkdata_1.csv.gz')
walkdata.head(n=10)

Unnamed: 0,walk_id,demo,walk_one,is_crosscluster,is_lattice,walk_two,walk_three,walk_four,nback_queries
0,1,"[3, 2, 1, 0, 9, 8, 7, 6, 5, 4]","[8, 9, 6, 7, 8, 7, 6, 8, 9, 6, 7, 8, 5, 6, 5, ...","[false, false, false, false, false, false, fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 8, 9, 7, 6, 8, 6, 5, 4, 1, 0, 1, 0, 2, 1, ...","[8, 9, 7, 5, 7, 5, 4, 1, 0, 3, 2, 0, 3, 0, 3, ...","[8, 6, 7, 6, 8, 6, 5, 6, 7, 6, 8, 7, 8, 7, 9, ...",
1,2,"[1, 3, 4, 5, 7, 6, 8, 9, 0, 2]","[8, 7, 9, 0, 2, 4, 3, 1, 9, 0, 2, 3, 4, 3, 4, ...","[false, false, false, true, false, false, fals...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[7, 6, 5, 6, 7, 5, 6, 4, 5, 7, 8, 6, 5, 6, 7, ...","[8, 0, 8, 9, 7, 5, 3, 1, 0, 9, 0, 8, 6, 8, 0, ...","[7, 5, 6, 8, 7, 6, 7, 6, 7, 6, 8, 9, 1, 2, 0, ...",
2,3,"[2, 1, 3, 4, 5, 6, 7, 8, 9, 0]","[7, 5, 6, 8, 7, 8, 7, 6, 9, 0, 2, 1, 3, 1, 3, ...","[false, false, false, false, false, false, fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 3, 0, 1, 4, 1, 4, 2, 1, 4, 2, 1, 3, 1, 4, ...","[1, 3, 0, 2, 1, 2, 0, 9, 6, 8, 6, 5, 4, 5, 4, ...","[8, 7, 6, 5, 8, 5, 4, 1, 3, 4, 3, 4, 2, 4, 1, ...",
3,4,"[4, 5, 7, 8, 6, 9, 0, 1, 3, 2]","[2, 0, 8, 0, 1, 3, 1, 9, 8, 6, 4, 2, 3, 4, 6, ...","[false, false, true, true, false, false, false...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[8, 9, 0, 1, 0, 8, 0, 1, 9, 7, 8, 9, 0, 8, 7, ...","[7, 5, 4, 3, 1, 3, 4, 2, 0, 1, 3, 2, 4, 3, 4, ...","[2, 1, 2, 3, 5, 7, 5, 6, 8, 0, 9, 7, 8, 6, 4, ...",
4,5,"[2, 0, 9, 6, 7, 8, 5, 4, 3, 1]","[5, 4, 3, 1, 0, 2, 4, 5, 7, 8, 9, 6, 9, 0, 2, ...","[false, true, false, false, false, false, fals...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 3, 2, 3, 2, 4, 1, 2, 3, 2, 4, 1, 0, 2, 0, ...","[2, 3, 2, 0, 2, 3, 2, 1, 0, 3, 4, 2, 0, 9, 7, ...","[3, 2, 3, 1, 2, 0, 3, 2, 3, 4, 2, 4, 5, 6, 8, ...",
5,6,"[3, 2, 1, 4, 5, 6, 7, 8, 9, 0]","[1, 0, 8, 7, 8, 7, 9, 0, 8, 7, 5, 3, 4, 5, 7, ...","[false, false, true, false, false, false, fals...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5, 4, 3, 5, 4, 5, 7, 9, 0, 2, 3, 4, 3, 1, 3, ...","[6, 7, 9, 0, 8, 0, 8, 9, 0, 1, 3, 2, 0, 2, 4, ...","[9, 7, 6, 8, 7, 8, 6, 4, 5, 3, 1, 0, 1, 2, 0, ...",
6,7,"[3, 2, 1, 4, 5, 8, 6, 7, 9, 0]","[1, 4, 2, 3, 1, 4, 5, 8, 9, 6, 5, 7, 6, 5, 8, ...","[false, false, false, false, false, false, tru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 4, 2, 0, 3, 1, 0, 1, 2, 4, 1, 4, 1, 4, 3, ...","[1, 4, 3, 0, 2, 0, 1, 3, 4, 3, 1, 2, 0, 3, 0, ...","[2, 3, 2, 1, 3, 1, 3, 1, 4, 2, 4, 3, 0, 2, 1, ...",
7,8,"[9, 8, 7, 6, 5, 4, 2, 1, 3, 0]","[4, 3, 2, 4, 6, 5, 6, 4, 3, 4, 3, 5, 6, 4, 5, ...","[false, false, false, false, true, false, fals...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 5, 6, 7, 9, 7, 5, 3, 5, 3, 5, 6, 7, 8, 6, ...","[0, 9, 7, 9, 7, 9, 7, 6, 5, 4, 2, 1, 0, 8, 6, ...","[8, 0, 1, 3, 4, 6, 7, 9, 8, 6, 5, 7, 6, 8, 6, ...",
8,9,"[0, 9, 6, 8, 7, 5, 4, 3, 1, 2]","[1, 2, 4, 3, 0, 2, 0, 9, 7, 6, 5, 4, 2, 0, 3, ...","[false, false, false, false, false, false, fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 4, 3, 0, 2, 0, 3, 4, 1, 0, 2, 0, 3, 2, 1, ...","[0, 9, 8, 5, 8, 9, 0, 3, 1, 0, 3, 1, 2, 3, 4, ...","[7, 6, 7, 5, 7, 8, 6, 8, 5, 6, 7, 6, 8, 6, 9, ...",
9,10,"[2, 1, 3, 0, 9, 6, 7, 8, 5, 4]","[5, 3, 4, 6, 4, 3, 2, 0, 2, 4, 6, 7, 8, 0, 9, ...","[false, true, false, true, true, false, false,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[6, 7, 6, 7, 5, 7, 8, 9, 8, 9, 7, 8, 0, 9, 0, ...","[5, 7, 9, 8, 7, 6, 5, 7, 9, 1, 3, 2, 4, 6, 5, ...","[1, 0, 8, 9, 8, 9, 1, 0, 8, 6, 4, 3, 5, 6, 4, ...",


In [5]:
walkdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
walk_id            500 non-null int64
demo               500 non-null object
walk_one           500 non-null object
is_crosscluster    500 non-null object
is_lattice         500 non-null object
walk_two           500 non-null object
walk_three         500 non-null object
walk_four          500 non-null object
nback_queries      0 non-null float64
dtypes: float64(1), int64(1), object(7)
memory usage: 35.2+ KB


# Export a new subject exclusion list

In [6]:
exclusion_list = participants[['workerid','hitid','assignmentid']]
# exclusion_list.to_csv('../data/subjects.csv.gz', index=False, compression='gzip')

# Datastring 

In [7]:
datastring = \
    participants.\
    loc[~participants.datastring.isnull()].\
    datastring.\
    apply(json.loads).\
    apply(pd.Series)

# N-Back Data

questiondata records the n-back data as a single entry for each subject.

We want to filter the rows that list the stage as 'n-back', and then we
can combine them into a single dataframe.

The different segments all seem to have different data, so we're separating the trial types out.

In [8]:
import zlib
import base64
from json import loads
def decompress_pako(datastring):
    """
    Decompress json data that we compressed in the browser with paco.
    Assumes data was then base64-encoded:
    btoa(pako.deflate(JSON.stringify(data), { to: 'string' }));
    Parameters
    ----------
    datastring : string
        base64-encoded json data to decompress
    Returns
    -------
    dict
        JSON-decoded and decompressed data
    """
    data = loads(zlib.decompress(base64.b64decode(datastring)))
    return data

In [9]:
# nback_data_list = []
# for row in datastring.itertuples(index=False):
#     if 'n-back' in row.questiondata:
#         data = pd.DataFrame(decompress_pako(row.questiondata['n-back']))
#         data['workerid'] = row.workerId
#         data['hitid'] = row.hitId
#         data['uniqueid'] = data.workerid + ':' + data.hitid
#         nback_data_list.append(data)
# nback_data = pd.concat(nback_data_list).reset_index(drop=True)

In [10]:
# def get_trial_type(trial_name):
#     return nback_data \
#         .loc[nback_data.trial_type == trial_name] \
#         .dropna(axis=1, how='all') \
#         .reset_index(drop=True)

# nback_text = get_trial_type('poldrack-text')
# nback_instructions = get_trial_type('poldrack-instructions')
# nback_categorize = get_trial_type('poldrack-categorize')
# nback_single_stim = get_trial_type('poldrack-single-stim')
# nback_survey = get_trial_type('survey-text')

#os.makedirs('../data/preprocessed/')
# nback_text.to_csv('preprocessed/nback_experiment_2/nback_text.csv.gz', compression='gzip')
# nback_instructions.to_csv('preprocessed/nback_experiment_2/nback_instructions.csv.gz', compression='gzip')
# nback_categorize.to_csv('preprocessed/nback_experiment_2/nback_categorize.csv.gz', compression='gzip')
# nback_single_stim.to_csv('preprocessed/nback_experiment_2/nback_single_stim.csv.gz', compression='gzip')
# nback_survey.to_csv('preprocessed/nback_experiment_2/nback_survey.csv.gz', compression='gzip')
exclusion_list.to_csv('../data/preprocessed/all_prior_subjects.csv.gz', index=False, compression='gzip')

# Participant/Walk Data

Need to join the experiment and walkdata tables

In [11]:
experiment_walk = pd.merge(experiment, walkdata, how='left', on='walk_id')
experiment_walk.to_json('../data/preprocessed/walkdata.json.gz', compression='gzip')
experiment_walk['workerid'] = experiment_walk['uniqueId']
experiment_walk['workerid'] = experiment_walk['workerid'].apply(lambda x: x.split(':')[0])
experiment_walk.head(n=10)

Unnamed: 0,id,uniqueId,finger_mapping,walk_id,bonus_info,demo,walk_one,is_crosscluster,is_lattice,walk_two,walk_three,walk_four,nback_queries,workerid
0,1,debugc8dG4:debug8JjiQ,"[[false, false, true, false, false, false, fal...",1,,"[3, 2, 1, 0, 9, 8, 7, 6, 5, 4]","[8, 9, 6, 7, 8, 7, 6, 8, 9, 6, 7, 8, 5, 6, 5, ...","[false, false, false, false, false, false, fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 8, 9, 7, 6, 8, 6, 5, 4, 1, 0, 1, 0, 2, 1, ...","[8, 9, 7, 5, 7, 5, 4, 1, 0, 3, 2, 0, 3, 0, 3, ...","[8, 6, 7, 6, 8, 6, 5, 6, 7, 6, 8, 7, 8, 7, 9, ...",,debugc8dG4
1,2,debugFB8GGF:debug9NXMQX,"[[false, false, false, false, false, false, tr...",2,,"[1, 3, 4, 5, 7, 6, 8, 9, 0, 2]","[8, 7, 9, 0, 2, 4, 3, 1, 9, 0, 2, 3, 4, 3, 4, ...","[false, false, false, true, false, false, fals...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[7, 6, 5, 6, 7, 5, 6, 4, 5, 7, 8, 6, 5, 6, 7, ...","[8, 0, 8, 9, 7, 5, 3, 1, 0, 9, 0, 8, 6, 8, 0, ...","[7, 5, 6, 8, 7, 6, 7, 6, 7, 6, 8, 9, 1, 2, 0, ...",,debugFB8GGF
2,3,debugH74ty:debugVdE2r,"[[false, false, false, false, false, false, fa...",3,,"[2, 1, 3, 4, 5, 6, 7, 8, 9, 0]","[7, 5, 6, 8, 7, 8, 7, 6, 9, 0, 2, 1, 3, 1, 3, ...","[false, false, false, false, false, false, fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 3, 0, 1, 4, 1, 4, 2, 1, 4, 2, 1, 3, 1, 4, ...","[1, 3, 0, 2, 1, 2, 0, 9, 6, 8, 6, 5, 4, 5, 4, ...","[8, 7, 6, 5, 8, 5, 4, 1, 3, 4, 3, 4, 2, 4, 1, ...",,debugH74ty
3,4,debug8URQXJ:debugCRSICZ,"[[false, true, false, false, false, false, fal...",4,"{""walk_one_bonus"": 0.0, ""total_perf"": null, ""w...","[4, 5, 7, 8, 6, 9, 0, 1, 3, 2]","[2, 0, 8, 0, 1, 3, 1, 9, 8, 6, 4, 2, 3, 4, 6, ...","[false, false, true, true, false, false, false...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[8, 9, 0, 1, 0, 8, 0, 1, 9, 7, 8, 9, 0, 8, 7, ...","[7, 5, 4, 3, 1, 3, 4, 2, 0, 1, 3, 2, 4, 3, 4, ...","[2, 1, 2, 3, 5, 7, 5, 6, 8, 0, 9, 7, 8, 6, 4, ...",,debug8URQXJ
4,5,A2EFENZUAL6Z9V:3I3WADAZ9R5IC39JP2V14CMZL8G5O9,"[[false, false, false, false, false, false, tr...",5,,"[2, 0, 9, 6, 7, 8, 5, 4, 3, 1]","[5, 4, 3, 1, 0, 2, 4, 5, 7, 8, 9, 6, 9, 0, 2, ...","[false, true, false, false, false, false, fals...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 3, 2, 3, 2, 4, 1, 2, 3, 2, 4, 1, 0, 2, 0, ...","[2, 3, 2, 0, 2, 3, 2, 1, 0, 3, 4, 2, 0, 9, 7, ...","[3, 2, 3, 1, 2, 0, 3, 2, 3, 4, 2, 4, 5, 6, 8, ...",,A2EFENZUAL6Z9V
5,6,A2EFENZUAL6Z9V:3PQMUDRV7S7KS5TVYR1Q9OUIELNII1,"[[false, false, false, false, false, false, tr...",6,"{""walk_two_bonus"": 0.0, ""total_perf"": null, ""w...","[3, 2, 1, 4, 5, 6, 7, 8, 9, 0]","[1, 0, 8, 7, 8, 7, 9, 0, 8, 7, 5, 3, 4, 5, 7, ...","[false, false, true, false, false, false, fals...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5, 4, 3, 5, 4, 5, 7, 9, 0, 2, 3, 4, 3, 1, 3, ...","[6, 7, 9, 0, 8, 0, 8, 9, 0, 1, 3, 2, 0, 2, 4, ...","[9, 7, 6, 8, 7, 8, 6, 4, 5, 3, 1, 0, 1, 2, 0, ...",,A2EFENZUAL6Z9V
6,7,A2EFENZUAL6Z9V:34J10VATJGZX93HEBZ6ZIQL45V4IQ4,"[[false, false, false, false, false, true, fal...",7,"{""total_perf"": 0.866, ""total_bonus"": 0.0, ""wal...","[3, 2, 1, 4, 5, 8, 6, 7, 9, 0]","[1, 4, 2, 3, 1, 4, 5, 8, 9, 6, 5, 7, 6, 5, 8, ...","[false, false, false, false, false, false, tru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 4, 2, 0, 3, 1, 0, 1, 2, 4, 1, 4, 1, 4, 3, ...","[1, 4, 3, 0, 2, 0, 1, 3, 4, 3, 1, 2, 0, 3, 0, ...","[2, 3, 2, 1, 3, 1, 3, 1, 4, 2, 4, 3, 0, 2, 1, ...",,A2EFENZUAL6Z9V
7,9,A1HWB810RJBV2K:3Q5C1WP23N2XGB2IJ02MR4WKYX6155,"[[false, false, false, true, false, false, fal...",9,,"[0, 9, 6, 8, 7, 5, 4, 3, 1, 2]","[1, 2, 4, 3, 0, 2, 0, 9, 7, 6, 5, 4, 2, 0, 3, ...","[false, false, false, false, false, false, fal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 4, 3, 0, 2, 0, 3, 4, 1, 0, 2, 0, 3, 2, 1, ...","[0, 9, 8, 5, 8, 9, 0, 3, 1, 0, 3, 1, 2, 3, 4, ...","[7, 6, 7, 5, 7, 8, 6, 8, 5, 6, 7, 6, 8, 6, 9, ...",,A1HWB810RJBV2K
8,15,A3AS3TZQF4X6ZK:3IO1LGZLKAYBAFMHN4L6YKXS8YN68A,"[[false, false, false, false, false, false, fa...",15,,"[3, 1, 4, 5, 8, 6, 7, 9, 0, 2]","[2, 1, 4, 5, 8, 7, 5, 4, 3, 2, 4, 3, 0, 1, 0, ...","[false, false, false, true, false, false, fals...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 4, 1, 4, 1, 3, 0, 3, 2, 1, 2, 4, 1, 0, ...","[7, 9, 0, 9, 8, 6, 5, 6, 7, 8, 7, 8, 7, 5, 6, ...","[1, 3, 1, 0, 2, 4, 5, 4, 1, 3, 4, 5, 7, 5, 6, ...",,A3AS3TZQF4X6ZK
9,14,AZGNU1HRKLGNW:3180JW2OT5D4BSIM9P8SV4S6SE5J5Q,"[[false, false, true, false, false, false, fal...",14,"{""walk_one_perf"": 0.956, ""total_bonus"": 0.0, ""...","[7, 8, 6, 5, 4, 3, 2, 1, 0, 9]","[7, 8, 7, 9, 7, 9, 7, 8, 7, 5, 7, 8, 7, 8, 9, ...","[false, false, false, false, false, false, fal...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 1, 9, 8, 0, 8, 7, 5, 3, 2, 4, 6, 8, 7, 5, ...","[8, 7, 6, 7, 8, 6, 5, 3, 5, 3, 4, 2, 3, 4, 3, ...","[0, 8, 0, 9, 7, 5, 4, 3, 1, 0, 2, 4, 2, 1, 2, ...",,AZGNU1HRKLGNW


# Trial Data

Now separate out the quiz data and the trial data

In [12]:
taskdata_list = []
for row in datastring.itertuples(index=False):
    if 'compressed_task_data' in row.questiondata:
        data = pd.DataFrame(decompress_pako(row.questiondata['compressed_task_data']))
        data['workerid'] = row.workerId
        data['assignmentid'] = row.assignmentId
        data['uniqueid'] = data.workerid + ':' + data.assignmentid
        taskdata_list.append(data)
taskdata = pd.concat(taskdata_list).reset_index(drop=True)
taskdata.tail(n=10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


Unnamed: 0,assignmentid,correct,event,keyCode,nTries,node,phase,query,response,rt,stage,target,trial,uniqueid,workerid
56922,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,66.0,1.0,0.0,task,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",459.0,walk_four,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",240.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56923,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,82.0,1.0,2.0,task,0.0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",501.0,walk_four,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",241.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56924,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,86.0,1.0,1.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",492.0,walk_four,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",242.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56925,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,73.0,1.0,4.0,task,0.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",553.0,walk_four,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",243.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56926,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,86.0,1.0,1.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",416.0,walk_four,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",244.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56927,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,66.0,1.0,0.0,task,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",586.0,walk_four,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",245.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56928,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,86.0,1.0,1.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",448.0,walk_four,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",246.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56929,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,81.0,1.0,3.0,task,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",469.0,walk_four,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",247.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56930,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,66.0,1.0,0.0,task,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",566.0,walk_four,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",248.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK
56931,31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,True,correct,86.0,1.0,1.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",413.0,walk_four,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",249.0,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,AY6FIVEJJAWMK


In [13]:
any(experiment_walk['uniqueId'].apply(lambda x: "AK4WZEW584BR9" in x))

True

In [14]:
quizdata_list = []
for row in datastring.itertuples(index=False):
    if 'compressed_quiz_1_data' in row.questiondata:
        data = pd.DataFrame(decompress_pako(row.questiondata['compressed_quiz_1_data']))
        data['workerid'] = row.workerId
        data['hitid'] = row.hitId
        data['uniqueid'] = data.workerid + ':' + data.hitid
        quizdata_list.append(data)
quizdata = pd.concat(quizdata_list).reset_index(drop=True)
quizdata.head(n=10)

Unnamed: 0,phase,qid,response,rt,workerid,hitid,uniqueid
0,quiz,1,1,5159,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
1,quiz,1,2,5424,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
2,quiz,1,1,2867,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
3,quiz,1,1,4150,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
4,quiz,1,3,4190,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
5,quiz,1,2,1073,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
6,quiz,1,3,3021,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
7,quiz,1,3,2782,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
8,quiz,1,1,2445,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN
9,quiz,1,3,4602,A13JQTLPWXZD6L,379OL9DBSTFBKKH38DHATNH67SU9YN,A13JQTLPWXZD6L:379OL9DBSTFBKKH38DHATNH67SU9YN


# Add Walk Info to taskdata

In [15]:
expdata = experiment_walk.drop(["bonus_info", 'is_crosscluster', "demo", "walk_one", "walk_two", "walk_three", "walk_four", "nback_queries"], axis=1)
expdata.rename(columns={'uniqueId':'uniqueid'}, inplace=True)
expdata.tail()

Unnamed: 0,id,uniqueid,finger_mapping,walk_id,is_lattice,workerid
80,80,A1W3QW2GKDBHWE:3RWE2M8QWIB1ZBN6RHU87GDWKQ6N0K,"[[false, false, false, false, true, false, fal...",80,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",A1W3QW2GKDBHWE
81,83,A15NRFWNN4HMFZ:3C5W7UE9CGR3EJ8HIS4F3FB7P8GXMQ,"[[false, false, true, false, false, false, fal...",83,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A15NRFWNN4HMFZ
82,82,A288RBRSOE2J9E:36H9ULYP63VWDPBBF1L1HGJJH9NFJB,"[[false, false, true, false, false, false, fal...",82,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",A288RBRSOE2J9E
83,84,A143S9PO1285XX:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,"[[true, false, false, false, false, false, fal...",84,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",A143S9PO1285XX
84,85,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,"[[false, false, false, false, false, true, fal...",85,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",AY6FIVEJJAWMK


Summarise graph type, and change finger to categorical

In [16]:
expdata['is_lattice'] = expdata['is_lattice'].apply(lambda x: x[1])
expdata.tail()

Unnamed: 0,id,uniqueid,finger_mapping,walk_id,is_lattice,workerid
80,80,A1W3QW2GKDBHWE:3RWE2M8QWIB1ZBN6RHU87GDWKQ6N0K,"[[false, false, false, false, true, false, fal...",80,1,A1W3QW2GKDBHWE
81,83,A15NRFWNN4HMFZ:3C5W7UE9CGR3EJ8HIS4F3FB7P8GXMQ,"[[false, false, true, false, false, false, fal...",83,0,A15NRFWNN4HMFZ
82,82,A288RBRSOE2J9E:36H9ULYP63VWDPBBF1L1HGJJH9NFJB,"[[false, false, true, false, false, false, fal...",82,1,A288RBRSOE2J9E
83,84,A143S9PO1285XX:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,"[[true, false, false, false, false, false, fal...",84,1,A143S9PO1285XX
84,85,AY6FIVEJJAWMK:31HQ4X3T3TBBCK4XO6XD4UFQAP7LSP,"[[false, false, false, false, false, true, fal...",85,0,AY6FIVEJJAWMK


Now let's figure out who finished the task

In [17]:
finished_subjects = taskdata[(taskdata.trial == 249) & (taskdata.stage == "walk_four")].uniqueid.unique()

In [18]:
print("{} subjects complete".format(len(finished_subjects)))

47 subjects complete


In [19]:
taskdata[taskdata['workerid'] == "A3I8HU3ZCHOROC"]

Unnamed: 0,assignmentid,correct,event,keyCode,nTries,node,phase,query,response,rt,stage,target,trial,uniqueid,workerid
34055,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,80.0,1.0,3.0,task,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",2350.0,demo,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",0.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34056,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,69.0,1.0,1.0,task,0.0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",2243.0,demo,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",1.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34057,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,85.0,1.0,0.0,task,0.0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",1055.0,demo,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",2.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34058,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,81.0,1.0,9.0,task,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1667.0,demo,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",3.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34059,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,73.0,1.0,6.0,task,0.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",1348.0,demo,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",4.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34060,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,82.0,1.0,8.0,task,0.0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",1215.0,demo,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",5.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34061,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,66.0,1.0,7.0,task,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",3172.0,demo,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",6.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34062,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,False,incorrect,69.0,1.0,5.0,task,0.0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",1331.0,demo,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",7.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34063,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,87.0,2.0,5.0,task,0.0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",2660.0,demo,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",7.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC
34064,3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,True,correct,86.0,1.0,4.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",1245.0,demo,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",8.0,A3I8HU3ZCHOROC:3FTOP5WARGP5GLWLCJ2GE9WFB6UJ05,A3I8HU3ZCHOROC


Save list of finished and not finished hits so that you can review the HIT

In [20]:
approved_unique = taskdata[(taskdata.trial == 249) & (taskdata.stage == "walk_four")].uniqueid.unique()
approved = [x.split(':')[1] for x in approved_unique]

not_approved_unique = set(taskdata.uniqueid.unique()) - set(approved_unique)
not_approved = [x.split(':')[1] for x in not_approved_unique]

approved

[u'3SBEHTYCWO46I58029SPU0BZA4KYI9',
 u'3Q5C1WP23N2XGB2IJ02MR4WKYX6155',
 u'3JAOYWH7VJ5T758YF0XMNZA0IZU9LA',
 u'3ZQIG0FLQFH7F6X0MF9KOSGD7YGVW4',
 u'39OWYR0EPLSM8E6A11M4R3Y7UCSYFI',
 u'3180JW2OT5D4BSIM9P8SV4S6SE5J5Q',
 u'35BLDD71I7YB9125X2TRNWAVO0KVZU',
 u'31Q0U3WYDQGC3FG17THJPOAGP3117K',
 u'3MRNMEIQW675AVLFRG2O9VTQ9K8LD2',
 u'3AAJC4I4FHT2I62BGDF8K8Z1TDFZJF',
 u'34YB12FSQZPSSXDRTI4BFTBCMHEGM9',
 u'3MMN5BL1W05RYL13Y3XRWZM3ICA3MP',
 u'39PAAFCODN1FNPT6RA49189E34YVTE',
 u'3TPZPLC3M1DXJOYGQJPJD3GYNHMP38',
 u'33LKR6A5KFLZ1DD8KWKKUK2YE0ST1O',
 u'3LO69W1SU4E8MFV6T63WFGWNMCOGL7',
 u'39GHHAVOMGSMF9EMVKPIQCMSSIAJ4L',
 u'30H4UDGLT3JY5A6QW1UY92P4N9OMP8',
 u'3X3OR7WPZ01TT0POA8R8N3FPYTBL8W',
 u'3HFNH7HEMIFJDCBJLBBSW2VHWIFGQ0',
 u'3NQL1CS15S9B4BS01GR89P1TI6GVY3',
 u'3AMW0RGHOE34NSRQ829YMHE4SYBNPK',
 u'35DR22AR5EL2SX07CAZ07G21PGX3XC',
 u'3KOPY89HM931XDVIV6KZG7UMEIIJ34',
 u'30JNVC0ORALXD86U5HTAH27IBIOQH5',
 u'3QHK8ZVMINJCVYNVDKBIE55Q6ZWLBV',
 u'3Z4GS9HPNWBQE1V3W9YSQY1TSP677S',
 u'3M1CVSFP616I3LYGZBJI0GKEY

In [21]:
finished_taskdata = taskdata[taskdata.uniqueid.isin(finished_subjects)]
finished_quizdata = quizdata[quizdata.uniqueid.isin(finished_subjects)]

Add info by uniqueid

In [22]:
expanded_walkdata = pd.DataFrame()
for s in finished_taskdata['uniqueid'].unique():
    nTrial = sum(finished_taskdata['uniqueid'] == s)
    
    if any(expdata['uniqueid'] == s):
        curr = expdata[expdata['uniqueid'] == s]
        for i in range(nTrial):
            expanded_walkdata = expanded_walkdata.append(curr)
    else:
        print('Hmmm...there are subjects in your taskdata that aren\'t in your walkdata. Did you load the correct dataset?')
    
expanded_walkdata.head()
    

Unnamed: 0,id,uniqueid,finger_mapping,walk_id,is_lattice,workerid
28,8,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,"[[false, false, true, false, false, false, fal...",8,1,A13JQTLPWXZD6L
28,8,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,"[[false, false, true, false, false, false, fal...",8,1,A13JQTLPWXZD6L
28,8,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,"[[false, false, true, false, false, false, fal...",8,1,A13JQTLPWXZD6L
28,8,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,"[[false, false, true, false, false, false, fal...",8,1,A13JQTLPWXZD6L
28,8,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,"[[false, false, true, false, false, false, fal...",8,1,A13JQTLPWXZD6L


In [23]:
expanded_walkdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52005 entries, 28 to 84
Data columns (total 6 columns):
id                52005 non-null int64
uniqueid          52005 non-null object
finger_mapping    52005 non-null object
walk_id           52005 non-null int64
is_lattice        52005 non-null object
workerid          52005 non-null object
dtypes: int64(2), object(4)
memory usage: 2.8+ MB


In [24]:
finished_taskdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52005 entries, 0 to 56931
Data columns (total 15 columns):
assignmentid    52005 non-null object
correct         52005 non-null object
event           52005 non-null object
keyCode         52005 non-null float64
nTries          52005 non-null float64
node            52005 non-null float64
phase           52005 non-null object
query           52005 non-null float64
response        52005 non-null object
rt              52005 non-null float64
stage           52005 non-null object
target          52005 non-null object
trial           52005 non-null float64
uniqueid        52005 non-null object
workerid        52005 non-null object
dtypes: float64(6), object(9)
memory usage: 6.3+ MB


Combine task and walkdata

In [25]:
for field in expdata.drop('uniqueid', axis=1).columns:
    finished_taskdata[field] = ''


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [26]:
for s in finished_taskdata['uniqueid'].unique():
    for field in expdata.drop('uniqueid', axis=1).columns:
        curr = expdata.loc[expdata['uniqueid'] == s,field]
        finished_taskdata.loc[finished_taskdata['uniqueid'] == s,field] = curr.values
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [27]:
finished_taskdata.head()

Unnamed: 0,assignmentid,correct,event,keyCode,nTries,node,phase,query,response,rt,stage,target,trial,uniqueid,workerid,id,finger_mapping,walk_id,is_lattice
0,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,86.0,1.0,9.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",2830.0,demo,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",0.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1
1,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,73.0,1.0,8.0,task,0.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",2175.0,demo,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",1.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1
2,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,81.0,1.0,7.0,task,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1320.0,demo,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1
3,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,66.0,1.0,6.0,task,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",2294.0,demo,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",3.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1
4,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,85.0,1.0,5.0,task,0.0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",1268.0,demo,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",4.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1


In [28]:
finished_taskdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52005 entries, 0 to 56931
Data columns (total 19 columns):
assignmentid      52005 non-null object
correct           52005 non-null object
event             52005 non-null object
keyCode           52005 non-null float64
nTries            52005 non-null float64
node              52005 non-null float64
phase             52005 non-null object
query             52005 non-null float64
response          52005 non-null object
rt                52005 non-null float64
stage             52005 non-null object
target            52005 non-null object
trial             52005 non-null float64
uniqueid          52005 non-null object
workerid          52005 non-null object
id                52005 non-null int64
finger_mapping    52005 non-null object
walk_id           52005 non-null int64
is_lattice        52005 non-null object
dtypes: float64(6), int64(2), object(11)
memory usage: 7.9+ MB


# Add additional fields

Is the trial cross cluster

In [29]:
finished_taskdata['is_crosscluster'] = np.nan
for index, row in finished_taskdata.iterrows():
    if row['nTries'] == 1:
        if row['trial'] == 0:
            finished_taskdata.loc[index,'is_crosscluster'] = False
        else:
            prev = finished_taskdata.loc[index-1,'node']
            curr = row['node']
            if (prev in [4,5] and curr in [4,5]) or (prev in [0,9] and curr in [0,9]) and (prev != curr):
                finished_taskdata.loc[index,'is_crosscluster'] = True
            else:
                finished_taskdata.loc[index,'is_crosscluster'] = False
    else:
        finished_taskdata.loc[index,'is_crosscluster'] = finished_taskdata.loc[index-1,'is_crosscluster']

finished_taskdata.head()      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,assignmentid,correct,event,keyCode,nTries,node,phase,query,response,rt,stage,target,trial,uniqueid,workerid,id,finger_mapping,walk_id,is_lattice,is_crosscluster
0,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,86.0,1.0,9.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",2830.0,demo,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",0.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False
1,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,73.0,1.0,8.0,task,0.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",2175.0,demo,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",1.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False
2,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,81.0,1.0,7.0,task,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1320.0,demo,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False
3,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,66.0,1.0,6.0,task,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",2294.0,demo,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",3.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False
4,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,85.0,1.0,5.0,task,0.0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",1268.0,demo,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",4.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False


In [30]:
finished_taskdata.head()

Unnamed: 0,assignmentid,correct,event,keyCode,nTries,node,phase,query,response,rt,stage,target,trial,uniqueid,workerid,id,finger_mapping,walk_id,is_lattice,is_crosscluster
0,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,86.0,1.0,9.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",2830.0,demo,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",0.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False
1,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,73.0,1.0,8.0,task,0.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",2175.0,demo,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",1.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False
2,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,81.0,1.0,7.0,task,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1320.0,demo,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False
3,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,66.0,1.0,6.0,task,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",2294.0,demo,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",3.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False
4,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,85.0,1.0,5.0,task,0.0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",1268.0,demo,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",4.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False


Which hand was used

In [31]:
code_map = {81:'left',
            87:'left',
            69:'left',
            82:'left',
            86:'left',
            66:'right',
            85:'right',
            73:'right',
            79:'right',
            80:'right',}

finished_taskdata['hand'] = finished_taskdata['keyCode'].apply(lambda x: code_map.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [32]:
finished_taskdata.head()

Unnamed: 0,assignmentid,correct,event,keyCode,nTries,node,phase,query,response,rt,...,target,trial,uniqueid,workerid,id,finger_mapping,walk_id,is_lattice,is_crosscluster,hand
0,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,86.0,1.0,9.0,task,0.0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",2830.0,...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",0.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False,left
1,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,73.0,1.0,8.0,task,0.0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",2175.0,...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",1.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False,right
2,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,81.0,1.0,7.0,task,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1320.0,...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False,left
3,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,66.0,1.0,6.0,task,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",2294.0,...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",3.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False,right
4,3SBEHTYCWO46I58029SPU0BZA4KYI9,True,correct,85.0,1.0,5.0,task,0.0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",1268.0,...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",4.0,A13JQTLPWXZD6L:3SBEHTYCWO46I58029SPU0BZA4KYI9,A13JQTLPWXZD6L,8,"[[false, false, true, false, false, false, fal...",8,1,False,right


Was this a hand transition?

In [33]:
finished_taskdata['hand_transition'] = np.nan
for index, row in finished_taskdata.iterrows():
    if np.isna(row['hand']):
        finished_taskdata.loc[index,'hand_transition'] = np.nan
    if row['nTries'] == 1:
        if row['trial'] == 0:
            finished_taskdata.loc[index,'hand_transition'] = False
        else:
            prev = finished_taskdata.loc[index-1,'hand']
            curr = row['hand']
            if (prev != curr):
                finished_taskdata.loc[index,'hand_transition'] = True
            else:
                finished_taskdata.loc[index,'hand_transition'] = False
    else:
        finished_taskdata.loc[index,'hand_transition'] = finished_taskdata.loc[index-1,'hand_transition']

finished_taskdata.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


AttributeError: 'module' object has no attribute 'isna'

Lag and recency

In [None]:
# lag - how many times has this shownup in the last 10 trials?
# recency - how long since last presentation?
prev_display = {0: np.nan,
            1: np.nan,
            2: np.nan,
            3: np.nan,
            4: np.nan,
            5: np.nan,
            6: np.nan,
            7: np.nan,
            8: np.nan,
            9: np.nan}
lag10 = [np.nan]*10

finished_taskdata['lag10'] = np.nan
finished_taskdata['recency'] = np.nan
for index, row in finished_taskdata.iterrows():
    if row['nTries'] == 1:
        # what node are we on?
        curr_node = finished_taskdata.loc[index,'node']
        
        if row['trial'] == 0:
            # reset stack and dictionary for new stages
            prev_display = {0: np.nan,
                            1: np.nan,
                            2: np.nan,
                            3: np.nan,
                            4: np.nan,
                            5: np.nan,
                            6: np.nan,
                            7: np.nan,
                            8: np.nan,
                            9: np.nan}
            lag10 = [np.nan]*10
            
            
            finished_taskdata.loc[index,'lag10'] = 0
            finished_taskdata.loc[index,'recency'] = np.nan
            
        else:
            # recency
            # add recency of previous node
            finished_taskdata.loc[index,'recency'] = finished_taskdata.loc[index,'trial'] - prev_display[curr_node]
            
            # lag10
            finished_taskdata.loc[index,'lag10'] = sum(curr_node == lag10)
        
        # add this to the last 10 nodes stack
        lag10.pop(0)
        lag10.append(curr_node)
        
        # log trial number of current node
        prev_display[curr_node] = finished_taskdata.loc[index,'trial']
            
    else:
        finished_taskdata.loc[index,'lag10'] = finished_taskdata.loc[index-1,'lag10']
        finished_taskdata.loc[index,'recency'] = finished_taskdata.loc[index-1,'recency']

finished_taskdata.head() 

Communicability

In [None]:
import scipy.linalg as sp
import matplotlib.pyplot as plt

mod = np.array([[0,1,1,1,0,0,0,0,0,1],
              [1,0,1,1,1,0,0,0,0,0],
              [1,1,0,1,1,0,0,0,0,0],
              [1,1,1,0,1,0,0,0,0,0],
              [0,1,1,1,0,1,0,0,0,0],
              [0,0,0,0,1,0,1,1,1,0],
              [0,0,0,0,0,1,0,1,1,1],
              [0,0,0,0,0,1,1,0,1,1],
              [0,0,0,0,0,1,1,1,0,1],
              [1,0,0,0,0,0,1,1,1,0]])

lat = np.array([[0,1,1,0,0,0,0,0,1,1],
              [1,0,1,1,0,0,0,0,0,1],
              [1,1,0,1,1,0,0,0,0,0],
              [0,1,1,0,1,1,0,0,0,0],
              [0,0,1,1,0,1,1,0,0,0],
              [0,0,0,1,1,0,1,1,0,0],
              [0,0,0,0,1,1,0,1,1,0],
              [0,0,0,0,0,1,1,0,1,1],
              [1,0,0,0,0,0,1,1,0,1],
              [1,1,0,0,0,0,0,1,1,0]])

g_mod = np.round(sp.expm(mod), decimals=3)
g_lat = np.round(sp.expm(lat), decimals = 3)

f, axes = plt.subplots(1, 2, figsize = (16,6))
sns.heatmap(g_mod, ax=axes[0])
sns.heatmap(g_lat, ax=axes[1])


In [None]:
finished_taskdata['communicability'] = np.nan
for index, row in finished_taskdata.iterrows():
    if row['nTries'] == 1:
        if row['trial'] == 0:
            finished_taskdata.loc[index,'communicability'] = np.nan
        else:
            prev = finished_taskdata.loc[index-1,'node']
            curr = row['node']
            graph_type = row['is_lattice']
            if (graph_type):
                finished_taskdata.loc[index,'communicability'] = g_lat[int(prev),int(curr)]
            else:
                finished_taskdata.loc[index,'communicability'] = g_mod[int(prev),int(curr)]
    else:
        finished_taskdata.loc[index,'communicability'] = finished_taskdata.loc[index-1,'communicability']

finished_taskdata.head() 

In [None]:
finished_taskdata.drop(['assignmentid','uniqueid', 'finger_mapping','response'],axis=1)

Save the final data

In [None]:
finished_taskdata.to_csv('../data/preprocessed/taskdata.csv.gz', compression='gzip')
finished_quizdata.to_csv('../data/preprocessed/quizdata.csv.gz', compression='gzip')

# Check feedback

In [None]:
[x['free_response_question'] for x in datastring.questiondata if 'free_response_question' in x]

## EDA

This is mostly just a sanity check

In [None]:
import seaborn as sns
%matplotlib inline

In [None]:
finished_taskdata = pd.read_csv('../data/preprocessed/taskdata.csv.gz')

In [None]:
finished_taskdata.info()

In [None]:
finished_taskdata.describe()

In [None]:
finished_taskdata[finished_taskdata['rt'] < 2000]['rt'].plot()

In [None]:
sns.distplot(finished_taskdata[finished_taskdata['rt'] < 2000]['rt'])

In [None]:
sns.violinplot(data=finished_taskdata, x='is_crosscluster', y="rt")

In [None]:
sns.violinplot(data=finished_taskdata, x='correct', y="rt")

In [None]:
sns.violinplot(data=finished_taskdata, x='stage', y="rt")

In [None]:
sns.countplot(data=finished_taskdata, x='stage')

In [None]:
finished_taskdata[finished_taskdata['stage'] == "walk_two"]["trial"].unique()
correct_data = finished_taskdata[finished_taskdata["event"] == "correct"]
# it is important to note that here, a correct trial is just when they pushed the right key, 
# even if they pushed the wrong key first. To make this comparable to ECoG data, you will have to remove all trials
# where nTries != 1

In [None]:
sns.countplot(data=correct_data, x='stage')

In [None]:
sns.pairplot(data=correct_data.drop(['assignmentid', 'event','query','response','target','uniqueid', 'workerid','walk_id'],axis=1), hue='is_crosscluster')

In [None]:
sns.violinplot(data=correct_data[finished_taskdata['rt'] < 2000], x='keyCode', y="rt")

In [None]:
correct_data[finished_taskdata['keyCode'] == 32].drop(['uniqueid','assignmentid','workerid'],axis=1)
