In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from textrec.paths import paths

In [40]:
experiments = ['gc1', 'spec1']
frames = [pd.read_csv(paths.analyzed / f'trial_withmanual_{exp}.csv') for exp in experiments]
# Approach based on https://stackoverflow.com/a/48064892/69707
columns_ordered = []
for frame in frames:
    for col in frame.columns:
        if col not in columns_ordered:
            columns_ordered.append(col)
all_data = pd.concat(
    frames,
    keys=experiments,
    names=('experiment', '_idx'))[columns_ordered].reset_index(level=0).reset_index(drop=True)

In [41]:
all_data

Unnamed: 0,experiment,participant,age,english_proficiency,gender,helpfulRank-accurate-least-condition,helpfulRank-accurate-least-idx,helpfulRank-accurate-most-condition,helpfulRank-accurate-most-idx,helpfulRank-quick-least-condition,...,corrected_tapstotype_general,corrected_idealrecuse_general,corrected_tapstotype_specific,corrected_idealrecuse_specific,corrected_tapstotype_gated,corrected_idealrecuse_gated,corrected_tapstotype_always,corrected_idealrecuse_always,corrected_tapstotype_cond,corrected_efficiency
0,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,20,13,17,13,23,13,20,13,57,0.311475
1,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,23,13,23,13,27,13,23,13,73,0.820225
2,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,18,13,18,13,22,13,18,13,69,0.896104
3,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,18,12,19,12,22,12,18,12,72,1.000000
4,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,17,9,15,9,19,9,17,9,19,0.475000
5,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,30,11,28,11,35,11,30,11,35,0.426829
6,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,38,18,35,18,42,18,38,18,42,0.365217
7,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,25,14,27,14,29,14,25,14,29,0.439394
8,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,20,8,17,8,20,8,20,8,20,0.571429
9,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,22,10,23,10,23,10,22,10,22,0.550000


In [42]:
norecs_speed = all_data[all_data.condition == 'norecs'].groupby('participant').characters_per_sec.mean()
with_norecs_speed = pd.merge(
    all_data,
    norecs_speed.to_frame('chars_per_sec_norecs_mean'),
    left_on='participant',
    right_index=True)
with_norecs_speed

Unnamed: 0,experiment,participant,age,english_proficiency,gender,helpfulRank-accurate-least-condition,helpfulRank-accurate-least-idx,helpfulRank-accurate-most-condition,helpfulRank-accurate-most-idx,helpfulRank-quick-least-condition,...,corrected_idealrecuse_general,corrected_tapstotype_specific,corrected_idealrecuse_specific,corrected_tapstotype_gated,corrected_idealrecuse_gated,corrected_tapstotype_always,corrected_idealrecuse_always,corrected_tapstotype_cond,corrected_efficiency,chars_per_sec_norecs_mean
0,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,13,17,13,23,13,20,13,57,0.311475,2.118500
1,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,13,23,13,27,13,23,13,73,0.820225,2.118500
2,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,13,18,13,22,13,18,13,69,0.896104,2.118500
3,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,12,19,12,22,12,18,12,72,1.000000,2.118500
4,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,9,15,9,19,9,17,9,19,0.475000,2.118500
5,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,11,28,11,35,11,30,11,35,0.426829,2.118500
6,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,18,35,18,42,18,38,18,42,0.365217,2.118500
7,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,14,27,14,29,14,25,14,29,0.439394,2.118500
8,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,8,17,8,20,8,20,8,20,0.571429,2.118500
9,gc1,26f62q,33.0,Native or bilingual,male,norecs,0,always,2,norecs,...,10,23,10,23,10,22,10,22,0.550000,2.118500


In [43]:
with_norecs_speed['chars_per_sec_ratio_to_norecs'] = with_norecs_speed.characters_per_sec / with_norecs_speed.chars_per_sec_norecs_mean

In [44]:
with_norecs_speed['chars_per_sec_ratio_to_norecs_log'] = np.log(with_norecs_speed.chars_per_sec_ratio_to_norecs)

In [45]:
import scipy.stats

In [46]:
with_norecs_speed['chars_per_sec_ratio_to_norecs_boxcox'], boxcox_lambda = scipy.stats.boxcox(with_norecs_speed.chars_per_sec_ratio_to_norecs)

In [47]:
with_norecs_speed.to_csv(paths.analyzed / 'combined_traits.csv', index=False)