## Experimental data
By item:
- every item has an entropy score calculated using all participants responses over that item

By participant 
- every participant has a score using all responses across items by this pariticpant

In [47]:
import pandas as pd
import scipy.stats

def ent(data):
    """Calculates entropy of the passed `pd.Series`
    from https://stackoverflow.com/questions/15450192/fastest-way-to-compute-entropy-in-python
    """
    p_data = data.value_counts()           # counts occurrence of each value
    entropy = scipy.stats.entropy(p_data)  # get entropy from counts
    return entropy


def by_participants(df):
    '''Returns by participant entropy for the passed dataframe
    '''
    part_entropies = {}
    subj = set(df['Exp_Subject_Id'])
    for subject in subj:
        s = df[df['Exp_Subject_Id'] == subject]['Production_R'] # creates a subset of the dataframe for the current participant, then accesses only the productionR column of that df
        entropy = ent(s)
        part_entropies[subject] = entropy
    return part_entropies

def by_items(df):
    '''Returns by item entropy for the passed dataframe
    '''
    item_entropies = {}
    items = set(df['Word_romanization'])
    for item in items:
        s = df[df['Word_romanization'] == item]['Production_R'] # creates a subset of the dataframe for the current item, then accesses only the productionR column of that df
        entropy = ent(s)
        item_entropies[item] = entropy
    return item_entropies

In [48]:
dev = pd.read_csv('c:/Users/astei/Downloads/dev.csv')
test = pd.read_csv('c:/Users/astei/Downloads/test.csv')
train = pd.read_csv('c:/Users/astei/Downloads/train.csv')
print(by_participants(dev), by_participants(train), by_participants(dev))
print(by_items(dev), by_items(train), by_items(dev))

{575760: 5.0930903208996305, 578698: 5.164151822134836, 592117: 5.123642132834217, 592166: 5.103284836410856} {581952: 5.1272663563154595, 556033: 5.133099245669346, 589028: 5.100537226815552, 578085: 5.129156703383325, 597515: 5.131343768173773, 563118: 5.139045403513327, 556014: 5.139045403513327, 556505: 5.069725789161007, 594939: 5.162150309531992, 585660: 5.171896483370142, 559838: 5.125428515958302, 565631: 5.100537226815552} {575760: 5.0930903208996305, 578698: 5.164151822134836, 592117: 5.123642132834217, 592166: 5.103284836410856}
{'yvTv': 0.0, 'nvlpv': 0.0, 'p*opna': 0.5623351446188083, 'yaTna': 0.6931471805599453, 'cvkna': 0.6931471805599453, 'kxlkna': 1.0397207708399179, 'nxlkna': 1.0397207708399179, 'cipv': 0.0, 'Cakna': 0.6931471805599453, 'pulkna': 0.5623351446188083, 'ipta': 0.6931471805599453, 'cvkv': 0.0, 'cvpta': 0.0, 'cilkna': 1.3862943611198906, 'Caka': 0.0, 'pulkta': 1.0397207708399179, 'cvpna': 0.5623351446188083, 'ipna': 0.6931471805599453, 'cipna': 0.5623351446