In [1]:
from main import read_in_blicks, BOUNDARY
import scorers
import datasets
import informants

In [2]:
import pandas as pd

In [3]:
feature_type = 'atr_four'

## Load dataset, scorers, and oracle


In [12]:
# Change these paths if you want to specify a different set of features
lexicon_path = f'data/hw/{feature_type}_lexicon.txt'
phoneme_feature_path = f'data/hw/{feature_type}_features.txt'
ngram_feature_path = f'data/hw/{feature_type}_feature_weights.txt'

print(f'Loading lexicon from:\t{lexicon_path}')
dataset = datasets.load_lexicon(lexicon_path, min_length=2, max_length=5)

mf_scorer = scorers.MeanFieldScorer(dataset, 
                                    feature_type=feature_type, 
                                    phoneme_feature_file=phoneme_feature_path,
                                   )
hw_scorer = scorers.HWScorer(dataset, 
                                    feature_type=feature_type, 
                                    phoneme_feature_file=phoneme_feature_path,
                            )

# Load oracle
informant = informants.HWInformant(dataset, hw_scorer)

Loading lexicon from:	data/hw/atr_four_lexicon.txt
Loading lexicon with min_length=2, max_length=5...
Reading phoneme features from: data/hw/atr_four_features.txt
# features:  2744
feature type:  atr_four
Reading phoneme features from: data/hw/atr_four_features.txt
Loading ngram features from: data/hw/atr_four_feature_weights.txt


## Getting mean features by labels across full eval dataset

In [17]:
# Read in items to featurize
# Change this path if you want to specify a different eval dataset
eval_dataset_path = f'{feature_type}_test_set.txt'
print(f'Reading eval items from:\t{eval_dataset_path}')
items = read_in_blicks(eval_dataset_path)

Reading eval items from:	atr_four_test_set.txt


In [18]:
# Get phonemes
phonemes = [[BOUNDARY] + item + [BOUNDARY] for item in items]
# Encode items
encoded_items = [dataset.vocab.encode(phon) for phon in phonemes]
# Get labels with HW oracle
labels = [informant.judge(encod) for encod in encoded_items]
# Featurize items
featurized_items = [mf_scorer._featurize(encod).nonzero()[0] for encod in encoded_items]
# Get num features
num_features = [len(f) for f in featurized_items]

# Get dataframe of results
eval_dataset = pd.DataFrame({
    'item': items,
    'label': labels,
    'encoded': encoded_items,
    'featurized': featurized_items,
    'num_features': num_features,
})

In [19]:
display(eval_dataset)

Unnamed: 0,item,label,encoded,featurized,num_features
0,"[g, A, I, i]",True,"(0, 1, 3, 9, 7, 0)","[222, 250, 278, 292, 309, 311, 313, 314, 317, ...",114
1,"[H, A, A, H, A, f]",True,"(0, 5, 3, 3, 5, 3, 8, 0)","[308, 311, 313, 315, 316, 317, 319, 700, 703, ...",66
2,"[F, f, F, f]",True,"(0, 6, 8, 6, 8, 0)","[0, 3, 5, 7, 9, 10, 11, 12, 42, 45, 47, 49, 51...",379
3,"[A, f, A, f]",True,"(0, 3, 8, 3, 8, 0)","[112, 115, 117, 119, 121, 123, 700, 703, 705, ...",54
4,"[A, F, A, i, A]",True,"(0, 3, 6, 3, 7, 3, 0)","[113, 115, 117, 118, 121, 123, 320, 701, 703, ...",57
...,...,...,...,...,...
1779,"[A, A, A, g, f]",False,"(0, 3, 3, 3, 1, 8, 0)","[208, 250, 278, 306, 334, 362, 404, 446, 474, ...",80
1780,"[G, g, H, h]",False,"(0, 2, 1, 5, 4, 0)","[211, 213, 214, 217, 219, 220, 221, 222, 225, ...",429
1781,"[i, G, i, A]",False,"(0, 7, 2, 7, 3, 0)","[211, 213, 215, 216, 218, 219, 221, 225, 227, ...",294
1782,"[A, F, h, H, F, G]",False,"(0, 3, 6, 4, 5, 6, 2, 0)","[15, 17, 18, 21, 23, 24, 26, 40, 43, 45, 46, 4...",565


In [20]:
print('Mean num features by label:')
for label in [True, False]:
    temp = eval_dataset[eval_dataset['label']==label]
    mean_num_features = temp['num_features'].mean()
    print(f'{label}:\t{mean_num_features}')

Mean num features by label:
True:	117.55381165919283
False:	372.87668161434976


## Getting num features in individual sequence

In [21]:
def get_num_features_in_seq(seq, scorer, dataset):
    item = seq.strip().split(' ')
    phonemes = [BOUNDARY] + item + [BOUNDARY]
    encoded = dataset.vocab.encode(phonemes)
    features = mf_scorer._featurize(encoded).nonzero()[0]
    return len(features)

In [22]:
get_num_features_in_seq('g A I i', mf_scorer, dataset)

114