In [51]:
from main import read_in_blicks, BOUNDARY
import scorers
import datasets
import informants
import random

In [52]:
import pandas as pd
import numpy as np

In [92]:
feature_type = 'atr_harmony'

## Load dataset, scorers, and oracle


In [93]:
# Change these paths if you want to specify a different set of features
lexicon_path = f'data/hw/{feature_type}_lexicon.txt'
phoneme_feature_path = f'data/hw/{feature_type}_features.txt'
ngram_feature_path = f'data/hw/{feature_type}_feature_weights.txt'

print(f'Loading lexicon from:\t{lexicon_path}')
dataset = datasets.load_lexicon(lexicon_path, min_length=2, max_length=5)

mf_scorer = scorers.MeanFieldScorer(dataset, 
                                    feature_type=feature_type, 
                                    phoneme_feature_file=phoneme_feature_path,
                                   )
hw_scorer = scorers.HWScorer(dataset, 
                                    feature_type=feature_type, 
                                    phoneme_feature_file=phoneme_feature_path,
                            )

# Load oracle
informant = informants.HWInformant(dataset, hw_scorer)



Loading lexicon from:	data/hw/atr_harmony_lexicon.txt
Loading lexicon with min_length=2, max_length=5...
Reading phoneme features from: data/hw/atr_harmony_features.txt
# features:  512
feature type:  atr_harmony
Reading phoneme features from: data/hw/atr_harmony_features.txt
Loading ngram features from: data/hw\atr_harmony_feature_weights.txt


# Generate data

Here's a function to generate a random string of syllables

In [163]:


def make_words(syllables,num_words,list_to_exclude=[]):
    word_list = []
    while len(word_list) < num_words:
        syllable_list = []
        length = np.random.poisson(2)
        for _ in range(length) if length > 0 else range(1):
            next_syllable = random.choice(syllables)
            syllable_list.append(next_syllable)
        word = " ".join(syllable_list)
        if word not in word_list and word not in list_to_exclude:
            word_list.append(word)
    temp_out = open("atr_harmony_TEMP.txt","w")
    for item in word_list:
        temp_out.write(item+"\n")
    temp_out.close()
    eval_dataset_path = f'{feature_type}_TEMP.txt'
    print(f'Reading eval items from:\t{eval_dataset_path}')
    items = read_in_blicks(eval_dataset_path)
    print(items)
    return items # returns a list of lists of syllables, ex.:[['ti'], ['qe', 'ka', 'qE'], ['ki', 'qI', 'qi', 'qa', 'qi', 'ke', 'ta'], ...]

def classify_word(list_of_forbidden_features,word):
    phonemes = [BOUNDARY] + word + [BOUNDARY]
    # Encode items
    encoded_item = dataset.vocab.encode(phonemes)
# Get labels with HW oracle
    print(encoded_item)
    features = mf_scorer._featurize(encoded_item).nonzero()[0]
    print(features)
    #assert False
    print(set.intersection(set(features), set(list_of_forbidden_features)))
    #assert False
    if len(set.intersection(set(features), set(list_of_forbidden_features))) != 0:
        return False
    else:
        return True

def make_a_train_and_test_set(num_words,list_of_forbidden_features,syllables):
    words = make_words(syllables, num_words)
    print("in make train adn test, here are words",words)
    goods = []
    bads = []
    for word in words: # word is a list like ['ti'] or  ['qe', 'ka', 'qE'], etc.
        if classify_word(list_of_forbidden_features,word):
            goods.append((" ".join(word),True))
        else:
            bads.append((" ".join(word),False))
    print(goods)
    #assert False
    random.shuffle(goods)
    train = goods[:len(goods)//2]
    test_goods = goods[len(goods)//2:]
    test_bads = random.sample(bads,len(test_goods))
    return train, test_goods+test_bads




def make_a_language(numerical_features,syllables,num_words,num_bad_features):
    print(numerical_features)
    bad_features = random.sample(numerical_features,num_bad_features)
    print(bad_features)
    
    train, test = make_a_train_and_test_set(num_words, bad_features,syllables)
    return train, test, bad_features

def write_out_a_language(train, test, bad_features, seed):
    t = open("./ProcGenLgs/atr_lg_with_seed_"+str(seed)+".csv","w")
    t.write("Word,Set,Status,BadFeatures,Seed\n")
    for item in train:
        t.write(str(item[0])+',Train,True,'+str(bad_features)+","+str(seed)+'\n')
    for item in test:
        print(item)
        print(type(item))
        #assert False
        t.write(str(item[0])+',Test,'+str(item[1])+","+str(bad_features)+","+str(seed)+'\n')
    t.close()
    return



def main(num_words, num_bad_features,numerical_features,syllables,num_languages):
    for i in range(num_languages):
        seed = i
        random.seed(seed)
        train, test, bad_features = make_a_language(numerical_features,syllables, num_words, num_bad_features)
        print(train)
        write_out_a_language(train, test, bad_features, seed)
        

numerical_features = [i for i in mf_scorer.ngram_features.values()]
syllables = [
    'pa',
'ta',
'ka',
'pi',
'ti',
'ki',
'pe',
'te',
'ke',
'pE',
'tE',
'kE',
'pI',
'tI',
'kI',
'qI',
'qi',
'qe',
'qE',
'qa'

]


num_words = 10
num_bad_features = 18

num_languages = 10

main(num_words,num_bad_features,numerical_features,syllables,num_languages)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

ValueError: Sample larger than population or is negative

## Getting mean features by labels across full eval dataset

In [95]:
# Read in items to featurize
# Change this path if you want to specify a different eval dataset
eval_dataset_path = f'{feature_type}_test_set.txt'
print(f'Reading eval items from:\t{eval_dataset_path}')
items = read_in_blicks(eval_dataset_path)

Reading eval items from:	atr_harmony_test_set.txt


FileNotFoundError: [Errno 2] No such file or directory: 'atr_harmony_test_set.txt'

In [107]:
# Get phonemes
phonemes = [[BOUNDARY] + item + [BOUNDARY] for item in items]
# Encode items
encoded_items = [dataset.vocab.encode(phon) for phon in phonemes]
# Get labels with HW oracle
labels = [informant.judge(encod) for encod in encoded_items]
# Featurize items
featurized_items = [mf_scorer._featurize(encod).nonzero()[0] for encod in encoded_items]
# Get num features
num_features = [len(f) for f in featurized_items]

# Get dataframe of results
eval_dataset = pd.DataFrame({
    'item': items,
    'label': labels,
    'encoded': encoded_items,
    'featurized': featurized_items,
    'num_features': num_features,
})

KeyError: 'g'

In [15]:
display(eval_dataset)

Unnamed: 0,item,label,encoded,featurized,num_features
0,"[g, A, I, i]",True,"(0, 1, 3, 9, 7, 0)","[222, 250, 278, 292, 309, 311, 313, 314, 317, ...",114
1,"[H, A, A, H, A, f]",True,"(0, 5, 3, 3, 5, 3, 8, 0)","[308, 311, 313, 315, 316, 317, 319, 700, 703, ...",66
2,"[F, f, F, f]",True,"(0, 6, 8, 6, 8, 0)","[0, 3, 5, 7, 9, 10, 11, 12, 42, 45, 47, 49, 51...",379
3,"[A, f, A, f]",True,"(0, 3, 8, 3, 8, 0)","[112, 115, 117, 119, 121, 123, 700, 703, 705, ...",54
4,"[A, F, A, i, A]",True,"(0, 3, 6, 3, 7, 3, 0)","[113, 115, 117, 118, 121, 123, 320, 701, 703, ...",57
...,...,...,...,...,...
1779,"[A, A, A, g, f]",False,"(0, 3, 3, 3, 1, 8, 0)","[208, 250, 278, 306, 334, 362, 404, 446, 474, ...",80
1780,"[G, g, H, h]",False,"(0, 2, 1, 5, 4, 0)","[211, 213, 214, 217, 219, 220, 221, 222, 225, ...",429
1781,"[i, G, i, A]",False,"(0, 7, 2, 7, 3, 0)","[211, 213, 215, 216, 218, 219, 221, 225, 227, ...",294
1782,"[A, F, h, H, F, G]",False,"(0, 3, 6, 4, 5, 6, 2, 0)","[15, 17, 18, 21, 23, 24, 26, 40, 43, 45, 46, 4...",565


In [19]:
print('Mean num features by label:')
for label in [True, False]:
    temp = eval_dataset[eval_dataset['label']==label]
    mean_num_features = temp['num_features'].mean()
    print(f'{label}:\t{mean_num_features}')

Mean num features by label:
True:	117.55381165919283
False:	372.87668161434976


## Getting num features in individual sequence

In [10]:
def get_num_features_in_seq(seq, scorer, dataset):
    item = seq.strip().split(' ')
    phonemes = [BOUNDARY] + item + [BOUNDARY]
    encoded = dataset.vocab.encode(phonemes)
    features = mf_scorer._featurize(encoded).nonzero()[0]
    return len(features)

In [None]:
get_num_features_in_seq('g A I i', mf_scorer, dataset)