In [4]:
from sympy.solvers import solve
from sympy import Symbol, log, exp
import sympy as sp

In [5]:
def get_prior(target_palloff, num_features=500):
    x = Symbol('x')
    eq = sp.Eq(num_features*log(1-x), log(target_palloff))
    sol = solve(eq, x)
    assert len(sol) == 1
    return sol[0]

In [6]:
prior = get_prior(0.4)

In [7]:
print(prior)

0.00183090331161104


In [8]:
import math
def sigmoid(x):
    return 1 / (1 + exp(-x))

def derive_params(prior_palloff_calibrate, target_palloff, label=True, num_features_observe=500, num_features_calibrate=500):
    # calculate prior prob of thetas
    prior = get_prior(prior_palloff_calibrate, num_features=num_features_calibrate)
    
#     print(prior)
    
    # calculate posterior prob of thetas based on target p_all_off_observes
    target_posterior = get_prior(target_palloff, num_features=num_features_observe)
    
    # calculate prior p_all_off_observe, used in updates
    prior_palloff_observe = math.pow((1-prior), num_features_observe)
#     prior_palloff_observe = math.pow(prior_palloff_calibrate, num_features_observe-num_features_calibrate)
#     print("prior p all off observe:", prior_palloff_observe)
#     print("prior:", prior)
#     print("posterior p all off observe:", target_palloff, (1-target_posterior)**num_features_observe)
        
    log_alpha = Symbol('log_alpha')
    # if label is True, subtract; else, add
    if label:
        term = -1*prior_palloff_observe*log_alpha
    else:
        term = prior_palloff_observe*log_alpha
        
#     print("term:", term)
#     print(log(sigmoid(log(prior)-log(1-prior)+term)))
#     print(log(target_posterior))

#     print(target_posterior)
    
#     eq = sp.Eq(sigmoid(log(prior)-log(1-prior)+term), target_posterior)
    eq = sp.Eq(log(sigmoid(log(prior)-log(1-prior)+term)), log(target_posterior))
    sol = solve(eq, log_alpha)
    
#     assert len(sol) == 1, sol
    sol = sol[0]
    
    lla = log(sol)
    
    return prior, lla

## Get params for English

In [29]:
import pandas as pd
words_with_feats_eng = pd.read_csv('words_with_feats_english.csv')
avg_num_feats = words_with_feats_eng['NumFeatures'].mean()
print(f'avg # feats: {avg_num_feats}')

avg # feats: 1797.6219567177639


**Get hyperparams for positive updates (unif, eig_train_model)**

In [9]:
num_features = 500 # our estimate for num features in a random sequence (TODO: because that's what's in the AUC?)
for prior_palloff in [0.5, 0.3, 0.1]:
    print("")
    print(f"prior palloff: {prior_palloff}")
    for posterior_palloff in [0.4, 0.3, 0.2, 0.1, 0.05, 0.01]:
        if posterior_palloff >= prior_palloff:
            continue
        
        prior, lla = derive_params(prior_palloff, posterior_palloff, label=False, 
                                   num_features_observe=num_features,
                                   num_features_calibrate=500,
                                  )

        print(f'prior={prior}\tlla={lla}\t\t(posterior_palloff={posterior_palloff})')


prior palloff: 0.5
prior=0.00138533389897108	lla=-0.582269524436029		(posterior_palloff=0.4)
prior=0.00138533389897108	lla=0.100117997493526		(posterior_palloff=0.3)
prior=0.00138533389897108	lla=0.522731931474557		(posterior_palloff=0.2)
prior=0.00138533389897108	lla=0.877263465295860		(posterior_palloff=0.1)
prior=0.00138533389897108	lla=1.07568861873667		(posterior_palloff=0.05)
prior=0.00138533389897108	lla=1.33374133882702		(posterior_palloff=0.01)

prior palloff: 0.3
prior=0.00240504883318384	lla=-0.0316148863778716		(posterior_palloff=0.2)
prior=0.00240504883318384	lla=0.772427968403264		(posterior_palloff=0.1)
prior=0.00240504883318384	lla=1.11334210926109		(posterior_palloff=0.05)
prior=0.00240504883318384	lla=1.50033511229104		(posterior_palloff=0.01)

prior palloff: 0.1
prior=0.00459458264847305	lla=0.970210628375159		(posterior_palloff=0.05)
prior=0.00459458264847305	lla=1.93939240693089		(posterior_palloff=0.01)


**Train CALIBRATED: get hyperparams for positive updates (train) -- using 500 as num features to calculate prior**

In [11]:
num_features = 1798 # our estimate for num features in a train sequence
for prior_palloff in [0.5, 0.3, 0.1]:
    print("")
    print(f"prior palloff: {prior_palloff}")
    for posterior_palloff in [0.8, 0.9, 0.95, 0.99]:
        if posterior_palloff <= prior_palloff:
            continue
        
        prior, lla = derive_params(prior_palloff, posterior_palloff, label=True, 
                                   num_features_observe=num_features,
                                   num_features_calibrate=500,
                                  )

        print(f'prior={prior}\tlla={lla}\t\t(posterior_palloff={posterior_palloff})')


prior palloff: 0.5
prior=0.00138533389897108	lla=3.37379282024428		(posterior_palloff=0.8)
prior=0.00138533389897108	lla=3.64450190689220		(posterior_palloff=0.9)
prior=0.00138533389897108	lla=3.84947003210775		(posterior_palloff=0.95)
prior=0.00138533389897108	lla=4.19987456939477		(posterior_palloff=0.99)

prior palloff: 0.3
prior=0.00240504883318384	lla=5.41687946870128		(posterior_palloff=0.8)
prior=0.00240504883318384	lla=5.64240070601870		(posterior_palloff=0.9)
prior=0.00240504883318384	lla=5.81942728026585		(posterior_palloff=0.95)
prior=0.00240504883318384	lla=6.13231740872503		(posterior_palloff=0.99)

prior palloff: 0.1


KeyboardInterrupt: 

**Train NOT CALIBRATED: get hyperparams for positive updates (train) -- using 1798 as num features to calculate prior**

In [12]:
num_features = 1798 # our estimate for num features in a train sequence
for prior_palloff in [0.5, 0.3, 0.1]:
    print("")
    print(f"prior palloff: {prior_palloff}")
    for posterior_palloff in [0.8, 0.9, 0.95, 0.99]:
        if posterior_palloff <= prior_palloff:
            continue
        
        prior, lla = derive_params(prior_palloff, posterior_palloff, label=True, 
                                   num_features_observe=num_features,
                                   num_features_calibrate=num_features,
                                  )

        print(f'prior={prior}\tlla={lla}\t\t(posterior_palloff={posterior_palloff})')


prior palloff: 0.5
prior=0.000385435812071119	lla=0.818508339363158		(posterior_palloff=0.8)
prior=0.000385435812071119	lla=1.32655384099565		(posterior_palloff=0.9)
prior=0.000385435812071119	lla=1.65014245712180		(posterior_palloff=0.95)
prior=0.000385435812071119	lla=2.13625332091839		(posterior_palloff=0.99)

prior palloff: 0.3
prior=0.000669393655992854	lla=1.72623647509002		(posterior_palloff=0.8)
prior=0.000669393655992854	lla=2.09445314096582		(posterior_palloff=0.9)
prior=0.000669393655992854	lla=2.35332338102534		(posterior_palloff=0.95)
prior=0.000669393655992854	lla=2.76969036320956		(posterior_palloff=0.99)

prior palloff: 0.1
prior=0.00127981720494661	lla=3.15040457368557		(posterior_palloff=0.8)
prior=0.00127981720494661	lla=3.42914027054695		(posterior_palloff=0.9)
prior=0.00127981720494661	lla=3.63886266492048		(posterior_palloff=0.95)
prior=0.00127981720494661	lla=3.99541135890715		(posterior_palloff=0.99)


## Get params for atr_harmony

**Get num features in atr train sequences**

In [47]:
import pandas as pd
import datasets
import scorers
import numpy as np

# get num features in atr lexicon, avg

# with open('data/hw/atr_harmony_lexicon.txt', 'r') as file:
#     lines = file.read().splitlines()
    
dataset = datasets.load_lexicon('data/hw/atr_harmony_lexicon.txt', min_length=2, max_length=5)
mf_scorer = scorers.MeanFieldScorer(dataset, feature_type='atr_harmony')

num_features_lexicon = [len(mf_scorer._featurize(seq).nonzero()[0]) for seq in dataset.data]
avg_num_features_lexicon = np.mean(num_features_lexicon)
print("avg # features in word in lexicon", avg_num_features_lexicon)


Loading lexicon with min_length=2, max_length=5...
# features:  512
avg # features in word in lexicon 29.286624203821656


In [48]:
# get num features in atr auc test set, avg

with open('test_set.csv', 'r') as file:
    lines = file.read().splitlines()
        
featurized = [mf_scorer._featurize(dataset.vocab.encode(['$'] + item.strip().split() + ['$'])).nonzero()[0] for item in lines]
num_features_eval = [len(feat) for feat in featurized]
avg_num_features_eval = np.mean(num_features_eval)
print("avg # features in word in eval set", avg_num_features_eval)

avg # features in word in eval set 40.68217821782178


In [49]:
# sanity checking that loading it as dataset gets same answer

# dataset = datasets.load_lexicon('test_set.csv', min_length=2, max_length=5)
# mf_scorer = scorers.MeanFieldScorer(dataset, feature_type='atr_harmony')

# num_features_lexicon = [len(mf_scorer._featurize(seq).nonzero()[0]) for seq in dataset.data]
# avg_num_features_lexicon = np.mean(num_features_lexicon)
# print("avg # features in word in lexicon", avg_num_features_lexicon)


**Get hyperparams for positive updates (unif, eig_train_model)**

In [54]:
num_features = 41 # our estimate for num features in a random sequence in the AUC eval set AND for observed sequences
# TODO: for eig_train_model, will see some lexicon num features too -- use average?

for prior_palloff in [0.5, 0.3, 0.1]:
    print("")
    print(f"prior palloff: {prior_palloff}")
    for posterior_palloff in [0.4, 0.3, 0.2, 0.1, 0.05, 0.01]:
        if posterior_palloff >= prior_palloff:
            continue
        
        prior, lla = derive_params(prior_palloff, posterior_palloff, label=False, 
                                   num_features_observe=num_features,
                                   num_features_calibrate=num_features,
                                  )

        print(f'prior={prior}\tlla={lla}\t\t(posterior_palloff={posterior_palloff})')


prior palloff: 0.5
prior=0.0167639238265123	lla=-0.573334159110551		(posterior_palloff=0.4)
prior=0.0167639238265123	lla=0.110455420091745		(posterior_palloff=0.3)
prior=0.0167639238265123	lla=0.534883289670680		(posterior_palloff=0.2)
prior=0.0167639238265123	lla=0.892237487991269		(posterior_palloff=0.1)
prior=0.0167639238265123	lla=1.09325962564821		(posterior_palloff=0.05)
prior=0.0167639238265123	lla=1.35682118741570		(posterior_palloff=0.01)

prior palloff: 0.3
prior=0.0289382226833754	lla=-0.0160236704849707		(posterior_palloff=0.2)
prior=0.0289382226833754	lla=0.791330322672277		(posterior_palloff=0.1)
prior=0.0289382226833754	lla=1.13526472219270		(posterior_palloff=0.05)
prior=0.0289382226833754	lla=1.52860419116721		(posterior_palloff=0.01)

prior palloff: 0.1
prior=0.0546127168692061	lla=0.999529409052411		(posterior_palloff=0.05)
prior=0.0546127168692061	lla=1.97633083575154		(posterior_palloff=0.01)


**Get hyperparams for train (calibrated)**

In [53]:
num_features = 41 # our estimate for num features in a random sequence in the AUC eval set AND for observed sequences
# TODO: for eig_train_model, will see some lexicon num features too -- use average?
num_features_lexicon = 29

for prior_palloff in [0.5, 0.3, 0.1]:
    print("")
    print(f"prior palloff: {prior_palloff}")
    for posterior_palloff in [0.4, 0.3, 0.2, 0.1, 0.05, 0.01]:
        if posterior_palloff >= prior_palloff:
            continue
        
        prior, lla = derive_params(prior_palloff, posterior_palloff, label=False, 
                                   num_features_observe=num_features_lexicon,
                                   num_features_calibrate=num_features,
                                  )

        print(f'prior={prior}\tlla={lla}\t\t(posterior_palloff={posterior_palloff})')


prior palloff: 0.5
prior=0.0167639238265123	lla=0.0325829043493283		(posterior_palloff=0.4)
prior=0.0167639238265123	lla=0.396821985827971		(posterior_palloff=0.3)
prior=0.0167639238265123	lla=0.679312529188839		(posterior_palloff=0.2)
prior=0.0167639238265123	lla=0.946635246593759		(posterior_palloff=0.1)
prior=0.0167639238265123	lla=1.10740888893872		(posterior_palloff=0.05)
prior=0.0167639238265123	lla=1.32836781944116		(posterior_palloff=0.01)

prior palloff: 0.3
prior=0.0289382226833754	lla=0.420335505817348		(posterior_palloff=0.2)
prior=0.0289382226833754	lla=0.871320405440002		(posterior_palloff=0.1)
prior=0.0289382226833754	lla=1.11026707059686		(posterior_palloff=0.05)
prior=0.0289382226833754	lla=1.41323884523369		(posterior_palloff=0.01)

prior palloff: 0.1


KeyboardInterrupt: 