In [1]:
from nltk.corpus import stopwords
import numpy as np
from nltk.corpus import words as nltk_english_words

from lib_utils.preprocessing import TextPreProcessor
from lib_utils.expectation_maximization import EM_SSL



# Set tokens to remove for all text preprocessing
remove_zero_vocab_docs = True
english_vocab = set(nltk_english_words.words())
english_vocab = None
_tokens_to_remove = stopwords.words('english')

In [2]:
# Fix static preprocessed data
# original article suggests 10k fixed unlabeled samples
processor = TextPreProcessor(n_labeled_train_samples=100,
    n_unlabeled_train_samples=500,
                                    tokens_to_remove=_tokens_to_remove,
                                    remove_zero_vocab_docs=remove_zero_vocab_docs,
                                    english_vocab=english_vocab)
# Initialize raw
processor.set_static_full_train_raw_data()
processor.set_static_raw_unlabeled_data()
processor.set_static_raw_test_data()

full train raw data shape: (11314,)
min, max full train data label vals = 0, 19
unlabeled train data shape: (1000,)
num avail train indices complement to unlabeled: 10314


In [3]:
processor.set_sample_raw_train_data()
sample_sent = processor.labeled_train_data_sample[0]
print('sample labeled trained sentence\n', sample_sent)
processed_sent = processor.process_documents_text(sample_sent)
print('processed sentence\n', processed_sent)

doc_array_sample = processor.labeled_train_data_sample[:2]
processed_docs = processor.process_documents_text(doc_array_sample)  # List[str]


num train samples to select: 100
at set_sample_raw_train_data
size: (100,)
sample labeled trained sentence
 
Another dodge.  Oh well.  I'm no match for your amazing repertoire
of red herrings and smoke screens.  

You asked for an apology.  I'm not going to apologize for pointing out
that your straw-man argument was a straw-man argument.  Nor for saying
that your list of "bible contradictions" shows such low standards of
scholarship that it should be an embarrassment to anti-inerrantists,
just as Josh McDowell should be an embarrassment to the fundies.  Nor
for objecting various times to your taking quotes out of context.  Nor
for pointing out that "they do it too" is not an excuse. Nor for calling
your red herrings and smoke screens what they are.

I'm still not sure why you think I'm a hypocrite.  It's true that I
haven't responded to any of Robert Weiss' articles, which may be due in
part to the fact that I almost never read his articles.  But I have
responded to both you and Frank 

In [4]:
# doc-to-vect based on train sample's count vectorizer
processor.set_labeled_train_sample_count_data()
print('labeled_train_sample_count_data shape:', processor.labeled_train_sample_count_data.shape)
processor.set_unlabeled_count_data()
print('unlabeld train count_data shape:', processor.unlabeled_count_data.shape)
processor.set_test_count_data()



labeled train sample count data before zero doc removal shape (100, 5400)
Removing zero vocab docs from labeled train sample.
After zero count doc removal:
 Kept 95 samples from original 11314 train
labeled_train_sample_count_data shape: (95, 5400)
got data=unlabeled, shape= (1000,)
unlabeled count data shape (1000, 5400)
unlabeld train count_data shape: (1000, 5400)
test count data shape (7532, 5400)


In [5]:
# CHECK TRAIN SET VOCAB
print(len(processor.vocab))
print(processor.vocab[:10])
print(processor.vocab[-10:])
short_vocab_text = [v for v in processor.vocab if len(v) <=2]
print(short_vocab_text)

5400
['aau', 'abc', 'abcdefghijklmnopqrstuvwxyz', 'ability', 'able', 'abortion', 'absolute', 'absolutism', 'abuse', 'abusers']
['yourhost', 'yourname', 'ysu', 'za', 'zabriskie', 'zeus', 'zia', 'zimbabve', 'zogwarg', 'zone']
['ac', 'ad', 'ah', 'ai', 'al', 'au', 'aw', 'ax', 'bc', 'ca', 'cc', 'cd', 'cf', 'ch', 'cm', 'co', 'cs', 'cu', 'cv', 'cz', 'db', 'dc', 'de', 'dg', 'di', 'dk', 'dr', 'ds', 'du', 'dx', 'ec', 'ed', 'ee', 'eg', 'eq', 'er', 'et', 'fd', 'fi', 'fr', 'ft', 'gc', 'ge', 'gm', 'go', 'hh', 'hi', 'hl', 'hp', 'hz', 'id', 'im', 'io', 'jz', 'la', 'ld', 'le', 'lp', 'mh', 'mr', 'ms', 'nd', 'ne', 'nl', 'ns', 'nt', 'ob', 'oh', 'oj', 'ok', 'os', 'pa', 'pm', 'ps', 'rh', 'rs', 'sa', 'sf', 'sh', 'sq', 'st', 'sw', 'sy', 'th', 'tm', 'tn', 'tr', 'tv', 'tx', 'ty', 'uh', 'ui', 'uk', 'un', 'us', 'uu', 'va', 'vm', 'vs', 'wc', 'wd', 'wu', 'xa', 'xc', 'xl', 'xt', 'xv', 'xz', 'ya', 'za']


In [6]:
# LABELED TRAIN COUNT DATA STATS
processor.get_train_doc_lengths()
print(processor.med_doc_len)
print(processor.max_doc_len)


42.0
4550


In [7]:

# scale count data to trains' unif doc len
scaled_labeled_train_sample_data = processor.make_uniform_doc_lens(word_count_data=processor.labeled_train_sample_count_data,
                                                                  strategy='max')
scaled_unlabeled_data = processor.make_uniform_doc_lens(word_count_data=processor.unlabeled_count_data,
                                                        strategy='max')
scaled_test_data = processor.make_uniform_doc_lens(word_count_data=processor.test_count_data,
                                                   strategy='max')



In [8]:
print(scaled_labeled_train_sample_data.shape)
print(np.sum(scaled_labeled_train_sample_data, axis=processor.vocab_axis))
print(scaled_unlabeled_data.shape)
print(np.sum(scaled_unlabeled_data, axis=processor.vocab_axis))

(95, 5400)
[4510.77586207 4282.35294118 4265.625      4514.453125   4412.12121212
 4381.48148148 4387.5        4510.0877193  4510.43478261 4427.02702703
 4095.         4333.33333333 4265.625      4265.625      4310.52631579
 4439.02439024 4387.5        4548.83780332 4265.625      3791.66666667
 3640.         4225.         4310.52631579 4459.         4548.70775348
 4472.88135593 4455.20833333 4527.5862069  4527.47524752 4044.44444444
 4481.06060606 4044.44444444 4542.26190476 4529.86725664 3981.25
 4468.75       4536.33633634 4530.63829787 4439.02439024 4412.12121212
 4478.90625    4282.35294118 4477.77777778 4496.47058824 4451.08695652
 4474.16666667 4368.         4246.66666667 4200.         4503.09278351
 4352.17391304 3981.25       4494.51219512 4549.00021973 4490.13157895
 4477.77777778 4475.40983607 4398.33333333 4487.67123288 4464.1509434
 4482.08955224 4457.14285714 4444.18604651 4333.33333333 4483.08823529
 4368.         4412.12121212 4465.74074074 4441.66666667 4453.19148936
 4

In [9]:
# train
model = EM_SSL(labeled_count_data=scaled_labeled_train_sample_data,
               label_vals=processor.train_sample_label_vals,
               unlabeled_count_data=scaled_unlabeled_data,
               max_em_iters=2,
               min_em_loss_delta=2e-4)

#model.fit()
model.initialize_EM()  # only runs M step (compute thetas) on labeled train samples


labeled train sample has 20 unique labels
Checking initail M step on only labeled train data...
Congrats, initial M step assertions passed.


In [10]:
print(model.label_set)
print(model.word_counts_per_class[0].mean())

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}
5.6870573871153685


In [11]:
print(model.total_word_count_per_class.shape)

(20,)


In [39]:
# problem to solve, when computing: labeled loss log (P(yi = cj|θ) .P(xi|yi = cj; θ)) ,
# P(xi|yi = cj; θ) -> 0 => log() -> nan
# and the reason P(xi|yi = cj; θ) -> 0 is bc theta_j_vocab -> 0 => theta_j_vocab^w_t -> 0 for w_t > 0
# Recall  P(xi|yi = cj; θ) = product_t \theta_j_t^ w_t
## So log(P(xi|yi = cj; θ) ) = sum_t (w_t * log(\theta_j_t())
### So just need theta_j_t > 0, likelihood improved with smaller vocab size or smaller unif. doc length

# TODO: implement above log trick for theta_j_vocab

print('currently only labeled data:', model.only_labeled_data)
single_doc = model.labeled_count_data[2]
this_true_label =  model.label_vals[2]
print('true label of single doc:', this_true_label)
# Denominators for single doc per class proba
print('Denominator terms\n')
print('total word count for class % d' % this_true_label, model.total_word_count_per_class[this_true_label])
# compute minimum proba: for words not appearing in class
min_expected_proba = (1 / (model.vocab_size + model.total_word_count_per_class[this_true_label]))
print('min expected proba:', min_expected_proba)
print('log of min expected proba', np.log(min_expected_proba))
print('Numerator terms\n')
# Numerators for single doc per class proba
print('labeled max word counts for class %d:' % this_true_label, model.labeled_word_counts_per_class[this_true_label].max())
print('single doc word count min, avg:', single_doc.min(), single_doc.mean())
print('theta_j_vocab min for class %d:' % this_true_label, model.theta_j_vocab_per_class[this_true_label].min())


currently only labeled data: False
true label of single doc: 2
Denominator terms

total word count for class  2 29460.3029520076
min expected proba: 2.8685923968495235e-05
log of min expected proba -10.45910400957003
Numerator terms

labeled max word counts for class 2: 1516.6666666666667
single doc word count min, avg: 0.0 0.7899305555555556
theta_j_vocab min for class 2: 2.8685923968495235e-05


# TESTS ON SINGLE TRAIN LABEL DOCUMENT

##  Check P(c_j | theta)

In [40]:
theta_j = model.theta_j_per_class  # per class: (n_docs_in_class + 1) / (self.n_docs + self.n_labels)
print('theta j shape', theta_j.shape)
print('sum theta j == 1: ', np.isclose(np.sum(theta_j, axis=0), 1.0))

theta j shape (20,)
sum theta j == 1:  True


##  FIX: P(x_i | c_j, theta)

In [41]:
# FIX: P(x_i | c_j, theta) -> {theta}_{tj} =: theta_j_vocab -> non-nan computations
X_log = np.log(model.theta_j_vocab_per_class)
print('shape of log_prob(w_t | cj):', X_log.shape)
print('log_prob(w_t | cj) min, mean, max')
print(X_log.min(), X_log.mean(), X_log.max())
doc_log_proba_per_class = np.array([np.sum(single_doc * np.log(model.theta_j_vocab_per_class[j]), axis=0)
                                            for j in model.ordered_labels_list])
test_doc_log_proba_per_class = model.compute_doc_log_proba_per_class(single_doc)
print('doc LOG probas per class of single do avg, min:', doc_log_proba_per_class.mean(), doc_log_proba_per_class.min())
print('TEST doc LOG probas per class of single do avg, min:', test_doc_log_proba_per_class.mean(), test_doc_log_proba_per_class.min())
print('argmax LOG proba class', np.argmax(doc_log_proba_per_class))
print('TEST argmax LOG proba class', np.argmax(test_doc_log_proba_per_class))

# fixed_doc_proba_per_class = np.exp(doc_log_proba_per_class)
# print('FIXED doc probas per class of single do avg, min:', fixed_doc_proba_per_class.mean(), fixed_doc_proba_per_class.min())



shape of log_prob(w_t | cj): (20, 5400)
log_prob(w_t | cj) min, mean, max
-10.71499899360082 -9.893558274134527 -3.1341746629033578
doc LOG probas per class of single do avg, min: -39054.46717747797 -41867.52672056465
TEST doc LOG probas per class of single do avg, min: -39054.46717747797 -41867.52672056465
argmax LOG proba class 2
TEST argmax LOG proba class 2


In [68]:
print('shape of test doc log probas per class', test_doc_log_proba_per_class.shape)
#print('log sum', test_doc_log_proba_per_class.sum())
#def compute_sum_of_logs_from_log_prbas(log_probas):
#"""For single doc x, compute denom: log P(x|theta), without yet using P(c_j|theta)"""
log_probas = np.copy(test_doc_log_proba_per_class)
# log sum exp trick: see Murphy
max_log = np.max(log_probas)
summand = np.sum([np.exp(a - max_log) for a in log_probas])
log_of_sums = np.log(summand) + max_log  #+ min_expected_proba # log(denom)
print('log of sums', log_of_sums)
test_log_probas_normalized  = log_probas - log_of_sums
print('test log probas', test_log_probas_normalized)
print('dtype test log probas', test_log_probas_normalized.dtype)
test_probas_normalized = np.exp(test_log_probas_normalized)
print('test probas normalized', test_probas_normalized)
print('test probas norm idx 3', test_probas_normalized[3])
print('argmax: ', np.argmax(test_probas_normalized))


    
    

shape of test doc log probas per class (20,)
log of sums -19644.13403510143
test log probas [-22223.39268546 -21698.87314649      0.         -18432.42137385
 -21043.11890256 -17700.11815543 -21195.29590106 -22123.27663283
 -21300.09491257 -22100.01083794 -18503.69576939 -20573.46892123
 -19641.990833   -20060.16343001 -22192.23808851 -18671.49836286
 -21513.0614009  -19055.94256372 -19592.68489489 -20585.31603481]
dtype test log probas float64
test probas normalized [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
test probas norm idx 3 0.0
argmax:  2


In [64]:
# TAKE CLOSER LOOK AT FIRST THETAS ON LABELED DATA

In [44]:
model.labeled_count_data.shape

(95, 5400)

In [45]:
np.sum(model.unlabeled_this_class_probas)

0.0

In [46]:
for j in model.ordered_labels_list:
    print(model.n_labeled_docs_per_class[j])

7.0
7.0
7.0
7.0
5.0
3.0
3.0
5.0
6.0
3.0
2.0
6.0
2.0
4.0
3.0
9.0
7.0
4.0
1.0
4.0


In [47]:
model.class_mask

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [48]:
np.sum(model.class_mask)

4

In [49]:
for j in model.label_set:
    print(model.n_docs_per_class[j])

7.0
7.0
7.0
7.0
5.0
3.0
3.0
5.0
6.0
3.0
2.0
6.0
2.0
4.0
3.0
9.0
7.0
4.0
1.0
4.0


In [50]:
print(1 / model.vocab_size)
print(model.theta_j_vocab_per_class.shape)

print("word count avg, theta_j, theta_jt_avg, theta_jt_max")
for j in model.label_set:
    print("%0.2f, %0.2f, %0.2f, %0.2f" % 
          (model.labeled_word_counts_per_class[j].mean(), model.theta_j_per_class[j], 
           model.theta_j_vocab_per_class[j].mean(), model.theta_j_vocab_per_class[j].max()))
# TODO: theta_jt near zero => log(theta_jt) -> nan

0.00018518518518518518
(20, 5400)
word count avg, theta_j, theta_jt_avg, theta_jt_max
5.69, 0.07, 0.00, 0.02
5.44, 0.07, 0.00, 0.03
5.46, 0.07, 0.00, 0.04
5.76, 0.07, 0.00, 0.01
4.11, 0.05, 0.00, 0.03
2.50, 0.03, 0.00, 0.02
2.49, 0.03, 0.00, 0.03
4.07, 0.05, 0.00, 0.03
4.87, 0.06, 0.00, 0.03
2.29, 0.03, 0.00, 0.04
1.65, 0.03, 0.00, 0.03
4.94, 0.06, 0.00, 0.02
1.65, 0.03, 0.00, 0.03
3.29, 0.04, 0.00, 0.02
2.37, 0.03, 0.00, 0.04
7.34, 0.09, 0.00, 0.03
5.63, 0.07, 0.00, 0.02
3.35, 0.04, 0.00, 0.01
0.83, 0.02, 0.00, 0.04
3.15, 0.04, 0.00, 0.04


In [51]:
model.theta_j_per_class[j]

0.043478260869565216

In [52]:
print(model.word_counts_per_class[j].min(), model.word_counts_per_class[j].mean(), 
      model.word_counts_per_class[j].min(), model.word_counts_per_class[j].max(), 
    model.word_counts_per_class[j].sum())

0.0 3.154225376249928 0.0 949.2241379310345 17032.81703174961


In [53]:
model.total_word_count_per_class[j]

17032.81703174961

In [54]:
model.theta_j_vocab_per_class[j].shape

(5400,)

In [55]:
model.theta_j_vocab_per_class[j].mean(), model.theta_j_vocab_per_class[j].max()

(0.00018518518518518518, 0.042358663050929515)

In [56]:
# don't want to compute np.log(model.theta_j_vocab_per_class) directly since theta_j_vocab is sparse => underflow
#np.log(model.theta_j_vocab_per_class)

In [57]:
# out-of-sammple inference: test
pct_test_correct_preds = model.evaluate_on_data(count_data=scaled_test_data,
                                            label_vals=processor.full_test_label_vals)
print(pct_test_correct_preds)

AttributeError: 'EM_SSL' object has no attribute 'compute_doc_proba_per_class'

In [None]:
print(model.preds.dtype)
print(processor.full_test_label_vals.dtype)

In [None]:
m = model.preds
p =  processor.full_test_label_vals
print(m.shape == p.shape)
corr = m == p
print(np.sum(corr))