In [1]:
from nltk.corpus import stopwords
import numpy as np
from nltk.corpus import words as nltk_english_words

from lib_utils.preprocessing import TextPreProcessor
from lib_utils.expectation_maximization import EM_SSL



# Set tokens to remove for all text preprocessing
remove_zero_vocab_docs = True
english_vocab = set(nltk_english_words.words())
english_vocab = None
_tokens_to_remove = stopwords.words('english')

In [2]:
# Fix static preprocessed data
# original article suggests 10k fixed unlabeled samples
processor = TextPreProcessor(n_labeled_train_samples=200,
    n_unlabeled_train_samples=500,
                                    tokens_to_remove=_tokens_to_remove,
                                    remove_zero_vocab_docs=remove_zero_vocab_docs,
                                    english_vocab=english_vocab)
# Initialize raw
processor.set_static_full_train_raw_data()
processor.set_static_raw_unlabeled_data()
processor.set_static_raw_test_data()

full train raw data shape: (11314,)
min, max full train data label vals = 0, 19
unlabeled train data shape: (500,)
num avail train indices complement to unlabeled: 10814


In [3]:
processor.set_sample_raw_train_data()
sample_sent = processor.labeled_train_data_sample[0]
print('sample labeled trained sentence\n', sample_sent)
processed_sent = processor.process_documents_text(sample_sent)
print('processed sentence\n', processed_sent)

doc_array_sample = processor.labeled_train_data_sample[:2]
processed_docs = processor.process_documents_text(doc_array_sample)  # List[str]


num train samples to select: 200
at set_sample_raw_train_data
size: (200,)
sample labeled trained sentence
 


I've even seen pictures of an installation where the ham pulled a little 
trailer behind his car with a 4KW generator, and ran the full legal limit 
while mobile. I don't know what his gas mileage was like, though, or 
where he found resonators able to stand the gaff.

processed sentence
 ['even seen pictures installation ham pulled little trailer behind car kw generator ran full legal limit mobile know gas mileage like though found resonators able stand gaff']


In [4]:
# doc-to-vect based on train sample's count vectorizer
processor.set_labeled_train_sample_count_data()
print('labeled_train_sample_count_data shape:', processor.labeled_train_sample_count_data.shape)
processor.set_unlabeled_count_data()
print('unlabeld train count_data shape:', processor.unlabeled_count_data.shape)
processor.set_test_count_data()



labeled train sample count data before zero doc removal shape (200, 5250)
Removing zero vocab docs from labeled train sample.
After zero count doc removal:
 Kept 198 samples from original 11314 train
labeled_train_sample_count_data shape: (198, 5250)
got data=unlabeled, shape= (500,)
unlabeled count data shape (500, 5250)
unlabeld train count_data shape: (500, 5250)
test count data shape (7532, 5250)


In [5]:
# CHECK TRAIN SET VOCAB
print(len(processor.vocab))
print(processor.vocab[:10])
print(processor.vocab[-10:])
short_vocab_text = [v for v in processor.vocab if len(v) <=2]
print(short_vocab_text)

5250
['abc', 'ability', 'able', 'abo', 'abort', 'abraham', 'absence', 'absolute', 'absolutely', 'absorb']
['zaven', 'zenit', 'zezel', 'zing', 'zip', 'zone', 'zoo', 'zooid', 'zoom', 'zx']
['ac', 'ae', 'ah', 'al', 'ao', 'ar', 'ba', 'bt', 'ca', 'cd', 'ce', 'cf', 'cg', 'cl', 'co', 'cs', 'cv', 'dc', 'de', 'dr', 'ds', 'du', 'dx', 'ea', 'ed', 'eg', 'eh', 'em', 'eq', 'er', 'et', 'fd', 'fg', 'fi', 'fm', 'gc', 'gd', 'gi', 'gm', 'go', 'gp', 'hd', 'hi', 'hp', 'hq', 'hr', 'ht', 'id', 'ie', 'ig', 'ii', 'io', 'iv', 'jn', 'jr', 'kb', 'kp', 'ku', 'kw', 'la', 'le', 'lh', 'li', 'lo', 'ls', 'mb', 'mc', 'md', 'mg', 'mi', 'mm', 'mn', 'mo', 'mr', 'ms', 'mt', 'mx', 'nd', 'ne', 'nj', 'nl', 'ns', 'nt', 'ny', 'oh', 'ok', 'op', 'os', 'ou', 'pa', 'pb', 'pc', 'pd', 'ph', 'pp', 'ps', 'pt', 'qc', 'rc', 'rd', 'rf', 'ri', 'rm', 'rs', 'rw', 'sa', 'se', 'sg', 'sj', 'sk', 'st', 'sx', 'tb', 'tc', 'td', 'th', 'tt', 'tv', 'un', 'us', 'va', 'vc', 'wb', 'xv', 'ya', 'ye', 'yr', 'zx']


In [6]:
# LABELED TRAIN COUNT DATA STATS
processor.get_train_doc_lengths()
print(processor.med_doc_len)
print(processor.max_doc_len)


45.0
1013


In [7]:

# scale count data to trains' unif doc len
scaled_labeled_train_sample_data = processor.make_uniform_doc_lens(word_count_data=processor.labeled_train_sample_count_data,
                                                                  strategy='max')
scaled_unlabeled_data = processor.make_uniform_doc_lens(word_count_data=processor.unlabeled_count_data,
                                                        strategy='max')
scaled_test_data = processor.make_uniform_doc_lens(word_count_data=processor.test_count_data,
                                                   strategy='max')



In [8]:
print(scaled_labeled_train_sample_data.shape)
print(np.sum(scaled_labeled_train_sample_data, axis=processor.vocab_axis))
print(scaled_unlabeled_data.shape)
print(np.sum(scaled_unlabeled_data, axis=processor.vocab_axis))

(198, 5250)
[ 976.82142857  999.12328767 1012.00098619 1001.98913043  868.28571429
  966.95454545  998.31884058  962.35        959.68421053 1006.0137931
  993.51923077  982.3030303   962.35        979.23333333  984.05714286
  970.79166667  810.4         975.48148148 1000.49382716  997.171875
 1007.77835052  981.34375     985.62162162  990.48888889 1000.17721519
 1000.64634146  980.32258065  972.48        983.20588235 1000.01282051
  993.1372549   991.89583333  998.31884058  940.64285714  940.64285714
 1006.86060606 1005.91608392  986.34210526 1001.74444444  959.68421053
 1004.41525424 1010.39588689  995.83050847  928.58333333  996.66129032
 1004.83064516  997.171875    844.16666667  959.68421053  940.64285714
 1003.87387387  979.23333333  995.53448276  928.58333333  988.88095238
  940.64285714  987.675       992.32653061  964.76190476  996.11666667
 1001.61797753  868.28571429  935.07692308  994.91071429  953.41176471
  992.32653061  940.64285714 1000.79518072  996.39344262 1001.617977

In [9]:
# train
model = EM_SSL(labeled_count_data=scaled_labeled_train_sample_data,
               label_vals=processor.train_sample_label_vals,
               unlabeled_count_data=scaled_unlabeled_data,
               max_em_iters=2,
               min_em_loss_delta=2e-4)

#model.fit()
model.initialize_EM()  # only runs M step (compute thetas) on labeled train samples


labeled train sample has 20 unique labels
Checking initail M step on only labeled train data...
Congrats, initial M step assertions passed.


In [10]:
print('num classs in labeled train', len(model.label_set))
print(model.word_counts_per_class[0].mean())

num classs in labeled train 20
1.8632873729477415


In [11]:
print(model.total_word_count_per_class.shape)

(20,)


In [12]:
# problem to solve, when computing: labeled loss log (P(yi = cj|θ) .P(xi|yi = cj; θ)) ,
# P(xi|yi = cj; θ) -> 0 => log() -> nan
# and the reason P(xi|yi = cj; θ) -> 0 is bc theta_j_vocab -> 0 => theta_j_vocab^w_t -> 0 for w_t > 0
# Recall  P(xi|yi = cj; θ) = product_t \theta_j_t^ w_t
## So log(P(xi|yi = cj; θ) ) = sum_t (w_t * log(\theta_j_t())
### So just need theta_j_t > 0, likelihood improved with smaller vocab size or smaller unif. doc length

# TODO: implement above log trick for theta_j_vocab

print('currently only labeled data:', model.only_labeled_data)
single_doc = model.labeled_count_data[2]
this_true_label =  model.label_vals[2]
print('true label of single doc:', this_true_label)
# Denominators for single doc per class proba
print('Denominator terms\n')
print('total word count for class % d' % this_true_label, model.total_word_count_per_class[this_true_label])
# compute minimum proba: for words not appearing in class
min_expected_proba = (1 / (model.vocab_size + model.total_word_count_per_class[this_true_label]))
print('min expected proba:', min_expected_proba)
print('log of min expected proba', np.log(min_expected_proba))
print('Numerator terms\n')
# Numerators for single doc per class proba
print('labeled max word counts for class %d:' % this_true_label, model.labeled_word_counts_per_class[this_true_label].max())
print('single doc word count min, avg:', single_doc.min(), single_doc.mean())
print('theta_j_vocab min for class %d:' % this_true_label, model.theta_j_vocab_per_class[this_true_label].min())


currently only labeled data: False
true label of single doc: 18
Denominator terms

total word count for class  18 8938.085593139323
min expected proba: 7.048167234651812e-05
log of min expected proba -9.56015784865931
Numerator terms

labeled max word counts for class 18: 180.89285714285717
single doc word count min, avg: 0.0 0.19276209260824648
theta_j_vocab min for class 18: 7.048167234651812e-05


# TESTS ON SINGLE TRAIN LABEL DOCUMENT

##  Check P(c_j | theta)

In [13]:
theta_j = model.theta_j_per_class  # per class: (n_docs_in_class + 1) / (self.n_docs + self.n_labels)
print('theta j shape', theta_j.shape)
print('sum theta j == 1: ', np.isclose(np.sum(theta_j, axis=0), 1.0))

theta j shape (20,)
sum theta j == 1:  True


##  FIX: P(x_i | c_j, theta)

In [14]:
# FIX: P(x_i | c_j, theta) -> {theta}_{tj} =: theta_j_vocab -> non-nan computations
X_log = np.log(model.theta_j_vocab_per_class)
print('shape of log_prob(w_t | cj):', X_log.shape)
print('log_prob(w_t | cj) min, mean, max')
print(X_log.min(), X_log.mean(), X_log.max())

# COMPUTE UNNORMALIZED LOG PROBAS
doc_log_proba_per_class = np.log(model.theta_j_per_class) +  np.array([np.sum(single_doc * np.log(model.theta_j_vocab_per_class[j]), axis=0)
                                            for j in model.ordered_labels_list])
test_doc_log_proba_per_class = model.compute_unnormalized_class_log_probas_doc(single_doc)
print('doc unnorm LOG probas per class of single do avg, min:', doc_log_proba_per_class.mean(), doc_log_proba_per_class.min())
print('TEST doc unnorm LOG probas per class of single do avg, min:', test_doc_log_proba_per_class.mean(), test_doc_log_proba_per_class.min())
print('argmax unnorm LOG proba class', np.argmax(doc_log_proba_per_class))
print('TEST argmax unnorm LOG proba class', np.argmax(test_doc_log_proba_per_class))

# fixed_doc_proba_per_class = np.exp(doc_log_proba_per_class)
# print('FIXED doc probas per class of single do avg, min:', fixed_doc_proba_per_class.mean(), fixed_doc_proba_per_class.min())



shape of log_prob(w_t | cj): (20, 5250)
log_prob(w_t | cj) min, mean, max
-9.85741110514188 -9.365035524016424 -4.0178290863837915
doc unnorm LOG probas per class of single do avg, min: -9282.708699945166 -9636.60654707081
TEST doc unnorm LOG probas per class of single do avg, min: -9282.708699945166 -9636.60654707081
argmax unnorm LOG proba class 18
TEST argmax unnorm LOG proba class 18


In [15]:
print('shape of test doc log probas per class', test_doc_log_proba_per_class.shape)
#print('log sum', test_doc_log_proba_per_class.sum())
#def compute_sum_of_logs_from_log_prbas(log_probas):
#"""For single doc x, compute denom: log P(x|theta), without yet using P(c_j|theta)"""
log_probas = np.copy(test_doc_log_proba_per_class)
print('given unnormalized log probas prior to logSumExp', log_probas)
# log sum exp trick: see Murphy
max_log = np.max(log_probas)
summand = [np.exp(a - max_log) for a in log_probas]  # still have underflow...
print('summand', summand)
log_of_sums = np.log(np.sum(summand)) + max_log  #+ min_expected_proba # log(denom)
print('log of sums', log_of_sums)
test_log_probas_normalized  = log_probas - log_of_sums
print('test log probas', test_log_probas_normalized)
print('dtype test log probas', test_log_probas_normalized.dtype)
test_probas_normalized = np.exp(test_log_probas_normalized)
print('test probas normalized', test_probas_normalized)
print('test probas norm idx 3', test_probas_normalized[3])
print('argmax: ', np.argmax(test_probas_normalized))


    
    

shape of test doc log probas per class (20,)
given unnormalized log probas prior to logSumExp [-9292.468323   -9471.71868692 -9329.26109874 -9333.25223776
 -9636.60654707 -9449.38210979 -9404.00320765 -9521.44397521
 -9527.51914768 -9437.64775702 -9189.48547816 -9220.05069891
 -9360.95182688 -9249.30744261 -9331.70213934 -9448.99069339
 -9153.420478   -9055.80120098 -7921.77217105 -9319.38877874]
summand [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
log of sums -7921.7721710469705
test log probas [-1370.69615195 -1549.94651587 -1407.48892769 -1411.48006671
 -1714.83437602 -1527.60993875 -1482.23103661 -1599.67180417
 -1605.74697664 -1515.87558597 -1267.71330711 -1298.27852786
 -1439.17965583 -1327.53527156 -1409.9299683  -1527.21852234
 -1231.64830695 -1134.02902994     0.         -1397.6166077 ]
dtype test log probas float64
test probas normalized [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
test probas norm idx 3

In [16]:
X = np.sum(np.exp(log_probas - max_log))

print(X)

1.0


In [17]:
# formalize as methods

def compute_unnormalized_class_log_probas(doc):
    """Get unnormalized log probas"""
    u_log_probas =  np.log(model.theta_j_per_class) +  np.array([np.sum(doc * np.log(model.theta_j_vocab_per_class[j]), axis=0)
                                            for j in model.ordered_labels_list])
    return u_log_probas


    
def compute_doc_class_probas(u_log_probas):
    """Apply normalization compute normalization factor using log-sum-exp trick."""
    max_log = np.max(u_log_probas)
    summand = np.sum(np.exp(log_probas - max_log))
    log_of_sums = np.log(summand) + max_log  #+ min_expected_proba # log(denom)
    log_probas_normalized  = log_probas - log_of_sums
    class_probas_normalized = np.exp(log_probas_normalized)
    return class_probas_normalized

u_log_probas = compute_unnormalized_class_log_probas(single_doc)
print('u_log_probas', u_log_probas)






u_log_probas [-9292.468323   -9471.71868692 -9329.26109874 -9333.25223776
 -9636.60654707 -9449.38210979 -9404.00320765 -9521.44397521
 -9527.51914768 -9437.64775702 -9189.48547816 -9220.05069891
 -9360.95182688 -9249.30744261 -9331.70213934 -9448.99069339
 -9153.420478   -9055.80120098 -7921.77217105 -9319.38877874]


In [18]:
# TAKE CLOSER LOOK AT FIRST THETAS ON LABELED DATA

In [19]:
model.labeled_count_data.shape

(198, 5250)

In [20]:
np.sum(model.unlabeled_this_class_probas)

0.0

In [21]:
for j in model.ordered_labels_list:
    print(model.n_labeled_docs_per_class[j])

10.0
12.0
7.0
8.0
12.0
9.0
9.0
11.0
13.0
13.0
8.0
10.0
13.0
8.0
9.0
14.0
6.0
11.0
9.0
6.0


In [22]:
model.class_mask

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [23]:
np.sum(model.class_mask)

6

In [24]:
for j in model.label_set:
    print(model.n_docs_per_class[j])

10.0
12.0
7.0
8.0
12.0
9.0
9.0
11.0
13.0
13.0
8.0
10.0
13.0
8.0
9.0
14.0
6.0
11.0
9.0
6.0


In [25]:
print(1 / model.vocab_size)
print(model.theta_j_vocab_per_class.shape)

print("word count avg, theta_j, theta_jt_avg, theta_jt_max")
for j in model.label_set:
    print("%0.2f, %0.2f, %0.2f, %0.2f" % 
          (model.labeled_word_counts_per_class[j].mean(), model.theta_j_per_class[j], 
           model.theta_j_vocab_per_class[j].mean(), model.theta_j_vocab_per_class[j].max()))
# TODO: theta_jt near zero => log(theta_jt) -> nan

0.00019047619047619048
(20, 5250)
word count avg, theta_j, theta_jt_avg, theta_jt_max
1.86, 0.05, 0.00, 0.01
2.24, 0.06, 0.00, 0.01
1.33, 0.04, 0.00, 0.01
1.49, 0.04, 0.00, 0.01
2.24, 0.06, 0.00, 0.02
1.68, 0.05, 0.00, 0.01
1.67, 0.05, 0.00, 0.01
2.00, 0.06, 0.00, 0.01
2.38, 0.06, 0.00, 0.01
2.40, 0.06, 0.00, 0.01
1.49, 0.04, 0.00, 0.02
1.88, 0.05, 0.00, 0.01
2.44, 0.06, 0.00, 0.01
1.52, 0.04, 0.00, 0.01
1.68, 0.05, 0.00, 0.01
2.64, 0.07, 0.00, 0.01
1.12, 0.03, 0.00, 0.01
2.07, 0.06, 0.00, 0.01
1.70, 0.05, 0.00, 0.01
1.10, 0.03, 0.00, 0.02


In [26]:
model.theta_j_per_class[j]

0.03211009174311927

In [27]:
print(model.word_counts_per_class[j].min(), model.word_counts_per_class[j].mean(), 
      model.word_counts_per_class[j].min(), model.word_counts_per_class[j].max(), 
    model.word_counts_per_class[j].sum())

0.0 1.1038429994154453 0.0 197.72459893048128 5795.175746931088


In [28]:
model.total_word_count_per_class[j]

5795.175746931088

In [29]:
model.theta_j_vocab_per_class[j].shape

(5250,)

In [30]:
model.theta_j_vocab_per_class[j].mean(), model.theta_j_vocab_per_class[j].max()

(0.0001904761904761905, 0.017991981611129828)

In [31]:
# don't want to compute np.log(model.theta_j_vocab_per_class) directly since theta_j_vocab is sparse => underflow
#np.log(model.theta_j_vocab_per_class)

# Model Evals

In [32]:
# Train fit: sanity check
pct_train_correct_preds = model.evaluate_on_data(count_data=scaled_labeled_train_sample_data,
                                            label_vals=processor.train_sample_label_vals)
print(pct_train_correct_preds)  # without a full EM loop, should be equal to 1.0

198
1.0


In [33]:
# out-of-sammple inference: test
pct_test_correct_preds = model.evaluate_on_data(count_data=scaled_test_data,
                                            label_vals=processor.full_test_label_vals)
print(pct_test_correct_preds)

2506
0.33271375464684017


In [34]:
print(model.preds.dtype)
print(processor.full_test_label_vals.dtype)

int64
int32


In [35]:
m = model.preds
p =  processor.full_test_label_vals
print(m.shape == p.shape)
corr = m == p
print(np.sum(corr) / len(m))

True
0.33271375464684017
