In [1]:
from nltk.corpus import stopwords
import numpy as np
from nltk.corpus import words as nltk_english_words

from lib_utils.preprocessing import TextPreProcessor
from lib_utils.expectation_maximization import EM_SSL



# Set tokens to remove for all text preprocessing
remove_zero_vocab_docs = True
english_vocab = set(nltk_english_words.words())
english_vocab = None
_tokens_to_remove = stopwords.words('english')

In [2]:
# Fix static preprocessed data
# original article suggests 10k fixed unlabeled samples
processor = TextPreProcessor(n_labeled_train_samples=200,
    n_unlabeled_train_samples=500,
                                    tokens_to_remove=_tokens_to_remove,
                                    remove_zero_vocab_docs=remove_zero_vocab_docs,
                                    english_vocab=english_vocab)
# Initialize raw
processor.set_static_full_train_raw_data()
processor.set_static_raw_unlabeled_data()
processor.set_static_raw_test_data()

full train raw data shape: (11314,)
min, max full train data label vals = 0, 19
unlabeled train data shape: (500,)
num avail train indices complement to unlabeled: 10814


In [3]:
processor.set_sample_raw_train_data()
sample_sent = processor.labeled_train_data_sample[0]
print('sample labeled trained sentence\n', sample_sent)
processed_sent = processor.process_documents_text(sample_sent)
print('processed sentence\n', processed_sent)

doc_array_sample = processor.labeled_train_data_sample[:2]
processed_docs = processor.process_documents_text(doc_array_sample)  # List[str]


num train samples to select: 200
at set_sample_raw_train_data
size: (200,)
sample labeled trained sentence
 Stolen from Pasadena between 4:30 and 6:30 pm on 4/15.

Blue and white Honda CBR900RR california plate KG CBR.   Serial number
JH2SC281XPM100187, engine number 2101240.

No turn signals or mirrors, lights taped over for track riders session
at Willow Springs tomorrow.  Guess I'll miss it.  :-(((

Help me find my baby!!!
processed sentence
 ['stolen pasadena pm blue white honda cbr rr california plate kg cbr serial number jh sc xpm engine number turn signals mirrors lights taped track riders session willow springs tomorrow guess miss help find baby']


In [4]:
# doc-to-vect based on train sample's count vectorizer
processor.set_labeled_train_sample_count_data()
print('labeled_train_sample_count_data shape:', processor.labeled_train_sample_count_data.shape)
processor.set_unlabeled_count_data()
print('unlabeld train count_data shape:', processor.unlabeled_count_data.shape)
processor.set_test_count_data()



labeled train sample count data before zero doc removal shape (200, 5632)
Removing zero vocab docs from labeled train sample.
After zero count doc removal:
 Kept 194 samples from original 11314 train
labeled_train_sample_count_data shape: (194, 5632)
got data=unlabeled, shape= (500,)
unlabeled count data shape (500, 5632)
unlabeld train count_data shape: (500, 5632)
test count data shape (7532, 5632)


In [5]:
# CHECK TRAIN SET VOCAB
print(len(processor.vocab))
print(processor.vocab[:10])
print(processor.vocab[-10:])
short_vocab_text = [v for v in processor.vocab if len(v) <=2]
print(short_vocab_text)

5632
['ab', 'abandon', 'abandond', 'abbie', 'abbreviation', 'abd', 'abide', 'abilities', 'ability', 'able']
['zeit', 'zero', 'zeus', 'zip', 'zipguns', 'zlin', 'zone', 'zones', 'zupcic', 'zx']
['ab', 'ac', 'ad', 'ah', 'al', 'au', 'av', 'ba', 'bb', 'bd', 'bg', 'bi', 'bj', 'br', 'bw', 'ca', 'cd', 'cg', 'co', 'cs', 'ct', 'cy', 'db', 'dc', 'de', 'dl', 'dr', 'dx', 'ed', 'ee', 'em', 'er', 'es', 'et', 'ex', 'fd', 'fg', 'fi', 'fm', 'gb', 'gl', 'gm', 'go', 'gs', 'ha', 'hb', 'hd', 'hi', 'hl', 'hp', 'hr', 'ht', 'hv', 'hz', 'ie', 'ig', 'ii', 'ip', 'jd', 'jh', 'kb', 'kg', 'kw', 'la', 'lb', 'lc', 'ld', 'li', 'mb', 'md', 'mi', 'mm', 'mo', 'mr', 'mt', 'mx', 'nb', 'nd', 'nj', 'nl', 'nm', 'nr', 'ns', 'ob', 'oh', 'ok', 'om', 'os', 'ot', 'oz', 'pc', 'ph', 'pm', 'po', 'pp', 'ps', 'pt', 'rj', 'rm', 'rn', 'rr', 'rt', 'rw', 'sb', 'sc', 'se', 'sp', 'st', 'su', 'tg', 'th', 'tt', 'tv', 'tx', 'uk', 'us', 'uw', 'vs', 'wb', 'wd', 'wf', 'wp', 'wu', 'ww', 'xt', 'ya', 'ye', 'yr', 'zx']


In [6]:
# LABELED TRAIN COUNT DATA STATS
processor.get_train_doc_lengths()
print(processor.med_doc_len)
print(processor.max_doc_len)


49.0
1157


In [7]:

# scale count data to trains' unif doc len
scaled_labeled_train_sample_data = processor.make_uniform_doc_lens(word_count_data=processor.labeled_train_sample_count_data,
                                                                  strategy='max')
scaled_unlabeled_data = processor.make_uniform_doc_lens(word_count_data=processor.unlabeled_count_data,
                                                        strategy='max')
scaled_test_data = processor.make_uniform_doc_lens(word_count_data=processor.test_count_data,
                                                   strategy='max')



In [8]:
print(scaled_labeled_train_sample_data.shape)
print(np.sum(scaled_labeled_train_sample_data, axis=processor.vocab_axis))
print(scaled_unlabeled_data.shape)
print(np.sum(scaled_unlabeled_data, axis=processor.vocab_axis))

(194, 5632)
[1124.86111111 1128.075      1153.66570605 1079.86666667 1108.79166667
 1079.86666667 1138.33870968 1131.84782609 1101.9047619  1128.7804878
 1135.16981132 1145.07216495 1084.6875     1028.44444444 1139.46969697
 1106.69565217 1138.03278689 1146.93913043 1124.86111111 1146.08490566
 1051.81818182 1112.5        1088.94117647 1146.18691589 1128.7804878
 1149.63057325 1150.11309524 1147.59349593 1121.93939394 1137.38983051
 1088.94117647 1133.3877551  1153.83879781 1108.79166667 1114.14814815
 1155.39305556 1099.15       1051.81818182 1150.42613636 1144.82105263
 1104.40909091 1106.69565217  964.16666667 1135.16981132 1150.77956989
 1146.66964286 1149.72327044 1147.59349593 1120.84375    1139.2
 1120.84375    1143.54651163 1136.70175439 1108.79166667 1139.2
 1132.89583333 1115.67857143 1096.10526316 1140.93055556 1068.
 1153.63662791  867.75       1139.2        1123.94285714 1151.06666667
 1104.40909091 1135.16981132 1151.99134199 1132.38297872 1121.93939394
 1134.75       113

# Initialize EM (only first M step)

In [9]:
# train
model = EM_SSL(labeled_count_data=scaled_labeled_train_sample_data,
               label_vals=processor.train_sample_label_vals,
               unlabeled_count_data=scaled_unlabeled_data,
               max_em_iters=2,
               min_em_loss_delta=2e-4)

model.initialize_EM()  # only runs M step (compute thetas) on labeled train samples


labeled train sample has 20 unique labels
Checking initail M step on only labeled train data...
Congrats, initial M step assertions passed.


In [10]:
print('unlabeled data class probas min: %0.2f, avg: %0.2f, max %0.2f' %
     ( model.unlabeled_data_class_probas.min(), 
      model.unlabeled_data_class_probas.mean(), 
      model.unlabeled_data_class_probas.max()))

unlabeled data class probas min: 0.00, avg: 0.00, max 0.00


In [11]:
print('num classs in labeled train', len(model.label_set))
print(model.word_counts_per_class[0].mean())

num classs in labeled train 20
1.5841880856960797


In [12]:
print(model.total_word_count_per_class.shape)

(20,)


In [13]:
# problem to solve, when computing: labeled loss log (P(yi = cj|θ) .P(xi|yi = cj; θ)) ,
# P(xi|yi = cj; θ) -> 0 => log() -> nan
# and the reason P(xi|yi = cj; θ) -> 0 is bc theta_j_vocab -> 0 => theta_j_vocab^w_t -> 0 for w_t > 0
# Recall  P(xi|yi = cj; θ) = product_t \theta_j_t^ w_t
## So log(P(xi|yi = cj; θ) ) = sum_t (w_t * log(\theta_j_t())
### So just need theta_j_t > 0, likelihood improved with smaller vocab size or smaller unif. doc length

# TODO: implement above log trick for theta_j_vocab

print('currently only labeled data:', model.only_labeled_data)
single_doc = model.labeled_count_data[2]
this_true_label =  model.label_vals[2]
print('true label of single doc:', this_true_label)
# Denominators for single doc per class proba
print('Denominator terms\n')
print('total word count for class % d: ' % this_true_label, model.total_word_count_per_class[this_true_label])
# compute minimum proba: for words not appearing in class
min_expected_proba = (1 / (model.vocab_size + model.total_word_count_per_class[this_true_label]))
print('min expected proba:', min_expected_proba)
print('log of min expected proba', np.log(min_expected_proba))
print('Numerator terms\n')
# Numerators for single doc per class proba
print('labeled max word counts for class %d:' % this_true_label, model.labeled_word_counts_per_class[this_true_label].max())
print('single doc word count min, avg:', single_doc.min(), single_doc.mean())
print('theta_j_vocab min for class %d:' % this_true_label, model.theta_j_vocab_per_class[this_true_label].min())


currently only labeled data: False
true label of single doc: 19
Denominator terms

total word count for class  19:  6586.426147113544
min expected proba: 8.184360145567832e-05
log of min expected proba -9.410700431232465
Numerator terms

labeled max word counts for class 19: 267.0
single doc word count min, avg: 0.0 0.20484121201205135
theta_j_vocab min for class 19: 8.184360145567832e-05


## TESTS ON SINGLE TRAIN LABEL DOCUMENT

###  Check P(c_j | theta)

In [14]:
theta_j = model.theta_j_per_class  # per class: (n_docs_in_class + 1) / (self.n_docs + self.n_labels)
print('theta j shape', theta_j.shape)
print('sum theta j == 1: ', np.isclose(np.sum(theta_j, axis=0), 1.0))

theta j shape (20,)
sum theta j == 1:  True


###  FIX: P(x_i | c_j, theta): debug + logSumExp trick

In [15]:
# FIX: P(x_i | c_j, theta) -> {theta}_{tj} =: theta_j_vocab -> non-nan computations
X_log = np.log(model.theta_j_vocab_per_class)
print('shape of log_prob(w_t | cj):', X_log.shape)
print('log_prob(w_t | cj) min, mean, max')
print(X_log.min(), X_log.mean(), X_log.max())

# COMPUTE UNNORMALIZED LOG PROBAS
doc_log_proba_per_class = np.log(model.theta_j_per_class) +  np.array([np.sum(single_doc * np.log(model.theta_j_vocab_per_class[j]), axis=0)
                                            for j in model.ordered_labels_list])
test_doc_log_proba_per_class = model.compute_unnormalized_class_log_probas_doc(single_doc)
print('doc unnorm LOG probas per class of single do avg, min:', doc_log_proba_per_class.mean(), doc_log_proba_per_class.min())
print('TEST doc unnorm LOG probas per class of single do avg, min:', test_doc_log_proba_per_class.mean(), test_doc_log_proba_per_class.min())
print('argmax unnorm LOG proba class', np.argmax(doc_log_proba_per_class))
print('TEST argmax unnorm LOG proba class', np.argmax(test_doc_log_proba_per_class))

# fixed_doc_proba_per_class = np.exp(doc_log_proba_per_class)
# print('FIXED doc probas per class of single do avg, min:', fixed_doc_proba_per_class.mean(), fixed_doc_proba_per_class.min())



shape of log_prob(w_t | cj): (20, 5632)
log_prob(w_t | cj) min, mean, max
-9.97693863157124 -9.456397612265821 -3.7976641575335726
doc unnorm LOG probas per class of single do avg, min: -10173.024557322897 -10848.653344136104
TEST doc unnorm LOG probas per class of single do avg, min: -10173.024557322897 -10848.653344136104
argmax unnorm LOG proba class 19
TEST argmax unnorm LOG proba class 19


### Start LogSumExp trick

In [16]:
print('shape of test doc log probas per class', test_doc_log_proba_per_class.shape)
#print('log sum', test_doc_log_proba_per_class.sum())
#def compute_sum_of_logs_from_log_prbas(log_probas):
#"""For single doc x, compute denom: log P(x|theta), without yet using P(c_j|theta)"""
log_probas = np.copy(test_doc_log_proba_per_class)
print('given unnormalized log probas prior to logSumExp', log_probas)
# log sum exp trick: see Murphy
max_log = np.max(log_probas)
summand = [np.exp(a - max_log) for a in log_probas]  # still have underflow...
print('summand', summand)
log_of_sums = np.log(np.sum(summand)) + max_log  #+ min_expected_proba # log(denom)
print('log of sums', log_of_sums)
test_log_probas_normalized  = log_probas - log_of_sums
print('test log probas', test_log_probas_normalized)
print('dtype test log probas', test_log_probas_normalized.dtype)
test_probas_normalized = np.exp(test_log_probas_normalized)
print('test probas normalized', test_probas_normalized)
print('test probas norm idx 3', test_probas_normalized[3])
print('argmax: ', np.argmax(test_probas_normalized))


    
    

shape of test doc log probas per class (20,)
given unnormalized log probas prior to logSumExp [ -9841.49258555 -10543.12898415 -10465.86560467 -10848.65334414
 -10442.89939051 -10659.34738913 -10241.41139516 -10254.84416874
 -10346.79989939 -10537.52837456 -10373.32594466 -10002.10649368
 -10286.09976215 -10079.15406636 -10131.42183657 -10085.95208996
 -10013.85346456 -10045.00224081 -10040.62872427  -8220.97538746]
summand [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
log of sums -8220.975387463148
test log probas [-1620.51719809 -2322.15359668 -2244.8902172  -2627.67795667
 -2221.92400305 -2438.37200167 -2020.43600769 -2033.86878128
 -2125.82451192 -2316.5529871  -2152.3505572  -1781.13110622
 -2065.12437469 -1858.17867889 -1910.4464491  -1864.97670249
 -1792.87807709 -1824.02685334 -1819.6533368      0.        ]
dtype test log probas float64
test probas normalized [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
tes

In [17]:
X = np.sum(np.exp(log_probas - max_log))

print(X)

1.0


In [18]:
# formalize as methods

def compute_unnormalized_class_log_probas(doc):
    """Get unnormalized log probas"""
    u_log_probas =  np.log(model.theta_j_per_class) +  np.array([np.sum(doc * np.log(model.theta_j_vocab_per_class[j]), axis=0)
                                            for j in model.ordered_labels_list])
    return u_log_probas


    
def compute_doc_class_probas(u_log_probas):
    """Apply normalization compute normalization factor using log-sum-exp trick."""
    max_log = np.max(u_log_probas)
    summand = np.sum(np.exp(log_probas - max_log))
    log_of_sums = np.log(summand) + max_log  #+ min_expected_proba # log(denom)
    log_probas_normalized  = log_probas - log_of_sums
    class_probas_normalized = np.exp(log_probas_normalized)
    return class_probas_normalized

u_log_probas = compute_unnormalized_class_log_probas(single_doc)
print('u_log_probas', u_log_probas)






u_log_probas [ -9841.49258555 -10543.12898415 -10465.86560467 -10848.65334414
 -10442.89939051 -10659.34738913 -10241.41139516 -10254.84416874
 -10346.79989939 -10537.52837456 -10373.32594466 -10002.10649368
 -10286.09976215 -10079.15406636 -10131.42183657 -10085.95208996
 -10013.85346456 -10045.00224081 -10040.62872427  -8220.97538746]


In [19]:
# TAKE CLOSER LOOK AT FIRST THETAS ON LABELED DATA

In [20]:
model.labeled_count_data.shape

(194, 5632)

In [21]:
np.sum(model.unlabeled_this_class_probas)

0.0

In [22]:
for j in model.ordered_labels_list:
    print(model.n_labeled_docs_per_class[j])

8.0
8.0
12.0
12.0
10.0
12.0
10.0
10.0
14.0
12.0
13.0
5.0
7.0
8.0
8.0
11.0
14.0
8.0
6.0
6.0


In [23]:
model.class_mask

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [24]:
np.sum(model.class_mask)

6

In [25]:
for j in model.label_set:
    print(model.n_docs_per_class[j])

8.0
8.0
12.0
12.0
10.0
12.0
10.0
10.0
14.0
12.0
13.0
5.0
7.0
8.0
8.0
11.0
14.0
8.0
6.0
6.0


In [26]:
print(1 / model.vocab_size)
print(model.theta_j_vocab_per_class.shape)

print("word count avg, theta_j, theta_jt_avg, theta_jt_max")
for j in model.label_set:
    print("%0.2f, %0.2f, %0.2f, %0.2f" % 
          (model.labeled_word_counts_per_class[j].mean(), model.theta_j_per_class[j], 
           model.theta_j_vocab_per_class[j].mean(), model.theta_j_vocab_per_class[j].max()))
# TODO: theta_jt near zero => log(theta_jt) -> nan

0.0001775568181818182
(20, 5632)
word count avg, theta_j, theta_jt_avg, theta_jt_max
1.58, 0.04, 0.00, 0.01
1.59, 0.04, 0.00, 0.01
2.40, 0.06, 0.00, 0.02
2.36, 0.06, 0.00, 0.01
1.99, 0.05, 0.00, 0.02
2.38, 0.06, 0.00, 0.01
1.99, 0.05, 0.00, 0.01
2.01, 0.05, 0.00, 0.01
2.82, 0.07, 0.00, 0.01
2.30, 0.06, 0.00, 0.02
2.60, 0.07, 0.00, 0.01
0.99, 0.03, 0.00, 0.02
1.40, 0.04, 0.00, 0.01
1.56, 0.04, 0.00, 0.02
1.61, 0.04, 0.00, 0.01
2.22, 0.06, 0.00, 0.01
2.79, 0.07, 0.00, 0.01
1.62, 0.04, 0.00, 0.01
1.22, 0.03, 0.00, 0.01
1.17, 0.03, 0.00, 0.02


In [27]:
model.theta_j_per_class[j]

0.03271028037383177

In [28]:
print(model.word_counts_per_class[j].min(), model.word_counts_per_class[j].mean(), 
      model.word_counts_per_class[j].min(), model.word_counts_per_class[j].max(), 
    model.word_counts_per_class[j].sum())

0.0 1.1694648698710128 0.0 267.0 6586.426147113544


In [29]:
model.total_word_count_per_class[j]

6586.426147113544

In [30]:
model.theta_j_vocab_per_class[j].shape

(5632,)

In [31]:
model.theta_j_vocab_per_class[j].mean(), model.theta_j_vocab_per_class[j].max()

(0.0001775568181818182, 0.02193408519012179)

In [32]:
# don't want to compute np.log(model.theta_j_vocab_per_class) directly since theta_j_vocab is sparse => underflow
#np.log(model.theta_j_vocab_per_class)

# Model Evals

In [33]:
# Train fit: sanity check
pct_train_correct_preds = model.evaluate_on_data(count_data=scaled_labeled_train_sample_data,
                                            label_vals=processor.train_sample_label_vals)
print(pct_train_correct_preds)  # without a full EM loop, should be equal to 1.0

194
1.0


In [34]:
# out-of-sammple inference: test
pct_test_correct_preds = model.evaluate_on_data(count_data=scaled_test_data,
                                            label_vals=processor.full_test_label_vals)
print(pct_test_correct_preds)

2499
0.33178438661710036


In [35]:
print(model.preds.dtype)
print(processor.full_test_label_vals.dtype)

int64
int32


In [36]:
m = model.preds
p =  processor.full_test_label_vals
print(m.shape == p.shape)
corr = m == p
print(np.sum(corr) / len(m))

True
0.33178438661710036


# RUN EM Loop

##  E-step

In [37]:
model.E_step()

In [38]:
theta_j = model.theta_j_per_class  # per class: (n_docs_in_class + 1) / (self.n_docs + self.n_labels)
print('theta j shape', theta_j.shape)
print('sum theta j == 1: ', np.isclose(np.sum(theta_j, axis=0), 1.0))

theta j shape (20,)
sum theta j == 1:  True


In [39]:
print('unlabeled data class probas min: %0.2f, avg: %0.2f, max %0.2f' %
     ( model.unlabeled_data_class_probas.min(), 
      model.unlabeled_data_class_probas.mean(), 
      model.unlabeled_data_class_probas.max()))

unlabeled data class probas min: 0.00, avg: 0.05, max 1.00


In [40]:
print('total word count for class % d: ' % this_true_label, model.total_word_count_per_class[this_true_label])


total word count for class  19:  6586.426147113544


In [41]:
u_joint_factors = np.apply_along_axis(func1d=model.compute_unnormalized_class_log_probas_doc,
                                            axis=model.vocab_axis,
                                            arr=model.unlabeled_count_data)
print(u_joint_factors.shape)

(500, 20)


In [42]:
u_log_of_sums = np.apply_along_axis(func1d=model.compute_log_of_sums,
                                    axis=1,
                                    arr=u_joint_factors)
print(u_log_of_sums.shape)

(500,)


In [43]:
print(np.sum(u_log_of_sums))

-4302732.242912833


## M-step

In [44]:
model.M_step()

In [45]:
print('total word count for class % d: ' % this_true_label, model.total_word_count_per_class[this_true_label])


total word count for class  19:  18695.778969314593


In [46]:
print('total loss = %0.2f' % model.compute_total_loss())

total loss = 5930965.02


## E-step

In [47]:
model.E_step()

In [48]:
print(np.log(model.theta_j_per_class))

[-3.20974294 -3.70210723 -2.49900142 -3.1478135  -2.53547739 -2.94604664
 -3.11878968 -2.97951963 -2.76022969 -2.99731838 -2.91453634 -3.33964579
 -3.2970942  -3.65581391 -3.10110156 -2.53432959 -2.55377146 -3.03140861
 -3.18741854 -3.66071884]


## M-step

In [49]:
model.M_step()
print('total word count for class % d: ' % this_true_label, model.total_word_count_per_class[this_true_label])
labeled_loss = model.compute_labeled_loss()
unlabeled_loss = model.compute_unlabeled_loss()
print(labeled_loss, unlabeled_loss, labeled_loss + unlabeled_loss)
print('total loss = %0.2f' % model.compute_total_loss())

total word count for class  19:  18695.780761045135
-1428176.3915571177 -3362907.908171191 -4791084.299728309
total loss = 5930169.61
