In [1]:
from nltk.corpus import stopwords
import numpy as np
from nltk.corpus import words as nltk_english_words

from lib_utils.preprocessing import TextPreProcessor
from lib_utils.expectation_maximization import EM_SSL



# Set tokens to remove for all text preprocessing
remove_zero_vocab_docs = True
english_vocab = set(nltk_english_words.words())
english_vocab = None
_tokens_to_remove = stopwords.words('english')

In [2]:
# Fix static preprocessed data
# original article suggests 10k fixed unlabeled samples
processor = TextPreProcessor(n_unlabeled_train_samples=10000,
                                    tokens_to_remove=_tokens_to_remove,
                                    remove_zero_vocab_docs=remove_zero_vocab_docs,
                                    english_vocab=english_vocab)
# Initialize raw
processor.set_static_full_train_raw_data()
processor.set_static_raw_unlabeled_data()
processor.set_static_raw_test_data()

full train raw data shape: (11314,)
min, max full train data label vals = 0, 19
unlabeled train data shape: (10000,)
num avail train indices complement to unlabeled: 1314


In [3]:
processor.set_sample_raw_train_data()
sample_sent = processor.labeled_train_data_sample[0]
print('sample labeled trained sentence\n', sample_sent)
processed_sent = processor.process_documents_text(sample_sent)
print('processed sentence\n', processed_sent)

doc_array_sample = processor.labeled_train_data_sample[:2]
processed_docs = processor.process_documents_text(doc_array_sample)  # List[str]


num train samples to select: 20
at set_sample_raw_train_data
size: (20,)
sample labeled trained sentence
 
I've got 7 episodes left on *Beta* for Sale at US$8 each (neg.), or
for Trade 1-for-1 for movie on Beta or a used CD; or, a package deal
for $50 or whatever you care to propose in trade -- e.g., all for a
set of good stereo headphones (e.g. Sony V6 or V7), an Apple IWII
sheet feeder, a good used FM/Cassette stereo "walkman" or a hotel
coupon(s) for free stays FOB New York City (guests coming!)).  The
remaining collection is as follows:

         8 - Charlie X
        11 - Dagger of the Mind
        12 - Miri
        17 - Shore Leave
        20 - The Alternative Factor
        29 - Operation-Annihilate!
        33 - Who Mourns for Adonais?

Numbers indicate episode numbering on the tape boxes, for those who
are keeping track of what episodes they're missing in that manner.

RSVP for summaries, if necessary.

The tapes are all in excellent condition in the original packaging.
All ha

In [4]:
# doc-to-vect based on train sample's count vectorizer
processor.set_labeled_train_sample_count_data()
print('labeled_train_sample_count_data shape:', processor.labeled_train_sample_count_data.shape)
processor.set_unlabeled_count_data()
print('unlabeld train count_data shape:', processor.unlabeled_count_data.shape)
processor.set_test_count_data()



labeled train sample count data before zero doc removal shape (20, 1015)
Removing zero vocab docs from labeled train sample.
After zero count doc removal:
 Kept 20 samples from original 11314 train
labeled_train_sample_count_data shape: (20, 1015)
got data=unlabeled, shape= (10000,)
unlabeled count data shape (10000, 1015)
unlabeld train count_data shape: (10000, 1015)
test count data shape (7532, 1015)


In [5]:
# CHECK TRAIN SET VOCAB
print(len(processor.vocab))
print(processor.vocab[:10])
print(processor.vocab[-10:])
short_vocab_text = [v for v in processor.vocab if len(v) <=2]
print(short_vocab_text)

1015
['able', 'abstract', 'acceptable', 'accident', 'action', 'active', 'actual', 'adam', 'address', 'administering']
['world', 'worth', 'would', 'wrist', 'years', 'yearwood', 'yes', 'yo', 'york', 'zone']
['al', 'cd', 'cm', 'dl', 'ea', 'ed', 'et', 'fm', 'go', 'hm', 'hp', 'kg', 'mr', 'nd', 'pc', 'pp', 'ps', 'rd', 'sf', 'st', 'th', 'us', 'wc', 'yo']


In [6]:
# LABELED TRAIN COUNT DATA STATS
processor.get_train_doc_lengths()
print(processor.med_doc_len)
print(processor.max_doc_len)


30.5
604


In [7]:

# scale count data to trains' unif doc len
scaled_labeled_train_sample_data = processor.make_uniform_doc_lens(word_count_data=processor.labeled_train_sample_count_data,
                                                                  strategy='max')
scaled_unlabeled_data = processor.make_uniform_doc_lens(word_count_data=processor.unlabeled_count_data,
                                                        strategy='max')
scaled_test_data = processor.make_uniform_doc_lens(word_count_data=processor.test_count_data,
                                                   strategy='max')



  scaled_word_count_data = (static_doc_len / reshaped_sums) * word_count_data
  scaled_word_count_data = (static_doc_len / reshaped_sums) * word_count_data


In [8]:
print(scaled_labeled_train_sample_data.shape)
print(np.sum(scaled_labeled_train_sample_data, axis=processor.vocab_axis))
print(scaled_unlabeled_data.shape)
print(np.sum(scaled_unlabeled_data, axis=processor.vocab_axis))

(20, 1015)
[604. 604. 604. 604. 604. 604. 604. 604. 604. 604. 604. 604. 604. 604.
 604. 604. 604. 604. 604. 604.]
(10000, 1015)
[604. 604. 604. ... 604. 604. 604.]


In [9]:
# train
model = EM_SSL(labeled_count_data=scaled_labeled_train_sample_data,
               label_vals=processor.train_sample_label_vals,
               unlabeled_count_data=scaled_unlabeled_data,
               max_em_iters=2,
               min_em_loss_delta=2e-4)

# model.fit()
model.initialize_EM()  # only runs M step (compute thetas) on labeled train samples


labeled train sample has 12 unique labels
Checking initail M step on only labeled train data...
Congrats, initial M step assertions passed.


In [10]:
# TAKE CLOSER LOOK AT FIRST THETAS ON LABELED DATA

In [11]:
model.labeled_count_data.shape

(20, 1015)

In [12]:
np.sum(model.unlabeled_this_class_probas)

0.0

In [15]:
for j in model.ordered_labels_list:
    print(model.n_labeled_docs_per_class[j])

0.0
0.0
1.0
0.0
1.0
1.0
1.0
3.0
0.0
3.0
2.0
3.0
0.0
0.0
2.0
0.0
1.0
1.0
0.0
1.0


In [16]:
model.class_mask

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False])

In [17]:
np.sum(model.class_mask)

1

In [18]:
for j in model.label_set:
    print(model.n_docs_per_class[j])

1.0
1.0
1.0
1.0
3.0
3.0
2.0
3.0
2.0
1.0
1.0
1.0


In [33]:
print(1 / model.vocab_size)
print(model.theta_jt_per_class.shape)

print("word count avg, theta_j, theta_jt_avg, theta_jt_max")
for j in model.label_set:
    print("%0.2f, %0.2f, %0.2f, %0.2f" % 
          (model.labeled_word_counts_per_class[j].mean(), model.theta_j_per_class[j], 
           model.theta_jt_per_class[j].mean(), model.theta_jt_per_class[j].max()))
# TODO: theta_jt near zero => log(theta_jt) -> nan

0.0009852216748768472
(20, 1015)
word count avg, theta_j, theta_jt_avg, theta_jt_max
0.60, 0.05, 0.00, 0.02
0.60, 0.05, 0.00, 0.03
0.60, 0.05, 0.00, 0.04
0.60, 0.05, 0.00, 0.02
1.79, 0.10, 0.00, 0.04
1.79, 0.10, 0.00, 0.02
1.19, 0.07, 0.00, 0.01
1.79, 0.10, 0.00, 0.01
1.19, 0.07, 0.00, 0.03
0.60, 0.05, 0.00, 0.03
0.60, 0.05, 0.00, 0.04
0.60, 0.05, 0.00, 0.09


In [28]:
model.theta_j_per_class[j]

0.05

In [None]:
# out-of-sammple inference: test
pct_test_correct_preds = model.evaluate_on_data(count_data=scaled_test_data,
                                            label_vals=processor.full_test_label_vals)
print(pct_test_correct_preds)

In [None]:
print(model.preds.dtype)
print(processor.full_test_label_vals.dtype)

In [None]:
m = model.preds
p =  processor.full_test_label_vals
print(m.shape == p.shape)
corr = m == p
print(np.sum(corr))