# Preprocessor object
Can we develop a preprocessor object to carry all that information?

### Raw inputs
The raw input is a list of 

In [90]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pickle
import random
from scipy import sparse
import itertools
from scipy.io import savemat, loadmat
import re
import string

train_data = fetch_20newsgroups(subset='train') # from sklearn
test_data = fetch_20newsgroups(subset='test')

train_data.data[:2]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

### Desired  outputs

In [97]:
import torch

import sys, os
sys.path.append('..')
import data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## get data
# 1. vocabulary
vocab, train, valid, test = data.get_data(os.path.join('../data/20ng'))
vocab_size = len(vocab)

# 2. tokens, counts for train, dev and test set
train_tokens = train['tokens']
train_counts = train['counts']
num_docs_train = len(train_tokens)
valid_tokens = valid['tokens']
valid_counts = valid['counts']
num_docs_valid = len(valid_tokens)
test_tokens = test['tokens']
test_counts = test['counts']
num_docs_test = len(test_tokens)
test_1_tokens = test['tokens_1']
test_1_counts = test['counts_1']
num_docs_test_1 = len(test_1_tokens)
test_2_tokens = test['tokens_2']
test_2_counts = test['counts_2']
num_docs_test_2 = len(test_2_tokens)

display( vocab[:5] )
display( train_tokens[:2][0][0] )
display( train_counts[:2][0][0] )

acc_loss = 0
acc_kl_theta_loss = 0
cnt = 0

num_docs_train = len(train_tokens)
batch_size = 1000

indices = torch.randperm(num_docs_train)
indices = torch.split(indices, batch_size)
idx, ind = 0, indices[0]

data_batch = data.get_batch(train_tokens, train_counts, ind, vocab_size, device)
sums = data_batch.sum(1).unsqueeze(1)
normalized_data_batch = data_batch / sums

['ii', 'plate', 'duke', 'greatly', 'holds']

array([  94,  100,  233,  327,  357,  504,  530,  597,  662,  720,  805,
        859,  889,  897,  898,  987,  996, 1024, 1098, 1177, 1178, 1532,
       1642, 1658, 1706, 1728, 1732, 1808, 1822, 1857, 1858, 1890, 1895,
       1957, 2013, 2059, 2091, 2118, 2174, 2236, 2386, 2478, 2522, 2539,
       2566, 2569, 2662, 2673, 2790, 2812, 2887, 2934, 3018, 3048, 3071],
      dtype=int32)

array([1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1,
       1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1])

### `preprocessor`

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle
import random
import numbers
from scipy import sparse
import itertools
from scipy.io import savemat, loadmat
import re
import string
from stops import stops

class Preprocessor(CountVectorizer):
    
    def __init__(self, min_df=0.005, max_df=0.7):
        super().__init__(min_df=min_df, max_df=max_df, stop_words=stops)
        
    
    """
    CLEANING METHODS
    """
    def clean_doc(self, doc):
        """
        Removes words with puncutations / numbers and one letter words
        """
        list_of_tokens = re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', doc)

        def contains_punctuation(w):
            return any(char in string.punctuation for char in w)
        def contains_numeric(w):
            return any(char.isdigit() for char in w)

        # filter out words with punctuation, eg "where's"
        list_of_tokens = [token.lower() for token in list_of_tokens if not contains_punctuation(token)]
        # filter out words with numbers, eg "rac3"
        list_of_tokens = [token for token in list_of_tokens if not contains_numeric(token)]
        # remove one letter words
        list_of_tokens = [token for token in list_of_tokens if len(token)>1]

        return ' '.join(list_of_tokens)
    
    def clean_corpus(self, corpus):
        """
        Cleans iterable of string docs
        """
        return [clean_doc(doc) for doc in corpus]
    
    
    """
    FIT AND PREPROCESS
    """
    def fit_transform(self, raw_documents, y=None):
        """Learn the vocabulary dictionary and return term-document matrix.
        This is equivalent to fit followed by transform, but more efficiently
        implemented.
        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.
        Returns
        -------
        X : array, [n_samples, n_features]
            Document-term matrix.
        """
        # We intentionally don't call the transform method to make
        # fit_transform overridable without unwanted side effects in
        # TfidfVectorizer.
        if isinstance(raw_documents, str):
            raise ValueError(
                "Iterable over raw text documents expected, "
                "string object received.")

        self._validate_params()
        self._validate_vocabulary()
        max_df = self.max_df
        min_df = self.min_df
        max_features = self.max_features

        vocabulary, X = self._count_vocab(self.clean_corpus(raw_documents),
                                          self.fixed_vocabulary_)

        if self.binary:
            X.data.fill(1)

        if not self.fixed_vocabulary_:
            X = self._sort_features(X, vocabulary)

            n_doc = X.shape[0]
            max_doc_count = (max_df
                             if isinstance(max_df, numbers.Integral)
                             else max_df * n_doc)
            min_doc_count = (min_df
                             if isinstance(min_df, numbers.Integral)
                             else min_df * n_doc)
            if max_doc_count < min_doc_count:
                raise ValueError(
                    "max_df corresponds to < documents than min_df")
            X, self.stop_words_ = self._limit_features(X, vocabulary,
                                                       max_doc_count,
                                                       min_doc_count,
                                                       max_features)

            self.vocabulary_ = vocabulary

        return X
    
    def _transform(self, raw_documents):
        """Transform documents to document-term matrix.
        Extract token counts out of raw text documents using the vocabulary
        fitted with fit or the one provided to the constructor.
        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.
        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Document-term matrix.
        """
        if isinstance(raw_documents, str):
            raise ValueError(
                "Iterable over raw text documents expected, "
                "string object received.")
        self._check_vocabulary()

        # use the same matrix-building strategy as fit_transform
        _, X = self._count_vocab(raw_documents, fixed_vocab=True)
        if self.binary:
            X.data.fill(1)
        return X
    
    def transform(self, raw_documents, y=None):
        return self._transform(self.clean_corpus(raw_documents))
    

In [99]:
# # how to get vocab from the stuff
# # https://stackoverflow.com/questions/28894756/countvectorizer-does-not-print-vocabulary/44320484
# from sklearn.feature_extraction.text import CountVectorizer
# train_set = ("The sky is blue.", "The sun is bright.")
# test_set = ("The sun in the sky is bright.", 
#     "We can see the shining sun, the bright sun.")

# vectorizer = CountVectorizer(stop_words='english')
# document_term_matrix = vectorizer.fit_transform(train_set)
# print( vectorizer.get_feature_names() )

# print( vectorizer.transform(['blue blue blue']).toarray() )

asd = Preprocessor()
cvz = asd.fit_transform(train_data.data)

print( asd.get_feature_names()[:5] )
print( asd.transform(['aaron aaron aaron ab']).toarray() )

print( asd.get_feature_names()[-5:] )
print( asd.transform(['zuma zuma zuma']).toarray() )

['aaron', 'ab', 'ability', 'abortion', 'absolute']
[[3 1 0 ... 0 0 0]]
['zealand', 'zip', 'zone', 'zoo', 'zuma']
[[0 0 0 ... 0 0 3]]


Inspecting this, the right words came out, i.e. its the same words as from the authors output.  
```
array(['addition', 'body', 'brought', 'called', 'car', 'college', 'day',
       'door', 'doors', 'early', 'engine', 'front', 'history', 'host',
       'il', 'info', 'late', 'looked', 'made', 'mail', 'maryland',
       'model', 'nntp', 'park', 'posting', 'production', 'rest',
       'separate', 'small', 'specs', 'sports', 'thing', 'umd',
       'university', 'wondering', 'years'])
```

In [106]:
row = cvz[0].toarray()[0]
np.array(asd.get_feature_names())[(row > 0)]

array(['addition', 'body', 'brought', 'called', 'car', 'college', 'day',
       'door', 'doors', 'early', 'engine', 'front', 'history', 'host',
       'il', 'info', 'late', 'looked', 'made', 'mail', 'maryland',
       'model', 'nntp', 'park', 'posting', 'production', 'rest',
       'separate', 'small', 'specs', 'sports', 'thing', 'umd',
       'university', 'wondering', 'years'], dtype='<U15')

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pickle
import random
from scipy import sparse
import itertools
from scipy.io import savemat, loadmat
import re
import string


"""
1. Read data
"""

print('reading data...')
train_data = fetch_20newsgroups(subset='train') # from sklearn
test_data = fetch_20newsgroups(subset='test')



"""
2. Filter out words with numerics and punctuations
"""

init_docs_tr = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', train_data.data[doc]) for doc in range(len(train_data.data))]
init_docs_ts = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', test_data.data[doc]) for doc in range(len(test_data.data))]

def contains_punctuation(w):
    return any(char in string.punctuation for char in w)

def contains_numeric(w):
    return any(char.isdigit() for char in w)
    
# put the train and test set together
init_docs = init_docs_tr + init_docs_ts
# filter out words with punctuation, eg "where's"
init_docs = [[w.lower() for w in init_docs[doc] if not contains_punctuation(w)] for doc in range(len(init_docs))]
# filter out words with numbers, eg "rac3"
# init_docs = [[w for w in init_docs[doc] if not contains_numeric(w)] for doc in range(len(init_docs))]
# # remove one letter words
# init_docs = [[w for w in init_docs[doc] if len(w)>1] for doc in range(len(init_docs))]
# # Join the words back together into whole string documents
# init_docs = [" ".join(init_docs[doc]) for doc in range(len(init_docs))]

init_docs[0]

reading data...


['From',
 ':',
 'lerxst',
 '@',
 'wam',
 '.',
 'umd',
 '.',
 'edu',
 '(',
 "where's",
 'my',
 'thing',
 ')',
 'Subject',
 ':',
 'WHAT',
 'car',
 'is',
 'this',
 '!',
 '?',
 'Nntp',
 'Posting',
 'Host',
 ':',
 'rac3',
 '.',
 'wam',
 '.',
 'umd',
 '.',
 'edu',
 'Organization',
 ':',
 'University',
 'of',
 'Maryland',
 ',',
 'College',
 'Park',
 'Lines',
 ':',
 '15',
 'I',
 'was',
 'wondering',
 'if',
 'anyone',
 'out',
 'there',
 'could',
 'enlighten',
 'me',
 'on',
 'this',
 'car',
 'I',
 'saw',
 'the',
 'other',
 'day',
 '.',
 'It',
 'was',
 'a',
 '2',
 'door',
 'sports',
 'car',
 ',',
 'looked',
 'to',
 'be',
 'from',
 'the',
 'late',
 '60s',
 '/',
 'early',
 '70s',
 '.',
 'It',
 'was',
 'called',
 'a',
 'Bricklin',
 '.',
 'The',
 'doors',
 'were',
 'really',
 'small',
 '.',
 'In',
 'addition',
 ',',
 'the',
 'front',
 'bumper',
 'was',
 'separate',
 'from',
 'the',
 'rest',
 'of',
 'the',
 'body',
 '.',
 'This',
 'is',
 'all',
 'I',
 'know',
 '.',
 'If',
 'anyone',
 'can',
 'tellme',