### Understanding the preprocessing steps
The authors preprocess the 20 news group.   
Importantly, this is a bag of words model, so attention should be paid to watch __stop words__ and how the corpus is __parsed into its tokens in their native documents.__

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pickle
import random
from scipy import sparse
import itertools
from scipy.io import savemat, loadmat
import re
import string


##### 1. Reading and parsing the data into its tokens
The authors download the data.   
One document from the raw data looks like:
```
"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"
```

The authors then parse the documents into words using `re`. This is not meant to tokenize it but to filter out the words that they do not need, namely words with puntuations or numerics. 

Notably, even though both the train and test datasets are downloaded, they are merged to a total of 18846 documents anyway, of which 11260 docs are in the training set and 7532 in the test set and 100 as a validation set. 

In [10]:
"""
1. Read data
"""

print('reading data...')
# train_data = fetch_20newsgroups(subset='train') # from sklearn
# test_data = fetch_20newsgroups(subset='test')



"""
2. Filter out words with numerics and punctuations
"""

init_docs_tr = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', train_data.data[doc]) for doc in range(len(train_data.data))]
init_docs_ts = [re.findall(r'''[\w']+|[.,!?;-~{}`´_<=>:/@*()&'$%#"]''', test_data.data[doc]) for doc in range(len(test_data.data))]

def contains_punctuation(w):
    return any(char in string.punctuation for char in w)

def contains_numeric(w):
    return any(char.isdigit() for char in w)
    
# put the train and test set together
init_docs = init_docs_tr + init_docs_ts
# filter out words with punctuation, eg "where's"
init_docs = [[w.lower() for w in init_docs[doc] if not contains_punctuation(w)] for doc in range(len(init_docs))]
# filter out words with numbers, eg "rac3"
init_docs = [[w for w in init_docs[doc] if not contains_numeric(w)] for doc in range(len(init_docs))]
# remove one letter words
init_docs = [[w for w in init_docs[doc] if len(w)>1] for doc in range(len(init_docs))]
# Join the words back together into whole string documents
init_docs = [" ".join(init_docs[doc]) for doc in range(len(init_docs))]

init_docs[0]

reading data...


'from lerxst wam umd edu my thing subject what car is this nntp posting host wam umd edu organization university of maryland college park lines was wondering if anyone out there could enlighten me on this car saw the other day it was door sports car looked to be from the late early it was called bricklin the doors were really small in addition the front bumper was separate from the rest of the body this is all know if anyone can tellme model name engine specs years of production where this car is made history or whatever info you have on this funky looking car please mail thanks il brought to you by your neighborhood lerxst'

##### 2. Count vectorize the corpus and get the vocab

`type(cvz) == scipy.sparse.csr.csr_matrix`
Notably, `sign()` is used. Meaning, the values of this sparse matrix is binary, indicatoing whether the word is present, but ignoring how many. <font color=red> (why should the authors do this?) </font>
```
>>> sparse.csr.csr_matrix([[1,2,3,0,-1]]).sign().toarray()
array([[ 1,  1,  1,  0, -1]], dtype=int64)
```

`cvz` is of shape (18846, 19148), which is (no of docs, no of vocab).  
Needless to say, this is after the 70% document count filtering.  

##### Document frequency & stop words 
The authors write: _"We preprocess the corpus by filtering stop words, words with document frequency above 70% and tokenizing."_  
`min_df` and `max_df` filter out words at the left-tail and right-tail of document frequency.   
Stop words are also listed and removed. 
```
['a',
 'able',
 'about',
 .
 .
 .
 'yourself', 
 'yourselves',
 'z',
 'zero',
 '']
 ````

`CountVectorizer`  
`max_df` : float in range [0.0, 1.0] or int, default=1.0
    When building the vocabulary ignore terms that have a document
    frequency strictly higher than the given threshold (corpus-specific
    stop words).
    If float, the parameter represents a proportion of documents, integer
    absolute counts.
    This parameter is ignored if vocabulary is not None.

`min_df` : float in range [0.0, 1.0] or int, default=1
    When building the vocabulary ignore terms that have a document
    frequency strictly lower than the given threshold. This value is also
    called cut-off in the literature.
    If float, the parameter represents a proportion of documents, integer
    absolute counts.
    This parameter is ignored if vocabulary is not None.

In [54]:
# Maximum / minimum document frequency
max_df = 0.7
min_df = 0.005  # choose desired value for min_df

# Read stopwords
with open('../scripts/stops.txt', 'r') as f:
    stops = f.read().split('\n')


# Create count vectorizer
print('counting document frequency of words...')
cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=stops)
cvz = cvectorizer.fit_transform(init_docs).sign()

# Get vocabulary
print('building the vocabulary...')
sum_counts = cvz.sum(axis=0)
v_size = sum_counts.shape[1]
sum_counts_np = np.zeros(v_size, dtype=int)
for v in range(v_size):
    sum_counts_np[v] = sum_counts[0,v]


counting document frequency of words...
building the vocabulary...


In [66]:
init_docs[0]

'from lerxst wam umd edu my thing subject what car is this nntp posting host wam umd edu organization university of maryland college park lines was wondering if anyone out there could enlighten me on this car saw the other day it was door sports car looked to be from the late early it was called bricklin the doors were really small in addition the front bumper was separate from the rest of the body this is all know if anyone can tellme model name engine specs years of production where this car is made history or whatever info you have on this funky looking car please mail thanks il brought to you by your neighborhood lerxst'

In [73]:
(cvz[0].toarray()>0)[0]

array([False, False, False, ..., False, False, False])

In [74]:
np.array(cvectorizer.get_feature_names())[(cvz[0].toarray()>0)[0]]

array(['addition', 'body', 'brought', 'called', 'car', 'college', 'day',
       'door', 'doors', 'early', 'engine', 'front', 'history', 'host',
       'il', 'info', 'late', 'looked', 'made', 'mail', 'maryland',
       'model', 'nntp', 'park', 'posting', 'production', 'rest',
       'separate', 'small', 'specs', 'sports', 'thing', 'umd',
       'university', 'wondering', 'years'], dtype='<U14')

The point of this code snippet is to get `word2id` and `id2word` in an order from infrequent to common words.

In [20]:
word2id = dict([(w, cvectorizer.vocabulary_.get(w)) 
                for w in cvectorizer.vocabulary_])
id2word = dict([(cvectorizer.vocabulary_.get(w), w) 
                for w in cvectorizer.vocabulary_])
# del cvectorizer
print('  initial vocabulary size: {}'.format(v_size))

# Sort elements in vocabulary from infrequent to common words
idx_sort = np.argsort(sum_counts_np)
vocab_aux = [id2word[idx_sort[cc]] for cc in range(v_size)]

# Create dictionary and inverse dictionary
vocab = vocab_aux
del vocab_aux
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])

  initial vocabulary size: 3275


In [26]:
sum_counts_np

array([118, 197, 349, ..., 146, 135, 133])

In [21]:
idx_sort

array([ 275, 2702,  920, ..., 2198,  169, 3250])

##### 4. Train Val Test split
Pretty simply, the authors carefully split the corpus into train test split. One additional thing they did was restrict the vocabulary to the train set vocab. 

In [28]:
"""
SPLIT DOCS INTO TRAIN-TEST-VALID
"""
# Split in train/test/valid
print('tokenizing documents and splitting into train/test/valid...')
num_docs_tr = len(init_docs_tr)
trSize = num_docs_tr-100
tsSize = len(init_docs_ts)
vaSize = 100
idx_permute = np.random.permutation(num_docs_tr).astype(int)

# Remove words not in train_data
vocab = list(set([w for idx_d in range(trSize) for w in init_docs[idx_permute[idx_d]].split() if w in word2id]))
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])
print('  vocabulary after removing words not in train: {}'.format(len(vocab)))

# Split in train/test/valid
docs_tr = [[word2id[w] for w in init_docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)]
docs_va = [[word2id[w] for w in init_docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(vaSize)]
docs_ts = [[word2id[w] for w in init_docs[idx_d+num_docs_tr].split() if w in word2id] for idx_d in range(tsSize)]

print('  number of documents (train): {} [this should be equal to {}]'.format(len(docs_tr), trSize))
print('  number of documents (test): {} [this should be equal to {}]'.format(len(docs_ts), tsSize))
print('  number of documents (valid): {} [this should be equal to {}]'.format(len(docs_va), vaSize))



"""
REMOVE EMPTY OR SINGLE WORD DOCUMENTS
"""
# Remove empty documents
print('removing empty documents...')

def remove_empty(in_docs):
    return [doc for doc in in_docs if doc!=[]]

docs_tr = remove_empty(docs_tr)
docs_ts = remove_empty(docs_ts)
docs_va = remove_empty(docs_va)

# Remove test documents with length=1
docs_ts = [doc for doc in docs_ts if len(doc)>1]

tokenizing documents and splitting into train/test/valid...
  vocabulary after removing words not in train: 3275
  number of documents (train): 11214 [this should be equal to 11214]
  number of documents (test): 7532 [this should be equal to 7532]
  number of documents (valid): 100 [this should be equal to 100]
removing empty documents...


The index of the first training set is 2551

In [34]:
len(docs_tr[2])

30

The authors split the test set into 2 halves. 

In [36]:
# Split test set in 2 halves
print('splitting test documents in 2 halves...')
docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts]
docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts]

splitting test documents in 2 halves...


##### 5. Get bag of word matrices
In order to get the bag of word matrices, the authors go through quite a number of steps. 

In [37]:
# Getting lists of words and doc_indices
print('creating lists of words...')

def create_list_words(in_docs):
    return [x for y in in_docs for x in y]

words_tr = create_list_words(docs_tr)
words_ts = create_list_words(docs_ts)
words_ts_h1 = create_list_words(docs_ts_h1)
words_ts_h2 = create_list_words(docs_ts_h2)
words_va = create_list_words(docs_va)

print('  len(words_tr): ', len(words_tr))
print('  len(words_ts): ', len(words_ts))
print('  len(words_ts_h1): ', len(words_ts_h1))
print('  len(words_ts_h2): ', len(words_ts_h2))
print('  len(words_va): ', len(words_va))

creating lists of words...
  len(words_tr):  934666
  len(words_ts):  606887
  len(words_ts_h1):  301564
  len(words_ts_h2):  305323
  len(words_va):  12953


In [38]:
# Get doc indices
print('getting doc indices...')

def create_doc_indices(in_docs):
    aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
    return [int(x) for y in aux for x in y]

doc_indices_tr = create_doc_indices(docs_tr)
doc_indices_ts = create_doc_indices(docs_ts)
doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
doc_indices_va = create_doc_indices(docs_va)

print('  len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr)))
print('  len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts)))
print('  len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)))
print('  len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)))
print('  len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va)))

# Number of documents in each set
n_docs_tr = len(docs_tr)
n_docs_ts = len(docs_ts)
n_docs_ts_h1 = len(docs_ts_h1)
n_docs_ts_h2 = len(docs_ts_h2)
n_docs_va = len(docs_va)

# # Remove unused variables
# del docs_tr
# del docs_ts
# del docs_ts_h1
# del docs_ts_h2
# del docs_va


getting doc indices...
  len(np.unique(doc_indices_tr)): 11214 [this should be 11214]
  len(np.unique(doc_indices_ts)): 7531 [this should be 7531]
  len(np.unique(doc_indices_ts_h1)): 7531 [this should be 7531]
  len(np.unique(doc_indices_ts_h2)): 7531 [this should be 7531]
  len(np.unique(doc_indices_va)): 100 [this should be 100]


In [39]:
# Create bow representation
print('creating bow representation...')

def create_bow(doc_indices, words, n_docs, vocab_size):
    return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, vocab_size)).tocsr()

bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))
bow_ts_h1 = create_bow(doc_indices_ts_h1, words_ts_h1, n_docs_ts_h1, len(vocab))
bow_ts_h2 = create_bow(doc_indices_ts_h2, words_ts_h2, n_docs_ts_h2, len(vocab))
bow_va = create_bow(doc_indices_va, words_va, n_docs_va, len(vocab))

# del words_tr
# del words_ts
# del words_ts_h1
# del words_ts_h2
# del words_va
# del doc_indices_tr
# del doc_indices_ts
# del doc_indices_ts_h1
# del doc_indices_ts_h2
# del doc_indices_va

creating bow representation...


The `bow_tr` is a sparse matrix of shape (number of documents x vocabulary)

In [42]:
bow_tr

<11214x3275 sparse matrix of type '<class 'numpy.int64'>'
	with 614491 stored elements in Compressed Sparse Row format>

In [58]:
cvectorizer.fit_transform([init_docs[2551]])

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [63]:
bow_tr.getrow(0).toarray()[(bow_tr.getrow(0).toarray()>1)]

array([2, 2, 4, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 4, 2])

In [14]:
len(vocab)

19097

coo_matrix:  
https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html

##### 6. Save the bag of words
The authors take each `bow` and saves them separately as tokens and counts.

In [None]:
# Write the vocabulary to a file
path_save = './min_df_' + str(min_df) + '/'
if not os.path.isdir(path_save):
    os.system('mkdir -p ' + path_save)

with open(path_save + 'vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
del vocab

# Split bow intro token/value pairs
print('splitting bow intro token/value pairs and saving to disk...')

def split_bow(bow_in, n_docs):
    indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)]
    counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)]
    return indices, counts

bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr)
savemat(path_save + 'bow_tr_tokens', {'tokens': bow_tr_tokens}, do_compression=True)
savemat(path_save + 'bow_tr_counts', {'counts': bow_tr_counts}, do_compression=True)
del bow_tr
del bow_tr_tokens
del bow_tr_counts

bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts)
savemat(path_save + 'bow_ts_tokens', {'tokens': bow_ts_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_counts', {'counts': bow_ts_counts}, do_compression=True)
del bow_ts
del bow_ts_tokens
del bow_ts_counts

bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1)
savemat(path_save + 'bow_ts_h1_tokens', {'tokens': bow_ts_h1_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_h1_counts', {'counts': bow_ts_h1_counts}, do_compression=True)
del bow_ts_h1
del bow_ts_h1_tokens
del bow_ts_h1_counts

bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2)
savemat(path_save + 'bow_ts_h2_tokens', {'tokens': bow_ts_h2_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_h2_counts', {'counts': bow_ts_h2_counts}, do_compression=True)
del bow_ts_h2
del bow_ts_h2_tokens
del bow_ts_h2_counts

bow_va_tokens, bow_va_counts = split_bow(bow_va, n_docs_va)
savemat(path_save + 'bow_va_tokens', {'tokens': bow_va_tokens}, do_compression=True)
savemat(path_save + 'bow_va_counts', {'counts': bow_va_counts}, do_compression=True)
del bow_va
del bow_va_tokens
del bow_va_counts

print('Data ready !!')
print('*************')