# Performing PCA on book data
We will use TFIDF with english stopwords to process the book data, then collapse the matrices using TruncatedSVD; the model would then be physically saved for later conversion task

In [1]:
from nltk import word_tokenize, sent_tokenize

In [3]:
! ls /workspace/khoai23/opennmt/data/monolingual/Gutenberg/author_compiled
#! head /workspace/khoai23/opennmt/data/monolingual/Gutenberg/author_compiled/Abraham_Lincoln.txt

Abraham_Lincoln.txt		     John_Ruskin.txt
Agatha_Christie.txt		     John_Stuart_Mill.txt
Albert_Einstein.txt		     Jonathan_Swift.txt
Aldous_Huxley.txt		     Joseph_Conrad.txt
Alexander_Pope.txt		     Leigh_Hunt.txt
Alfred_Russel_Wallace.txt	     Lewis_Carroll.txt
Ambrose_Bierce.txt		     Lord_Byron.txt
Andrew_Lang.txt			     Lord_Tennyson.txt
Anthony_Trollope.txt		     Louisa_May_Alcott.txt
Arnold_Joseph_Toynbee.txt	     Lucy_Maud_Montgomery.txt
Baronness_Orczy.txt		     Lyman_Frank_Baum.txt
Beatrix_Potter.txt		     Mark_Twain.txt
Benjamin_Disraeli.txt		     Mary_Shelley.txt
Benjamin_Franklin.txt		     Mary_Stewart_Daggett.txt
Bertrand_Russell.txt		     Michael_Faraday.txt
Bram_Stoker.txt			     Nathaniel_Hawthorne.txt
Bret_Harte.txt			     O_Henry.txt
Charles_Darwin.txt		     Oscar_Wilde.txt
Charles_Dickens.txt		     P_B_Shelley.txt
Charles_Kingsley.txt		     Percival_Lowell.txt
Charlotte_Bronte.txt		     P_G_Wodehouse.txt
Charlotte_Mary_Yonge.txt	     Philip_Kin

In [3]:
import io, os
data_location = "/workspace/khoai23/opennmt/data/monolingual/Gutenberg/author_compiled"
all_sentences = []
for root, dirs, files in os.walk(data_location):
    for f in files:
        true_file_path = os.path.join(root, f)
        with io.open(true_file_path, "r", encoding="utf-8") as infile:
            for paragraph in infile.readlines():
                all_sentences.extend(sent_tokenize(paragraph))
print("Sentences read: ", len(all_sentences))

Sentences read:  11401766


## Convert data using default library
We will use `TfidfVectorizer` and `TruncatedSVD` to convert data to sparse and downsize the data to a lower dimension respectively. Some arguments:
- For TfidfVectorizer: `stop_word="english", min_df=10, max_df=0.98, ngram_range=(1, 3)`
- ~~For TruncatedSVD: `n_components=vector_size, algorithm='arpack', n_iter=20, random_state=100`. The algorithm 'arpack' is crucial in stopping the `MemoryError` from being thrown.~~ TruncatedSVD had caused `MemoryError` in both format, currently unusable
- For IncrementalPCA: `n_components=vector_size, batch_size=128`

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf
vectorizer = TfidfVectorizer(stop_words="english", min_df=10, max_df=0.98, ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(all_sentences)

In [None]:
# svd. Currently not working!
from sklearn.decomposition import TruncatedSVD
vector_size = 512
svd_model = TruncatedSVD(n_components=vector_size, algorithm="arpack", n_iter=20, random_state=100)
reduced_matrix = svd_model.fit_transform(tfidf_matrix)

In [5]:
# An alternative: IncrementalPCA
from sklearn.decomposition import IncrementalPCA
vector_size = 512
pca_model = IncrementalPCA(n_components=vector_size, batch_size=128)
reduced_matrix = pca_model.fit_transform(tfidf_matrix)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

Now we can either export the data in a pickle/json object. Note that the values in json might be very large due to its readable nature, compare to pickle's binary serialization.

In [None]:
import pickle
pickle_path = "/workspace/khoai23/opennmt/data/monolingual/Gutenberg/svd_vector.pickle"
with io.open(pickle_path, "wb") as picklefile:
    pickle.dump({"sentences": all_sentences, "vector": reduced_matrix}, picklefile)

In [None]:
import json
json_path = "/workspace/khoai23/opennmt/data/monolingual/Gutenberg/svd_vector.json"
with io.open(json_path, "w") as jsonfile:
    json.dump({"sentences": all_sentences, "vector": reduced_matrix}, jsonfile)

## Alternative version
As we intend to load the data into a keras model anyway, it might be better that we save the data with an vocabulary and convert the sentences into their indexed format instead. This would possibly decrease the physical storage size at the expense of unknown words.

In [None]:
from collections import Counter
unk_threshold = 10

counter = Counter()
word_ver_sentences = []
for sent in all_sentences:
    words = sent.strip().split()
    counter.update(words)
    word_ver_sentences.append(words)
valid_words = {k for k, v in counter.items() if v >= unk_threshold}
idx_dict = {k: i for i, k in enumerate(["unk"] + list(valid_words))}
idx_ver_sentences = [[idx_dict.get(w, 0) for w in words] for words in word_ver_sentences]

In [None]:
import pickle
pickle_path = "/workspace/khoai23/opennmt/data/monolingual/Gutenberg/svd_with_vocab.pickle"
with io.open(pickle_path, "wb") as picklefile:
    pickle.dump({"idx_sents": idx_ver_sentences, 
                 "vector": reduced_matrix, 
                 "vocab"=idx_dict}, picklefile)

In [None]:
import json
json_path = "/workspace/khoai23/opennmt/data/monolingual/Gutenberg/svd_with_vocab.json"
with io.open(json_path, "w") as jsonfile:
    json.dump({"idx_sents": idx_ver_sentences, 
                 "vector": reduced_matrix, 
                 "vocab"=idx_dict}, jsonfile)