This is a notebook for [MiniBatchSparsePCA on Text Data](https://stackoverflow.com/questions/48034724/minibatchsparsepca-on-text-data#comment83167351_48034724). 

In [1]:
import numpy as np
from sklearn.decomposition import MiniBatchSparsePCA
from sklearn.datasets import fetch_20newsgroups

In [2]:
dataset_train = fetch_20newsgroups(subset='train')
dataset_test = fetch_20newsgroups(subset='test')

In [3]:
y_train = dataset_train['target']
y_test = dataset_test['target']

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=5, token_pattern='[a-zA-Z]+', stop_words='english')

X_train = vectorizer.fit_transform(dataset_train['data'])
X_test = vectorizer.transform(dataset_test['data'])

print('dictionary has ', len(vectorizer.vocabulary_), 'entries')

X_train_dense = X_train.toarray()

dictionary has  22955 entries


In [5]:
n_components = 5

spca = MiniBatchSparsePCA(n_components=n_components, alpha=0.04,
                          batch_size=3, n_iter=100, random_state=0)

%time X_train_reduced = spca.fit_transform(X_train_dense)

CPU times: user 27.8 s, sys: 7.58 s, total: 35.3 s
Wall time: 27.5 s


In [6]:
for i in range(n_components):
  print('component', i, 'has', sum(spca.components_[i, :] != 0), 'nonzero entries')

component 0 has 31 nonzero entries
component 1 has 20 nonzero entries
component 2 has 28 nonzero entries
component 3 has 33 nonzero entries
component 4 has 34 nonzero entries


Retrieving indices of nonzero entries in components 

In [7]:
component_idxs = [np.where(spca.components_[i, :])[0] for i in range(n_components)]

Inverting vocabulary (token -> int mapping) to int -> token mapping

In [8]:
idx_to_words = dict((n, word) for (word, n) in vectorizer.vocabulary_.items())

In [9]:
for i in range(5):
  print('Tokens for', i, 'component:')
  print([idx_to_words[i] for i in component_idxs[i]])
  print()

Tokens for 0 component:
['address', 'ai', 'andrew', 'berkeley', 'ca', 'cable', 'card', 'chip', 'columbia', 'cubs', 'digex', 'dma', 'dos', 'doug', 'edu', 'ftp', 'g', 'game', 'james', 'marc', 'motorola', 'o', 'object', 'rose', 's', 'sandvik', 'space', 'state', 'su', 'uga', 'x']

Tokens for 1 component:
['acsu', 'behanna', 'buffalo', 'ca', 'cs', 'csc', 'data', 'ericsson', 'freenet', 'government', 'l', 'nasa', 'p', 'problem', 'thanks', 'usc', 'utexas', 'uucp', 'v', 'video']

Tokens for 2 component:
['andrew', 'au', 'berkeley', 'bus', 'c', 'cmu', 'color', 'cs', 'data', 'david', 'digital', 'dog', 'don', 'fpu', 'gun', 'll', 'lost', 'nasa', 'netcom', 'problem', 's', 'scsi', 'se', 'small', 'unit', 've', 'version', 'vram']

Tokens for 3 component:
['att', 'chip', 'christ', 'computer', 'cs', 'data', 'drive', 'fbi', 'file', 'files', 'game', 'god', 'hp', 'ibm', 'insurance', 'k', 'legal', 'lehigh', 'm', 'mouse', 'polygon', 'price', 'running', 's', 'season', 'simms', 'temple', 've', 'w', 'way', 'win'