In [1]:
from time import time

from  sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [2]:
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
num_data = 2000
data_samples = dataset.data[:num_data]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 2.576s.


In [3]:
data_samples[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [4]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=50000,
                                stop_words='english',
                                analyzer="word")
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

done in 0.499s.


In [5]:
import numpy as np

np.max(tf.toarray()[0])

4

In [6]:

lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 3.953s.


In [7]:
lda.transform(tf[0]).shape

(1, 10)

In [8]:
lda.get_params()

{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 50.0,
 'max_doc_update_iter': 100,
 'max_iter': 5,
 'mean_change_tol': 0.001,
 'n_jobs': 1,
 'n_topics': 10,
 'perp_tol': 0.1,
 'random_state': 0,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}

In [12]:
from  sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LinearRegression

class TopicModeling(LatentDirichletAllocation):
    def __init__(self, num_topics, num_latent=None, *args, **kwargs):
        if num_latent is None:
            num_latent = num_topics
        self.num_topics = num_topics
        super().__init__(n_topics=num_latent, *args, **kwargs)
        self.label_model = LinearRegression(fit_intercept=False)
        self.vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=50000,
                                stop_words='english',
                                analyzer="word")
        
    def fit_label(self, X, y):
        X = self.transform(X)
        y = self._get_onehot(y)
        self.label_model.fit(X, y)
        
    def fit(self, X, *args, **kwargs):
        X = self.vectorizer.fit_transform(X, *args, **kwargs)
        super().fit(X)
        
    def transform(self, X, *args, **kwargs):
        X = self.vectorizer.transform(X)
        return super().transform(X, *args, **kwargs)
    
    def fit_transform(self, X, *args, **kwargs):
        self.fit(X, *args, **kwargs)
        return self.transform(X, *args, **kwargs)
        
    def predict_topic_proba(self, X):
        X = self.transform(X)
        y = self.label_model.predict(X)
        return (y.T / np.sum(y, axis=1)).T
    
    def predict_topic(self, X):
        proba = self.predict_topic_proba(X)
        return np.argmax(proba, 1)
        
    def _get_onehot(self, y):
        return np.eye(self.num_topics)[y]

In [13]:
model = TopicModeling(10)
model.fit(data_samples)
model.fit_label(data_samples[:10], np.arange(10))



In [14]:
model.predict_topic(data_samples)

array([0, 1, 2, ..., 9, 3, 8])