In [25]:
import pandas as pd
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset = 'train', shuffle = True, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset = 'test', shuffle = True, remove=('headers', 'footers', 'quotes'))

In [54]:
punctuations = string.punctuation
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

parser = English()

def spacy_tokenizer(article):
    doc = nlp(article)
    my_tokens = [token.lemma_ for token in doc]
    my_tokens = [word for word in my_tokens if word not in stop_words and word not in punctuations]
    my_tokens = [word for word in my_tokens if word != '\n']
    return my_tokens

In [55]:
# Parameters tuning using Grid Search
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(tokenizer = spacy_tokenizer)
data_fit = cv.fit_transform(newsgroups_train.data)
lda = LatentDirichletAllocation(n_components = 20, random_state = 213)
results = lda.fit_transform(data_fit)

In [57]:
lda_components = lda.components_

terms = cv.get_feature_names()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

'lda_components = lda.components_\n\nterms = cv.get_feature_names()\n\nfor index, component in enumerate(lda_components):\n    zipped = zip(terms, component)\n    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]\n    top_terms_list=list(dict(top_terms_key).keys())\n    print("Topic "+str(index)+": ",top_terms_list)'