# Identifying topics

Latent Dirichlet allocation algorithm

Topic identification is the process of discovering topics that are present in the input document set. These topics can be multiple words that occur uniquely in a given text.

In [1]:
from nltk.tokenize import RegexpTokenizer

In [2]:
from nltk.corpus import stopwords

In [4]:
from gensim import corpora, models



In [5]:
import nltk

In [6]:
import feedparser

In [7]:
class IdentifyingTopicExample:
    def getDocuments(self):
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['summary']
            if 'ex' in text:
                continue
            self.documents.append(text)
            print("-- {}".format(text))
        print("INFO: Fetching documents from {} completed".format(url))

    def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower()
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print("INFO: Clearning {} documents completed".format(len(self.documents)))

    def doLDA(self):
        dictionary = corpora.Dictionary(self.cleaned)
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned]
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary)
        print(ldamodel.print_topics(num_topics=2, num_words=4))

    def run(self):
        self.getDocuments()
        self.cleanDocuments()
        self.doLDA()

In [8]:
if __name__ == '__main__':
    topicExample = IdentifyingTopicExample()
    topicExample.run()

-- With the 2018 regular season winding down, free agency is a month from being the biggest story in baseball. Here is a comprehensive primer on what this offseason holds.
-- You know you're getting old when your boys of summer start applying for managerial jobs. Give you a for-instance: According to Bleacher Report, former A's third baseman Eric Chavez is the leading candidate to become manager of the Los Angeles Angels of Anaheim. As we speak, Mike Scioscia is the
-- Christian Yelich's numbers have added up to a MVP worthy season. If we're being honest, there have been better offensive seasons in Brewers history, not that this fact diminishes from the MVP-caliber campaign that Brewers outfielder Christian Yelich has undertaken. It falls in the top-10, though
-- The 2018 MLB regular season ends Sunday, meaning the postseason is now less than a week away. Even so, there are still plenty of postseason races that have to be decided between now and then. With that in mind, let's update th