In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora, models
import nltk
import feedparser
# !pip install gensim

In [6]:
class IdentifyingTopicExample:
    
    def getDocuments(self):
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['summary']
            if 'ex' in text:
                continue
            self.documents.append(text)
            print("-- {}".format(text))
        print("INFO: Fetching documents from {} completed".format(url))

    def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english')) # 영어의 불용어 지정
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower() # 모두 소문자로 변경
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print("INFO: Clearning {} documents completed".format(len(self.documents)))

    def doLDA(self):
        dictionary = corpora.Dictionary(self.cleaned) # 딕셔너리 생성
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned] 
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary)
        print(ldamodel.print_topics(num_topics=2, num_words=4)) 

    def run(self):
        self.getDocuments()
        self.cleanDocuments()
        self.doLDA()

In [7]:
if __name__ == '__main__':
    topicExample = IdentifyingTopicExample()
    topicExample.run()

-- Should signing DJ LeMahieu be the top priority for New York? Or would a blockbuster trade put the Yankees on a clearer path to a World Series championship?
-- The Yankees added four players to their 40-man roster on Friday, protecting the players from selection in the Dec. 10 Rule 5 Draft.
INFO: Fetching documents from https://sports.yahoo.com/mlb/rss.xml completed
INFO: Clearning 2 documents completed
[(0, '0.050*"yankees" + 0.044*"new" + 0.042*"clearer" + 0.040*"dj"'), (1, '0.062*"players" + 0.053*"yankees" + 0.042*"roster" + 0.042*"protecting"')]
