# Document Classify

Ejemplo extradido del libro: Natural Language Processing with Python Cookbook

In [1]:
import nltk

In [2]:
import random

In [4]:
import feedparser

In [5]:
urls = {
    'mlb': 'https://sports.yahoo.com/mlb/rss.xml',
    'nfl': 'https://sports.yahoo.com/nfl/rss.xml',
}

In [6]:
feedmap = {}

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

In [8]:
def featureExtractor(words):
    features = {}
    for word in words:
        if word not in stopwords:
            features["word({})".format(word)] = True
    return features

In [9]:
sentences = []

In [10]:
for category in urls.keys():
    feedmap[category] = feedparser.parse(urls[category])
    print("downloading {}".format(urls[category]))
    for entry in feedmap[category]['entries']:
        data = entry['summary']
        words = data.split()
        sentences.append((category, words))

downloading https://sports.yahoo.com/nfl/rss.xml
downloading https://sports.yahoo.com/mlb/rss.xml


In [11]:
featuresets = [(featureExtractor(words), category) for category, words in sentences]

In [12]:
random.shuffle(featuresets)

In [13]:
total = len(featuresets)

In [14]:
off = int(total/2)

In [15]:
trainset = featuresets[off:]

In [16]:
testset = featuresets[:off]

In [17]:
classifier = nltk.NaiveBayesClassifier.train(trainset)

In [18]:
print(nltk.classify.accuracy(classifier, testset))

0.8235294117647058


In [19]:
classifier.show_most_informative_features(5)

Most Informative Features
             word(three) = True              nfl : mlb    =      3.5 : 1.0
                 word(A) = True              mlb : nfl    =      3.3 : 1.0
                word(--) = True              mlb : nfl    =      2.5 : 1.0
            word(Monday) = True              mlb : nfl    =      2.3 : 1.0
              word(take) = True              mlb : nfl    =      2.3 : 1.0


In [20]:
for (i, entry) in enumerate(feedmap['nfl']['entries']):
    if i < 4:
        features = featureExtractor(entry['title'].split())
        category = classifier.classify(features)
        print('{} -> {}'.format(category, entry['summary']))

nfl -> The former NFL MVP looks like his "old" self three games into the season, but he's fed up with the narratives forming around his strong start.
nfl -> The Titans run much more than they throw, which should play into the hands of the Eagles' defense. By Reuben Frank
nfl -> Pittsburgh Steelers rise nine spots in Week 4 power rankings, now rankings at No. 7 on Elliot Harrison's list.
nfl -> Jimmy Garoppolo’s season-ending ACL injury was one of the biggest Week 3 storylines. Now, the 49ers will have to try and salvage their season without their franchise quarterback. C.J. Beathard isn’t exactly Aaron Rodgers, so it’s no surprise why our Yahoo Fantasy Football experts have the Los Angeles
