## Use Python Nltk

In [1]:
import nltk
from nltk.corpus import brown

## Use News corpus for the training data 

In [2]:
brown_train = brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger(
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
     (r'(-|:|;)$', ':'),
     (r'\'*$', 'MD'),
     (r'(The|the|A|a|An|an)$', 'AT'),
     (r'.*able$', 'JJ'),
     (r'^[A-Z].*$', 'NNP'),
     (r'.*ness$', 'NN'),
     (r'.*ly$', 'RB'),
     (r'.*s$', 'NNS'),
     (r'.*ing$', 'VBG'),
     (r'.*ed$', 'VBD'),
     (r'.*', 'NN')
])
unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)

cfg = {}
cfg["NNP+NNP"] = "NNP"
cfg["NN+NN"] = "NNI"
cfg["NNI+NN"] = "NNI"
cfg["JJ+JJ"] = "JJ"
cfg["JJ+NN"] = "NNI"

## create a Extraction class

In [3]:

class NPExtractor(object):

    def __init__(self, sentence):
        self.sentence = sentence

    # Split the sentence into singlw words/tokens
    def tokenize_sentence(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        return tokens

    # Normalize brown corpus' tags ("NN", "NN-PL", "NNS" > "NN")
    def normalize_tags(self, tagged):
        n_tagged = []
        for t in tagged:
            if t[1] == "NP-TL" or t[1] == "NP":
                n_tagged.append((t[0], "NNP"))
                continue
            if t[1].endswith("-TL"):
                n_tagged.append((t[0], t[1][:-3]))
                continue
            if t[1].endswith("S"):
                n_tagged.append((t[0], t[1][:-1]))
                continue
            n_tagged.append((t[0], t[1]))
        return n_tagged

    # Extract the main topics from the sentence
    def extract(self):

        tokens = self.tokenize_sentence(self.sentence)
        tags = self.normalize_tags(bigram_tagger.tag(tokens))

        merge = True
        while merge:
            merge = False
            for x in range(0, len(tags) - 1):
                t1 = tags[x]
                t2 = tags[x + 1]
                key = "%s+%s" % (t1[1], t2[1])
                value = cfg.get(key, '')
                if value:
                    merge = True
                    tags.pop(x)
                    tags.pop(x)
                    match = "%s %s" % (t1[0], t2[0])
                    pos = value
                    tags.insert(x, (match, pos))
                    break

        matches = []
        for t in tags:
            if t[1] == "NNP" or t[1] == "NNI":
            #if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN":
                matches.append(t[0])
        return matches

## Example:
Today's Apple event is likely to emphasize the power of polish. Patient folks who have waited for the refined updates to Apple's latest hardware are likely to be rewarded. Refined updates of the iPhone, Apple TV, and iPad, each with a bit more power and a handful of novel features, have been rumored to make an appearance.

Read next: The iPhone 6s review.

We should learn about the iPhone 6S and iPhone 6S Plus. They're likely to look the same as their predecessors, but with faster processors, an improved camera, and Force Touch, the haptic feedback already part of newer MacBook trackpads and the Apple Watch.

We're hesitant to say anything, but this will probably be the year Apple updates the Apple TV. The Apple TV has been rumored for previous Apple events, only to be a no-show, but all signs point to the new Apple TV, now with a Siri integrated remote, showing up on stage. The rumored price tag is somewhere between $149 and $199, a significant bump above the $69 asking price for the current model. Perhaps Apple will show the long-rumored Apple TV video game controller.



In [4]:
sentence = """
Today's Apple event is likely to emphasize the power of polish. Patient folks who have waited for the refined updates to Apple's latest hardware are likely to be rewarded. Refined updates of the iPhone, Apple TV, and iPad, each with a bit more power and a handful of novel features, have been rumored to make an appearance.

Read next: The iPhone 6s review.

We should learn about the iPhone 6S and iPhone 6S Plus. They're likely to look the same as their predecessors, but with faster processors, an improved camera, and Force Touch, the haptic feedback already part of newer MacBook trackpads and the Apple Watch.

We're hesitant to say anything, but this will probably be the year Apple updates the Apple TV. The Apple TV has been rumored for previous Apple events, only to be a no-show, but all signs point to the new Apple TV, now with a Siri integrated remote, showing up on stage. The rumored price tag is somewhere between $149 and $199, a significant bump above the $69 asking price for the current model. Perhaps Apple will show the long-rumored Apple TV video game controller.


"""
np_extractor = NPExtractor(sentence)
result = np_extractor.extract()
print("This sentence is about: {0}".format(result[0]))
print("Other possible topics are: {0} ".format(result[1:]))

This sentence is about: Apple
Other possible topics are: ['Patient', 'Apple', 'Refined', 'Apple', 'Read', 'iPhone 6s review', 'iPhone 6S', 'iPhone 6S', 'Touch', 'haptic feedback', 'MacBook', 'Apple Watch', "'re hesitant", 'Apple', 'Apple', 'Apple', 'Apple', 'Apple', 'Siri', 'price tag', 'significant bump', 'current model', 'Apple', 'Apple', 'TV video game controller'] 
