In [7]:
import pandas as pd
import nltk, re, pprint
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

tweet = "I subscribe to Luca Mezzalira take on state management. And just for you to know.. white beards are class!! Now let me observe your talk."



In [8]:
def preprocess(tweet):
    tokenized_words = nltk.word_tokenize(tweet)
    pos_tagged = nltk.pos_tag(tokenized_words)
    return pos_tagged

preprocessed_tweet = preprocess(tweet)

print(preprocessed_tweet)


[('I', 'PRP'), ('subscribe', 'VBP'), ('to', 'TO'), ('Luca', 'NNP'), ('Mezzalira', 'NNP'), ('take', 'VB'), ('on', 'IN'), ('state', 'NN'), ('management', 'NN'), ('.', '.'), ('And', 'CC'), ('just', 'RB'), ('for', 'IN'), ('you', 'PRP'), ('to', 'TO'), ('know..', 'VB'), ('white', 'JJ'), ('beards', 'NNS'), ('are', 'VBP'), ('class', 'NN'), ('!', '.'), ('!', '.'), ('Now', 'RB'), ('let', 'VB'), ('me', 'PRP'), ('observe', 'VB'), ('your', 'PRP$'), ('talk', 'NN'), ('.', '.')]


In [12]:

"""
Chunking:

"""

# This grammar is described in the paper by S. N. Kim,
# T. Baldwin, and M.-Y. Kan.
# Evaluating n-gram based evaluation metrics for automatic
# keyphrase extraction.
# Technical report, University of Melbourne, Melbourne 2010.
grammar = r"""
    NBAR:
        # Nouns and Adjectives, terminated with Nouns
        {<NN.*|JJ>*<NN.*>}

    NP:
        # Noun Phrase Chunking
        {<NBAR>}
        # Above, connected with in/of/etc...
        {<NBAR><IN><NBAR>}
"""
chunker = nltk.RegexpParser(grammar)
tree = chunker.parse(preprocessed_tweet)

print(preprocessed_tweet)


[('I', 'PRP'), ('subscribe', 'VBP'), ('to', 'TO'), ('Luca', 'NNP'), ('Mezzalira', 'NNP'), ('take', 'VB'), ('on', 'IN'), ('state', 'NN'), ('management', 'NN'), ('.', '.'), ('And', 'CC'), ('just', 'RB'), ('for', 'IN'), ('you', 'PRP'), ('to', 'TO'), ('know..', 'VB'), ('white', 'JJ'), ('beards', 'NNS'), ('are', 'VBP'), ('class', 'NN'), ('!', '.'), ('!', '.'), ('Now', 'RB'), ('let', 'VB'), ('me', 'PRP'), ('observe', 'VB'), ('your', 'PRP$'), ('talk', 'NN'), ('.', '.')]


In [11]:
# Helper function to traverse the graph and extract the NP
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    #bool(2 <= len(word) <= 40 and word.lower() not in stopwords)
    return word not in stopwords

# Walking the tree and Normalisation
"""

Normalisation may consist of lower-casing words, removing stop-words which appear in many documents 
(i.e. if, the, a…), stemming (i.e. cars → car), and lemmatizing (i.e. drove, drives, rode → drive). 

"""

def get_terms(tree):
    for leaf in leaves(tree):
        term = [ word.lower() for word, tag in leaf if acceptable_word(word) ]
        yield term

terms = get_terms(tree)


extracted_keywords = []
for term in terms:
    for word in term:
        extracted_keywords.append(word)
print(extracted_keywords)

['luca', 'mezzalira', 'state', 'management', 'white', 'beards', 'class', 'talk']
