In [1]:
import pandas as pd
import nltk, re, pprint
from nltk.corpus import stopwords
import functools

stopwords = stopwords.words('english')

In [2]:
tweets = [
    "We are making great progress with healthcare. ObamaCare is imploding and will only get worse. Republicans coming together to get job done!",
    "'U.S. Consumer Comfort Just Reached Its Highest Level in a Decade' ??https://t.co/S8nZgmeMMV https://t.co/xC0piRa6eP",
    "For eight years Russia 'ran over' President Obama, got stronger and stronger, picked-off Crimea and added missiles. Weak! @foxandfriends",
    "122 vicious prisoners, released by the Obama Administration from Gitmo, have returned to the battlefield. Just another terrible decision!",
    "MAKE AMERICA GREAT AGAIN!",
    "North Korea just stated that it is in the final stages of developing a nuclear weapon capable of reaching parts of the U.S. It won't happen!",
    "Russia has more warheads than ever, N Korea is testing nukes, and Iran got a sweetheart deal to keep theirs. Thanks, @HillaryClinton.",
    "In Bangladesh, hostages were immediately killed by ISIS terrorists if they were unable to cite a verse from the Koran. 20 were killed!"
]

In [3]:
def preprocess(tweet):
    tokenized_words = nltk.word_tokenize(tweet)
    pos_tagged = nltk.pos_tag(tokenized_words)
    return pos_tagged

In [4]:
preprocessed_tweets = [preprocess(tweet) for tweet in tweets]

In [5]:
"""
Chunking:

"""

# This grammar is described in the paper by S. N. Kim,
# T. Baldwin, and M.-Y. Kan.
# Evaluating n-gram based evaluation metrics for automatic
# keyphrase extraction.
# Technical report, University of Melbourne, Melbourne 2010.
grammar = r"""
    NBAR:
        # Nouns and Adjectives, terminated with Nouns
        {<NN.*|JJ>*<NN.*>}

    NP:
        # Noun Phrase Chunking
        {<NBAR>}
        # Above, connected with in/of/etc...
        {<NBAR><IN><NBAR>}
"""
trees = []
chunker = nltk.RegexpParser(grammar)
for index, tweet in enumerate(preprocessed_tweets):
    trees.append(chunker.parse(tweet))
#trees[0].draw()

In [6]:
# Helper function to traverse the graph and extract the NP
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    #bool(2 <= len(word) <= 40 and word.lower() not in stopwords)
    return word not in stopwords

def get_terms(tree):
    for leaf in leaves(tree):
        term = [ word.lower() for word, tag in leaf if acceptable_word(word) ]
        yield term

terms = []
for index, tree in enumerate(trees):
    terms.append(get_terms(tree))

In [7]:
def convert2Dinto1D(terms):
    return functools.reduce(lambda x,y :x+y ,[word for word in terms])


extracted_keywords = []
for term in terms:
    extracted_keywords.append(convert2Dinto1D(term))


[['great', 'progress', 'healthcare', 'obamacare', 'republicans', 'job'], ['consumer', 'comfort', 'just', 'highest', 'level', 'decade', 'https', '//t.co/s8nzgmemmv', 'https', '//t.co/xc0pira6ep'], ['years', 'russia', 'president', 'obama', 'picked-off', 'crimea', 'missiles', 'foxandfriends'], ['vicious', 'prisoners', 'obama', 'administration', 'gitmo', 'battlefield', 'just', 'terrible', 'decision'], ['make', 'america', 'great', 'again'], ['north', 'korea', 'final', 'stages', 'nuclear', 'weapon', 'parts', 'u.s'], ['russia', 'warheads', 'n', 'korea', 'nukes', 'iran', 'sweetheart', 'deal', 'thanks', '@', 'hillaryclinton'], ['bangladesh', 'hostages', 'isis', 'terrorists', 'verse', 'koran']]


In [8]:
# Printing Results

for keywords in extracted_keywords:
    print(keywords)

['great', 'progress', 'healthcare', 'obamacare', 'republicans', 'job']
['consumer', 'comfort', 'just', 'highest', 'level', 'decade', 'https', '//t.co/s8nzgmemmv', 'https', '//t.co/xc0pira6ep']
['years', 'russia', 'president', 'obama', 'picked-off', 'crimea', 'missiles', 'foxandfriends']
['vicious', 'prisoners', 'obama', 'administration', 'gitmo', 'battlefield', 'just', 'terrible', 'decision']
['make', 'america', 'great', 'again']
['north', 'korea', 'final', 'stages', 'nuclear', 'weapon', 'parts', 'u.s']
['russia', 'warheads', 'n', 'korea', 'nukes', 'iran', 'sweetheart', 'deal', 'thanks', '@', 'hillaryclinton']
['bangladesh', 'hostages', 'isis', 'terrorists', 'verse', 'koran']
