In [11]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
text = """good day ladies and gentleman and welcome to the alphabet second quarter 2018 earnings call
at this time all participants are in a listen-only mode later we will conduct question-and-answer session
and it will be given at that time if anyone should car operator assistance to spell star and then 0 on 
your touch-tone telephone and now it's in the conference call over to Ellen West had an investor relations
please go ahead thank you good afternoon everyone and welcome to alphabet second quarter 2018 earnings 
conference call with us today our Ruth Pratt and Sundar pichai now I'll quickly cover the Safe Harbor 
some of the statements that we make today may be considered forward-looking including statements regarding
our future Investments are long-term growth and Innovation the expected performance of our businesses"""

# Used when tokenizing words
sentence_re = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

#sentence_re = r'(?:(?:[A-Z])(?:.[A-Z])+.?)|(?:\w+(?:-\w+)*)|(?:$?\d+(?:.\d+)?%?)|(?:...|)(?:[][.,;"'?():-_`])'
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

#Taken from Su Nam Kim Paper...
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)

toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)

print(postoks)

tree = chunker.parse(postoks)

from nltk.corpus import stopwords
stopwords = stopwords.words('english')


def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem(word)
    word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted


def get_terms(tree):
    for leaf in leaves(tree):
        term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
        yield term

terms = get_terms(tree)

for term in terms:
    for word in term:
        print(word),
    print

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kasun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/kasun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[('good', 'JJ'), ('day', 'NN'), ('ladies', 'NNS'), ('and', 'CC'), ('gentleman', 'NN'), ('and', 'CC'), ('welcome', 'NN'), ('to', 'TO'), ('the', 'DT'), ('alphabet', 'JJ'), ('second', 'JJ'), ('quarter', 'NN'), ('2018', 'CD'), ('earnings', 'NNS'), ('call', 'NN'), ('at', 'IN'), ('this', 'DT'), ('time', 'NN'), ('all', 'DT'), ('participants', 'NNS'), ('are', 'VBP'), ('in', 'IN'), ('a', 'DT'), ('listen-only', 'JJ'), ('mode', 'NN'), ('later', 'RBR'), ('we', 'PRP'), ('will', 'MD'), ('conduct', 'VB'), ('question-and-answer', 'JJR'), ('session', 'NN'), ('and', 'CC'), ('it', 'PRP'), ('will', 'MD'), ('be', 'VB'), ('given', 'VBN'), ('at', 'IN'), ('that', 'DT'), ('time', 'NN'), ('if', 'IN'), 