<a href="https://colab.research.google.com/github/hjj31/AI/blob/master/CS50/Language/J020_Language_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import sys
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
TERMINALS = """
Adj -> "country" | "dreadful" | "enigmatical" | "little" | "moist" | "red"
Adv -> "down" | "here" | "never"
Conj -> "and" | "until"
Det -> "a" | "an" | "his" | "my" | "the"
N -> "armchair" | "companion" | "day" | "door" | "hand" | "he" | "himself"
N -> "holmes" | "home" | "i" | "mess" | "paint" | "palm" | "pipe" | "she"
N -> "smile" | "thursday" | "walk" | "we" | "word"
P -> "at" | "before" | "in" | "of" | "on" | "to"
V -> "arrived" | "came" | "chuckled" | "had" | "lit" | "said" | "sat"
V -> "smiled" | "tell" | "were"
"""
NONTERMINALS = """
S -> NP VP | VP NP | S Conj S
NP -> N | Det N | NP PP | Det AdjP N
VP -> V | V NP | V PP | Adv VP | VP Adv
PP -> P NP
AdjP -> Adj | Adj AdjP
"""

In [3]:
grammar = nltk.CFG.fromstring(NONTERMINALS + TERMINALS)
parser = nltk.ChartParser(grammar)

In [4]:
def main():

    # If filename specified, read sentence from file
    if len(sys.argv) == 2:
        with open(sys.argv[1]) as f:
            s = f.read()

    # Otherwise, get sentence as input
    else:
        s = input("Sentence: ")

    # Convert input into list of words
    s = preprocess(s)

    # Attempt to parse sentence
    try:
        trees = list(parser.parse(s))
    except ValueError as e:
        print(e)
        return
    if not trees:
        print("Could not parse sentence.")
        return

    # Print each tree with noun phrase chunks
    for tree in trees:
        tree.pretty_print()

        print("Noun Phrase Chunks")
        for np in np_chunk(tree):
            print(" ".join(np.flatten()))


In [5]:
def preprocess(sentence):
    # Regex pattern to match words with at least one alphabetic character
    pattern = re.compile(".*[a-z].*")

    # Word tokenize lower-cased sentence and remove all pure non-alphabetic words
    words = nltk.word_tokenize(sentence.lower())
    words = [word for word in words if pattern.match(word)]

    return words

In [6]:
def np_chunk(tree):
    np_chunks = []
    parented_tree = nltk.tree.ParentedTree.convert(tree)
    
    for subtree in parented_tree.subtrees(lambda t: t.label() == 'N'):
        np_chunks.append(subtree.parent())

    return np_chunks

In [7]:
main()

Sentence: Holmes sat in the red armchair and he chuckled.
                        S                                 
             ___________|______________________            
            S                         |        |          
   _________|___                      |        |           
  |             VP                    |        |          
  |      _______|___                  |        |           
  |     |           PP                |        |          
  |     |    _______|___              |        |           
  |     |   |           NP            |        S          
  |     |   |    _______|______       |     ___|_____      
  NP    |   |   |      AdjP    |      |    NP        VP   
  |     |   |   |       |      |      |    |         |     
  N     V   P  Det     Adj     N     Conj  N         V    
  |     |   |   |       |      |      |    |         |     
holmes sat  in the     red  armchair and   he     chuckled

Noun Phrase Chunks
holmes
the red armchair
he
