## N Gram tagging 

In [1]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

In [26]:
## bi-gram tagging on brown corpus 
import nltk
from nltk.corpus import brown
from nltk.util import ngrams
from nltk.corpus import treebank

nltk.download('brown')

tagged_words = brown.tagged_words()
tagged_bigrams = list(ngrams(tagged_words, 2))
for bg in tagged_bigrams[:5]:
    print(bg)


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\kaush\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


(('The', 'AT'), ('Fulton', 'NP-TL'))
(('Fulton', 'NP-TL'), ('County', 'NN-TL'))
(('County', 'NN-TL'), ('Grand', 'JJ-TL'))
(('Grand', 'JJ-TL'), ('Jury', 'NN-TL'))
(('Jury', 'NN-TL'), ('said', 'VBD'))


In [23]:
## Combining unigram , bigram and default tagger 
import nltk
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger
from nltk.corpus import treebank

nltk.download('treebank')
nltk.download('punkt')

train_sents = treebank.tagged_sents()[:3000]

default_tagger = DefaultTagger('NN')  # tag everything as noun by default
unigram_tagger = UnigramTagger(train_sents, backoff=default_tagger)
bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)

custom_sent = "The quick brown fox jumped over the lazy dog."
tokens = nltk.word_tokenize(custom_sent)
tagged = bigram_tagger.tag(tokens)

print(tagged)



[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\kaush\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'RP'), ('the', 'DT'), ('lazy', 'NN'), ('dog', 'NN'), ('.', '.')]


In [24]:
test_sents = treebank.tagged_sents()[3000:]
accuracy = bigram_tagger.evaluate(test_sents)
print(accuracy)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = bigram_tagger.evaluate(test_sents)


0.8810274120440319


## Grammar AND Parsing

In [None]:
import nltk
from nltk import CFG

grammar = CFG.fromstring("""
  S -> NP VP
  VP -> V NP | V NP PP
  PP -> P NP
  NP -> Det N | Det N PP | 'I'
  Det -> 'a' | 'the'
  N -> 'man' | 'park' | 'dog' | 'book'
  V -> 'saw' | 'walked'
  P -> 'in' | 'with'
""")

sentence = "I saw a man with a book".split()
parser = nltk.ChartParser(grammar)

In [14]:
grammar

<Grammar with 17 productions>

In [16]:
grammar.start() ## will tell from where it starts 

S

In [15]:
print (grammar) 

Grammar with 17 productions (start state = S)
    S -> NP VP
    VP -> V NP
    VP -> V NP PP
    PP -> P NP
    NP -> Det N
    NP -> Det N PP
    NP -> 'I'
    Det -> 'a'
    Det -> 'the'
    N -> 'man'
    N -> 'park'
    N -> 'dog'
    N -> 'book'
    V -> 'saw'
    V -> 'walked'
    P -> 'in'
    P -> 'with'


In [17]:
grammar.productions()

[S -> NP VP,
 VP -> V NP,
 VP -> V NP PP,
 PP -> P NP,
 NP -> Det N,
 NP -> Det N PP,
 NP -> 'I',
 Det -> 'a',
 Det -> 'the',
 N -> 'man',
 N -> 'park',
 N -> 'dog',
 N -> 'book',
 V -> 'saw',
 V -> 'walked',
 P -> 'in',
 P -> 'with']

In [None]:
## printing in tree format 
for tree in parser.parse(sentence):
    tree.pretty_print()

     S                               
  ___|___________                     
 |               VP                  
 |    ___________|________            
 |   |       |            PP         
 |   |       |        ____|___        
 |   |       NP      |        NP     
 |   |    ___|___    |     ___|___    
 NP  V  Det      N   P   Det      N  
 |   |   |       |   |    |       |   
 I  saw  a      man with  a      book

     S                           
  ___|_______                     
 |           VP                  
 |    _______|___                 
 |   |           NP              
 |   |    _______|____            
 |   |   |   |        PP         
 |   |   |   |    ____|___        
 |   |   |   |   |        NP     
 |   |   |   |   |     ___|___    
 NP  V  Det  N   P   Det      N  
 |   |   |   |   |    |       |   
 I  saw  a  man with  a      book



In [18]:
## top-down parsing 
from nltk.parse import RecursiveDescentParser
rd_parser = RecursiveDescentParser(grammar)
sentence = "I saw a man with a book".split()
for tree in rd_parser.parse(sentence):
    tree.pretty_print()
## to use this we have to first make a grammar then only we can do the parsing 

     S                           
  ___|_______                     
 |           VP                  
 |    _______|___                 
 |   |           NP              
 |   |    _______|____            
 |   |   |   |        PP         
 |   |   |   |    ____|___        
 |   |   |   |   |        NP     
 |   |   |   |   |     ___|___    
 NP  V  Det  N   P   Det      N  
 |   |   |   |   |    |       |   
 I  saw  a  man with  a      book

     S                               
  ___|___________                     
 |               VP                  
 |    ___________|________            
 |   |       |            PP         
 |   |       |        ____|___        
 |   |       NP      |        NP     
 |   |    ___|___    |     ___|___    
 NP  V  Det      N   P   Det      N  
 |   |   |       |   |    |       |   
 I  saw  a      man with  a      book



In [19]:
## bottom up parsing 
import nltk
from nltk import CFG
from nltk.parse import ShiftReduceParser

grammar = CFG.fromstring("""
  S -> NP VP
  VP -> V VP
  VP -> TO V
  NP -> 'he'
  V -> 'loves' | 'eat'
  TO -> 'to'
""")

sr_parser = ShiftReduceParser(grammar)
sent = 'he loves to eat'.split()

for t in sr_parser.parse(sent):
    print(t)
    t.pretty_print()


(S (NP he) (VP (V loves) (VP (TO to) (V eat))))
           S             
  _________|___           
 |             VP        
 |     ________|___       
 |    |            VP    
 |    |         ___|___   
 NP   V        TO      V 
 |    |        |       |  
 he loves      to     eat



In [None]:
## 