# CH.5 Part-of-speach tagging

In [1]:
from nltk import pos_tag ,word_tokenize
import nltk

In [2]:
Sent=word_tokenize ("I always lie down to tell a lie.")
Sent

['I', 'always', 'lie', 'down', 'to', 'tell', 'a', 'lie', '.']

In [3]:
pos_tag(Sent)

[('I', 'PRP'),
 ('always', 'RB'),
 ('lie', 'VBP'),
 ('down', 'RP'),
 ('to', 'TO'),
 ('tell', 'VB'),
 ('a', 'DT'),
 ('lie', 'NN'),
 ('.', '.')]

In [4]:
 nltk.help.upenn_tagset('DT')

DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those


In [5]:
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [6]:
nltk.help.upenn_tagset('POS')

POS: genitive marker
    ' 's


In [7]:
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [8]:
nltk.help.upenn_tagset('NNPS')

NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...


In [9]:
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [10]:
nltk.help.upenn_tagset('IN')

IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...


# CH.7 Etracting information from text

## Example of a simple regular expression–based NP(noun phrase) chunker.

In [11]:
 # the market for system-management software for Digital’s hardware
#sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]
sentence="the little yellow dog barked at the cat"
sentence=word_tokenize(sentence)
S=pos_tag(sentence)
S

[('the', 'DT'),
 ('little', 'JJ'),
 ('yellow', 'JJ'),
 ('dog', 'NN'),
 ('barked', 'VBD'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('cat', 'NN')]

In [12]:
# In this case, we will define a simple grammar with a single regular expression rule
#This rule says that an NP chunk should be formed whenever the
#chunker finds an optional determiner (DT) followed by any number of adjectives (JJ)
#and then a noun (NN).
grammar = "NP: {<DT>?<JJ>*<NN>}"

In [13]:
#create chunk parser 
cp = nltk.RegexpParser(grammar)
#test the parser on yoursentince
result = cp.parse(S)
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [14]:
result.draw()

 # CH.8  Analyzing Sentence Structure
 

## A well-known example of ambiguity is shown in (2), from the Groucho Marx movie, Animal Crackers (1930):
## (2)While hunting in Africa, I shot an elephant in my pajamas

In [15]:
# First we need to define a simple grammar
groucho_grammar = nltk.CFG.fromstring(""" 
S -> NP VP            
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
#S for sentinse
# preposition phrase 
# NP for noun phrase 
# VP verb phrase
# Det for determiner
################################################ create parser
parser = nltk.ChartParser(groucho_grammar)

In [16]:
# parser

In [17]:
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas'] #tokenized sentince

In [18]:
for tree in parser.parse(sent):
    print(tree)
    tree.draw()

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [19]:
#parser

In [20]:
#cp

## Practice Named entity recognition (NER)

In [None]:
##in progress 