-
Notifications
You must be signed in to change notification settings - Fork 0
/
6_Chinking.py
36 lines (30 loc) · 1.47 KB
/
6_Chinking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#Chinking is based on chunking. We chink or remove something from the chunk
import nltk
from nltk.corpus import state_union
''' PunktSentenceTokenizer is unsupervised ml sentence tokenizer
It comes with pretraining and we can also further train it '''
from nltk import PunktSentenceTokenizer
train=state_union.raw("2005-GWBush.txt")
text=state_union.raw("2006-GWBush.txt")
SentenceTokenizer= PunktSentenceTokenizer(train)
tokenized=SentenceTokenizer.tokenize(text)
def process():
try:
for i in tokenized:
words=nltk.word_tokenize(i)
tagged=nltk.pos_tag(words)
# print(tagged)
#'.' represents any new character except a new line, * means 0 or more, + means 1 or more than one, to mention any parts of speech tag we use <>, r means to perform a regex may be its not necessary to put r, I will confirm that. the name used here 'Chunk' can be any name.
#<.*>+ means chunk everything
# '}{' is used for chinking
chunkGram= r"""Chunk: {<.*>+}
}<VB.?| IN| DT>+{""" # IN means Preposition and DT means Determiners, | means or
chunkParser= nltk.RegexpParser(chunkGram)
#tagged contains all the parts of speech, so chunking tagged
chunked=chunkParser.parse(tagged)
print (chunked)
#drawing the representation through matplotlib
chunked.draw()
except Exception as e:
print(str(e))
process()