In [38]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [39]:
text = 'Natural language processing (NLP) is a field of computer science, artificial intelligence and computational \
linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, \
concerned with programming computers to fruitfully process large natural language corpora.’, \
‘Challenges in natural language processing frequently involve natural language understanding, natural language generation \
(frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer\
dialog systems, or some combination thereof.'

In [40]:
def preprocess(text):
    text_token = nltk.tokenize.word_tokenize(text)
    remove_punctu = [w for w in text_token if w not in string.punctuation]
    stop_words = stopwords.words('english')
    remove_stop_word = [w for w in remove_punctu if w not in stop_words]
    lower_text = [w.lower() for w in remove_stop_word if len(w)>2]
    lemmaed_text = [WordNetLemmatizer().lemmatize(w) for w in lower_text]
    return ' '.join(lemmaed_text)

In [74]:
processed_txt = preprocess(text)

In [75]:
tagged_txt = nltk.tag.pos_tag(nltk.tokenize.word_tokenize(processed_txt))

In [76]:
print(tagged_txt)

[('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('nlp', 'JJ'), ('field', 'NN'), ('computer', 'NN'), ('science', 'NN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('computational', 'JJ'), ('linguistics', 'NNS'), ('concerned', 'JJ'), ('interaction', 'NN'), ('computer', 'NN'), ('human', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('particular', 'JJ'), ('concerned', 'VBD'), ('programming', 'VBG'), ('computer', 'NN'), ('fruitfully', 'RB'), ('process', 'JJ'), ('large', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('corpora', 'NN'), ('.', '.'), ('challenge', 'VB'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('frequently', 'RB'), ('involve', 'VBP'), ('natural', 'JJ'), ('language', 'NN'), ('understanding', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('generation', 'NN'), ('frequently', 'RB'), ('formal', 'JJ'), ('machine-readable', 'JJ'), ('logical', 'JJ'), ('form', 'NN'), ('connecting', 'VBG'), ('language', 'NN'), ('machine', 'NN'), ('perception', 'NN'), 

## Chunking

#### One of the main goals of chunking is to group into what are known as "noun phrases." These are phrases of one or more words that contain a noun, maybe some descriptive words, maybe a verb, and maybe something like an adverb. The idea is to group nouns with the words that are in relation to them.

In [77]:
chunkgram = r''' Chunk : {<RB.?>*<VB.?>*<JJ.?>*<NN.?>*}'''

In [78]:
chunkparser = nltk.RegexpParser(chunkgram)

In [79]:
chunked  = chunkparser.parse(tagged_txt)

In [80]:
print(chunked)

(S
  (Chunk natural/JJ language/NN processing/NN)
  (Chunk nlp/JJ field/NN computer/NN science/NN)
  (Chunk artificial/JJ intelligence/NN)
  (Chunk computational/JJ linguistics/NNS)
  (Chunk concerned/JJ interaction/NN computer/NN)
  (Chunk human/JJ natural/JJ language/NN)
  (Chunk particular/JJ)
  (Chunk concerned/VBD programming/VBG computer/NN)
  (Chunk
    fruitfully/RB
    process/JJ
    large/JJ
    natural/JJ
    language/NN
    corpora/NN)
  ./.
  (Chunk challenge/VB natural/JJ language/NN processing/NN)
  (Chunk frequently/RB involve/VBP natural/JJ language/NN)
  (Chunk understanding/JJ natural/JJ language/NN generation/NN)
  (Chunk
    frequently/RB
    formal/JJ
    machine-readable/JJ
    logical/JJ
    form/NN)
  (Chunk connecting/VBG language/NN machine/NN perception/NN)
  (Chunk
    managing/VBG
    human-computerdialog/JJ
    system/NN
    combination/NN
    thereof/NN))


## Chinking 

#### You may find that, after a lot of chunking, you have some words in your chunk you still do not want, but you have no idea how to get rid of them by chunking. You may find that chinking is your solution.

#### Chinking is a lot like chunking, it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is your chink.

In [90]:
chunkgrammer = r''' Chunk : {<.*>+}
                           }<VB.?>*<NN>{ '''

In [91]:
chunkparser = nltk.RegexpParser(chunkgrammer)
chunked  = chunkparser.parse(tagged_txt)

In [92]:
print(chunked)

(S
  (Chunk natural/JJ)
  language/NN
  processing/NN
  (Chunk nlp/JJ)
  field/NN
  computer/NN
  science/NN
  (Chunk artificial/JJ)
  intelligence/NN
  (Chunk computational/JJ linguistics/NNS concerned/JJ)
  interaction/NN
  computer/NN
  (Chunk human/JJ natural/JJ)
  language/NN
  (Chunk particular/JJ)
  concerned/VBD
  programming/VBG
  computer/NN
  (Chunk fruitfully/RB process/JJ large/JJ natural/JJ)
  language/NN
  corpora/NN
  (Chunk ./. challenge/VB natural/JJ)
  language/NN
  processing/NN
  (Chunk frequently/RB involve/VBP natural/JJ)
  language/NN
  (Chunk understanding/JJ natural/JJ)
  language/NN
  generation/NN
  (Chunk frequently/RB formal/JJ machine-readable/JJ logical/JJ)
  form/NN
  connecting/VBG
  language/NN
  machine/NN
  perception/NN
  (Chunk managing/VBG human-computerdialog/JJ)
  system/NN
  combination/NN
  thereof/NN)
