### Basic Operations of NLP

Source: https://github.com/luchux/ipython-notebook-nltk/blob/master/NLP%20-%20MelbDjango.ipynb

In [10]:
import nltk #dirty, but we will use it letters
from nltk import sent_tokenize, word_tokenize

In [11]:
sentences = sent_tokenize("The world is huge. All human populations possess language! This includes populations, \
such as the Tasmanians and the Andamanese, who may have been isolated from the Old World continents for as long as 40,000 years. \
All the pugs speak a language that we can not understand."
)

sentences

['The world is huge.',
 'All human populations possess language!',
 'This includes populations, such as the Tasmanians and the Andamanese, who may have been isolated from the Old World continents for as long as 40,000 years.',
 'All the pugs speak a language that we can not understand.']

In [22]:
tokens = word_tokenize(sentences[2])

In [23]:
tokens

['This',
 'includes',
 'populations',
 ',',
 'such',
 'as',
 'the',
 'Tasmanians',
 'and',
 'the',
 'Andamanese',
 ',',
 'who',
 'may',
 'have',
 'been',
 'isolated',
 'from',
 'the',
 'Old',
 'World',
 'continents',
 'for',
 'as',
 'long',
 'as',
 '40,000',
 'years',
 '.']

### POS tagging

In [24]:
from nltk import pos_tag
# this is a Classifier, given a token assign a class
# pos_tag Already defined in the library. We can train our own.

In [25]:
tags = pos_tag(tokens)
tags

[('This', 'DT'),
 ('includes', 'VBZ'),
 ('populations', 'NNS'),
 (',', ','),
 ('such', 'JJ'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('Tasmanians', 'NNPS'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('Andamanese', 'NNP'),
 (',', ','),
 ('who', 'WP'),
 ('may', 'MD'),
 ('have', 'VB'),
 ('been', 'VBN'),
 ('isolated', 'VBN'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('Old', 'NNP'),
 ('World', 'NNP'),
 ('continents', 'VBZ'),
 ('for', 'IN'),
 ('as', 'RB'),
 ('long', 'RB'),
 ('as', 'IN'),
 ('40,000', 'CD'),
 ('years', 'NNS'),
 ('.', '.')]

### Word senses

In [27]:
from nltk.corpus import wordnet as wn

wn.synsets('human')

[Synset('homo.n.02'),
 Synset('human.a.01'),
 Synset('human.a.02'),
 Synset('human.a.03')]

In [28]:
wn.synsets('human')[0].definition

<bound method Synset.definition of Synset('homo.n.02')>

In [29]:
wn.synsets('human')[1].definition

<bound method Synset.definition of Synset('human.a.01')>

In [30]:
human = wn.synsets('Human', pos=wn.NOUN)[0]
human

Synset('homo.n.02')

In [32]:
human.hyponyms()

[Synset('homo_erectus.n.01'),
 Synset('homo_habilis.n.01'),
 Synset('homo_sapiens.n.01'),
 Synset('homo_soloensis.n.01'),
 Synset('neandertal_man.n.01'),
 Synset('rhodesian_man.n.01'),
 Synset('world.n.08')]

In [33]:
bike = wn.synsets('bicycle')[0]
bike

Synset('bicycle.n.01')

In [34]:
girl = wn.synsets('girl')[0]
girl

Synset('girl.n.01')

In [35]:
bike.wup_similarity(human)

0.34782608695652173

In [36]:
girl.wup_similarity(human)

0.5

### Chunks

In [37]:
from nltk import word_tokenize, pos_tag
from nltk.chunk import RegexpParser

chunker = RegexpParser(r'''
NP:
{<DT><NN.*><.*>*<NN.*>}
}<VB.*>{
''')

In [38]:
print(tags)
print(chunker.parse(tags))

[('This', 'DT'), ('includes', 'VBZ'), ('populations', 'NNS'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('the', 'DT'), ('Tasmanians', 'NNPS'), ('and', 'CC'), ('the', 'DT'), ('Andamanese', 'NNP'), (',', ','), ('who', 'WP'), ('may', 'MD'), ('have', 'VB'), ('been', 'VBN'), ('isolated', 'VBN'), ('from', 'IN'), ('the', 'DT'), ('Old', 'NNP'), ('World', 'NNP'), ('continents', 'VBZ'), ('for', 'IN'), ('as', 'RB'), ('long', 'RB'), ('as', 'IN'), ('40,000', 'CD'), ('years', 'NNS'), ('.', '.')]
(S
  This/DT
  includes/VBZ
  populations/NNS
  ,/,
  such/JJ
  as/IN
  (NP
    the/DT
    Tasmanians/NNPS
    and/CC
    the/DT
    Andamanese/NNP
    ,/,
    who/WP
    may/MD)
  have/VB
  been/VBN
  isolated/VBN
  (NP from/IN the/DT Old/NNP World/NNP)
  continents/VBZ
  (NP for/IN as/RB long/RB as/IN 40,000/CD years/NNS)
  ./.)


### Entity Recognition - Chunking

In [39]:
from nltk.chunk import ne_chunk

In [42]:
sentence = "Daryl A. is the head of the coworking place Commoncode Corp. from where many people work in Melbourne, Australia."
pos_tags = pos_tag(word_tokenize(sentence))
pos_tags

[('Daryl', 'NNP'),
 ('A.', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('head', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('coworking', 'VBG'),
 ('place', 'NN'),
 ('Commoncode', 'NNP'),
 ('Corp.', 'NNP'),
 ('from', 'IN'),
 ('where', 'WRB'),
 ('many', 'JJ'),
 ('people', 'NNS'),
 ('work', 'VBP'),
 ('in', 'IN'),
 ('Melbourne', 'NNP'),
 (',', ','),
 ('Australia', 'NNP'),
 ('.', '.')]

In [45]:
ne_chunk(pos_tags)

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [Tree('GPE', [('Daryl', 'NNP')]), ('A.', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('head', 'NN'), ('of', 'IN'), ('the', 'DT'), ('coworking', 'VBG'), ('place', 'NN'), Tree('ORGANIZATION', [('Commoncode', 'NNP')]), ('Corp.', 'NNP'), ('from', 'IN'), ('where', 'WRB'), ('many', 'JJ'), ('people', 'NNS'), ('work', 'VBP'), ('in', 'IN'), Tree('GPE', [('Melbourne', 'NNP')]), (',', ','), Tree('GPE', [('Australia', 'NNP')]), ('.', '.')])

### Excercise A: Frequency Distribution of Words, Language Models

In [46]:
from camplight import Request, Campfire
from settings import CAMPFIRE_BOT

ModuleNotFoundError: No module named 'camplight'