# NPL - First steps with Natural Language Took Kit (NLTK)

In [1]:
%pylab
%matplotlib inline

%config InlineBackend.figure_format = 'retina'

import numpy as np

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


### Import NLTK

At the beginning, we are going to import NLTK and download all its packages:

In [2]:
import nltk
#nltk.download() #Execute if it is not downloaded

### 1. Create a input text for our NLP chain

In [3]:
# 1. Create a input text for our NLP chain
text = "I didn't notice my animals were uglier than yours! I'm sorry..."
print ("1. Text:",text)

1. Text: I didn't notice my animals were uglier than yours! I'm sorry...


### 2. Split the text on sentences (optional)

In [4]:
# 2. Split the text on sentences (optional)
sentences = nltk.tokenize.sent_tokenize(text)
print ("2. Sentences:",sentences)

2. Sentences: ["I didn't notice my animals were uglier than yours!", "I'm sorry..."]


### 3. Tokenization: Tokenize the text, i.e: split the text on tokens

In [5]:
# 3. Tokenization: Tokenize the text, i.e: split the text on tokens
tokens = nltk.word_tokenize(text)
print ("3. Tokens:",tokens)

3. Tokens: ['I', 'did', "n't", 'notice', 'my', 'animals', 'were', 'uglier', 'than', 'yours', '!', 'I', "'m", 'sorry', '...']


### 4. Morphology/Lexical analysis: using Penn Tree Bank POS tags

First, we load the Penn Tree Bank POS tags in a `csv` file, because like this we can know what it means each tag:

In [6]:
import pandas as pd

peen_tree_bank = pd.read_csv("data/ref_nlp_penn_treebank2_pos_tags.csv", sep = ",")
peen_tree_bank

Unnamed: 0,penn_treebank_pos_locator,penn_treebank_tag,tag_description,example_text
0,1,CC,"conjunction, coordinating","and, or, but"
1,2,CD,cardinal number,"five, three, 13%"
2,3,DT,determiner,"the, a, these"
3,4,EX,existential there,there were six boys
4,5,FW,foreign word,mais
5,6,IN,"conjunction, subordinating or preposition","of, on, before, unless"
6,7,JJ,adjective,"nice, easy"
7,8,JJR,"adjective, comparative","nicer, easier"
8,9,JJS,"adjective, superlative","nicest, easiest"
9,10,LS,list item marker,


In [7]:
# 4. Morphology/Lexical analysis: set a morphology tag for each token
tagged = nltk.pos_tag(tokens)
print ("4. Analisis Morfologico:",tagged)

4. Analisis Morfologico: [('I', 'PRP'), ('did', 'VBD'), ("n't", 'RB'), ('notice', 'VB'), ('my', 'PRP$'), ('animals', 'NNS'), ('were', 'VBD'), ('uglier', 'JJR'), ('than', 'IN'), ('yours', 'JJR'), ('!', '.'), ('I', 'PRP'), ("'m", 'VBP'), ('sorry', 'JJ'), ('...', ':')]


### 5. Stemming: get the stem of each token

In [8]:
# 5. Stemming: get the stem of each token. For Stemming, the morphology analysis is not necessary.
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print ("5. Stems: ")
for tok in tokens:
    print (stemmer.stem(tok.lower()))

5. Stems: 
i
did
n't
notic
my
anim
were
uglier
than
your
!
i
'm
sorri
...


### 6. Lemmatization: get the lemma of each token

In [9]:
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

# Create the lemmatizer
lemmatizer = WordNetLemmatizer()

# WordNetLemmatizer only knows 4 POS tags: a (adjetive), r (adverb), n (noun) and v (verb)
# For that, we should convert Penn Tree Bank format to WordNet format 
# (e.g: N->n, J->a, R->r, V->V, ...)

wnTags = {'N':wordnet.NOUN, 'J':wordnet.ADJ, 'V':wordnet.VERB, 'R':wordnet.ADV} 

print ("6. Lemmas: ")
# For each token and its tag:
for (tok,tag) in tagged:
    # WordNet has not the short forms: 'm, n't, so we should introduce them for the good lemmatization
    if tok=='\'m':
        tok = 'am'
    if tok=='\'s':
        tok = 'is'
    if tok=='n\'t':
        tok = 'not'
        
    # We only get the first char of the tag because we use it to convert it to WordNet format
    tag = tag[:1]
    
    # Lemmatize the tokens
    lemma = lemmatizer.lemmatize(tok.lower(), wnTags.get(tag, wordnet.NOUN))

    # Other alternative for get the lemma can be use the wordnet.morphy() function 
    #lemma = wordnet.morphy(tok.lower(), wnTags.get(tag, wordnet.NOUN))
    
    
    if lemma is None: #Si wordnet no contiene la palabra, supondremos que el lema es igual al token
       lemma = tok.lower() 
    print (lemma)


6. Lemmas: 
i
do
not
notice
my
animal
be
ugly
than
yours
!
i
be
sorry
...


### 7. Syntactic analysis 

In [10]:
# Let's start with a sentence of a Groucho Marx's known text, with ambiguity
# "While hunting in Africa, I shot an elephant in my pijamas. How he got into my pijamas, I don't know."
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pijamas']

# Create our CFG
grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pijamas'
V -> 'shot' | 'did'
P -> 'in'
""")


# Generate a syntactic parser be able to recognize the grammar
parser = nltk.ChartParser(grammar, trace=1)
print ('7. Syntactic analysis:\n')
for tree in parser.parse(sent):
    print(tree,'\n')
    tree.draw()

7. Syntactic analysis:

|.  I  . shot.  an .eleph.  in .  my .pijam.|
|[-----]     .     .     .     .     .     .| [0:1] 'I'
|.     [-----]     .     .     .     .     .| [1:2] 'shot'
|.     .     [-----]     .     .     .     .| [2:3] 'an'
|.     .     .     [-----]     .     .     .| [3:4] 'elephant'
|.     .     .     .     [-----]     .     .| [4:5] 'in'
|.     .     .     .     .     [-----]     .| [5:6] 'my'
|.     .     .     .     .     .     [-----]| [6:7] 'pijamas'
|[-----]     .     .     .     .     .     .| [0:1] NP -> 'I' *
|[----->     .     .     .     .     .     .| [0:1] S  -> NP * VP
|.     [-----]     .     .     .     .     .| [1:2] V  -> 'shot' *
|.     [----->     .     .     .     .     .| [1:2] VP -> V * NP
|.     .     [-----]     .     .     .     .| [2:3] Det -> 'an' *
|.     .     [----->     .     .     .     .| [2:3] NP -> Det * N
|.     .     [----->     .     .     .     .| [2:3] NP -> Det * N PP
|.     .     .     [-----]     .     .     .| [3:4] N  -