# NLP (Natural Languaue Processing)

In [3]:
import nltk

In [4]:
import sys
import numpy as np

In [5]:
print(f"Python: {sys.version}")
print(f"nltk: {nltk.__version__}")

Python: 3.7.7 (default, May  6 2020, 11:45:54) [MSC v.1916 64 bit (AMD64)]
nltk: 3.5


In [6]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Corpus
- Body of text, singular.
- Corpora is the plural of this
- e.g. a collection of medical journals

## Lexicon
- words and their meanings
- e.g. dictionary

## Token
- Each `entity` that is a part of whatever was split up based on rules
- e.g. each word is a token when a sentence is tokenized into words.
- e.g. each sentence can also be a token when a peragraph is tokenized into sentences,

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [8]:
text = "Hello students! How are you doing today? You look great today. I am your new teacher. C++ is faster but Python is awesome."

print(sent_tokenize(text))

['Hello students!', 'How are you doing today?', 'You look great today.', 'I am your new teacher.', 'C++ is faster but Python is awesome.']


In [9]:
print(word_tokenize(text))

['Hello', 'students', '!', 'How', 'are', 'you', 'doing', 'today', '?', 'You', 'look', 'great', 'today', '.', 'I', 'am', 'your', 'new', 'teacher', '.', 'C++', 'is', 'faster', 'but', 'Python', 'is', 'awesome', '.']


In [10]:
print(text.split())

['Hello', 'students!', 'How', 'are', 'you', 'doing', 'today?', 'You', 'look', 'great', 'today.', 'I', 'am', 'your', 'new', 'teacher.', 'C++', 'is', 'faster', 'but', 'Python', 'is', 'awesome.']


## stop words
Stopwords are the most common words in any natural language. For the purpose of analyzing text data and building NLP models, these stopwords might not add much value to the meaning of the document. Generally, the most common words used in a text are “the”, “is”, “in”, “for”, “where”, “when”, “to”, “at” etc.

In [11]:
# removing `stop words` = useless data
from nltk.corpus import stopwords

In [12]:
print(set(stopwords.words("english")))

{'a', "mustn't", 'when', 'as', 'mustn', 'his', 'again', "won't", 'had', 'yourself', 'i', 'shan', 'above', 'aren', 'an', 'hasn', 'isn', 'ma', 'the', 'yours', 'through', 'yourselves', "that'll", 'after', 'were', "it's", 'same', "shouldn't", 'about', 'does', 'him', 'will', 'y', 'hers', 'from', 'so', 'further', 'weren', "wasn't", 'to', "isn't", 'being', 'nor', 'they', 'some', "you've", 'is', 'we', 'was', 'each', 'only', 'too', 't', 'doing', 'their', 'haven', 'ours', 'do', 'no', 'up', 'myself', 'ain', 'few', "weren't", 'of', 'this', 'itself', 'm', "she's", 'between', 'been', 'because', "doesn't", "couldn't", 'those', 'o', 'can', 'am', 'you', 'and', 'but', "should've", 'these', "you're", "you'd", 'herself', 'doesn', "shan't", 'before', 'why', 's', 'wasn', 'than', 'more', 'once', 'until', 'on', 'me', 'under', 'what', 'own', 'if', 'at', 'there', "don't", 'them', 'didn', 'such', "haven't", 'off', 'very', 'who', 'that', "aren't", "didn't", 'my', 'where', 'not', 'be', 'won', 'below', 'ourselves',

In [13]:
example = "This is some sample text, showing off the stop words filtration"

stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(example)

filtered_sentence = [w for w in word_tokens if w not in stop_words]
filtered_sentence

['This', 'sample', 'text', ',', 'showing', 'stop', 'words', 'filtration']

## `stemming` words in NLP

1. I was taking a ride on my horse.
2. I was riding my horse.

Both have same meaning

In [14]:
from nltk.stem import PorterStemmer

In [15]:
ps = PorterStemmer()

example_words = ["ride", "riding", "rider", "rides"]

for w in example_words:
    print(ps.stem(w))

ride
ride
rider
ride


In [16]:
# stemming an entire sentence
new_text = "When riders are riding their horses, they often think of how cowboys rode horses."

words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

when
rider
are
ride
their
hors
,
they
often
think
of
how
cowboy
rode
hors
.


## Part of speech tagging

labelling words as nouns, verbs, adjectives or whatever.

In [17]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [18]:
from nltk.corpus import udhr

In [19]:
print(udhr.raw("English-Latin1"))

Universal Declaration of Human Rights
Preamble
Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom, justice and peace in the world, 

Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind, and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people, 

Whereas it is essential, if man is not to be compelled to have recourse, as a last resort, to rebellion against tyranny and oppression, that human rights should be protected by the rule of law, 

Whereas it is essential to promote the development of friendly relations between nations, 

Whereas the peoples of the United Nations have in the Charter reaffirmed their faith in fundamental human rights, in the dignity and worth of the human person and in

In [20]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [21]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [22]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [23]:
# tokenize the sample text
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [24]:
type(tokenized), len(tokenized)

(list, 346)

In [25]:
# define a function that will tag each tokenized word with a part of speech

def process_content():
    try:
        for w in tokenized[:5]:
            words = nltk.word_tokenize(w)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
        print(str(e))
        
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [26]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [27]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [None]:
'''
+ = match 1 or more
? = match 0 or 1 repetitions
* = match 0 or more repitions
. = any character except a new line
'''

## chunking

Phrase chunking is a phase of natural language processing that separates and segments a sentence into its subconstituents, such as noun, verb, and prepositional phrases, abbreviated as NP, VP, and PP, respectively. Typically, each subconstituent or chunk is denoted by brackets.

In [29]:
# Chunking with NLTK
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for w in tokenized[:2]:
            words = nltk.word_tokenize(w)
            tagged = nltk.pos_tag(words)
            
            # combine the part of each tag with regex
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>*<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            # draw the chunks with nltk
            chunked.draw()
            
    except Exception as e:
        print(str(e))
        
process_content()

In [None]:
'''
<RB.?> = "O or more of any tense of adverb," followed by:
<VB.?> = "O or more of any tense of verb," followed by:
<NNP> = "1 or more proper nouns," followed by:
<NN> = "0 or 1 singular noun"
'''

In [None]:
# Chunking with NLTK
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for w in tokenized[:2]:
            words = nltk.word_tokenize(w)
            tagged = nltk.pos_tag(words)
            
            # combine the part of each tag with regex
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>*<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            # print the nltk tree
            for subtree in chunked.subtrees(filter=lambda t: t.label() == "Chunk"):
                print(subtree)
            # draw the chunks with nltk
            # chunked.draw()
            
    except Exception as e:
        print(str(e))
        
process_content()