In [0]:
#The purpose of this script is to go through a few different techniques in NLP.
#Specifically, to work through a few examples in NLTK (Natural Language Toolkit)
#
#I will be using Google Colab/Colaboratory for this video, so some features of
#local (on your local computer) Python capabilities don't work quite the same.
#I will note those as necessary.
#
#Code is by JB at jamdatajam and posted on Github for reference purposes.#Walkthrough Video @ https://youtu.be/laHLAcO7VGI

In [0]:
#First we've gotta get those libraries:
import numpy as np
import nltk
nltk.download("popular") #in case you don't have NLTK already
from future.utils import iteritems #some of the functions I used when writing
                                   #this up originally were python2/3
                                   #Here I am using Python 3, just for reference
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

#that should be good, but we might add some more later

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [0]:
#Once it finishes downloading ...

In [0]:
#Let's assign some pos tags (parts of speech)
nltk.pos_tag("Pug dawgs are great!".split())

[('Pug', 'NNP'), ('dawgs', 'NN'), ('are', 'VBP'), ('great!', 'NN')]

In [0]:
#So what this does is split the sentence up and then assign each word a part of speech
#Here we have Pug being treated as a proper noun, dawgs is a noun, etc.

In [0]:
###############

In [0]:
#Now if we want to stem some words, perhaps for a "bag of words" type of approach
new_porter_stemmer = PorterStemmer()

In [0]:
new_porter_stemmer.stem('pugdogs')

'pugdog'

In [0]:
#and it stems the word by giving us the base form of the word
#In this case, it removes the plural
#Let's test some other cases
new_porter_stemmer.stem('knaves')

'knave'

In [0]:
#looks good so far

In [0]:
#Let's try something a bit more confusing
new_porter_stemmer.stem('Ponies')

'poni'

In [0]:
#Well that's not quite right!
#How about lowercase versions:
new_porter_stemmer.stem('ponies')

'poni'

In [0]:
#Same result

In [0]:
#Makes me wonder if there is another way to do this that will work correctly...

In [0]:
#How about lemmatizer?
new_lemma = WordNetLemmatizer()
new_lemma.lemmatize('Ponies')

'Ponies'

In [0]:
#Huh. That still isn't quite right . . .
#How to fix?

In [0]:
# . . . ?!?

In [0]:
#Let's try the lowercase!
#For example, remember the sentence from earlier in the video?
nltk.pos_tag("Pug dawgs are great!".split())

[('Pug', 'NNP'), ('dawgs', 'NN'), ('are', 'VBP'), ('great!', 'NN')]

In [0]:
#'Pug' here was assigned NNP rather than NN like 'dawgs'
#Maybe there is something to this: capital letters!
#Looking in the documentation, it seems like NLTK automatically assumes that
#capitalized nouns are, quote, "NNP" or "Nouns, Singular, Proper"
#
#So let's see what happens if we use a "NN" or "Noun, Singular"
new_lemma.lemmatize("ponies") #So lowercase for non-proper

'pony'

In [0]:
#Perfect!

In [0]:
###################

In [0]:
#Let's do some more word tagging
#We need a sentence:
sentence = 'Ricardo Montalban was known for his elegance and charm.'
#*Source: IMDB

In [0]:
#let's tag this sentence using pos (Parts of Speech)
tag_it = nltk.pos_tag(sentence.split())

In [0]:
#let's review what we've made
tag_it

[('Ricardo', 'NNP'),
 ('Montalban', 'NNP'),
 ('was', 'VBD'),
 ('known', 'VBN'),
 ('for', 'IN'),
 ('his', 'PRP$'),
 ('elegance', 'NN'),
 ('and', 'CC'),
 ('charm.', 'NN')]

In [0]:
#Awesome! We have tagged all the words using NLTK's built in tagging system

In [0]:
#We can also chunk this thing up into more of a sentence diagram form
tree_stuff = nltk.ne_chunk(tag_it)

In [0]:
print(tree_stuff) #let's check it out

(S
  (PERSON Ricardo/NNP)
  (PERSON Montalban/NNP)
  was/VBD
  known/VBN
  for/IN
  his/PRP$
  elegance/NN
  and/CC
  charm./NN)


In [0]:
#Cool, so now we have the S for subject and it's been clustered inside parentheses

In [0]:
#And we can view this chunked version in various ways
tree_stuff.leaves()

[('Ricardo', 'NNP'),
 ('Montalban', 'NNP'),
 ('was', 'VBD'),
 ('known', 'VBN'),
 ('for', 'IN'),
 ('his', 'PRP$'),
 ('elegance', 'NN'),
 ('and', 'CC'),
 ('charm.', 'NN')]

In [0]:
#Leaves are the words that come off major branches like the Subject, etc.

In [0]:
#We can also traverse the tree using a code snippet from NLTK's documentation:
def traverse(t):
  try:
    t.label()
  except AttributeError:
    print(t, end= " ")
  else:
    print('(', t.label(), end= " ")
    for child in t:
      traverse(child)
    print(')', end = " ")

In [0]:
#Here's the traverse command in action:
traverse(tree_stuff)

( S ( PERSON ('Ricardo', 'NNP') ) ( PERSON ('Montalban', 'NNP') ) ('was', 'VBD') ('known', 'VBN') ('for', 'IN') ('his', 'PRP$') ('elegance', 'NN') ('and', 'CC') ('charm.', 'NN') ) 

In [0]:
#And now we have a nicely formatted structure that is easy to copy for use in lists, etc.

In [0]:
#Now normally at this point, it would be easiest to simply use the built in
#functions of NLTK to view a sentence diagram, but Jupyter Notebook and Colab
#both throw errors relating to the display command and tkinter, because NLTK
#requires the use of external popup windows.

#So to plot such a diagram (assuming you are not in Jupyter or Colaboratory),
#you would use the following command:
nltk.ne_chunk(tag_it).draw()

TclError: ignored

In [0]:
#but as you can see, in my case (I'm using Colab), it throws an error.
#However, I have a few examples of what those diagrams look like!

In [0]:
#So as you can see, it's a very useful tool

In [0]:
#Anyways, I hope some of these techniques are helpful to you in your approach to
#learning data science and NLP (Natural Language Processing).
#
#Thanks for watching!