In [1]:
import nltk
import string

from nltk.corpus import gutenberg as g #import gutenberg corpus


from nltk.corpus import stopwords
from nltk import word_tokenize  # word tokenizer
from nltk import sent_tokenize # sentence tokenizer

from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer

from nltk.stem.snowball import SnowballStemmer # Porter 2
from nltk.stem import LancasterStemmer

from nltk.stem import RegexpStemmer

In [2]:
g.fileids()


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'child-voice.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
#compute statistics for each text in the corpus
# average word length - average sentence length - lexical diversity score (average # times each vocab appears in text)

for textid in g.fileids():
    nb_chars = len(g.raw(textid))
    nb_words = len(g.words(textid))
    nb_sents = len(g.sents(textid))
    nb_voc = len(set([w.lower() for w in g.words(textid)]))
    print ((nb_chars//nb_words), (nb_words//nb_sents), (nb_words//nb_voc), textid)

4 24 26 austen-emma.txt
4 26 16 austen-persuasion.txt
4 28 22 austen-sense.txt
4 33 79 bible-kjv.txt
4 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 17 12 burgess-busterbrown.txt
4 20 12 carroll-alice.txt
4 20 11 chesterton-ball.txt
4 22 11 chesterton-brown.txt
4 18 10 chesterton-thursday.txt
5 27 9 child-voice.txt
4 20 24 edgeworth-parents.txt
4 25 15 melville-moby_dick.txt
4 52 10 milton-paradise.txt
4 11 8 shakespeare-caesar.txt
4 12 7 shakespeare-hamlet.txt
4 12 6 shakespeare-macbeth.txt
4 36 12 whitman-leaves.txt


**Note 1.** information retrieved from the above statistics, we get to see that the average word length in English is 4,  however the average sentence length and the lexical diversity is particular to the author.

**Note 2.** g.sents(textid): sents() function divides the text up into its sentences, where each sentence is a list of words

In [4]:
g.raw(g.fileids()[-4]) # raw text of 'shakespeare-caesar.txt'

len(g.raw(g.fileids()[-4])) # returns the number of letters in the text, including space characters.

112310

# Basic Text Operations

In [5]:
#lower()

text = "They are not LONG, the Weeping and the Laughter, Love and desire and hate; I think they have no portion in us after We pass the gate."
lower_text = text.lower()
print (lower_text)


they are not long, the weeping and the laughter, love and desire and hate; i think they have no portion in us after we pass the gate.


### Word Tokenization

In [6]:
text = "They are Not LONG, the days of Wine And Roses, Out of a misty DREAM, Our Path Emerges for a While, then closes Within a Dream. - Ernest Downson"
word_tokens = nltk.word_tokenize(text)
print(text)
print (word_tokens)

They are Not LONG, the days of Wine And Roses, Out of a misty DREAM, Our Path Emerges for a While, then closes Within a Dream. - Ernest Downson
['They', 'are', 'Not', 'LONG', ',', 'the', 'days', 'of', 'Wine', 'And', 'Roses', ',', 'Out', 'of', 'a', 'misty', 'DREAM', ',', 'Our', 'Path', 'Emerges', 'for', 'a', 'While', ',', 'then', 'closes', 'Within', 'a', 'Dream', '.', '-', 'Ernest', 'Downson']


In [7]:

text = "They are Not LONG, the days of Wine And Roses, Out of a misty DREAM, Our Path Emerges for a While, then closes Within a Dream. - Ernest Downson"

#after the above import you use word_tokenize directly
word_tokens = word_tokenize(text)
print (word_tokens)

['They', 'are', 'Not', 'LONG', ',', 'the', 'days', 'of', 'Wine', 'And', 'Roses', ',', 'Out', 'of', 'a', 'misty', 'DREAM', ',', 'Our', 'Path', 'Emerges', 'for', 'a', 'While', ',', 'then', 'closes', 'Within', 'a', 'Dream', '.', '-', 'Ernest', 'Downson']


### Stop Words


**Stop Words**: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine ignores. 

We would not want these words to take up space in our database, or taking up valuable processing time. 

For this, NLTK removes them easily by storing a list of words that are considered as *stop words*. NLTKm has a list of stopwords stored in 16 different languages. They are found in the nltk_data directory.

In [8]:
sample = "Sometimes a thousand twangling instruments Will hum about mine ears; and sometime voices, That, if I then had waked after long sleep, Will make me sleep again:"

# sample  = "They are Not LONG, the days of Wine And Roses, Out of a misty DREAM, Our Path Emerges for a While, then closes Within a Dream. - Ernest Downson"

word_tokens = word_tokenize(sample) 
# word_tokens

stop_words = set(stopwords.words('english')) 

new_sent = []  # list

for w in word_tokens: 
    if w not in stop_words: 
        new_sent.append(w) 


# print(word_tokens) 
print(new_sent) 

['Sometimes', 'thousand', 'twangling', 'instruments', 'Will', 'hum', 'mine', 'ears', ';', 'sometime', 'voices', ',', 'That', ',', 'I', 'waked', 'long', 'sleep', ',', 'Will', 'make', 'sleep', ':']


In [9]:
 # Using List Comprehension
    
new_sentence = [w for w in word_tokens if not w in stop_words]

print(new_sentence)


['Sometimes', 'thousand', 'twangling', 'instruments', 'Will', 'hum', 'mine', 'ears', ';', 'sometime', 'voices', ',', 'That', ',', 'I', 'waked', 'long', 'sleep', ',', 'Will', 'make', 'sleep', ':']


In [10]:
# stop words stored in file english.txt inside library 'corpus' of NLTK package


# print(stopwords.words('french')) 

w = stopwords.words('english')

res = [True for d in w if (d =='again')]

res

[True]

In [11]:
# You could add a word to the stop words list

stop_words = stopwords.words('english')

stop_words.append('hello') #add a stopword

print(len(stop_words))
# print(stop_words)

res = [d for d in stop_words if (d =='hello')]

# remove the newly added word from the original stopword list

stop_words.remove('hello')

print(len(stop_words))

print(stop_words)

180
179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', '

In [12]:
# add a new list of stop words

stop_words = stopwords.words('english')

newstop_words = ['Sunny','Side', 'Up', 'Attitude']
stop_words.extend(newstop_words)

print(len(stop_words))
print(stop_words)

183
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [13]:
# remove newly added list of stop words

stop_words = stopwords.words('english')
newstop_words = ['Sunny','Side', 'Up', 'Attitude']

old = [old for old in stop_words if old not in newstop_words]

print(len(stop_words))
print(stop_words)

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [14]:
# find stop words without tokenization

def mystopwords(sent):
    tokens = sent.split(" ")
    print(tokens)
    new_tokens = [w for w in tokens if not w in stop_words]
#     print(new_tokens)
    return (" ").join(new_tokens)


In [15]:
mystopwords("hello there! my old friends! where have you all been. Hadn't seen you yet.")

['hello', 'there!', 'my', 'old', 'friends!', 'where', 'have', 'you', 'all', 'been.', "Hadn't", 'seen', 'you', 'yet.']


"hello there! old friends! been. Hadn't seen yet."

### Sentence Tokenization

In [16]:

# text = "Be Not Afeard, the Isle is full of Noises. Sounds and Sweet Airs that Give Delight and Hurt Not."

text = "They are Not LONG, the days of Wine And Roses. Out of a misty DREAM! Our Path Emerges for a While, then closes Within a Dream."
sent_token = sent_tokenize(text)
print (sent_token)

['They are Not LONG, the days of Wine And Roses.', 'Out of a misty DREAM!', 'Our Path Emerges for a While, then closes Within a Dream.']


In [17]:
# Tokenization _ sentence and word _ together

poem = sent_tokenize(text)
print(poem)



['They are Not LONG, the days of Wine And Roses.', 'Out of a misty DREAM!', 'Our Path Emerges for a While, then closes Within a Dream.']


In [18]:
print("\n")

Tokens = [word_tokenize(t) for t in poem]
print(Tokens)




[['They', 'are', 'Not', 'LONG', ',', 'the', 'days', 'of', 'Wine', 'And', 'Roses', '.'], ['Out', 'of', 'a', 'misty', 'DREAM', '!'], ['Our', 'Path', 'Emerges', 'for', 'a', 'While', ',', 'then', 'closes', 'Within', 'a', 'Dream', '.']]


In [19]:
no_punc_tok = []

for T in Tokens:
    no_punc_tok.append([token for token in T if token not in string.punctuation])
    
print(no_punc_tok)

[['They', 'are', 'Not', 'LONG', 'the', 'days', 'of', 'Wine', 'And', 'Roses'], ['Out', 'of', 'a', 'misty', 'DREAM'], ['Our', 'Path', 'Emerges', 'for', 'a', 'While', 'then', 'closes', 'Within', 'a', 'Dream']]


In [20]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


## Stemming

**Stemming**: Chopping off letters from the end of a word until the **stem** is reached.

In [21]:
# Using Porter Stemmer algorithm
 # Porter1 - Other stemming algorithms are snowball and lancaster

porter = PorterStemmer()

stemmed_tokens = [porter.stem(token) for seq in no_punc_tok for token in seq]
print(stemmed_tokens)

['they', 'are', 'not', 'long', 'the', 'day', 'of', 'wine', 'and', 'rose', 'out', 'of', 'a', 'misti', 'dream', 'our', 'path', 'emerg', 'for', 'a', 'while', 'then', 'close', 'within', 'a', 'dream']


### Overstemming

In [22]:
# university, universal, universities, and universe

print(porter.stem("university"))
print(porter.stem("universal"))
print(porter.stem("universities"))
print(porter.stem("universe"))

univers
univers
univers
univers


### Porter vs Snowball vs Lancaster Stemming Algorithms

In [23]:
porter = PorterStemmer() # object of PorterStemmer

snow = SnowballStemmer('english') #object of SnowballStemmer

lan_stem = LancasterStemmer() #object of LancasterStemmer


print(porter.stem('writing'))
print(snow.stem("writing"))
print(lan_stem.stem("writing"))

print(porter.stem('generously'))
print(snow.stem('generously'))
print(lan_stem.stem("generously"))

write
write
writ
gener
generous
gen


In [24]:
SnowballStemmer.languages


('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

**Plurals** Test the stemmer on plurals

In [25]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned', 'humbled', 'sized','meeting', 
           'stating', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer','plotted']

# using porter 1

print([porter.stem(p) for p in plurals])

print("\n")

print([snow.stem(p) for p in plurals])

print("\n")

print([lan_stem.stem(p) for p in plurals])





['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit', 'refer', 'colon', 'plot']


['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit', 'refer', 'colon', 'plot']


['caress', 'fli', 'die', 'mul', 'deny', 'died', 'agree', 'own', 'humbl', 'siz', 'meet', 'stat', 'siez', 'item', 'sens', 'tradit', 'ref', 'colon', 'plot']


### Lemmatization

**Lemma**: the root of a text

In [26]:
 #based on The Porter Stemming Algorithm
# from nltk.corpus import wordnet

lm = WordNetLemmatizer() # class - check source code


print([porter.stem(token) for seq in no_punc_tok for token in seq])

print('\n')

lemmatized_tokens = [lm.lemmatize(token) for seq in no_punc_tok for token in seq]
print(lemmatized_tokens)


['they', 'are', 'not', 'long', 'the', 'day', 'of', 'wine', 'and', 'rose', 'out', 'of', 'a', 'misti', 'dream', 'our', 'path', 'emerg', 'for', 'a', 'while', 'then', 'close', 'within', 'a', 'dream']


['They', 'are', 'Not', 'LONG', 'the', 'day', 'of', 'Wine', 'And', 'Roses', 'Out', 'of', 'a', 'misty', 'DREAM', 'Our', 'Path', 'Emerges', 'for', 'a', 'While', 'then', 'close', 'Within', 'a', 'Dream']


## Lemmatization with POS Tagging

In [27]:
print(lm.lemmatize("booking", pos='v'))

print(lm.lemmatize("Books", pos="n"))

print(lm.lemmatize("books", pos="v"))

print(lm.lemmatize("happily", pos="n"))

print(lm.lemmatize("happily", pos="v"))

print(lm.lemmatize("loving", pos="v"))

print(lm.lemmatize("angrily", pos="v"))

print(lm.lemmatize("angrily", pos="n"))


book
Books
book
happily
happily
love
angrily
angrily


## Lemmatizer vs Stemmer

In [28]:
print(porter.stem("angrily"))
print(lm.lemmatize("angrily"))

print(porter.stem("happily"))
print(lm.lemmatize("happily"))

print(porter.stem("loving"))
print(lm.lemmatize("loving", pos='v')) # default pos is 'NOUN'


angrili
angrily
happili
happily
love
love


In [29]:

stopword = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

text = "And then, in Dreaming, the Clouds Methought, would Open, and Show Riches Ready to Drop upon me; that, When I Waked, I Cried to Dream Again."

word_tokens = word_tokenize(text)

lemmatized_word = [lemmatizer.lemmatize(word) for word in word_tokens]

print (lemmatized_word)


say = "The striped bats are hanging on their feet for best"

# Tokenize: Split the sentence into words
wordlist = word_tokenize(say)

print("tokens are: ", wordlist)
#Lemmatize

lemma = [lemmatizer.lemmatize(wd) for wd in wordlist]

print("lemma: ", lemma)


['And', 'then', ',', 'in', 'Dreaming', ',', 'the', 'Clouds', 'Methought', ',', 'would', 'Open', ',', 'and', 'Show', 'Riches', 'Ready', 'to', 'Drop', 'upon', 'me', ';', 'that', ',', 'When', 'I', 'Waked', ',', 'I', 'Cried', 'to', 'Dream', 'Again', '.']
tokens are:  ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']
lemma:  ['The', 'striped', 'bat', 'are', 'hanging', 'on', 'their', 'foot', 'for', 'best']


In [30]:
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run", 'v'))



cat
cactus
goose
rock
python
good
best
run
run


### Regex Stemmer

In [31]:
regex_stem = RegexpStemmer('ing') #stem any word that contains the regex 'ing' whether it occurs as a prefix or suffix

regex_stem.stem('dancingqueen')

'dancqueen'