# 1. Import NLTK library

In [1]:
import nltk
from nltk.stem  import  PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

## Make sure that all NLTK packages are downloaded

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# 2. Choose a text to play with

In [4]:
# Reading from a file    
with open('steve-jobs-2005.txt','r') as f:
    text_full = f.read()
    
print(text_full)

I am honored to be with you today at your commencement from one of the finest universities in the world. I never graduated from college. Truth be told, this is the closest I’ve ever gotten to a college graduation. Today I want to tell you three stories from my life. That’s it. No big deal. Just three stories.

The first story is about connecting the dots.

I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit. So why did I drop out?

It started before I was born. My biological mother was a young, unwed college graduate student, and she decided to put me up for adoption. She felt very strongly that I should be adopted by college graduates, so everything was all set for me to be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the last minute that they really wanted a girl. So my parents, who were on a waiting list, got a call in the middle of the night asking: “

# 3. Tokenization
Tokenization is about creating tokens. Tokens are small portions of text. They can be words, more complex phrases, or even the whole sentences.

## 3.1 Tokenizing sentences

In [5]:
text_splitted_dot = text_full.split('.')

for index in range(len(text_splitted_dot[:15])):
    print(str(index+1)+'.',text_splitted_dot[index])

1. I am honored to be with you today at your commencement from one of the finest universities in the world
2.  I never graduated from college
3.  Truth be told, this is the closest I’ve ever gotten to a college graduation
4.  Today I want to tell you three stories from my life
5.  That’s it
6.  No big deal
7.  Just three stories
8. 

The first story is about connecting the dots
9. 

I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit
10.  So why did I drop out?

It started before I was born
11.  My biological mother was a young, unwed college graduate student, and she decided to put me up for adoption
12.  She felt very strongly that I should be adopted by college graduates, so everything was all set for me to be adopted at birth by a lawyer and his wife
13.  Except that when I popped out they decided at the last minute that they really wanted a girl
14.  So my parents, who were on a waiting list, 

In [6]:
sentences = sent_tokenize(text_full)
for index in range(len(sentences[:45])):
    print(str(index+1)+'.',sentences[index])

1. I am honored to be with you today at your commencement from one of the finest universities in the world.
2. I never graduated from college.
3. Truth be told, this is the closest I’ve ever gotten to a college graduation.
4. Today I want to tell you three stories from my life.
5. That’s it.
6. No big deal.
7. Just three stories.
8. The first story is about connecting the dots.
9. I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit.
10. So why did I drop out?
11. It started before I was born.
12. My biological mother was a young, unwed college graduate student, and she decided to put me up for adoption.
13. She felt very strongly that I should be adopted by college graduates, so everything was all set for me to be adopted at birth by a lawyer and his wife.
14. Except that when I popped out they decided at the last minute that they really wanted a girl.
15. So my parents, who were on a waiting list,

In [10]:
text = ' '.join(sentences[8:12])

## 3.2 Tokenizing words

In [11]:
text_splitted_space = text.split(' ')

for index in range(20):
    print(str(index+1)+'.',text_splitted_space[index])

1. I
2. dropped
3. out
4. of
5. Reed
6. College
7. after
8. the
9. first
10. 6
11. months,
12. but
13. then
14. stayed
15. around
16. as
17. a
18. drop-in
19. for
20. another


In [12]:
words_tokenized = word_tokenize(text)
    
names = ['text_splitted_space'+' | ', 'words_tokenized']
formatted_text = '{:>16}' * (len(names))
print('\n', formatted_text.format(*names),'\n', '='*38)

for index in range(40):
    output = [text_splitted_space[index], words_tokenized[index]]
    print(formatted_text.format(*output))
    


 text_splitted_space |  words_tokenized 
               I               I
         dropped         dropped
             out             out
              of              of
            Reed            Reed
         College         College
           after           after
             the             the
           first           first
               6               6
         months,          months
             but               ,
            then             but
          stayed            then
          around          stayed
              as          around
               a              as
         drop-in               a
             for         drop-in
         another             for
              18         another
          months              18
              or          months
              so              or
          before              so
               I          before
          really               I
           quit.          really
              So            quit
 

In [13]:
word_punkt_tokenizer = WordPunctTokenizer()

words_punkt_tokenized = word_punkt_tokenizer.tokenize(text)

names = ['text_splitted_space'+' | ', 'words_tokenized'+' | ', 'words_punkt_tokenized']
formatted_text = '{:>18}' * (len(names))
print('\n', formatted_text.format(*names),'\n', '='*61)

for index in range(21):
    output = [text_splitted_space[index], words_tokenized[index],words_punkt_tokenized[index]]
    print(formatted_text.format(*output))


 text_splitted_space | words_tokenized | words_punkt_tokenized 
                 I                 I                 I
           dropped           dropped           dropped
               out               out               out
                of                of                of
              Reed              Reed              Reed
           College           College           College
             after             after             after
               the               the               the
             first             first             first
                 6                 6                 6
           months,            months            months
               but                 ,                 ,
              then               but               but
            stayed              then              then
            around            stayed            stayed
                as            around            around
                 a                as                as


## 3.3 Text chunker

In [14]:
# Split the input text into chunks, where each chunk contains N words
def chunker(input_data, N):
    input_words = input_data.split(' ')
    output = []

    current_chunk = []
    count = 0
    for word in input_words:
        current_chunk.append(word)
        count += 1
        if count == N:
            output.append(' '.join(current_chunk))
            count, current_chunk = 0, []

    output.append(' '.join(current_chunk))

    return output

# Define the number of words in each chunk
chunk_size = 100

chunks = chunker(text_full, chunk_size)
print('\nNumber of text chunks =', len(chunks), '\n')
for i, chunk in enumerate(chunks):
    print('Chunk', i+1, '==>', chunk,'\n') 


Number of text chunks = 23 

Chunk 1 ==> I am honored to be with you today at your commencement from one of the finest universities in the world. I never graduated from college. Truth be told, this is the closest I’ve ever gotten to a college graduation. Today I want to tell you three stories from my life. That’s it. No big deal. Just three stories.

The first story is about connecting the dots.

I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit. So why did I drop out?

It started before 

Chunk 2 ==> I was born. My biological mother was a young, unwed college graduate student, and she decided to put me up for adoption. She felt very strongly that I should be adopted by college graduates, so everything was all set for me to be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the last minute that they really wanted a girl. So my parents, who were on a waiti

# 4. Stemming

In [15]:
porter_stemmer = PorterStemmer()
words_stemmed = [porter_stemmer.stem(word) for word in words_tokenized]

stemmed_sentences =[]
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [porter_stemmer.stem(word) for word in words]
    stemmed_sentences.append(words)

for index in range(len(stemmed_sentences)):
    print(str(index+1)+'.',stemmed_sentences[index])
    

1. ['I', 'am', 'honor', 'to', 'be', 'with', 'you', 'today', 'at', 'your', 'commenc', 'from', 'one', 'of', 'the', 'finest', 'univers', 'in', 'the', 'world', '.']
2. ['I', 'never', 'graduat', 'from', 'colleg', '.']
3. ['truth', 'be', 'told', ',', 'thi', 'is', 'the', 'closest', 'I', '’', 've', 'ever', 'gotten', 'to', 'a', 'colleg', 'graduat', '.']
4. ['today', 'I', 'want', 'to', 'tell', 'you', 'three', 'stori', 'from', 'my', 'life', '.']
5. ['that', '’', 's', 'it', '.']
6. ['No', 'big', 'deal', '.']
7. ['just', 'three', 'stori', '.']
8. ['the', 'first', 'stori', 'is', 'about', 'connect', 'the', 'dot', '.']
9. ['I', 'drop', 'out', 'of', 'reed', 'colleg', 'after', 'the', 'first', '6', 'month', ',', 'but', 'then', 'stay', 'around', 'as', 'a', 'drop-in', 'for', 'anoth', '18', 'month', 'or', 'so', 'befor', 'I', 'realli', 'quit', '.']
10. ['So', 'whi', 'did', 'I', 'drop', 'out', '?']
11. ['It', 'start', 'befor', 'I', 'wa', 'born', '.']
12. ['My', 'biolog', 'mother', 'wa', 'a', 'young', ',', 'un

In [16]:
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

# Create a list of stemmer names for display
names = ['PORTER', 'LANCASTER', 'SNOWBALL']
formatted_text = '{:>16}' * (len(names) + 1)
print('\n', formatted_text.format('INPUT WORD', *names),'\n', '='*70)

# Stem each word and display the output
for word in words_tokenized[:29]:
    output = [word, porter_stemmer.stem(word), 
            lancaster.stem(word), snowball.stem(word)]
    print(formatted_text.format(*output))


       INPUT WORD          PORTER       LANCASTER        SNOWBALL 
               I               I               i               i
         dropped            drop            drop            drop
             out             out             out             out
              of              of              of              of
            Reed            reed             ree            reed
         College          colleg          colleg          colleg
           after           after             aft           after
             the             the             the             the
           first           first           first           first
               6               6               6               6
          months           month           month           month
               ,               ,               ,               ,
             but             but             but             but
            then            then            then            then
          stayed      

# 5. Lemmatization

In [17]:
lemmatizer = WordNetLemmatizer()

words_lemmatized = [lemmatizer.lemmatize(word) for word in words_tokenized]

lemmatized_sentences =[]
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_sentences.append(words)
    
for index in range(len(lemmatized_sentences)):
    print(str(index+1)+'.',lemmatized_sentences[index]) 

1. ['I', 'am', 'honored', 'to', 'be', 'with', 'you', 'today', 'at', 'your', 'commencement', 'from', 'one', 'of', 'the', 'finest', 'university', 'in', 'the', 'world', '.']
2. ['I', 'never', 'graduated', 'from', 'college', '.']
3. ['Truth', 'be', 'told', ',', 'this', 'is', 'the', 'closest', 'I', '’', 've', 'ever', 'gotten', 'to', 'a', 'college', 'graduation', '.']
4. ['Today', 'I', 'want', 'to', 'tell', 'you', 'three', 'story', 'from', 'my', 'life', '.']
5. ['That', '’', 's', 'it', '.']
6. ['No', 'big', 'deal', '.']
7. ['Just', 'three', 'story', '.']
8. ['The', 'first', 'story', 'is', 'about', 'connecting', 'the', 'dot', '.']
9. ['I', 'dropped', 'out', 'of', 'Reed', 'College', 'after', 'the', 'first', '6', 'month', ',', 'but', 'then', 'stayed', 'around', 'a', 'a', 'drop-in', 'for', 'another', '18', 'month', 'or', 'so', 'before', 'I', 'really', 'quit', '.']
10. ['So', 'why', 'did', 'I', 'drop', 'out', '?']
11. ['It', 'started', 'before', 'I', 'wa', 'born', '.']
12. ['My', 'biological', 'm

In [18]:
names = ['snowball stemmer', 'lemmatizer noun', 'lemmatizer verb']
formatted_text = '{:>20}' * (len(names) + 1)
print('\n', formatted_text.format('INPUT WORD', *names),'\n', '='*88)

for word in words_tokenized:
    output = [word, 
              snowball.stem(word), 
              lemmatizer.lemmatize(word), 
              lemmatizer.lemmatize(word, pos='v')]
    print(formatted_text.format(*output))


           INPUT WORD    snowball stemmer     lemmatizer noun     lemmatizer verb 
                   I                   i                   I                   I
             dropped                drop             dropped                drop
                 out                 out                 out                 out
                  of                  of                  of                  of
                Reed                reed                Reed                Reed
             College              colleg             College             College
               after               after               after               after
                 the                 the                 the                 the
               first               first               first               first
                   6                   6                   6                   6
              months               month               month              months
                   ,     