In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

nltk.download()
# Separating by Sentence with Parragraph identifier.
# Tokenizing: word tokenizers... Sentence tokenizers.
# Lexicon and corpora.
# corpora - body of text. ex: medical journals, presidential speeches, 
# English language.
# Lexicon - words and their means.

# investor-speak ... regular english-speak

# investor speak 'bull' = 'someone who is positive about the market'.
# english-speak 'bull' = scary animal you don't want running at you

example_text = "Hello Mr. Smith, how are you doing today? The weather is great and python is awesome. The sky is pinkish-blue. You should not eat cardboard"

#print("Tokenize by sentence:")
#print(sent_tokenize(example_text))
#print("Tokenize by word: ")
#print(word_tokenize(example_text))

for i in word_tokenize(example_text):
    print(i)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
Hello
Mr.
Smith
,
how
are
you
doing
today
?
The
weather
is
great
and
python
is
awesome
.
The
sky
is
pinkish-blue
.
You
should
not
eat
cardboard


In [9]:
# Tutorial 2. Stop Words. 
# Words that make you leave a text analysis. 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sentence = "This is an example showing off stop word filtration."
stop_words = set(stopwords.words("english")) #Considerar que stopwords trabaja con diferentes lenguajes. 
words = word_tokenize(example_sentence)
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
print(filtered_sentence)

# One Liner.
filtered_sentence_OL = [w for w in words if not w in stop_words]
print(filtered_sentence_OL)

['This', 'example', 'showing', 'stop', 'word', 'filtration', '.']
['This', 'example', 'showing', 'stop', 'word', 'filtration', '.']


In [10]:
# T3.Stemming
# Takes the root stem of the word.
# You can have different words with the same roots that mean the same.
# Useful to economize space. 

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

#for w in example_words:
    #print(ps.stem(w))
    
new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

python
python
python
python
pythonli
It
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


In [14]:
# Part 4. Part of Speech tagging.
#Creates tuples with the word and the parts of speech. 
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #Unsupervised learning tokenizer.

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try: 
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            print(tagged)
            
    except Exception as e:
        print(str(e))
        
process_content()

Z'), ('been', 'VBN'), ('falling', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('dozen', 'NN'), ('years', 'NNS'), ('in', 'IN'), ('a', 'DT'), ('row', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('These', 'DT'), ('gains', 'NNS'), ('are', 'VBP'), ('evidence', 'NN'), ('of', 'IN'), ('a', 'DT'), ('quiet', 'JJ'), ('transformation', 'NN'), ('--', ':'), ('a', 'DT'), ('revolution', 'NN'), ('of', 'IN'), ('conscience', 'NN'), (',', ','), ('in', 'IN'), ('which', 'WDT'), ('a', 'DT'), ('rising', 'VBG'), ('generation', 'NN'), ('is', 'VBZ'), ('finding', 'VBG'), ('that', 'IN'), ('a', 'DT'), ('life', 'NN'), ('of', 'IN'), ('personal', 'JJ'), ('responsibility', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('life', 'NN'), ('of', 'IN'), ('fulfillment', 'NN'), ('.', '.')]
[('Government', 'NNP'), ('has', 'VBZ'), ('played', 'VBN'), ('a', 'DT'), ('role', 'NN'), ('.', '.')]
[('Wise', 'NNP'), ('policies', 'NNS'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('welfare', 'NN'), ('reform', 'NN'), ('and', 

In [12]:
# Part 5. Chunking
#Creates tuples with the word and the parts of speech. 
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #Unsupervised learning tokenizer.

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try: 
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            print(chunked)

            
    except Exception as e:
        print(str(e))
        
process_content()


  of/IN
  producing/VBG
  ethanol/NN
  ,/,
  not/RB
  just/RB
  from/IN
  corn/NN
  ,/,
  but/CC
  from/IN
  wood/NN
  chips/NNS
  and/CC
  stalks/NNS
  ,/,
  or/CC
  switch/VB
  grass/NN
  ./.)
(S
  Our/PRP$
  goal/NN
  is/VBZ
  to/TO
  make/VB
  this/DT
  new/JJ
  kind/NN
  of/IN
  ethanol/JJ
  practical/JJ
  and/CC
  competitive/JJ
  within/IN
  six/CD
  years/NNS
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  Breakthroughs/NNS
  on/IN
  this/DT
  and/CC
  other/JJ
  new/JJ
  technologies/NNS
  will/MD
  help/VB
  us/PRP
  reach/VB
  another/DT
  great/JJ
  goal/NN
  :/:
  to/TO
  replace/VB
  more/JJR
  than/IN
  75/CD
  percent/NN
  of/IN
  our/PRP$
  oil/NN
  imports/NNS
  from/IN
  the/DT
  (Chunk Middle/NNP East/NNP)
  by/IN
  2025/CD
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  By/IN
  applying/VBG
  the/DT
  talent/NN
  and/CC
  technology/NN
  of/IN
  (Chunk America/NNP)
  ,/,
  this/DT
  country/NN
  can/MD
  dramatically/RB
  improve/VB
  our/PRP$
  environment/NN
 

In [13]:
# Part 6. Chinking
#Creates tuples with the word and the parts of speech. 
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #Unsupervised learning tokenizer.

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try: 
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()

            
    except Exception as e:
        print(str(e))
        
process_content()

KeyboardInterrupt: 