In [1]:
"""Nature Language Pre-process"""
"""
    - Tokenization
    - Part of Speech tagging (PoS tagging)
    - Lemmatization
    - Stop Word Filtering

    Tokenization: is taking a text or set of text and splitting it into pieces, called tokens (words, punctuation marks,
        numbers, etc). Tokenization is governed by the type of the natural language. In English language, words are
        split whenever there is white space, punctuation marks are treated as separate tokens.
    Part of Speech tagging (PoS tagging): is a process of marking the words in a text as corresponding to a particular
        part of speech, based on its definition, as well as its context. In other words, PoS tagging is assigning parts
        of speech (such as noun, verb, adjective, etc) to each token
    Lemmatization: is "the problem of transforming a word form into its canonical form, or lemma". In fact the same word
        can have multiple different ‘lemma’s. So, based on the context it is used, the PoS tagging for the word, in that
        specific context, should be identified to extract the appropriate lemma.
    Stop Word Filtering: Stop words are frequently used common words. English has a lot of stop words like "and", "the",
        "this", and "a".
"""

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import state_union


In [2]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

print(train_text)
print(train_text)

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 2005


9:10 P.M. EST 

THE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: 

As a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territories, Ukraine, and a free and sovereign Iraq. (Applause.) 

Two weeks ago, I stood on the steps of this Capitol and renewed the commitment of our nation to the guiding ideal of liberty for all. This evening I will set forth policies to advance that ideal at home and around the world. 

Tonight, with a healthy, growing economy, with more Americans going back to work, with our nation an active force for good in the world -- the state of our union is confident and strong. (Applause.) 

Our generati

In [7]:
"""Tokenization"""
sentences = sent_tokenize(sample_text)
words_list = []
for sentence in sentences:
    words_list.append(word_tokenize(sentence))

print(sentences)
print(words_list)

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.', '(Applause.)', 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan. 31, 2006.', "White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.", 'We have gathered under this Capitol dome in moments of national mourning and national achievement.'

In [9]:
"""Part of Speech tagging (PoS tagging)"""
words_tagged_list = []
for words in words_list:
    tagged = pos_tag(words)
    words_tagged_list.append(tagged)


In [20]:
"""Lemmatization"""
lemmatizer = WordNetLemmatizer()

words_lemmatized_list = []
for words_tagged in words_tagged_list:
    words_lemmatized = []
    for word_tagged in words_tagged:
        word_lemmatized = lemmatizer.lemmatize(word_tagged[0])
        words_lemmatized.append(word_lemmatized)
    words_lemmatized_list.append(words_lemmatized)

In [21]:
"""Stop Word Filtering"""

stop_words = set(stopwords.words('english'))
words_filtered_list = []
for words_lemmatized in words_lemmatized_list:
    words_filtered = []
    for word in words_lemmatized:
        if word not in stop_words:
            words_filtered.append(word)
    words_filtered_list.append(words_filtered)

print(words_filtered_list)


[['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'January', '31', ',', '2006', 'THE', 'PRESIDENT', ':', 'Thank', '.'], ['Mr.', 'Speaker', ',', 'Vice', 'President', 'Cheney', ',', 'member', 'Congress', ',', 'member', 'Supreme', 'Court', 'diplomatic', 'corp', ',', 'distinguished', 'guest', ',', 'fellow', 'citizen', ':', 'Today', 'nation', 'lost', 'beloved', ',', 'graceful', ',', 'courageous', 'woman', 'called', 'America', 'founding', 'ideal', 'carried', 'noble', 'dream', '.'], ['Tonight', 'comforted', 'hope', 'glad', 'reunion', 'husband', 'wa', 'taken', 'long', 'ago', ',', 'grateful', 'good', 'life', 'Coretta', 'Scott', 'King', '.'], ['(', 'Applause', '.', ')'], ['President', 'George', 'W.', 'Bush', 'reacts', 'applause', 'State', 'Union', 'Address', 'Capitol', ',', 'Tuesday', ',', 'Jan.', '31', ',', '2006', '.'], ['White', 'House', 'photo', 'Eric', 'DraperEvery', 'time', 'I', "'