In [2]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as a

In [3]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [4]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [5]:
# lowercase everything
original = original.lower()
original

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [6]:
# remove accented characters and non-ASCII characters
original = unicodedata.normalize('NFKD', original).encode('ascii', 'ignore').decode('utf-8')
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [7]:
# remove special characters
original = re.sub(r'[^a-z0-9\s]', '', original)
original

'paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

In [8]:
# tokenize
tokenize = nltk.tokenize.ToktokTokenizer()
tokenize

<nltk.tokenize.toktok.ToktokTokenizer at 0x7fef5cbc9d00>

In [9]:
original = tokenize.tokenize(original)
original

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematicians',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdoss',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 'o',
 'o',
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'as',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [12]:
# stemming
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [13]:
ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')

('call', 'call', 'call', 'call')

In [14]:
ps.stem('house'), ps.stem('housing')

('hous', 'hous')

In [16]:
stems = [ps.stem(word) for word in original]
' '.join(stems)

'paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdoss name contain the hungarian letter o o with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess'

In [23]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    |

[nltk_data]    |   Unzipping corpora/nps_chat.zip.
[nltk_data]    | Downloading package omw to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    | Downloading package omw-1.4 to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    | Downloading package opinion_lexicon to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Unzipping corpora/opinion_lexicon.zip.
[nltk_data]    | Downloading package panlex_swadesh to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    | Downloading package paradigms to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Unzipping corpora/paradigms.zip.
[nltk_data]    | Downloading package pe08 to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Unzipping corpora/pe08.zip.
[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Unzipping misc/perluniprops.zip.
[nltk_data]    | Download

[nltk_data]    | Downloading package wordnet_ic to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Unzipping corpora/wordnet_ic.zip.
[nltk_data]    | Downloading package words to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Unzipping corpora/words.zip.
[nltk_data]    | Downloading package ycoe to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Unzipping corpora/ycoe.zip.
[nltk_data]    | 
[nltk_data]  Done downloading collection all


True

In [24]:
# lemmatize
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [25]:
wnl.lemmatize('calling'), wnl.lemmatize('calls'), wnl.lemmatize('called'), wnl.lemmatize('call')

('calling', 'call', 'called', 'call')

In [26]:
wnl.lemmatize('house'), wnl.lemmatize('housing')

('house', 'housing')

In [27]:
wnl.lemmatize('mouse'), wnl.lemmatize('mice')

('mouse', 'mouse')

In [28]:
stopwords_english = stopwords.words('english')
stopwords_english[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [31]:
stopwords_english.append('o')
len(stopwords_english)

182

In [34]:
original_with_stopwords_removed = [word for word in original if word not in stopwords_english]
' '.join(original_with_stopwords_removed)

'paul erdos george polya influential hungarian mathematicians contributed lot field erdoss name contains hungarian letter double acute accent often incorrectly written erdos erdos either mistake typographical necessity'

Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:  
  
Lowercase everything  
Normalize unicode characters  
Replace anything that is not a letter, number, whitespace or a single quote.  

In [38]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    string = re.sub(r'[^a-z0-9\s]', '', string)
    
    return string

In [40]:
string = 'The cat aNd the dog wEnt to the $store and got some MILK!'
basic_clean(string)

'the cat and the dog went to the store and got some milk'

Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [41]:
def tokenize(string):
    tokenize = nltk.tokenize.ToktokTokenizer()
    string = tokenize.tokenize(string)
    
    return string

In [42]:
string = 'the dog went over to the cat and said hi'
tokenize(string)

['the', 'dog', 'went', 'over', 'to', 'the', 'cat', 'and', 'said', 'hi']

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [49]:
def stem(text1, text2, text3):
    ps = nltk.porter.PorterStemmer()
    ps.stem(text1), ps.stem(text2), ps.stem(text3)
    
    return text1, text2, text3

In [59]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # create our stemming object
    ps = nltk.porter.PorterStemmer()
    # use a list comprehension => stem each word for each word inside of the entire document,
    # split by the default, which are single spaces
    stems = [ps.stem(word) for word in string.split()]
    # glue it back together with spaces, as it was before
    string = ' '.join(stems)
    
    return string

In [61]:
stem('baller, balling, balls')

'baller, balling, ball'

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [70]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # create our lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use a list comprehension to lemmatize each word
    # string.split() => output a list of every token inside of the document
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # glue the lemmas back together by the strings we split on
    string = ' '.join(lemmas)
    #return the altered document
    return string

In [72]:
lemmatize('shawn, shawning, shawns')

'shawn, shawning, shawn'

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.  
  
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [55]:
def remove_stopwords(text):
    stopwords_english = stopwords.words('english')
    new_text = [word for word in text if word not in stopwords_english]
    
    return new_text

In [69]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # assign our stopwords from nltk into stopword_list
    stopword_list = stopwords.words('english')
    # utilizing set casting, i will remove any excluded stopwords
    stopword_list = set(stopword_list) - set(exclude_words)
    # add in any extra words to my stopwords set using a union
    stopword_list = stopword_list.union(set(extra_words))
    # split our document by spaces
    words = string.split()
    # every word in our document, as long as that word is not in our stopwords
    filtered_words = [word for word in words if word not in stopword_list]
    # glue it back together with spaces, as it was so it shall be
    string_without_stopwords = ' '.join(filtered_words)
    # return the document back
    return string_without_stopwords

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [67]:
news_df = a.get_news_articles()
news_df

Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [None]:
codeup_df = a.get_blog_articles()
codeup_df

For each dataframe, produce the following columns:  
  
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.


Ask yourself:  
  
- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?