In [4]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import re

import unicodedata
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import acquire

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [5]:
def basic_clean(article):
    """
    Lowercases, normalizes, and removes special characters from the article.
    """
    # lowercasing
    article = article.lower()
    # normalize by removing non-ascii characters
    # encode turns characters into ascii characters
    # decode turns ascii characters back into a string
    article = unicodedata.normalize('NFKD', article).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # remove speial characters
    article = re.sub(r'[^a-z09\s]', '', article)
    
    return article

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [14]:
def tokenize(article):
    """
    Tokenizes a cleaned article (or a string).
    """
    # create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # use the tokenizer
    article = tokenizer.tokenize(article, return_str=True)
    
    return article

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [7]:
def stem(article):
    """
    Stem all words in an article (or a string).
    """
    # create the stemmer
    ps = nltk.porter.PorterStemmer()
    # use the stemmer, list comprehension uses the stemmer word-by-word
    stems = [ps.stem(word) for word in article.split()]
    # rejoin the stemmed words as an article
    article_stemmed = ''.join(stems)
    
    return article_stemmed

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [19]:
def lemmatize(article):
    """
    Lemmatize all words in an article (or a string).
    """
    # the the most current lemma list
    nltk.download('wordnet')
    # create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    # use the lemmatizer, list comprehension uses the lemmatizer word-by-word
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    # rejoin the lemmatized words as a article
    article_lemmatized = ''.join(lemmas)
    
    return article_lemmatized

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords. <br> <br>
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [9]:
def remove_stopwords(article):
    """
    Removes stopwords from an article (or a string).
    """
    # get the default list of stopwords
    stopword_list = stopwords.words('english')
    # split the article to prepare for removal of stopwords
    words = article.split()
    # remove stopwords
    filtered_stopwords = [word for word in words if word not in stopword_list]
    # rejoin the words into article
    article_without_stopwords = ''.join(filtered_stopwords)
    
    return article_without_stopwords

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [10]:
news_df = acquire.get_news()
news_df.head()

Unnamed: 0,headline,publish_time,category,content
0,Facebook changes its company name to 'Meta',12:20 am,Business,Facebook on Thursday announced it's changing t...
1,'Man who takes 6 months parental leave is a lo...,04:53 pm,Business,Several Twitter users criticised US-based Pala...
2,"Delhi HC notice to RBI, SBI over banning UPI p...",06:24 pm,Business,The Delhi High Court on Thursday issued notice...
3,Who are the top 10 new entrants on Hurun India...,04:46 pm,Business,Ace investor Rakesh Jhunjhunwala is the top ne...
4,Indian market has 3 key beauties: Paytm CEO ah...,05:49 pm,Business,"Vijay Shekhar Sharma, the CEO of Paytm that pl..."


7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [11]:
codeup_df = acquire.get_blogs()
codeup_df.head()

Unnamed: 0,title,date,category,content
0,Codeup’s Data Science Career Accelerator is Here!,"Sep 30, 2018",Data Science,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,"Oct 31, 2018",Data Science,By Dimitri Antoniou and Maggie Giust Data Scie...
2,Data Science VS Data Analytics: What’s The Dif...,"Oct 17, 2018",Data Science,"By Dimitri Antoniou A week ago, Codeup launche..."
3,10 Tips to Crush It at the SA Tech Job Fair,"Aug 14, 2018",Tips for Prospective Students,The third bi-annual San Antonio Tech Job Fair ...
4,Competitor Bootcamps Are Closing. Is the Model...,"Aug 14, 2018",Codeup News,"In recent news, DevBootcamp and The Iron Yar..."


8. For each dataframe, produce the following columns:
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [20]:
news_df['cleaned'] = news_df['content'].apply(basic_clean).apply(tokenize)
news_df['stemmed'] = news_df['cleaned'].apply(stem)
news_df['lemmatized'] = news_df['cleaned'].apply(lemmatize)
news_df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Unnamed: 0,headline,publish_time,category,content,cleaned,stemmed,lemmatized
0,Facebook changes its company name to 'Meta',12:20 am,Business,Facebook on Thursday announced it's changing t...,facebook on thursday announced its changing th...,facebookonthursdayannouncitchangthecompaniname...,facebookonthursdayannounceditchangingthecompan...
1,'Man who takes 6 months parental leave is a lo...,04:53 pm,Business,Several Twitter users criticised US-based Pala...,several twitter users criticised usbased palan...,severtwitterusercriticisusbaspalantirtechnolog...,severaltwitterusercriticisedusbasedpalantirtec...
2,"Delhi HC notice to RBI, SBI over banning UPI p...",06:24 pm,Business,The Delhi High Court on Thursday issued notice...,the delhi high court on thursday issued notice...,thedelhihighcourtonthursdayissunotictorbisbinp...,thedelhihighcourtonthursdayissuednoticetorbisb...
3,Who are the top 10 new entrants on Hurun India...,04:46 pm,Business,Ace investor Rakesh Jhunjhunwala is the top ne...,ace investor rakesh jhunjhunwala is the top ne...,aceinvestorrakeshjhunjhunwalaisthetopnewentran...,aceinvestorrakeshjhunjhunwalaisthetopnewentran...
4,Indian market has 3 key beauties: Paytm CEO ah...,05:49 pm,Business,"Vijay Shekhar Sharma, the CEO of Paytm that pl...",vijay shekhar sharma the ceo of paytm that pla...,vijayshekharsharmatheceoofpaytmthatplantorais0...,vijayshekharsharmatheceoofpaytmthatplantoraise...


In [22]:
codeup_df['cleaned'] = codeup_df['content'].apply(basic_clean).apply(tokenize)
codeup_df['stemmed'] = codeup_df['cleaned'].apply(stem)
codeup_df['lemmatized'] = codeup_df['cleaned'].apply(lemmatize)
codeup_df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ianjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,title,date,category,content,cleaned,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,"Sep 30, 2018",Data Science,The rumors are true! The time has arrived. Cod...,the rumors are true the time has arrived codeu...,therumoraretruethetimehaarrivcodeuphaofficiope...,therumoraretruethetimehaarrivedcodeuphaofficia...
1,Data Science Myths,"Oct 31, 2018",Data Science,By Dimitri Antoniou and Maggie Giust Data Scie...,by dimitri antoniou and maggie giust data scie...,bydimitriantoniandmaggigiustdatasciencbigdatam...,bydimitriantoniouandmaggiegiustdatasciencebigd...
2,Data Science VS Data Analytics: What’s The Dif...,"Oct 17, 2018",Data Science,"By Dimitri Antoniou A week ago, Codeup launche...",by dimitri antoniou a week ago codeup launched...,bydimitriantoniaweekagocodeuplaunchourimmersda...,bydimitriantoniouaweekagocodeuplaunchedourimme...
3,10 Tips to Crush It at the SA Tech Job Fair,"Aug 14, 2018",Tips for Prospective Students,The third bi-annual San Antonio Tech Job Fair ...,the third biannual san antonio tech job fair i...,thethirdbiannualsanantoniotechjobfairisjustaro...,thethirdbiannualsanantoniotechjobfairisjustaro...
4,Competitor Bootcamps Are Closing. Is the Model...,"Aug 14, 2018",Codeup News,"In recent news, DevBootcamp and The Iron Yar...",in recent news devbootcamp and the iron yard a...,inrecentnewsdevbootcampandtheironyardannouncth...,inrecentnewsdevbootcampandtheironyardannounced...


9. Ask yourself:<br>

If your corpus is 493KB, would you prefer to use stemmed or lemmatized text? <br>
We can afford to use lemmatization instead of stemming; because the corpus size is small and therefore the computational cost is also small. <br> <br>
If your corpus is 25MB, would you prefer to use stemmed or lemmatized text? <br>
I might use stemming because the larger corpus size incurs a larger computational cost. <br> <br>
If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text? <br>
I would definitely use stemmed text in order to reduce the computational cost.