In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [2]:
urls = acquire.get_all_urls()

blogs = acquire.get_blogs(urls=urls)

In [3]:
blogs.head()

Unnamed: 0,title,content
0,From Bootcamp to Bootcamp: Two Military Vetera...,Are you a veteran or active-duty military memb...
1,How to Get Started On Any Programming Exercise,Programming is hard. Whether you’re just begin...
2,The Best Path to a Career in Data Science,"In our blog, “The Best Path To A Career In Sof..."
3,Getting Hired in a Remote Environment,As a career accelerator with a tuition refund ...
4,The Remote Codeup Student Experience,Communities across Texas have now lived in a r...


In [4]:
news = acquire.get_news_articles()

In [5]:
news.head()

Unnamed: 0,topic,title,author,content
0,business,US firm buys Serum Institute parent's Czech un...,Krishna Veera Vanamali,US biotech firm Novavax has announced it's buy...
1,business,Google in talks to buy 5% stake in Vodafone Id...,Krishna Veera Vanamali,Google is exploring an investment in Vodafone ...
2,business,Microsoft in talks to buy 2.5% stake in Jio fo...,Anushka Dixit,Microsoft is in talks with Mukesh Ambani-led R...
3,business,GE to sell its 129-year-old lightbulb business...,Anushka Dixit,General Electric has announced that it is goin...
4,business,Kent's Atta maker ad says 'maid's hands may be...,Pragya Swastik,Kent RO Systems withdrew an advertisement for ...


# Exercises
The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [6]:
def basic_clean(string):
    # converting string to lowercase
    string = string.lower()
    
    # encoding string to ASCII, to convert special characters
    # Decode from ASCII to UTF-8 so we have normal python string
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    # Remove any special characters and replace with an empty string
    string = re.sub(r"[^a-z0-9'\s]", '', string)

    return string

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [7]:
def tokenize(string):
    # initializing tokenizer
    tokenizer = nltk.tokenize.Toktoktokenizer()
    
    # using the tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

3.Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [8]:
def stem(string):
    # Initializing Porter Stemmer
    ps = nltk.porter.PorterStemmer()
    
    # creating our list of stems
    stems = [ps.stem(word) for word in string.split()]
    
    # Unpacking our list
    stem_string = ' '.join(stems)

    return stem_string

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(string):
    # Initializing our Word Net Lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    # applying lemmatization to our string
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string_lemmatized = ' '.join(lemmas)
    
    return string_lemmatized

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [10]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    # setting up stop words list from nltk
    stopword_list = stopwords.words('english')
    
    # Adding/Filtering extra words to stopword list
    words = string.split()
    stopword_list = set(stopword_list) - set(exclude_words)
    stopword_list = stopword_list.union(set(extra_words))
    
    filtered_words = [w for w in words if w not in stopword_list]

    string_without_stopwords = ' '.join(filtered_words)

    return string_without_stopwords

6. Define a function named prep_article that takes in the dictionary representing an article and returns a dictionary that looks like this:


> `{
    'title': 'the original title'.
    'original': original,
    'stemmed': article_stemmed,
    'lemmatized': article_lemmatized,
    'clean': article_without_stopwords
}`


In [11]:
blogs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99 entries, 0 to 98
Data columns (total 2 columns):
title      99 non-null object
content    99 non-null object
dtypes: object(2)
memory usage: 2.3+ KB


In [12]:
def prep_article(df):
    df['title'] = df.title
    df['original'] = df.content
    df['stemmed'] = df.content.apply(basic_clean).apply(stem)
    df['lemmatized'] = df.content.apply(basic_clean).apply(lemmatize)
    df['clean'] = df.content.apply(basic_clean).apply(remove_stopwords)
    df.drop(columns=['content'], inplace=True)
    return df

Note that if the orignal dictionary has a title property, it should remain unchanged (same goes for the category property).

In [13]:
df = prep_article(blogs)

In [14]:
df.head()

Unnamed: 0,title,original,stemmed,lemmatized,clean
0,From Bootcamp to Bootcamp: Two Military Vetera...,Are you a veteran or active-duty military memb...,are you a veteran or activeduti militari membe...,are you a veteran or activeduty military membe...,veteran activeduty military member considering...
1,How to Get Started On Any Programming Exercise,Programming is hard. Whether you’re just begin...,program is hard whether your just begin to lea...,programming is hard whether youre just beginni...,programming hard whether youre beginning learn...
2,The Best Path to a Career in Data Science,"In our blog, “The Best Path To A Career In Sof...",in our blog the best path to a career in softw...,in our blog the best path to a career in softw...,blog best path career software development loo...
3,Getting Hired in a Remote Environment,As a career accelerator with a tuition refund ...,as a career acceler with a tuition refund guar...,a a career accelerator with a tuition refund g...,career accelerator tuition refund guarantee al...
4,The Remote Codeup Student Experience,Communities across Texas have now lived in a r...,commun across texa have now live in a remot en...,community across texas have now lived in a rem...,communities across texas lived remote environm...


7. Define a function named prepare_article_data that takes in the list of articles dictionaries, applies the prep_article function to each one, and returns the transformed data.