# Prepare Exercises Workbook

In [1]:
import pandas as pd

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import warnings
warnings.filter="ignore"

import acquire

from bs4 import BeautifulSoup

## 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 'https://codeup.com/data-science-myths/', 
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/', 'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']
acquire.get_blog_articles(urls)



  soup = bs4.BeautifulSoup(html)


Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


In [50]:
blog = acquire.get_codeup_blog('https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/')
blog



  soup = bs4.BeautifulSoup(html)


{'title': 'Competitor Bootcamps Are Closing. Is the Model in Danger?',
 'content': 'Competitor Bootcamps Are Closing. Is the Model in Danger?\n\xa0\n\nIs the programming bootcamp model in danger?\nIn recent news, DevBootcamp and The Iron Yard announced that they are closing their doors. This is big news. DevBootcamp was the first programming bootcamp model and The Iron Yard is a national player with 15 campuses across the U.S. In both cases, the companies cited an unsustainable business model. Does that mean the boot-camp model is dead?\n\ntl;dr “Nope!”\nBootcamps exist because traditional education models have failed to provide students job-ready skills for the 21st century. Students demand better employment options from their education. Employers demand skilled and job ready candidates. Big Education’s failure to meet those needs through traditional methods created the fertile ground for the new business model of the programming bootcamp.\nEducation giant Kaplan and Apollo Education 

In [51]:
def basic_clean(text):
    '''
    This function takes in a string of text and cleans it for NLP by:
    - converting all chracters to lowercase
    - normalizing unicode characters
    - removing any characters that are not letters, numbers, single quote, or space
    
    It returns a cleaned text string.
    '''
    
    #lowercase all characters
    text = text.lower()
    
    #normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    #remove any characters that are not a letter, number, or single quote
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    text = re.sub(r"\n", '', text)
    
    return text

In [52]:
blog = basic_clean(blog['content'])
blog

'competitor bootcamps are closing is the model in danger is the programming bootcamp model in dangerin recent news devbootcamp and the iron yard announced that they are closing their doors this is big news devbootcamp was the first programming bootcamp model and the iron yard is a national player with 15 campuses across the us in both cases the companies cited an unsustainable business model does that mean the bootcamp model is deadtldr nopebootcamps exist because traditional education models have failed to provide students jobready skills for the 21st century students demand better employment options from their education employers demand skilled and job ready candidates big educations failure to meet those needs through traditional methods created the fertile ground for the new business model of the programming bootcampeducation giant kaplan and apollo education group owner of university of phoenix bought their way into this new educational model when they purchased the iron yard and 

---

## 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [53]:
def tokenize(text):
    '''
    This function takes in a single arguement, a string 
    and prepares it for NLP by tokenizing the words.
    
    It returns a string. 
    '''
    
    #Create the tokenizer object
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    #Use the tokenizer
    text = tokenizer.tokenize(text, return_str = True)
    
    return text

In [54]:
blog = tokenize(blog)

In [55]:
blog

'competitor bootcamps are closing is the model in danger is the programming bootcamp model in dangerin recent news devbootcamp and the iron yard announced that they are closing their doors this is big news devbootcamp was the first programming bootcamp model and the iron yard is a national player with 15 campuses across the us in both cases the companies cited an unsustainable business model does that mean the bootcamp model is deadtldr nopebootcamps exist because traditional education models have failed to provide students jobready skills for the 21st century students demand better employment options from their education employers demand skilled and job ready candidates big educations failure to meet those needs through traditional methods created the fertile ground for the new business model of the programming bootcampeducation giant kaplan and apollo education group owner of university of phoenix bought their way into this new educational model when they purchased the iron yard and 

---

## 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [56]:
def stem(text):
    '''
    This function takes in a string as an arguement
    and stems the words for NLP.
    It returns a single string of the stemmed words. 
    '''
    #create the porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    #Apply the stemmer to each word in string
    stems = [ps.stem(word) for word in text.split()]
    
    #Join the stemmed list of words back into a string
    text_stemmed = ' '.join(stems)
    
    return text_stemmed

In [57]:
text_stemmed = stem(blog)
text_stemmed

'competitor bootcamp are close is the model in danger is the program bootcamp model in dangerin recent news devbootcamp and the iron yard announc that they are close their door thi is big news devbootcamp wa the first program bootcamp model and the iron yard is a nation player with 15 campus across the us in both case the compani cite an unsustain busi model doe that mean the bootcamp model is deadtldr nopebootcamp exist becaus tradit educ model have fail to provid student jobreadi skill for the 21st centuri student demand better employ option from their educ employ demand skill and job readi candid big educ failur to meet those need through tradit method creat the fertil ground for the new busi model of the program bootcampeduc giant kaplan and apollo educ group owner of univers of phoenix bought their way into thi new educ model when they purchas the iron yard and devbootcamp they purchas their competit with the intent to scale up the model unfortun big educ is too habitu to come up 

---

## 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [58]:
def lemmatize(text):
    '''
    This function takes in a string of text as
    an arguement and lemmatizes the words for NLP.
    It returns a single single string of the lemmatized words.
    '''
    #create the word nest list
    wnl = nltk.stem.WordNetLemmatizer()
    
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    
    text_lemmatized = ' '.join(lemmas)
    
    return text_lemmatized

In [59]:
text_lemmatized = lemmatize(blog)
text_lemmatized

'competitor bootcamps are closing is the model in danger is the programming bootcamp model in dangerin recent news devbootcamp and the iron yard announced that they are closing their door this is big news devbootcamp wa the first programming bootcamp model and the iron yard is a national player with 15 campus across the u in both case the company cited an unsustainable business model doe that mean the bootcamp model is deadtldr nopebootcamps exist because traditional education model have failed to provide student jobready skill for the 21st century student demand better employment option from their education employer demand skilled and job ready candidate big education failure to meet those need through traditional method created the fertile ground for the new business model of the programming bootcampeducation giant kaplan and apollo education group owner of university of phoenix bought their way into this new educational model when they purchased the iron yard and devbootcamp they pu

---

## 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [60]:
def remove_stopwords(text, extra_words=[], exclude_words=[]):
    '''
    This function takes in three arguements:
    1. A string
    2. extra_words=[] that should also be removed in addition to the std. stopwords.
    3. exclude_words=[] to signify std. stopwords that should not be removed.
    
    It returns a string with stopwords removed.
    '''
    stopword_list = stopwords.words('english')
    
    if len(extra_words) > 0:
        stopword_list.append(extra_words)
    else:
        stopword_list = stopword_list
        
    if len(exclude_words) > 0:
        stopword_list.remove(exclude_words)
    
    words = text.split()
    
    filtered_words = [word for word in words if word not in stopword_list]
    
    text_without_stopwords = ' '.join(filtered_words)
    
    return text_without_stopwords

In [61]:
text_without_stopwords = remove_stopwords(text_lemmatized, extra_words=['model', 'iron'])
text_without_stopwords

'competitor bootcamps closing model danger programming bootcamp model dangerin recent news devbootcamp iron yard announced closing door big news devbootcamp wa first programming bootcamp model iron yard national player 15 campus across u case company cited unsustainable business model doe mean bootcamp model deadtldr nopebootcamps exist traditional education model failed provide student jobready skill 21st century student demand better employment option education employer demand skilled job ready candidate big education failure meet need traditional method created fertile ground new business model programming bootcampeducation giant kaplan apollo education group owner university phoenix bought way new educational model purchased iron yard devbootcamp purchased competition intent scale model unfortunately big education habituated coming short student bought upstart challenged tried making change run bootcamps big education way sadly theyve closed door realized scaling education challeng

---

## 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [22]:
categories = ["business", "sports", "technology", "entertainment", "science", "world"]
news_df = acquire.get_all_news_articles(categories)



  soup = BeautifulSoup(response.text)


In [23]:
news_df

Unnamed: 0,title,content,category
0,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business
1,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business
2,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business
3,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business
4,Japan's Nomura to donate ₹15 crore for COVID-1...,Japanese bank Nomura will donate nearly ₹15 cr...,business
...,...,...,...
142,"US, Japan agree to unite in response to China:...",The United States and Japan have agreed to ste...,world
143,Myanmar indicts Japanese journalist on 'fake n...,Japanese journalist Yuki Kitazumi detained by ...,world
144,Egypt buys 30 more Rafale jets from France in ...,Egypt will buy 30 more Rafale fighter jets fro...,world
145,Chinese Navy to help Indonesia salvage sunken ...,Chinese Navy ships have arrived in Indonesia t...,world


---

## 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [24]:
codeup_df = acquire.get_blog_articles(urls)
codeup_df



  soup = bs4.BeautifulSoup(html)


Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


---

## For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [42]:
#clean to hold the normalized and tokenized original with the stopwords removed.
codeup_df['clean'] = codeup_df['content'].apply(lambda x: remove_stopwords(tokenize(basic_clean(x))))

In [43]:
codeup_df

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,rumors true time arrived codeup officially ope...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,dimitri antoniou maggie giust data science big...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",dimitri antoniou week ago codeup launched imme...,by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair third biannual san antonio te...,sa tech job fair the third biannual san antoni...,sa tech job fair the third biannual san antoni...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps closing model danger prog...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...


In [37]:
#stemmed to hold the stemmed version of the cleaned data.
codeup_df['stemmed'] = codeup_df['clean'].apply(lambda x: stem(x))
codeup_df

Unnamed: 0,title,content,clean,stemmed
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,the rumors are true the time has arrived codeu...,the rumor are true the time ha arriv codeup ha...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,by dimitri antoniou and maggie giust\ndata sci...,by dimitri antoni and maggi giust data scienc ...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",by dimitri antoniou\na week ago codeup launche...,by dimitri antoni a week ago codeup launch our...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair\nthe third biannual san anton...,sa tech job fair the third biannual san antoni...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps are closing is the model ...,competitor bootcamp are close is the model in ...


In [38]:
#lemmatized to hold the lemmatized version of the cleaned data.
codeup_df['lemmatized'] = codeup_df['clean'].apply(lambda x: lemmatize(x))
codeup_df

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,the rumors are true the time has arrived codeu...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,by dimitri antoniou and maggie giust\ndata sci...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",by dimitri antoniou\na week ago codeup launche...,by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair\nthe third biannual san anton...,sa tech job fair the third biannual san antoni...,sa tech job fair the third biannual san antoni...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps are closing is the model ...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...


In [44]:
#original to hold the original article/post content
codeup_df.rename(columns={"content": "original"}, inplace=True)
codeup_df

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,rumors true time arrived codeup officially ope...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,dimitri antoniou maggie giust data science big...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",dimitri antoniou week ago codeup launched imme...,by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair third biannual san antonio te...,sa tech job fair the third biannual san antoni...,sa tech job fair the third biannual san antoni...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps closing model danger prog...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...


---

In [46]:
#Rename the content column to original
news_df.rename(columns={"content": "original"}, inplace=True)
news_df

Unnamed: 0,title,original,category
0,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business
1,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business
2,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business
3,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business
4,Japan's Nomura to donate ₹15 crore for COVID-1...,Japanese bank Nomura will donate nearly ₹15 cr...,business
...,...,...,...
142,"US, Japan agree to unite in response to China:...",The United States and Japan have agreed to ste...,world
143,Myanmar indicts Japanese journalist on 'fake n...,Japanese journalist Yuki Kitazumi detained by ...,world
144,Egypt buys 30 more Rafale jets from France in ...,Egypt will buy 30 more Rafale fighter jets fro...,world
145,Chinese Navy to help Indonesia salvage sunken ...,Chinese Navy ships have arrived in Indonesia t...,world


In [47]:
#clean to hold the normalized and tokenized original with the stopwords removed.
news_df['clean'] = news_df['original'].apply(lambda x: remove_stopwords(tokenize(basic_clean(x))))

#stemmed to hold the stemmed version of the cleaned data.
news_df['stemmed'] = news_df['clean'].apply(lambda x: stem(x))

#lemmatized to hold the lemmatized version of the cleaned data.
news_df['lemmatized'] = news_df['clean'].apply(lambda x: lemmatize(x))

news_df

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business,indian commercial pilots association icpa tues...,indian commerci pilot associ icpa tuesday said...,indian commercial pilot association icpa tuesd...
1,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business,speaking india ' second covid19 wave former rb...,speak india ' second covid19 wave former rbi g...,speaking india ' second covid19 wave former rb...
2,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business,south koreas richest woman hong rahee added an...,south korea richest woman hong rahe ad anoth 7...,south korea richest woman hong rahee added ano...
3,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business,pandora world ' biggest jeweller said ' stop u...,pandora world ' biggest jewel said ' stop use ...,pandora world ' biggest jeweller said ' stop u...
4,Japan's Nomura to donate ₹15 crore for COVID-1...,Japanese bank Nomura will donate nearly ₹15 cr...,business,japanese bank nomura donate nearly 15 crore su...,japanes bank nomura donat nearli 15 crore supp...,japanese bank nomura donate nearly 15 crore su...
...,...,...,...,...,...,...
142,"US, Japan agree to unite in response to China:...",The United States and Japan have agreed to ste...,world,united states japan agreed step cooperation de...,unit state japan agre step cooper deal china j...,united state japan agreed step cooperation dea...
143,Myanmar indicts Japanese journalist on 'fake n...,Japanese journalist Yuki Kitazumi detained by ...,world,japanese journalist yuki kitazumi detained sec...,japanes journalist yuki kitazumi detain secur ...,japanese journalist yuki kitazumi detained sec...
144,Egypt buys 30 more Rafale jets from France in ...,Egypt will buy 30 more Rafale fighter jets fro...,world,egypt buy 30 rafale fighter jets france 4 bill...,egypt buy 30 rafal fighter jet franc 4 billion...,egypt buy 30 rafale fighter jet france 4 billi...
145,Chinese Navy to help Indonesia salvage sunken ...,Chinese Navy ships have arrived in Indonesia t...,world,chinese navy ships arrived indonesia help salv...,chines navi ship arriv indonesia help salvag i...,chinese navy ship arrived indonesia help salva...
