In [41]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

from time import strftime

import acquire

# Exercise 1

Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:

* Lowercase everything

* Normalize unicode characters

* Replace anything that is not a letter, number, whitespace or a single quote.

In [42]:
def basic_clean(str):
    '''
    This function takes in a string and converts all characters to lowercase, normalizes them, removes special
    characters, and returns a string
    '''
    # convert all to lowercase
    str = str.lower()
    # normalize
    str = unicodedata.normalize('NFKD', str)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    # remove anything that is not a letter, number, single quote, or white space
    str = re.sub(r"[^a-z0-9'\s]", '', str)
    
    return str

# Exercise 2

Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.

In [43]:
def tokenize(str):
    '''
    This function takes in a string and uses a tokenizer to break words, punctuation, etc. into discrete units
    '''
    # create tokenize item
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # apply tokenizer to str and assign result to variable 'str'
    str = tokenizer.tokenize(str, return_str=True)
    
    return str

# Exercise 3

Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.

In [44]:
def stem(str):
    '''
    This function takes in a string and uses a stemmer object to attempt to convert words to their base form
    '''
    # create the stemmer object
    ps = nltk.porter.PorterStemmer()
    # split string and loop through each word to apply stemmer
    stems = [ps.stem(word) for word in str.split()]
    # join the split and stemmed words back together on a space
    str = ' '.join(stems)
    
    return str

# Exercise 4

Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [45]:
def lemmatize(str):
    '''
    This function takes in a string and uses a lemmatizer object to attempt to convert words to their root word
    '''
    # create the lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # split string and loop through each word to apply lemmatizer
    lemmas = [wnl.lemmatize(word) for word in str.split()]
    # join the split and lemmatized words back together on a space
    str = ' '.join(lemmas)
    
    return str

# Exercise 5

Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.

In [46]:
def remove_stopwords(str, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string and removes words that have little or no significance
    '''
    # pull in english stopwords and assign to a variable
    stopword_list = stopwords.words('english')
    # extend stopword_list if additional words are specified
    if extra_words:
        stopword_list = stopword_list.extend(extra_words)
        # remove words if exclusion words are specified
    if exclude_words:
        stopword_list = [stopword_list.remove(word) for word in exclude_words]
    # split string to create a list of words
    words = str.split()
    # filter out words that are not in stopword list
    not_stopwords = [w for w in words if w not in stopword_list]
    
    str = ' '.join(not_stopwords)
    
    return str

# Exercise 6

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [47]:
# use function to pull in dataframe
news_df = acquire.read_news_articles(refresh = False)
news_df

Unnamed: 0,title,content,category
0,RBI cancels licence of Maha-based Independence...,RBI has cancelled licence of Maharashtra-based...,business
1,Boost to EVs a big step: Windmill Capital,"Increased use of EVs in public transport, spec...",business
2,Facebook parent Meta's $230-billion wipeout bi...,Facebook's parent Meta's shares plunged 27% an...,business
3,"Tesla co-worker used N-word, threw a hot tool ...",A former Tesla worker has filed a lawsuit agai...,business
4,Mark Zuckerberg loses $31 bn in one of the big...,Meta CEO Mark Zuckerberg's wealth dropped by $...,business
...,...,...,...
95,Greatest blessing: Riteish wishes Genelia on 1...,Actor Riteish Deshmukh on Thursday marked his ...,entertainment
96,I don't care about all that: Jim Sarbh on soci...,"Actor Jim Sarbh, on being asked about craving ...",entertainment
97,"'Bunty Aur Babli 2' numbers disappointing, but...","Actress Sharvari Wagh, who made her debut in '...",entertainment
98,Court directs Honey Singh to give voice sample...,Rapper Yo Yo Honey Singh has been directed by ...,entertainment


# Exercise 7

Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [48]:
# use function to pull in dataframe
codeup_df = acquire.read_blog_articles(refresh = False)
codeup_df

Unnamed: 0,title,link,content
0,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...
1,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...
2,Is Codeup the Best Bootcamp in San Antonio…or ...,https://codeup.com/codeup-news/is-codeup-the-b...,Looking for the best data science bootcamp in ...
3,Codeup Launches First Podcast: Hire Tech,https://codeup.com/codeup-news/codeup-launches...,Any podcast enthusiasts out there? We are plea...
4,Codeup Start Dates for March 2022,https://codeup.com/codeup-news/codeup-start-da...,As we approach the end of January we wanted to...
5,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...
6,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...
7,Codeup Dallas Open House,https://codeup.com/dallas-newsletter/codeup-da...,Come join us for the re-opening of our Dallas ...
8,Codeup’s Placement Team Continues Setting Records,https://codeup.com/codeup-news/codeups-placeme...,Our Placement Team is simply defined as a grou...
9,"IT Certifications 101: Why They Matter, and Wh...",https://codeup.com/it-training/it-certificatio...,"AWS, Google, Azure, Red Hat, CompTIA…these are..."


# Exercise 8

For each dataframe, produce the following columns:

* `title` to hold the title

* `original` to hold the original article/post content

* `clean` to hold the normalized and tokenized original with the stopwords removed

* `stemmed` to hold the stemmed version of the cleaned data

* `lemmatized` to hold the lemmatized version of the cleaned data


### For `news_df`:

In [49]:
# rename content column to original
news_df = news_df.rename(columns={'content':'original'})

In [50]:
# create a clean column with functions applied to original to clean and tokenize content
news_df['clean'] = news_df.original.apply(lambda x:basic_clean(x)).apply(lambda x:tokenize(x))

In [51]:
# create stemmed and lemmatized columns with functions applied to original to further clean
news_df = news_df.assign(stemmed = news_df.clean.apply(stem), lemmatized = news_df.clean.apply(lemmatize))
news_df

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,RBI cancels licence of Maha-based Independence...,RBI has cancelled licence of Maharashtra-based...,business,rbi has cancelled licence of maharashtrabased ...,rbi ha cancel licenc of maharashtrabas indepen...,rbi ha cancelled licence of maharashtrabased i...
1,Boost to EVs a big step: Windmill Capital,"Increased use of EVs in public transport, spec...",business,increased use of evs in public transport speci...,increas use of ev in public transport special ...,increased use of ev in public transport specia...
2,Facebook parent Meta's $230-billion wipeout bi...,Facebook's parent Meta's shares plunged 27% an...,business,facebook ' s parent meta ' s shares plunged 27...,facebook ' s parent meta ' s share plung 27 an...,facebook ' s parent meta ' s share plunged 27 ...
3,"Tesla co-worker used N-word, threw a hot tool ...",A former Tesla worker has filed a lawsuit agai...,business,a former tesla worker has filed a lawsuit agai...,a former tesla worker ha file a lawsuit agains...,a former tesla worker ha filed a lawsuit again...
4,Mark Zuckerberg loses $31 bn in one of the big...,Meta CEO Mark Zuckerberg's wealth dropped by $...,business,meta ceo mark zuckerberg ' s wealth dropped by...,meta ceo mark zuckerberg ' s wealth drop by 31...,meta ceo mark zuckerberg ' s wealth dropped by...
...,...,...,...,...,...,...
95,Greatest blessing: Riteish wishes Genelia on 1...,Actor Riteish Deshmukh on Thursday marked his ...,entertainment,actor riteish deshmukh on thursday marked his ...,actor riteish deshmukh on thursday mark hi 10t...,actor riteish deshmukh on thursday marked his ...
96,I don't care about all that: Jim Sarbh on soci...,"Actor Jim Sarbh, on being asked about craving ...",entertainment,actor jim sarbh on being asked about craving p...,actor jim sarbh on be ask about crave peopl ' ...,actor jim sarbh on being asked about craving p...
97,"'Bunty Aur Babli 2' numbers disappointing, but...","Actress Sharvari Wagh, who made her debut in '...",entertainment,actress sharvari wagh who made her debut in ' ...,actress sharvari wagh who made her debut in ' ...,actress sharvari wagh who made her debut in ' ...
98,Court directs Honey Singh to give voice sample...,Rapper Yo Yo Honey Singh has been directed by ...,entertainment,rapper yo yo honey singh has been directed by ...,rapper yo yo honey singh ha been direct by nag...,rapper yo yo honey singh ha been directed by n...


### For `codeup_df`:

In [52]:
# rename content column to original
codeup_df = codeup_df.rename(columns={'content':'original'})
codeup_df

Unnamed: 0,title,link,original
0,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...
1,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...
2,Is Codeup the Best Bootcamp in San Antonio…or ...,https://codeup.com/codeup-news/is-codeup-the-b...,Looking for the best data science bootcamp in ...
3,Codeup Launches First Podcast: Hire Tech,https://codeup.com/codeup-news/codeup-launches...,Any podcast enthusiasts out there? We are plea...
4,Codeup Start Dates for March 2022,https://codeup.com/codeup-news/codeup-start-da...,As we approach the end of January we wanted to...
5,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...
6,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...
7,Codeup Dallas Open House,https://codeup.com/dallas-newsletter/codeup-da...,Come join us for the re-opening of our Dallas ...
8,Codeup’s Placement Team Continues Setting Records,https://codeup.com/codeup-news/codeups-placeme...,Our Placement Team is simply defined as a grou...
9,"IT Certifications 101: Why They Matter, and Wh...",https://codeup.com/it-training/it-certificatio...,"AWS, Google, Azure, Red Hat, CompTIA…these are..."


In [54]:
# fill one null value with the reason for it's lack of content
codeup_df.original.fillna('video', inplace=True)
codeup_df

Unnamed: 0,title,link,original
0,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...
1,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...
2,Is Codeup the Best Bootcamp in San Antonio…or ...,https://codeup.com/codeup-news/is-codeup-the-b...,Looking for the best data science bootcamp in ...
3,Codeup Launches First Podcast: Hire Tech,https://codeup.com/codeup-news/codeup-launches...,Any podcast enthusiasts out there? We are plea...
4,Codeup Start Dates for March 2022,https://codeup.com/codeup-news/codeup-start-da...,As we approach the end of January we wanted to...
5,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...
6,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...
7,Codeup Dallas Open House,https://codeup.com/dallas-newsletter/codeup-da...,Come join us for the re-opening of our Dallas ...
8,Codeup’s Placement Team Continues Setting Records,https://codeup.com/codeup-news/codeups-placeme...,Our Placement Team is simply defined as a grou...
9,"IT Certifications 101: Why They Matter, and Wh...",https://codeup.com/it-training/it-certificatio...,"AWS, Google, Azure, Red Hat, CompTIA…these are..."


In [55]:
# create a clean column with functions applied to original to clean and tokenize content
codeup_df['clean'] = codeup_df.original.apply(lambda x:basic_clean(x)).apply(lambda x:tokenize(x))

In [56]:
# create stemmed and lemmatized columns with functions applied to original to further clean
codeup_df = codeup_df.assign(stemmed = codeup_df.clean.apply(stem), lemmatized = codeup_df.clean.apply(lemmatize))
codeup_df

Unnamed: 0,title,link,original,clean,stemmed,lemmatized
0,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...,we are so happy to announce that vet tec benef...,we are so happi to announc that vet tec benefi...,we are so happy to announce that vet tec benef...
1,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...,we are happy to announce that our dallas campu...,we are happi to announc that our dalla campu r...,we are happy to announce that our dallas campu...
2,Is Codeup the Best Bootcamp in San Antonio…or ...,https://codeup.com/codeup-news/is-codeup-the-b...,Looking for the best data science bootcamp in ...,looking for the best data science bootcamp in ...,look for the best data scienc bootcamp in the ...,looking for the best data science bootcamp in ...
3,Codeup Launches First Podcast: Hire Tech,https://codeup.com/codeup-news/codeup-launches...,Any podcast enthusiasts out there? We are plea...,any podcast enthusiasts out there we are pleas...,ani podcast enthusiast out there we are pleas ...,any podcast enthusiast out there we are please...
4,Codeup Start Dates for March 2022,https://codeup.com/codeup-news/codeup-start-da...,As we approach the end of January we wanted to...,as we approach the end of january we wanted to...,as we approach the end of januari we want to l...,a we approach the end of january we wanted to ...
5,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...,we are so happy to announce that vet tec benef...,we are so happi to announc that vet tec benefi...,we are so happy to announce that vet tec benef...
6,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...,we are happy to announce that our dallas campu...,we are happi to announc that our dalla campu r...,we are happy to announce that our dallas campu...
7,Codeup Dallas Open House,https://codeup.com/dallas-newsletter/codeup-da...,Come join us for the re-opening of our Dallas ...,come join us for the reopening of our dallas c...,come join us for the reopen of our dalla campu...,come join u for the reopening of our dallas ca...
8,Codeup’s Placement Team Continues Setting Records,https://codeup.com/codeup-news/codeups-placeme...,Our Placement Team is simply defined as a grou...,our placement team is simply defined as a grou...,our placement team is simpli defin as a group ...,our placement team is simply defined a a group...
9,"IT Certifications 101: Why They Matter, and Wh...",https://codeup.com/it-training/it-certificatio...,"AWS, Google, Azure, Red Hat, CompTIA…these are...",aws google azure red hat comptiathese are big ...,aw googl azur red hat comptiathes are big name...,aws google azure red hat comptiathese are big ...
