In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as aq

## Acquire Data

In [2]:
blogs = aq.get_blog_articles()
blogs.head()

Using cached .json file


Unnamed: 0,title,date,category,content
0,Learn to Code: Python Workshop on 4/23,2022-03-31,Events,"According to LinkedIn, the “#1 Most Promising ..."
1,Coming Soon: Cloud Administration,2022-03-17,Codeup News,We’re launching a new program out of San Anton...
2,5 Books Every Woman In Tech Should Read,2022-03-08,Featured,On this International Women’s Day 2022 we want...
3,Codeup Start Dates for March 2022,2022-01-26,Codeup News,As we approach the end of January we wanted to...
4,VET TEC Funding Now Available For Dallas Veterans,2022-01-07,Codeup News,We are so happy to announce that VET TEC benef...


In [3]:
articles = aq.get_news_articles()
articles.head()

Using cached .json file


Unnamed: 0,category,title,author,date,content
0,business,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,2022-05-09,The Indian rupee fell to an all-time low of 77...
1,business,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,2022-05-09,"Bitcoin fell on Monday to as low as $33,266 in..."
2,business,Rupee closes at all-time low of 77.50 against ...,Pragya Swastik,2022-05-09,The Indian rupee weakened further on Monday to...
3,business,Made best possible decision: IndiGo on barring...,Pragya Swastik,2022-05-09,IndiGo's CEO Ronojoy Dutta said the airline ma...
4,business,India's biggest IPO of LIC subscribed nearly 3...,Pragya Swastik,2022-05-09,"LIC's IPO, India's biggest IPO which opened on..."


## Exercises

**1) Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:**

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [4]:
def basic_clean(string):
    """
    This function will take in a string and perform basic cleaning procedutes. It will convert all characters
    to lower case, remove accented characters using unicode, and remove all special character 
    and symbols that are not alphanumeric characters.
    """
    
    #Convert to lower case
    string = string.lower()
    
    #Normalize and remove inconsistencies, 
    #encode into ascii byte strings and ignore unknown chars,
    #decode back into a UTF-8 string
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('UTF-8')
    
    #Use regex to replace remove/replace all special characters
    string = re.sub(r"[^a-z0-9\s']", '', string)
    
    return string

In [5]:
# Extract a string from one of my dataframes for testing

string = articles.content[0]
string

'The Indian rupee fell to an all-time low of 77.42 against the US dollar on Monday, Reuters reported. Asian markets were lower on Monday as US stock futures fell on fears of more policy tightening from the Federal Reserve and strict lockdown in Shanghai impacting global growth, according to Reuters.'

In [6]:
# Test my function 
cleaned = basic_clean(string)
cleaned

'the indian rupee fell to an alltime low of 7742 against the us dollar on monday reuters reported asian markets were lower on monday as us stock futures fell on fears of more policy tightening from the federal reserve and strict lockdown in shanghai impacting global growth according to reuters'

**2) Define a function named tokenize. It should take in a string and tokenize all the words in the string.**

In [7]:
def tokenize(string):
    """
    This function will take in a string, tokenize it and return the 
    tokenized string.
    """
    #Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    #Use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [8]:
# Test function 
tokenized = tokenize(cleaned)
tokenized

'the indian rupee fell to an alltime low of 7742 against the us dollar on monday reuters reported asian markets were lower on monday as us stock futures fell on fears of more policy tightening from the federal reserve and strict lockdown in shanghai impacting global growth according to reuters'

**3) Define a function named stem. It should accept some text and return the text after applying stemming to all the words.**

In [9]:
def stem(string):
    """
    This function will take in a string return a stemmed version of the string.
    """
    
    #Create the stemmer
    ps = nltk.porter.PorterStemmer()
    
    #Apply the stemmer to each word in the string and create a list of stemmed words
    stems = [ps.stem(word) for word in string.split()]
    
    #join the list of stemmed words into a string
    string_stemmed = ' '.join(stems)
    
    return string_stemmed

In [10]:
# Test function using the tokenized version of my string
stemmed = stem(tokenized)
stemmed

'the indian rupe fell to an alltim low of 7742 against the us dollar on monday reuter report asian market were lower on monday as us stock futur fell on fear of more polici tighten from the feder reserv and strict lockdown in shanghai impact global growth accord to reuter'

**4) Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.**

In [11]:
# Download wornet lemmatized
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/krivera/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
def lemmatize(string):
    """
    This function takes in a string and returns a lemmatized version of the string.
    """
    
    #Create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    #Use the lemmatizer on each word in the string to create a list of lemmatized words
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    #Join the lemmatized words into one string
    string_lemmatized = ' '.join(lemmas)
    
    return string_lemmatized

In [13]:
# Test on my tokenized string
lemmatized = lemmatize(tokenized)
lemmatized

'the indian rupee fell to an alltime low of 7742 against the u dollar on monday reuters reported asian market were lower on monday a u stock future fell on fear of more policy tightening from the federal reserve and strict lockdown in shanghai impacting global growth according to reuters'

**5) Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.**

*This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.*

In [14]:
# dowload nltk stopwords list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krivera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    """
    This function will take in a string, filter out stop words from the nltk standard english list 
    as well as any other extra words, and return a version of the text without these stopwords.
    It includes optional paramaters allowing the user to add extra words to remove 
    or to exclude words from the stopword list.
    """
    #Get the standard english stop word list from nltk
    stop_words = stopwords.words('english')
    
    #Add extra words to be removed to the stop word list
    for word in extra_words:
        stop_words.append(word)
    
    #Remove words to be excluded from the stop word list
    for word in exclude_words:
        stop_words.remove(word)
    
    #Create a list of words to be checked by splitting the string
    words = string.split()
    
    #Filter out all of the stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    #Join the list of filtered words into a string
    filtered_string = ' '.join(filtered_words)
    
    return filtered_string

In [16]:
# Test function on my tokenized string
stopwords_removed = remove_stopwords(tokenized)
stopwords_removed

'indian rupee fell alltime low 7742 us dollar monday reuters reported asian markets lower monday us stock futures fell fears policy tightening federal reserve strict lockdown shanghai impacting global growth according reuters'

In [17]:
# Test using extra_words and exclude_words options
extra_words = ['rupee', 'strict']
exclude_words = ['the']

filtered = remove_stopwords(tokenized, extra_words, exclude_words)
filtered

'the indian fell alltime low 7742 the us dollar monday reuters reported asian markets lower monday us stock futures fell fears policy tightening the federal reserve lockdown shanghai impacting global growth according reuters'

**6) Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.**

In [18]:
news_df = aq.get_news_articles()
news_df

Using cached .json file


Unnamed: 0,category,title,author,date,content
0,business,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,2022-05-09,The Indian rupee fell to an all-time low of 77...
1,business,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,2022-05-09,"Bitcoin fell on Monday to as low as $33,266 in..."
2,business,Rupee closes at all-time low of 77.50 against ...,Pragya Swastik,2022-05-09,The Indian rupee weakened further on Monday to...
3,business,Made best possible decision: IndiGo on barring...,Pragya Swastik,2022-05-09,IndiGo's CEO Ronojoy Dutta said the airline ma...
4,business,India's biggest IPO of LIC subscribed nearly 3...,Pragya Swastik,2022-05-09,"LIC's IPO, India's biggest IPO which opened on..."
...,...,...,...,...,...
94,entertainment,"Nimrat Kaur shares pic of tattoo, says got it ...",Kriti Kambiri,2022-05-09,Actress Nimrat Kaur shared a series of picture...
95,entertainment,"U2's Bono, The Edge perform at bomb shelter in...",Kriti Kambiri,2022-05-09,Irish band U2's members Bono and The Edge perf...
96,entertainment,Malavika Mohanan to star opposite Prabhas in t...,Udit Gupta,2022-05-09,Malavika Mohanan has reportedly been roped in ...
97,entertainment,Took me a while to break it: Isha Koppikar on ...,Mahima Kharbanda,2022-05-09,"Isha Koppikar, who has been a part of several ..."


**7) Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.**

In [19]:
codeup_df = aq.get_blog_articles()
codeup_df

Using cached .json file


Unnamed: 0,title,date,category,content
0,Learn to Code: Python Workshop on 4/23,2022-03-31,Events,"According to LinkedIn, the “#1 Most Promising ..."
1,Coming Soon: Cloud Administration,2022-03-17,Codeup News,We’re launching a new program out of San Anton...
2,5 Books Every Woman In Tech Should Read,2022-03-08,Featured,On this International Women’s Day 2022 we want...
3,Codeup Start Dates for March 2022,2022-01-26,Codeup News,As we approach the end of January we wanted to...
4,VET TEC Funding Now Available For Dallas Veterans,2022-01-07,Codeup News,We are so happy to announce that VET TEC benef...
5,Dallas Campus Re-opens With New Grant Partner,2021-12-30,Codeup News,We are happy to announce that our Dallas campu...
6,Codeup’s Placement Team Continues Setting Records,2021-11-19,Codeup News,Our Placement Team is simply defined as a grou...
7,"IT Certifications 101: Why They Matter, and Wh...",2021-11-18,IT Training,"AWS, Google, Azure, Red Hat, CompTIA…these are..."
8,A rise in cyber attacks means opportunities fo...,2021-11-17,Cybersecurity,"In the last few months, the US has experienced..."
9,Use your GI Bill® benefits to Land a Job in Tech,2021-11-04,Codeup News,"As the end of military service gets closer, ma..."


**8) For each dataframe, produce the following columns:**
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [20]:
# Rename the content columns to original 
news_df.rename(columns = {'content':'original'}, inplace = True)
codeup_df.rename(columns = {'content':'original'}, inplace = True)

In [21]:
## Starting with the news_df

news_df['clean'] = news_df['original']

#apply the basic_clean, tokenize, and remove_stopwords functions
news_df['clean'] = news_df['clean'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)

#create the stemmed column
news_df['stemmed'] = news_df['clean']

#apply the stem function
news_df['stemmed'] = news_df['stemmed'].apply(stem)

#create the lematize column
news_df['lemmatized'] = news_df['clean']

#apply the lemmatize function
news_df['lemmatized'] = news_df['lemmatized'].apply(lemmatize)

news_df

Unnamed: 0,category,title,author,date,original,clean,stemmed,lemmatized
0,business,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,2022-05-09,The Indian rupee fell to an all-time low of 77...,indian rupee fell alltime low 7742 us dollar m...,indian rupe fell alltim low 7742 us dollar mon...,indian rupee fell alltime low 7742 u dollar mo...
1,business,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,2022-05-09,"Bitcoin fell on Monday to as low as $33,266 in...",bitcoin fell monday low 33266 morning trade ne...,bitcoin fell monday low 33266 morn trade near ...,bitcoin fell monday low 33266 morning trade ne...
2,business,Rupee closes at all-time low of 77.50 against ...,Pragya Swastik,2022-05-09,The Indian rupee weakened further on Monday to...,indian rupee weakened monday close new alltime...,indian rupe weaken monday close new alltim low...,indian rupee weakened monday close new alltime...
3,business,Made best possible decision: IndiGo on barring...,Pragya Swastik,2022-05-09,IndiGo's CEO Ronojoy Dutta said the airline ma...,indigo ' ceo ronojoy dutta said airline made b...,indigo ' ceo ronojoy dutta said airlin made be...,indigo ' ceo ronojoy dutta said airline made b...
4,business,India's biggest IPO of LIC subscribed nearly 3...,Pragya Swastik,2022-05-09,"LIC's IPO, India's biggest IPO which opened on...",lic ' ipo india ' biggest ipo opened may 4 clo...,lic ' ipo india ' biggest ipo open may 4 close...,lic ' ipo india ' biggest ipo opened may 4 clo...
...,...,...,...,...,...,...,...,...
94,entertainment,"Nimrat Kaur shares pic of tattoo, says got it ...",Kriti Kambiri,2022-05-09,Actress Nimrat Kaur shared a series of picture...,actress nimrat kaur shared series pictures ins...,actress nimrat kaur share seri pictur instagra...,actress nimrat kaur shared series picture inst...
95,entertainment,"U2's Bono, The Edge perform at bomb shelter in...",Kriti Kambiri,2022-05-09,Irish band U2's members Bono and The Edge perf...,irish band u2 ' members bono edge performed bo...,irish band u2 ' member bono edg perform bomb s...,irish band u2 ' member bono edge performed bom...
96,entertainment,Malavika Mohanan to star opposite Prabhas in t...,Udit Gupta,2022-05-09,Malavika Mohanan has reportedly been roped in ...,malavika mohanan reportedly roped play female ...,malavika mohanan reportedli rope play femal le...,malavika mohanan reportedly roped play female ...
97,entertainment,Took me a while to break it: Isha Koppikar on ...,Mahima Kharbanda,2022-05-09,"Isha Koppikar, who has been a part of several ...",isha koppikar part several movies spoke typeca...,isha koppikar part sever movi spoke typecast s...,isha koppikar part several movie spoke typecas...


In [22]:
## Now apply the same to the codeup_df

codeup_df['clean'] = codeup_df['original']

#apply the basic_clean, tokenize, and remove_stopwords functions
codeup_df['clean'] = codeup_df['clean'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)

#create the stemmed column
codeup_df['stemmed'] = codeup_df['clean']

#apply the stem function
codeup_df['stemmed'] = codeup_df['stemmed'].apply(stem)

#create the lematize column
codeup_df['lemmatized'] = codeup_df['clean']

#apply the lemmatize function
codeup_df['lemmatized'] = codeup_df['lemmatized'].apply(lemmatize)

codeup_df

Unnamed: 0,title,date,category,original,clean,stemmed,lemmatized
0,Learn to Code: Python Workshop on 4/23,2022-03-31,Events,"According to LinkedIn, the “#1 Most Promising ...",according linkedin 1 promising job data scienc...,accord linkedin 1 promis job data scienc codeu...,according linkedin 1 promising job data scienc...
1,Coming Soon: Cloud Administration,2022-03-17,Codeup News,We’re launching a new program out of San Anton...,launching new program san antonio acquisition ...,launch new program san antonio acquisit racksp...,launching new program san antonio acquisition ...
2,5 Books Every Woman In Tech Should Read,2022-03-08,Featured,On this International Women’s Day 2022 we want...,international womens day 2022 wanted tell stor...,intern women day 2022 want tell stori women te...,international woman day 2022 wanted tell story...
3,Codeup Start Dates for March 2022,2022-01-26,Codeup News,As we approach the end of January we wanted to...,approach end january wanted look forward next ...,approach end januari want look forward next st...,approach end january wanted look forward next ...
4,VET TEC Funding Now Available For Dallas Veterans,2022-01-07,Codeup News,We are so happy to announce that VET TEC benef...,happy announce vet tec benefits available used...,happi announc vet tec benefit avail use campu ...,happy announce vet tec benefit available used ...
5,Dallas Campus Re-opens With New Grant Partner,2021-12-30,Codeup News,We are happy to announce that our Dallas campu...,happy announce dallas campus reopened better y...,happi announc dalla campu reopen better yet ne...,happy announce dallas campus reopened better y...
6,Codeup’s Placement Team Continues Setting Records,2021-11-19,Codeup News,Our Placement Team is simply defined as a grou...,placement team simply defined group manages re...,placement team simpli defin group manag relati...,placement team simply defined group manages re...
7,"IT Certifications 101: Why They Matter, and Wh...",2021-11-18,IT Training,"AWS, Google, Azure, Red Hat, CompTIA…these are...",aws google azure red hat comptiathese big name...,aw googl azur red hat comptiathes big name pro...,aws google azure red hat comptiathese big name...
8,A rise in cyber attacks means opportunities fo...,2021-11-17,Cybersecurity,"In the last few months, the US has experienced...",last months us experienced dozens major cybera...,last month us experienc dozen major cyberattac...,last month u experienced dozen major cyberatta...
9,Use your GI Bill® benefits to Land a Job in Tech,2021-11-04,Codeup News,"As the end of military service gets closer, ma...",end military service gets closer many transiti...,end militari servic get closer mani transit se...,end military service get closer many transitio...


**9) Ask yourself:**
- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?


- 493kb is pretty small and I would prefer to use lemmatized text.
- 25mb is larger but still not too large and I'd prefer to use lemmatized text.
- 200TB is a very large corpus and in this case I'd prefere to use stemmed text to save on computational resouces and cost.

## Test out my prepare.py

In [23]:
articles.head()

Unnamed: 0,category,title,author,date,content
0,business,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,2022-05-09,The Indian rupee fell to an all-time low of 77...
1,business,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,2022-05-09,"Bitcoin fell on Monday to as low as $33,266 in..."
2,business,Rupee closes at all-time low of 77.50 against ...,Pragya Swastik,2022-05-09,The Indian rupee weakened further on Monday to...
3,business,Made best possible decision: IndiGo on barring...,Pragya Swastik,2022-05-09,IndiGo's CEO Ronojoy Dutta said the airline ma...
4,business,India's biggest IPO of LIC subscribed nearly 3...,Pragya Swastik,2022-05-09,"LIC's IPO, India's biggest IPO which opened on..."


In [25]:
import prepare as prep

# Test out prepare function on a df
prep.prep_article_data(articles, 'content')

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Rupee hits all-time low of 77.42 against US do...,The Indian rupee fell to an all-time low of 77...,indian rupee fell alltime low 7742 us dollar m...,indian rupe fell alltim low 7742 us dollar mon...,indian rupee fell alltime low 7742 u dollar mo...
1,Bitcoin falls to the lowest level since Januar...,"Bitcoin fell on Monday to as low as $33,266 in...",bitcoin fell monday low 33266 morning trade ne...,bitcoin fell monday low 33266 morn trade near ...,bitcoin fell monday low 33266 morning trade ne...
2,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,indian rupee weakened monday close new alltime...,indian rupe weaken monday close new alltim low...,indian rupee weakened monday close new alltime...
3,Made best possible decision: IndiGo on barring...,IndiGo's CEO Ronojoy Dutta said the airline ma...,indigo ' ceo ronojoy dutta said airline made b...,indigo ' ceo ronojoy dutta said airlin made be...,indigo ' ceo ronojoy dutta said airline made b...
4,India's biggest IPO of LIC subscribed nearly 3...,"LIC's IPO, India's biggest IPO which opened on...",lic ' ipo india ' biggest ipo opened may 4 clo...,lic ' ipo india ' biggest ipo open may 4 close...,lic ' ipo india ' biggest ipo opened may 4 clo...
...,...,...,...,...,...
94,"Nimrat Kaur shares pic of tattoo, says got it ...",Actress Nimrat Kaur shared a series of picture...,actress nimrat kaur shared series pictures ins...,actress nimrat kaur share seri pictur instagra...,actress nimrat kaur shared series picture inst...
95,"U2's Bono, The Edge perform at bomb shelter in...",Irish band U2's members Bono and The Edge perf...,irish band u2 ' members bono edge performed bo...,irish band u2 ' member bono edg perform bomb s...,irish band u2 ' member bono edge performed bom...
96,Malavika Mohanan to star opposite Prabhas in t...,Malavika Mohanan has reportedly been roped in ...,malavika mohanan reportedly roped play female ...,malavika mohanan reportedli rope play femal le...,malavika mohanan reportedly roped play female ...
97,Took me a while to break it: Isha Koppikar on ...,"Isha Koppikar, who has been a part of several ...",isha koppikar part several movies spoke typeca...,isha koppikar part sever movi spoke typecast s...,isha koppikar part several movie spoke typecas...
