In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
import acquire as a
from time import strftime

import warnings
warnings.filterwarnings('ignore')

In [2]:
original = a.get_codeup_articles()
print(original)

                                                title     published  \
0              Learn to Code: Python Workshop on 4/23  Mar 31, 2022   
1                   Coming Soon: Cloud Administration  Mar 17, 2022   
2             5 Books Every Woman In Tech Should Read   Mar 8, 2022   
3                   Codeup Start Dates for March 2022  Jan 26, 2022   
4   VET TEC Funding Now Available For Dallas Veterans   Jan 7, 2022   
5       Dallas Campus Re-opens With New Grant Partner  Dec 30, 2021   
6   Codeup’s Placement Team Continues Setting Records  Nov 19, 2021   
7   IT Certifications 101: Why They Matter, and Wh...  Nov 18, 2021   
8   A rise in cyber attacks means opportunities fo...  Nov 17, 2021   
9    Use your GI Bill® benefits to Land a Job in Tech   Nov 4, 2021   
10  Which program is right for me: Cyber Security ...  Oct 28, 2021   
11               What the Heck is System Engineering?  Oct 21, 2021   
12     From Speech Pathology to Business Intelligence  Oct 18, 2021   
13    

#### 1.)Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
def basic_clean(string):
    '''
    Function takes in a string and returns the same string normalized
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .mdecode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string
    

#### 2.) Define a function named tokenize. 
It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string


#### 3.) Define a function named stem. 
It should accept some text and return the text after applying stemming to all the words.

In [5]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string


#### 4.) Define a function named lemmatize. 
It should accept some text and return the text after applying lemmatization to each word.

In [6]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

#### 5.) Define a function named remove_stopwords. 
It should accept some text and return the text after removing all the stopwords. This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [7]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

#### 6.) Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [8]:
news_df = a.get_inshorts_articles()
news_df.head()

Getting articles for business
Getting articles for sports
Getting articles for entertainment
Getting articles for technology


Unnamed: 0,category,title,content,author,published
0,business,India's retail inflation surges to 7.79% in Ap...,India's retail inflation surged to 7.79% in Ap...,Pragya Swastik,2022-05-12T12:41:14.000Z
1,business,"Rupee hits new all-time low, slips to 77.55 ag...",The Indian rupee has touched a fresh all-time ...,Apaar Sharma,2022-05-12T05:58:52.000Z
2,business,List of 10 highest-paid sportspersons released...,Argentina and PSG forward Lionel Messi was the...,Anmol Sharma,2022-05-12T05:22:54.000Z
3,business,Saudi Aramco dethrones Apple as world's most v...,World's biggest crude exporter Saudi Aramco ha...,Anmol Sharma,2022-05-12T05:40:33.000Z
4,business,Over $200 billion wiped off cryptocurrency mar...,More than $200 billion of wealth was wiped out...,Pragya Swastik,2022-05-12T12:19:46.000Z


#### 7.) Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [9]:
codeup_df = a.get_codeup_articles()
codeup_df.head()

Unnamed: 0,title,published,content
0,Learn to Code: Python Workshop on 4/23,"Mar 31, 2022","According to LinkedIn, the “#1 Most Promising ..."
1,Coming Soon: Cloud Administration,"Mar 17, 2022",We’re launching a new program out of San Anton...
2,5 Books Every Woman In Tech Should Read,"Mar 8, 2022",On this International Women’s Day 2022 we want...
3,Codeup Start Dates for March 2022,"Jan 26, 2022",As we approach the end of January we wanted to...
4,VET TEC Funding Now Available For Dallas Veterans,"Jan 7, 2022",We are so happy to announce that VET TEC benef...


#### 8.) For each dataframe, produce the following columns:
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [10]:
news_df.rename(columns={'content': 'original'}, inplace=True)
codeup_df.rename(columns={'content': 'original'}, inplace=True)

In [11]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [12]:
ex_df = basic_clean(news_df.title[0])
ex_df

'indias retail inflation surges to 779 in april highest in 8 years'

In [13]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [14]:
tok_df = tokenize(ex_df)
tok_df

'indias retail inflation surges to 779 in april highest in 8 years'

In [15]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [16]:
stem_df = stem(tok_df)
stem_df

'india retail inflat surg to 779 in april highest in 8 year'

In [17]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [18]:
lem_df = lemmatize(stem_df)
lem_df

'india retail inflat surg to 779 in april highest in 8 year'

In [19]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [20]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

news_df prepped...

In [21]:

prep_article_data(news_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,India's retail inflation surges to 7.79% in Ap...,India's retail inflation surged to 7.79% in Ap...,indias retail inflation surged 779 april 2022 ...,india retail inflat surg 779 april 2022 highes...,india retail inflation surged 779 april 2022 h...
1,"Rupee hits new all-time low, slips to 77.55 ag...",The Indian rupee has touched a fresh all-time ...,indian rupee touched fresh alltime low 7755 us...,indian rupe touch fresh alltim low 7755 us dol...,indian rupee touched fresh alltime low 7755 u ...
2,List of 10 highest-paid sportspersons released...,Argentina and PSG forward Lionel Messi was the...,argentina psg forward lionel messi highestpaid...,argentina psg forward lionel messi wa highestp...,argentina psg forward lionel messi wa highestp...
3,Saudi Aramco dethrones Apple as world's most v...,World's biggest crude exporter Saudi Aramco ha...,worlds biggest crude exporter saudi aramco ove...,world biggest crude export saudi aramco overta...,world biggest crude exporter saudi aramco over...
4,Over $200 billion wiped off cryptocurrency mar...,More than $200 billion of wealth was wiped out...,200 billion wealth wiped cryptocurrency market...,200 billion wealth wa wipe cryptocurr market 2...,200 billion wealth wa wiped cryptocurrency mar...


code_up_df prepped...

In [22]:
prep_article_data(codeup_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ...",according linkedin 1 promising job data scienc...,accord linkedin 1 promis job data scienc codeu...,according linkedin 1 promising job data scienc...
1,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...,launching new program san antonio acquisition ...,launch new program san antonio acquisit racksp...,launching new program san antonio acquisition ...
2,5 Books Every Woman In Tech Should Read,On this International Women’s Day 2022 we want...,international womens day 2022 wanted tell stor...,thi intern women day 2022 want tell stori wome...,international woman day 2022 wanted tell story...
3,Codeup Start Dates for March 2022,As we approach the end of January we wanted to...,approach end january wanted look forward next ...,approach end januari want look forward next st...,approach end january wanted look forward next ...
4,VET TEC Funding Now Available For Dallas Veterans,We are so happy to announce that VET TEC benef...,happy announce vet tec benefits available used...,happi announc vet tec benefit avail use campu ...,happy announce vet tec benefit available used ...


#### Ask yourself:

If your corpus is 493KB, would you prefer to use stemmed or lemmatized text? 
##### Lemmatizing

If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
##### Lemmatizing or Stemming

If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text? 
##### Stemming
