# Prepare Exercises

__1) Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:__

* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
import numpy as np
import pandas as pd
import acquire
import unicodedata
import re

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [6]:
#Acquire data for testing
blog_articles = acquire.get_blog_articles()

news_articles = acquire.get_news_articles()

In [5]:
def basic_clean(string):
    """
    This function will perform basic cleaning of a string. It will reduce all characters 
    to lower case, normalize unicode characters, and remove anything that is not a 
    letter, number, whitespace, or a single quote.
    """
    
    #Lower case everything
    string = string.lower()
    
    #Normalize unicode characters, 
    #encode into ascii byte strings and ignore unknown chars,
    #decode back into a UTF-8 string that we can work with
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('UTF-8')
    
    #Use regex to replace anything that is not a letter, number, whitespace, or a single quote
    string = re.sub(r"[^a-z0-9\s']", '', string)
    
    return string

In [8]:
original = news_articles.content[0]
original

'A WHO technical advisory group which met on Tuesday to consider Bharat Biotech\'s COVID-19 vaccine Covaxin for emergency use listing is likely to announce its decision soon. "If all is in place and all goes well and if the committee is satisfied, we would expect a recommendation within the next 24 hours or so," WHO spokesperson Margaret Harris told reporters. '

In [9]:
#For testing
cleaned = basic_clean(original)
cleaned

"a who technical advisory group which met on tuesday to consider bharat biotech's covid19 vaccine covaxin for emergency use listing is likely to announce its decision soon if all is in place and all goes well and if the committee is satisfied we would expect a recommendation within the next 24 hours or so who spokesperson margaret harris told reporters "

__2) Define a function named tokenize. It should take in a string and tokenize all the words in the string.__

In [10]:
def tokenize(string):
    """
    This function will tokenize all the words in the given string and return the 
    tokenized string.
    """
    
    #Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    #Use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [11]:
#For testing
#untokenized
cleaned

"a who technical advisory group which met on tuesday to consider bharat biotech's covid19 vaccine covaxin for emergency use listing is likely to announce its decision soon if all is in place and all goes well and if the committee is satisfied we would expect a recommendation within the next 24 hours or so who spokesperson margaret harris told reporters "

In [12]:
#Tokenized
tokenized = tokenize(cleaned)
tokenized

"a who technical advisory group which met on tuesday to consider bharat biotech ' s covid19 vaccine covaxin for emergency use listing is likely to announce its decision soon if all is in place and all goes well and if the committee is satisfied we would expect a recommendation within the next 24 hours or so who spokesperson margaret harris told reporters"

__3) Define a function named stem. It should accept some text and return the text after applying stemming to all the words.__

In [13]:
def stem(string):
    """
    This function will accept some text and return a stemmed version of the text.
    """
    
    #Create porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    #Apply the stemmer to each word in the string to create a list of stemmed words
    stems = [ps.stem(word) for word in string.split()]
    
    #join our list of stemmed words into a string
    string_stemmed = ' '.join(stems)
    
    return string_stemmed

In [14]:
#For testing
#unstemmed
tokenized

"a who technical advisory group which met on tuesday to consider bharat biotech ' s covid19 vaccine covaxin for emergency use listing is likely to announce its decision soon if all is in place and all goes well and if the committee is satisfied we would expect a recommendation within the next 24 hours or so who spokesperson margaret harris told reporters"

In [15]:
#stemmed
stemmed = stem(tokenized)
stemmed

"a who technic advisori group which met on tuesday to consid bharat biotech ' s covid19 vaccin covaxin for emerg use list is like to announc it decis soon if all is in place and all goe well and if the committe is satisfi we would expect a recommend within the next 24 hour or so who spokesperson margaret harri told report"

__4) Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.__

In [16]:
# Need to download this the first time.
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnathonsmith/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
def lemmatize(string):
    """
    This function accepts some text and returns the lemmatized version of the string.
    """
    
    #Create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    #Use the lemmatizer on each word in the string to create a list of lemmatized words
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    #Join the lemmatized words into one string
    string_lemmatized = ' '.join(lemmas)
    
    return string_lemmatized

In [18]:
#For testing
#Unlemmatized
tokenized

"a who technical advisory group which met on tuesday to consider bharat biotech ' s covid19 vaccine covaxin for emergency use listing is likely to announce its decision soon if all is in place and all goes well and if the committee is satisfied we would expect a recommendation within the next 24 hours or so who spokesperson margaret harris told reporters"

In [19]:
#Lemmatized
lemmatized = lemmatize(tokenized)
lemmatized

"a who technical advisory group which met on tuesday to consider bharat biotech ' s covid19 vaccine covaxin for emergency use listing is likely to announce it decision soon if all is in place and all go well and if the committee is satisfied we would expect a recommendation within the next 24 hour or so who spokesperson margaret harris told reporter"

__5) Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.__

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [23]:
#Download the stopword corpus
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnathonsmith/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [35]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    """
    This function will accept a string and return a version of the text without any stopwords.
    It will also allow the user to add extra words to remove or exclude words from the removal list.
    """
    #Get the standard english stop word list from nltk
    stop_words = stopwords.words('english')
    
    #Add the extra words to be removed to the stop word list
    stop_words.append(extra_words)
    
    #Remove the words to be excluded from the stop word list
    stop_words.remove(exclude_words)
    
    #Create a list of words to be checked by splitting the given string
    words = string.split()
    
    #Now filter out all of the stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    #Join the list of filtered words into a string
    filtered_string = ' '.join(filtered_words)
    
    return filtered_string

In [36]:
#For testing
#String with stop words
lemmatized

"a who technical advisory group which met on tuesday to consider bharat biotech ' s covid19 vaccine covaxin for emergency use listing is likely to announce it decision soon if all is in place and all go well and if the committee is satisfied we would expect a recommendation within the next 24 hour or so who spokesperson margaret harris told reporter"

In [39]:
#Create a list of extra words and words to exclude
extra_words = ['group', 'met', 'tuesday']
exclude_words = ['a', 'which']

In [40]:
#String without stop words
filtered = remove_stopwords(lemmatized, extra_words, exclude_words)
filtered

ValueError: list.remove(x): x not in list