In [20]:
import unicodedata
import re
import json
import os
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

### Exercise 1
Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

Lowercase everything
Normalize unicode characters
Replace anything that is not a letter, number, whitespace or a single quote.

In [21]:
#Define a function to clean text data
def basic_clean(string):
    
    string = string.lower()
    
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    string = re.sub(r"[^a-z0-9\s']", '', string)
    
    return string
#Test my function on a string
messy_string = 'ThiS is% .thE mé_ssiest STRING o^f all timE!!!!!'

clean_string = basic_clean(messy_string)

clean_string

'this is the messiest string of all time'

### Exercise 2
Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [22]:
#Define a function to tokenize text
def tokenize(string):
    
    tokenizer = ToktokTokenizer()
    
    return tokenizer.tokenize(string, return_str=True)
#Test it
tokenized_string = tokenize(clean_string)

tokenized_string

'this is the messiest string of all time'

### Exercise 3
Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [23]:
#Define a function to stem text data using the PorterStemmer
def stem(string):
    
    ps = nltk.porter.PorterStemmer()
    
    stems = [ps.stem(word) for word in string.split()]
    
    stemmed_string = ' '.join(stems)
    
    return stemmed_string
#Test
stemmed_string = stem(tokenized_string)

stemmed_string

'thi is the messiest string of all time'

### Exercise 4
Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [24]:
#Define a function to lemmatize text
def lemmatize(string):
    
    wnl = nltk.stem.WordNetLemmatizer()
    
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    lemmatized_string = ' '.join(lemmas)
    
    return lemmatized_string
#Test
lemma_string = lemmatize(tokenized_string)

lemma_string

'this is the messiest string of all time'

### Exercise 5
Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [25]:
#Define a function to remove stopwords from a string of text
def remove_stopwords(string, extra_words=None, exclude_words=None):
    
    stopword_list = stopwords.words('english')
    
    if exclude_words:
        
        stopword_list = stopword_list + exclude_words
        
    if extra_words:
        
        for word in extra_words:
            
            stopword_list.remove(word)
            
    words = string.split()
    
    filtered_words = [word for word in words if word not in stopword_list]
    
    filtered_string = ' '.join(filtered_words)
    
    return filtered_string
#Test
no_stop_string = remove_stopwords(lemma_string)

no_stop_string

'messiest string time'

### Exercise 6
Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [26]:
def get_news_articles():
    
    file = 'news_articles.json'
    
    if os.path.exists(file):
        
        with open(file) as f:
            
            return json.load(f)
    
    topic_list = ['business', 'sports', 'technology', 'entertainment']
    
    final_list = []
    
    for topic in topic_list:
        
        final_list.extend(scrape_one_page(topic))
        
    with open(file, 'w') as f:
        
        json.dump(final_list, f)
        
    return final_list    

In [27]:
#Run my acquire function to get the news articles. Is there an easier way?
news_json = get_news_articles()

news_json

[{'title': 'FBI notified Twitter of at least 1 Chinese agent working at the firm: Whistleblower',
  'content': "Former Twitter executive-turned-whistleblower Peiter Zatko has stated that Federal Bureau of Investigation (FBI) had notified Twitter of at least one Chinese agent working at the firm, Senator Chuck Grassley said. Zatko added that some Twitter employees were concerned that Chinese government would be able to collect data on some Twitter users. Zatko used to work as Twitter's head of security.",
  'category': 'business'},
 {'title': 'Vedanta, Foxconn to invest ₹1,54,000 cr in Guj for semiconductor plant',
  'content': "Vedanta and Taiwan's Foxconn have signed a Memorandum of Understanding (MoU) with the Gujarat government to invest ₹1,54,000 crore for setting up a display fabrication and semiconductor facility in the state. Gujarat Chief Minister Bhupendra Patel said that the facility will create one lakh job opportunities. Foxconn is acting as the technical partner, while Ved

In [28]:
#Yes!
news_df = pd.read_json('news_articles.json')

news_df.head()

Unnamed: 0,title,content,category
0,FBI notified Twitter of at least 1 Chinese age...,Former Twitter executive-turned-whistleblower ...,business
1,"Vedanta, Foxconn to invest ₹1,54,000 cr in Guj...",Vedanta and Taiwan's Foxconn have signed a Mem...,business
2,US inflation eases to 8.3% in August; falls fo...,The US' consumer price index (CPI) rose 8.3% i...,business
3,"NYT offers lunch boxes as return to work perk,...",The New York Times offered its staff branded l...,business
4,US firm offers 10% hike to staff on notice per...,A marketing firm in the US offers its employee...,business


### Exercise 7
Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

### Exercise 8
For each dataframe, produce the following columns:

title: to hold the title
original: to hold the original article/post content
clean: to hold the normalized and tokenized original with the stopwords removed.
stemmed: to hold the stemmed version of the cleaned data.
lemmatized: to hold the lemmatized version of the cleaned data.