In [1]:
import unicodedata
import re
import json
import os

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
# nltk.download('all')
# nltk.download('stopwords')

import pandas as pd
import numpy as np

import wrangle as wr

Exercises
* The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

* Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
    * Lowercase everything
    * Normalize unicode characters
    * Replace anything that is not a letter, number, whitespace or a single quote.
    
* Define a function named tokenize. It should take in a string and tokenize all the words in the string.

* Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

* Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

* Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
    * This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

* Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

* Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

* For each dataframe, produce the following columns:
    * title to hold the title
    * original to hold the original article/post content
    * clean to hold the normalized and tokenized original with the stopwords removed.
    * stemmed to hold the stemmed version of the cleaned data.
    * lemmatized to hold the lemmatized version of the cleaned data.

Ask yourself:
* If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    * lemmatize
* If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    * lemmatize or stem
* If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    * stemmed! I'm not made of money lol


In [70]:
blogs = wr.get_blog_articles()
blogs

[{'title': 'Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa',
  'link': 'https://codeup.com/featured/apida-heritage-month/',
  'date_published': 'May 24, 2023',
  'content': ' May is traditionally known as Asian American and Pacific Islander (AAPI) Heritage Month. This month we celebrate the history and contributions made possible by our AAPI friends, family, and community. We also examine our level of support and seek opportunities to better understand the AAPI community.  In an effort to address real concerns and experiences, we sat down with Arbeena Thapa, one of Codeup’s Financial Aid and Enrollment Managers. Arbeena identifies as Nepali American and Desi. Arbeena’s parents immigrated to Texas in 1988 for better employment and educational opportunities. Arbeena’s older sister was five when they made the move to the US. Arbeena was born later, becoming the first in her family to be a US citizen. At Codeup we take our efforts at inclusivity very

In [72]:
#turned into df
bdf = pd.DataFrame(blogs)
bdf.head(1)

Unnamed: 0,title,link,date_published,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,https://codeup.com/featured/apida-heritage-month/,"May 24, 2023",May is traditionally known as Asian American ...


In [161]:
#news articles
news = wr.get_news_articles()
news

{'title': ['Was never interested in being LoP, give me party post: Ajit Pawar',
  "2 transgenders killed with stones & knives in Hyderabad's Daibagh",
  " I declare less than actual area while declaring properties: K'taka Deputy CM",
  'Experience hassle-free home interiors: HomeLane',
  'Asit Modi would pinch my cheeks, say inappropriate things about my looks: Actress in FIR',
  'US overestimated value of weapons sent to Ukraine by $6.2 bn due to error: Pentagon',
  "Iceland Cricket troll England with 'Bazball 0-1 Basic Common Sense' tweet",
  "TIME releases list of the world's 100 most influential companies",
  "'Cis' or 'cisgender' are considered slurs on Twitter: Elon Musk",
  "Int'l Yoga day a call to stand up for humankind: NY Mayor at UN HQ",
  'Woman tortured to death by relatives in UP, loud music played to cover up screams',
  'Urge brave sisters to lead way to bring peace to Manipur: Sonia',
  'Yoga event led by PM Modi in US creates Guinness World Record',
  'Thought kids a

In [165]:
news_df = pd.DataFrame(news)
news_df.head(1)

Unnamed: 0,title,content,category
0,"Was never interested in being LoP, give me par...",NCP leader Ajit Pawar has asked the party to r...,business


## 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.

In [73]:
#create lowercase function
def lower_everything(string):
    return string.str.lower()

In [74]:
bdf_lower = bdf.apply(lower_everything)
bdf_lower.head(1)

Unnamed: 0,title,link,date_published,content
0,spotlight on apida voices: celebrating heritag...,https://codeup.com/featured/apida-heritage-month/,"may 24, 2023",may is traditionally known as asian american ...


In [140]:
subject = "paul e≈√å∆ª\
rdos and george polys were 1293 influential hungarian mathematicians written as erdos or erdos \
either by mistake or out of typographical ººœ¡ necessity!"

In [141]:
def normalize_everything(string):
    string = unicodedata.normalize('NFKD', string).encode('ascii','ignore').decode('utf-8')
    return string

In [142]:
normalize_everything(subject)

'paul eaardos and george polys were 1293 influential hungarian mathematicians written as erdos or erdos either by mistake or out of typographical oo necessity!'

In [143]:
#create removal of specials function
def specials_removed(string):
    string = re.sub(r'[^a-z0-9\'\s]', '', string)
    return string

In [144]:
specials_removed(subject)

'paul erdos and george polys were 1293 influential hungarian mathematicians written as erdos or erdos either by mistake or out of typographical  necessity'

In [169]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii','ignore').decode('utf-8')
    string = re.sub(r'[^a-z0-9\'\s]', '', string)

    return string

In [170]:
basic_clean(subject)

'paul eaardos and george polys were 1293 influential hungarian mathematicians written as erdos or erdos either by mistake or out of typographical oo necessity'

### Tokenize

In [173]:
def token_it_up(string):
    tokenize = nltk.tokenize.ToktokTokenizer()
    string = tokenize.tokenize(string, return_str=True)
    return string

In [174]:
token_it_up(subject)

'paul e≈√å∆ªrdos and george polys were 1293 influential hungarian mathematicians written as erdos or erdos either by mistake or out of typographical ººœ ¡ necessity !'

### Stemmer

In [179]:
def stemmer(string):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string = ' '.join(stems)
    return string

In [180]:
stemmer(subject)

'paul e≈√å∆ªrdo and georg poli were 1293 influenti hungarian mathematician written as erdo or erdo either by mistak or out of typograph ººœ¡ necessity!'

### Lemmatizer

In [181]:
def lemmad(string):
    wnl = nltk.stem.WordNetLemmatizer()
    string = [wnl.lemmatize(word) for word in string.split()]
    string = ' '.join(string)
    return string

In [182]:
lemmad(subject)

'paul e≈√å∆ªrdos and george polys were 1293 influential hungarian mathematician written a erdos or erdos either by mistake or out of typographical ººœ¡ necessity!'

### Stopwords

In [183]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    sls = stopwords.words('english')
    
    sls = set(sls) - set(exclude_words)
    sls = sls.union(set(extra_words))
    
    words = string.split()
    filtered = [word for word in words if word not in sls]
    string = ' '.join(filtered)
    return string

In [184]:
remove_stopwords(subject)

'paul e≈√å∆ªrdos george polys 1293 influential hungarian mathematicians written erdos erdos either mistake typographical ººœ¡ necessity!'

### Create DFs

* title to hold the title
* original to hold the original article/post content
* clean to hold the normalized and tokenized original with the stopwords removed.
* stemmed to hold the stemmed version of the cleaned data.
* lemmatized to hold the lemmatized version of the cleaned data.

In [160]:
codeup_df = bdf
codeup_df

Unnamed: 0,title,link,date_published,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,https://codeup.com/featured/apida-heritage-month/,"May 24, 2023",May is traditionally known as Asian American ...
1,Women in tech: Panelist Spotlight – Magdalena ...,https://codeup.com/featured/women-in-tech-pane...,"Mar 28, 2023",Women in tech: Panelist Spotlight – Magdalena...
2,Women in tech: Panelist Spotlight – Rachel Rob...,https://codeup.com/featured/women-in-tech-rach...,"Mar 20, 2023",Women in tech: Panelist Spotlight – Rachel Ro...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,https://codeup.com/codeup-news/women-in-tech-p...,"Mar 13, 2023",Women in tech: Panelist Spotlight – Sarah Mel...
4,Women in Tech: Panelist Spotlight – Madeleine ...,https://codeup.com/events/women-in-tech-madele...,"Mar 6, 2023",Women in tech: Panelist Spotlight – Madeleine...
5,Black Excellence in Tech: Panelist Spotlight –...,https://codeup.com/codeup-news/panelist-spotli...,"Feb 16, 2023",Black excellence in tech: Panelist Spotlight ...


In [168]:
news_df.head(5)

Unnamed: 0,title,content,category
0,"Was never interested in being LoP, give me par...",NCP leader Ajit Pawar has asked the party to r...,business
1,2 transgenders killed with stones & knives in ...,Two transgenders aged between 25 and 30 died a...,business
2,I declare less than actual area while declari...,Karnataka Deputy CM DK Shivakumar while talkin...,business
3,Experience hassle-free home interiors: HomeLane,HomeLane invites you to discover the world of ...,business
4,"Asit Modi would pinch my cheeks, say inappropr...",The details of FIR filed against 'Taarak Mehta...,business


In [None]:
news_df = news_df.rename(columns={'content':'original'}).drop(columns='category')

In [187]:
news_df.head(3)

Unnamed: 0,title,original
0,"Was never interested in being LoP, give me par...",NCP leader Ajit Pawar has asked the party to r...
1,2 transgenders killed with stones & knives in ...,Two transgenders aged between 25 and 30 died a...
2,I declare less than actual area while declari...,Karnataka Deputy CM DK Shivakumar while talkin...


In [190]:
news_df['clean'] = news_df.original.apply(basic_clean).apply(token_it_up).apply(remove_stopwords)

In [195]:
news_df['stem'] = news_df.clean.apply(stemmer)
news_df['lemma'] = news_df.clean.apply(lemmad)

In [196]:
news_df.head(1)

Unnamed: 0,title,original,clean,stem,lemma
0,"Was never interested in being LoP, give me par...",NCP leader Ajit Pawar has asked the party to r...,ncp leader ajit pawar asked party relieve resp...,ncp leader ajit pawar ask parti reliev respons...,ncp leader ajit pawar asked party relieve resp...


In [197]:
def clean_df(df, exclude_words=[], extra_words=[]):
    '''
    send in df with columns: title and original,
    returns df with original, clean, stemmed, and lemmatized data
    '''
    df['clean'] = df.original.apply(basic_clean).apply(token_it_up).apply(remove_stopwords)
    df['stem'] = df.clean.apply(stemmer)
    df['lemma'] = df.clean.apply(lemmad)
    
    return df

In [203]:
codeup_df = codeup_df.rename(columns={'content':'original'}).drop(columns={'link','date_published'})

In [204]:
clean_df(codeup_df)

Unnamed: 0,title,original,clean,stem,lemma
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American ...,may traditionally known asian american pacific...,may tradit known asian american pacif island a...,may traditionally known asian american pacific...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena...,women tech panelist spotlight magdalena rahn c...,women tech panelist spotlight magdalena rahn c...,woman tech panelist spotlight magdalena rahn c...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Ro...,women tech panelist spotlight rachel robbinsma...,women tech panelist spotlight rachel robbinsma...,woman tech panelist spotlight rachel robbinsma...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mel...,women tech panelist spotlight sarah mellor cod...,women tech panelist spotlight sarah mellor cod...,woman tech panelist spotlight sarah mellor cod...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine...,women tech panelist spotlight madeleine capper...,women tech panelist spotlight madelein capper ...,woman tech panelist spotlight madeleine capper...
5,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight ...,black excellence tech panelist spotlight wilma...,black excel tech panelist spotlight wilmari de...,black excellence tech panelist spotlight wilma...


In [35]:
## PARKING LOT FOR CODE CREATED BUT NOT USED - WRONG RABBIT HOLE
# #lowercase
# articles = [{key: value.lower() for key, value in dictionary.items()} for dictionary in blogs]
# articles
# #normalize
# normalized_list = [{key: unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')\
#         .decode('utf-8') for key, value in dictionary.items()} for dictionary in articles]
# normalized_list
# #remove special characters
# specials_removed = [{key: re.sub(r'[^a-z0-9\'\s]', '', value) \
#                      for key, value in dictionary.items()} for dictionary in normalized_list]
# specials_removed
# #tokenize created
# tokenize = nltk.tokenize.ToktokTokenizer()
# tokenize
# #tokenize used
# tokenize_list = [{key: tokenize.tokenize(value, return_str=True) \
#                      for key, value in dictionary.items()} for dictionary in specials_removed]
# tokenize_list
# #lemmatize created
# wnl = nltk.stem.WordNetLemmatizer()
# #lemmatize used
# lemmas = [{key: [wnl.lemmatize(word) for word in value.split()] \
#                      for key, value in dictionary.items()} for dictionary in tokenize_list]
# lemmas
# #stopwords created
# sls = stopwords.words('english')
# sls
# #join dicts back together
# blogs_lemmad = [{key: ' '.join(value) for key, \
#   value in dictionary.items()} for dictionary in lemmas]
# blogs_lemmad
# #splitting to apply stopword removal
# words = [{key: [word for word in value.split()] for key, \
#   value in dictionary.items()} for dictionary in blogs_lemmad]
# words
# #filtering stopwords out
# filtered = [{key: [word for word in words if word not in sls] for key, \
#   value in dictionary.items()} for dictionary in words]
# filtered