# The end result of this exercise should be a file named prepare.py that defines the requested functions.

## In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import re

def basic_clean(string_of_words):
    string_of_words=string_of_words.lower()
    string_of_words = unicodedata.normalize('NFKD',string_of_words)\
        .encode('ascii', 'ignore')\
            .decode('utf-8')
    string_of_words=re.sub(r'[^a-z0-9\'\s]','',string_of_words)
    return string_of_words            



In [2]:
test_phrase= 'This is my test run sentence to see IF my function works 10 out of 10 times '

test_phrase=basic_clean(test_phrase)

test_phrase

'this is my test run sentence to see if my function works 10 out of 10 times '

# 2 Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [3]:
import nltk

#create the tokenizer, hard to remember due to repetitiveness 
#tokenize=nltk.tokenize.ToktokTokenizer()

#use the tokenizer I just created
#any_string = tokenize.tokenize(any_string, return_str=True)

#now lets put these into a function 

def tokenize(any_string):
    tokenize=nltk.tokenize.ToktokTokenizer()
    any_string = tokenize.tokenize(any_string, return_str=True)
    return any_string


test_phrase=tokenize(test_phrase)
test_phrase

'this is my test run sentence to see if my function works 10 out of 10 times'

# 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [4]:
#create the stemmer
ps= nltk.porter.PorterStemmer()



def stem(any_string):
    ps= nltk.porter.PorterStemmer()
    stems= [ps.stem(word) for word in any_string.split()]
    string_stemmed= ' '.join(stems)
    return string_stemmed

stem(test_phrase)

'thi is my test run sentenc to see if my function work 10 out of 10 time'

# 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [5]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/joebennett/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/joebennett/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/joebennett/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/joebennett/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/joebennett/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    |

True

In [6]:
#create the lemmatizer
wnl = nltk.stem.WordNetLemmatizer()

#test lemmatizer
wnl.lemmatize('calling'), wnl.lemmatize('calls')

def lemmatize(any_string):
    lemmas = [wnl.lemmatize(word) for word in any_string.split()]   
    any_string_lemmatized= ' '.join(lemmas)
    return any_string_lemmatized

lemmatize(test_phrase)

'this is my test run sentence to see if my function work 10 out of 10 time'

# 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')

#save stopwords
stopwords_ls= stopwords.words('english')


len(stopwords_ls)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joebennett/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [8]:
def remove_stopwords(any_string,extra_words='',exclude_words=''):
    stopwords_ls= stopwords.words('english')
    if exclude_words != '':
        stopwords_ls.remove(exclude_words)
    if extra_words!='':
        stopwords_ls.append(extra_words)
    words=any_string.split()
    filtered_words=[word for word in words if word not in stopwords_ls]
    print(f'Removed {(len(words)-len(filtered_words))} words from string')
    string_no_stop_words= ' '.join(filtered_words)
    return string_no_stop_words

remove_stopwords(test_phrase)


Removed 8 words from string


'test run sentence see function works 10 10 times'

# 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [9]:
import os
from requests import get
from bs4 import BeautifulSoup as soupify
import acquire


In [10]:
news_df=pd.DataFrame(acquire.get_news_articles(acquire.url))



  soup = soupify(get(base_url).content)


<Response [200]>
cat articles length:  12
length of all_articles:  12




  cat_soup = soupify(get(cat_url).content)


<Response [200]>
cat articles length:  25
length of all_articles:  37
<Response [200]>
cat articles length:  25
length of all_articles:  62
<Response [200]>
cat articles length:  25
length of all_articles:  87
<Response [200]>
cat articles length:  25
length of all_articles:  112
<Response [200]>
cat articles length:  25
length of all_articles:  137
<Response [200]>
cat articles length:  25
length of all_articles:  162
<Response [200]>
cat articles length:  25
length of all_articles:  187
<Response [200]>
cat articles length:  24
length of all_articles:  211
<Response [200]>
cat articles length:  25
length of all_articles:  236
<Response [200]>
cat articles length:  25
length of all_articles:  261
<Response [200]>
cat articles length:  24
length of all_articles:  285


# 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.



In [13]:
news_df.head()

Unnamed: 0,title,category,body
0,"Afghanistan wins SAFF title, spoils India's ha...",india,Afghanistan won their maiden-SAFF Football Cha...
1,"Nigerian weightlifter in dope net, India may gain",india,India may move up after Nigerian weightlifter ...
2,India beat NZ 3-2 to enter CWG hockey finals,india,In the CWG men's hockey semi-final against New...
3,Infosys Gifts Sikka Shares Worth Rs 8.2cr,india,"In a regulatory filing to the BSE on Friday, I..."
4,Oldest woman in India passes away,india,"Kunjannam, a 112-yr-old woman from Parannur (K..."


In [15]:
#codeup_df=pd.DataFrame(acquire.get_blog_articles(acquire.articles))
pd.DataFrame([acquire.get_blog_articles(x) for x in acquire.articles])

Unnamed: 0,title,content
0,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
2,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...
3,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
4,What is Cloud Computing and AWS?,With many companies switching to cloud service...


# 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [18]:
news_df = news_df.rename(columns={'body':'original'})
news_df.head()

Unnamed: 0,title,category,original
0,"Afghanistan wins SAFF title, spoils India's ha...",india,Afghanistan won their maiden-SAFF Football Cha...
1,"Nigerian weightlifter in dope net, India may gain",india,India may move up after Nigerian weightlifter ...
2,India beat NZ 3-2 to enter CWG hockey finals,india,In the CWG men's hockey semi-final against New...
3,Infosys Gifts Sikka Shares Worth Rs 8.2cr,india,"In a regulatory filing to the BSE on Friday, I..."
4,Oldest woman in India passes away,india,"Kunjannam, a 112-yr-old woman from Parannur (K..."


In [19]:
news_df = news_df[['title','original']]
news_df.head()


Unnamed: 0,title,original
0,"Afghanistan wins SAFF title, spoils India's ha...",Afghanistan won their maiden-SAFF Football Cha...
1,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...
2,India beat NZ 3-2 to enter CWG hockey finals,In the CWG men's hockey semi-final against New...
3,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I..."
4,Oldest woman in India passes away,"Kunjannam, a 112-yr-old woman from Parannur (K..."


In [20]:
news_df['clean'] = news_df.original.apply(basic_clean)
news_df.head()


Unnamed: 0,title,original,clean
0,"Afghanistan wins SAFF title, spoils India's ha...",Afghanistan won their maiden-SAFF Football Cha...,afghanistan won their maidensaff football cham...
1,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...,india may move up after nigerian weightlifter ...
2,India beat NZ 3-2 to enter CWG hockey finals,In the CWG men's hockey semi-final against New...,in the cwg men's hockey semifinal against new ...
3,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I...",in a regulatory filing to the bse on friday in...
4,Oldest woman in India passes away,"Kunjannam, a 112-yr-old woman from Parannur (K...",kunjannam a 112yrold woman from parannur keral...


In [21]:
news_df['stemmed'] = news_df.clean.apply(stem)
news_df['lemmatized'] = news_df.clean.apply(lemmatize)
news_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"Afghanistan wins SAFF title, spoils India's ha...",Afghanistan won their maiden-SAFF Football Cha...,afghanistan won their maidensaff football cham...,afghanistan won their maidensaff footbal champ...,afghanistan won their maidensaff football cham...
1,"Nigerian weightlifter in dope net, India may gain",India may move up after Nigerian weightlifter ...,india may move up after nigerian weightlifter ...,india may move up after nigerian weightlift ch...,india may move up after nigerian weightlifter ...
2,India beat NZ 3-2 to enter CWG hockey finals,In the CWG men's hockey semi-final against New...,in the cwg men's hockey semifinal against new ...,in the cwg men' hockey semifin against new zea...,in the cwg men's hockey semifinal against new ...
3,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I...",in a regulatory filing to the bse on friday in...,in a regulatori file to the bse on friday info...,in a regulatory filing to the bse on friday in...
4,Oldest woman in India passes away,"Kunjannam, a 112-yr-old woman from Parannur (K...",kunjannam a 112yrold woman from parannur keral...,kunjannam a 112yrold woman from parannur keral...,kunjannam a 112yrold woman from parannur keral...
