In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as a

In [2]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [4]:
# lowercase everything
original = original.lower()
original

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [5]:
# remove accented characters and non-ASCII characters
original = unicodedata.normalize('NFKD', original).encode('ascii', 'ignore').decode('utf-8')
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [6]:
# remove special characters
original = re.sub(r'[^a-z0-9\s]', '', original)
original

'paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

In [7]:
# tokenize
tokenize = nltk.tokenize.ToktokTokenizer()
tokenize

<nltk.tokenize.toktok.ToktokTokenizer at 0x7f881e5d6550>

In [8]:
original = tokenize.tokenize(original)
original

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematicians',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdoss',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 'o',
 'o',
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'as',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [9]:
# stemming
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [10]:
ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')

('call', 'call', 'call', 'call')

In [11]:
ps.stem('house'), ps.stem('housing')

('hous', 'hous')

In [12]:
stems = [ps.stem(word) for word in original]
' '.join(stems)

'paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdoss name contain the hungarian letter o o with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess'

In [13]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    |

[nltk_data]    |   Package mwa_ppdb is already up-to-date!
[nltk_data]    | Downloading package names to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Downloading package nombank.1.0 to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package nombank.1.0 is already up-to-date!
[nltk_data]    | Downloading package nonbreaking_prefixes to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package nonbreaking_prefixes is already up-to-date!
[nltk_data]    | Downloading package nps_chat to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package nps_chat is already up-to-date!
[nltk_data]    | Downloading package omw to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package omw is already up-to-date!
[nltk_data]    | Downloading package omw-1.4 to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package 

[nltk_data]    |   Package word2vec_sample is already up-to-date!
[nltk_data]    | Downloading package wordnet to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package wordnet is already up-to-date!
[nltk_data]    | Downloading package wordnet2021 to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package wordnet2021 is already up-to-date!
[nltk_data]    | Downloading package wordnet2022 to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package wordnet2022 is already up-to-date!
[nltk_data]    | Downloading package wordnet31 to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package wordnet31 is already up-to-date!
[nltk_data]    | Downloading package wordnet_ic to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   Package wordnet_ic is already up-to-date!
[nltk_data]    | Downloading package words to
[nltk_data]    |     /Users/jorgelopez/nltk_data...
[nltk_data]    |   P

True

In [14]:
# lemmatize
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [15]:
wnl.lemmatize('calling'), wnl.lemmatize('calls'), wnl.lemmatize('called'), wnl.lemmatize('call')

('calling', 'call', 'called', 'call')

In [16]:
wnl.lemmatize('house'), wnl.lemmatize('housing')

('house', 'housing')

In [17]:
wnl.lemmatize('mouse'), wnl.lemmatize('mice')

('mouse', 'mouse')

In [18]:
stopwords_english = stopwords.words('english')
stopwords_english[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [19]:
stopwords_english.append('o')
len(stopwords_english)

180

In [20]:
original_with_stopwords_removed = [word for word in original if word not in stopwords_english]
' '.join(original_with_stopwords_removed)

'paul erdos george polya influential hungarian mathematicians contributed lot field erdoss name contains hungarian letter double acute accent often incorrectly written erdos erdos either mistake typographical necessity'

Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:  
  
Lowercase everything  
Normalize unicode characters  
Replace anything that is not a letter, number, whitespace or a single quote.  

In [41]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    string = re.sub(r'[^a-z0-9\s]', '', string)
    
    return string

In [22]:
string = 'The cat aNd the dog wEnt to the $store and got some MILK!'
basic_clean(string)

'the cat and the dog went to the store and got some milk'

Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [39]:
def tokenize(string):
    tokenize = nltk.tokenize.ToktokTokenizer()
    string = tokenize.tokenize(string, return_str = True)
    
    return string

In [40]:
string = 'the dog went over to the cat and said hi'
tokenize(string)

'the dog went over to the cat and said hi'

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [25]:
def stem(text1, text2, text3):
    ps = nltk.porter.PorterStemmer()
    ps.stem(text1), ps.stem(text2), ps.stem(text3)
    
    return text1, text2, text3

In [26]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # create our stemming object
    ps = nltk.porter.PorterStemmer()
    # use a list comprehension => stem each word for each word inside of the entire document,
    # split by the default, which are single spaces
    stems = [ps.stem(word) for word in string.split()]
    # glue it back together with spaces, as it was before
    string = ' '.join(stems)
    
    return string

In [27]:
stem('baller, balling, balls')

'baller, balling, ball'

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [28]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # create our lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use a list comprehension to lemmatize each word
    # string.split() => output a list of every token inside of the document
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # glue the lemmas back together by the strings we split on
    string = ' '.join(lemmas)
    #return the altered document
    return string

In [29]:
lemmatize('shawn, shawning, shawns')

'shawn, shawning, shawn'

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.  
  
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [30]:
def remove_stopwords(text):
    stopwords_english = stopwords.words('english')
    new_text = [word for word in text if word not in stopwords_english]
    
    return new_text

In [31]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # assign our stopwords from nltk into stopword_list
    stopword_list = stopwords.words('english')
    # utilizing set casting, i will remove any excluded stopwords
    stopword_list = set(stopword_list) - set(exclude_words)
    # add in any extra words to my stopwords set using a union
    stopword_list = stopword_list.union(set(extra_words))
    # split our document by spaces
    words = string.split()
    # every word in our document, as long as that word is not in our stopwords
    filtered_words = [word for word in words if word not in stopword_list]
    # glue it back together with spaces, as it was so it shall be
    string_without_stopwords = ' '.join(filtered_words)
    # return the document back
    return string_without_stopwords

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [32]:
#Test my function on the business page
business_test = a.scrape_one_page('business')
business_test[0]

{'category': 'business',
 'title': 'Bill Gates meets Ratan Tata, N Chandrasekaran; pics surface',
 'content': 'Microsoft Co-founder Bill Gates met with Tata Sons Chairman Emeritus Ratan Tata and Tata Sons Chairman Natarajan Chandrasekaran. "Bill had an enriching discussion with Ratan Tata and N Chandrasekaran about their philanthropic initiatives," Gates Foundation India said in a tweet. "We look forward to strengthening our work together & partnering for health, diagnostics, and nutrition," it added.'}

In [33]:
#Test my function!
topics = ['business', 'sports', 'technology', 'entertainment']

news_df = a.get_news_articles(topics)
news_df[0]

{'category': 'business',
 'title': 'All Adani stocks end higher for the first time since Hindenburg report',
 'content': 'All 10 Adani Group stocks closed higher on Wednesday, the first such occurrence since the report by US short-seller Hindenburg Research in late January that accused the group of fraud. Adani Enterprises led the rally, advancing almost 15% as the group conducted investor meetings in Singapore and Hong Kong. Adani Transmission and Adani Power rose by the 5% daily limit.'}

In [34]:
news_df = pd.DataFrame(a.get_news_articles(topics))

In [35]:
news_df

Unnamed: 0,category,title,content
0,business,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...
1,business,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...
2,business,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...
3,business,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...
4,business,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...
...,...,...,...
95,entertainment,I relived my battle: Sagarika on Rani's 'Mrs C...,"Sagarika Chatterjee, whose life inspired Rani ..."
96,entertainment,"Diagnosed with Influenza B, staying away from ...",Actress Debina Bonnerjee took to Instagram Sto...
97,entertainment,"Broke my heart, we didn't do justice: Shahid o...",Actor Shahid Kapoor said the failure of his fi...
98,entertainment,It takes guts: Rajatava on Akshay accepting fa...,Bengali actor Rajatava Dutta praised Akshay Ku...


Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [36]:
codeup_df = pd.DataFrame(a.get_blog_articles('blog_posts.json'))

In [37]:
codeup_df

Unnamed: 0,title,link,date_published,content
0,Black Excellence in Tech: Panelist Spotlight –...,https://inshorts.com/en/read/business/,"Feb 16, 2023",\nBlack excellence in tech: Panelist Spotlight...
1,Black excellence in tech: Panelist Spotlight –...,https://inshorts.com/en/read/business/,"Feb 13, 2023",\nBlack excellence in tech: Panelist Spotlight...
2,Black excellence in tech: Panelist Spotlight –...,https://inshorts.com/en/read/business/,"Feb 10, 2023",\nBlack excellence in tech: Panelist Spotlight...
3,Black excellence in tech: Panelist Spotlight –...,https://inshorts.com/en/read/business/,"Feb 6, 2023",\nBlack excellence in tech: Panelist Spotlight...
4,Coding Bootcamp or Self-Learning? Which is Bes...,https://inshorts.com/en/read/business/,"Jan 20, 2023",\nIf you’re interested in embarking on a caree...
5,Codeup Among Top 58 Best Coding Bootcamps of 2023,https://inshorts.com/en/read/business/,"Jan 12, 2023",\nCodeup is pleased to announce we have been r...


For each dataframe, produce the following columns:  
  
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.


In [43]:
news_df['content'].apply(basic_clean).apply(tokenize).apply(lemmatize).apply(remove_stopwords)

0     10 adani group stock closed higher wednesday f...
1     microsoft cofounder bill gate met tata son cha...
2     softbank sold share worth 954 crore logistics ...
3     hour central government raised price commercia...
4     indianamericans punit renjen rajesh subramania...
                            ...                        
95    sagarika chatterjee whose life inspired rani m...
96    actress debina bonnerjee took instagram story ...
97    actor shahid kapoor said failure film jersey b...
98    bengali actor rajatava dutta praised akshay ku...
99    speaking similarity found husband rj anmol met...
Name: content, Length: 100, dtype: object

In [45]:
################################ PREP ARTICLES ################################

#take dataframe, specify the column, extra and exclude words
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    #original text from content column
    df['original'] = df['content']
    
    #chain together clean, tokenize, remove stopwords
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    #chain clean, tokenize, stem, remove stopwords
    df['stemmed'] = df['clean'].apply(stem)
    
    #clean clean, tokenize, lemmatize, remove stopwords
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', 'original', 'clean', 'stemmed', 'lemmatized']]

In [46]:
#assign variable to our new prepped dataframe
prep_news = prep_article_data(news_df, 'content', extra_words =[], exclude_words=[])

#take a look
prep_news.head(5)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...,10 adani group stocks closed higher wednesday ...,10 adani group stock close higher wednesday fi...,10 adani group stock closed higher wednesday f...
1,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...,microsoft cofounder bill gates met tata sons c...,microsoft cofound bill gate met tata son chair...,microsoft cofounder bill gate met tata son cha...
2,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...,softbank sold shares worth 954 crore logistics...,softbank sold share worth 954 crore logist com...,softbank sold share worth 954 crore logistics ...
3,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...,hours central government raised price commerci...,hour central govern rais price commerci lpg cy...,hour central government raised price commercia...
4,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...,indianamericans punit renjen rajesh subramania...,indianamerican punit renjen rajesh subramaniam...,indianamericans punit renjen rajesh subramania...


In [47]:
#take a look at one article
prep_news.iloc[1]

title         Bill Gates meets Ratan Tata, N Chandrasekaran;...
original      Microsoft Co-founder Bill Gates met with Tata ...
clean         microsoft cofounder bill gates met tata sons c...
stemmed       microsoft cofound bill gate met tata son chair...
lemmatized    microsoft cofounder bill gate met tata son cha...
Name: 1, dtype: object

In [48]:
#assign variable to our new prepped dataframe
prep_codeup = prep_article_data(codeup_df, 'content', extra_words =[], exclude_words=[])

#take a look
prep_codeup.head(5)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Black Excellence in Tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,black excellence tech panelist spotlight wilma...,black excel tech panelist spotlight wilmari de...,black excellence tech panelist spotlight wilma...
1,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,black excellence tech panelist spotlight steph...,black excel tech panelist spotlight stephani j...,black excellence tech panelist spotlight steph...
2,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,black excellence tech panelist spotlight james...,black excel tech panelist spotlight jame coope...,black excellence tech panelist spotlight james...
3,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,black excellence tech panelist spotlight jeani...,black excel tech panelist spotlight jeanic fre...,black excellence tech panelist spotlight jeani...
4,Coding Bootcamp or Self-Learning? Which is Bes...,\nIf you’re interested in embarking on a caree...,youre interested embarking career tech likely ...,your interest embark career tech like taken lo...,youre interested embarking career tech likely ...


In [49]:
#take a look at one article
prep_codeup.iloc[4]

title         Coding Bootcamp or Self-Learning? Which is Bes...
original      \nIf you’re interested in embarking on a caree...
clean         youre interested embarking career tech likely ...
stemmed       your interest embark career tech like taken lo...
lemmatized    youre interested embarking career tech likely ...
Name: 4, dtype: object

Ask yourself:  
  
- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - lemmatize is slower, so smaller is ok to take longer.
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - stem because dataset is larger and stemming is faster
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    - stemming because it is faster 

In [44]:
!pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.8.2.2-cp39-cp39-macosx_10_9_x86_64.whl (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.2.2
