In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

import unicodedata

import re

from acquire import get_news_articles, get_blog_articles

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ihcrane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/ihcrane/nltk_data...


True

In [18]:
stopwords_english = stopwords.words('english')

In [4]:
def clean_data(string):
    
    string = string.lower()
    
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    
    string = re.sub(r'[^a-z0-9\s]', '', string)
    
    return string

In [5]:
def tokenize(string):
    
    tokenize = nltk.tokenize.ToktokTokenizer()
    
    tokens = tokenize.tokenize(string)
    
    return tokens

In [6]:
def stem(tokens):
    
    ps = nltk.porter.PorterStemmer()
    
    ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')
    ps.stem('house'), ps.stem('housing')
    
    stems = [ps.stem(word) for word in tokens]
    
    return ' '.join(stems)

In [24]:
def lemmatize(tokens):
    
    wnl = nltk.stem.WordNetLemmatizer()
    
    wnl.lemmatize('calling'), wnl.lemmatize('calls'), wnl.lemmatize('called'), wnl.lemmatize('call')
    wnl.lemmatize('house'), wnl.lemmatize('housing')
    wnl.lemmatize('mouse'), wnl.lemmatize('mice')
    
    lemmas = [wnl.lemmatize(word) for word in tokens]
    
    return ' '.join(lemmas)

In [8]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    
    stopwords_english = stopwords.words('english')
    
    stopwords_english.extend(extra_words)
    stopwords_english = [word for word in stopwords_english if word not in exclude_words]
    
    string_with_stopwords_removed = [word for word in string if word not in stopwords_english]
    
    return ' '.join(string_with_stopwords_removed)

In [9]:
topics = ['sports','business','technology','entertainment']c

news_df = pd.DataFrame(get_news_articles(topics))
news_df

Unnamed: 0,category,title,content
0,sports,India's Ravichandran Ashwin becomes world numb...,Team India all-rounder Ravichandran Ashwin has...
1,sports,You are a harsh man: Hayden to Gavaskar over r...,Former Australia opener Matthew Hayden called ...
2,sports,I just didn't agree with a lot of his opinions...,"Ex-Australia batter Mark Waugh, who commentate..."
3,sports,Virat Kohli's reaction to Umesh Yadav equallin...,A video has gone viral on social media showing...
4,sports,Family of Kobe Bryant awarded around ₹240 cror...,Los Angeles County will pay Kobe Bryant's fami...
...,...,...,...
95,entertainment,"Diagnosed with Influenza B, staying away from ...",Actress Debina Bonnerjee took to Instagram Sto...
96,entertainment,Actors today focus on gym and social media mor...,Actor Govind Namdev has said that the current ...
97,entertainment,I relived my battle: Sagarika on Rani's 'Mrs C...,"Sagarika Chatterjee, whose life inspired Rani ..."
98,entertainment,Not solo anymore: Sidharth as paparazzi ask hi...,Sidharth Malhotra was recently asked to pose s...


In [11]:
url = 'https://codeup.com/blog/'

codeup_df = pd.DataFrame(get_blog_articles(url))
codeup_df

Unnamed: 0,title,date_published,content
0,Black Excellence in Tech: Panelist Spotlight -...,"Feb 16, 2023",Codeup is hosting a Black Excellence in Tech ...
1,Black excellence in tech: Panelist Spotlight -...,"Feb 13, 2023",Codeup is hosting our second Black Excellence ...
2,Black excellence in tech: Panelist Spotlight -...,"Feb 10, 2023",Codeup is hosting our second Black Excellence ...
3,Black excellence in tech: Panelist Spotlight -...,"Feb 6, 2023",Codeup is hosting our second Black Excellence ...
4,Coding Bootcamp or Self-Learning? Which is Bes...,"Jan 20, 2023",If you’re interested in embarking on a career ...
5,Codeup Among Top 58 Best Coding Bootcamps of 2...,"Jan 12, 2023",Codeup was highlighted for being an establishe...


In [25]:
cleaned_content = []
stemmed = []
lemmatized = []

for content in news_df['content']:
    
    
    cleaned = clean_data(content)
    cleaned_content.append(cleaned)
    
    tokens = tokenize(cleaned)
    
    stemmed.append(stem(tokens))
    lemmatized.append(lemmatize(tokens))

In [26]:
news_df['clean'] = cleaned_content
news_df['stemmed'] = stemmed
news_df['lemmatized'] = lemmatized

In [27]:
news_df

Unnamed: 0,category,title,content,clean,stemmed,lemmatized
0,sports,India's Ravichandran Ashwin becomes world numb...,Team India all-rounder Ravichandran Ashwin has...,team india allrounder ravichandran ashwin has ...,team india allround ravichandran ashwin ha bec...,team india allrounder ravichandran ashwin ha b...
1,sports,You are a harsh man: Hayden to Gavaskar over r...,Former Australia opener Matthew Hayden called ...,former australia opener matthew hayden called ...,former australia open matthew hayden call suni...,former australia opener matthew hayden called ...
2,sports,I just didn't agree with a lot of his opinions...,"Ex-Australia batter Mark Waugh, who commentate...",exaustralia batter mark waugh who commentated ...,exaustralia batter mark waugh who comment in f...,exaustralia batter mark waugh who commentated ...
3,sports,Virat Kohli's reaction to Umesh Yadav equallin...,A video has gone viral on social media showing...,a video has gone viral on social media showing...,a video ha gone viral on social media show vir...,a video ha gone viral on social medium showing...
4,sports,Family of Kobe Bryant awarded around ₹240 cror...,Los Angeles County will pay Kobe Bryant's fami...,los angeles county will pay kobe bryants famil...,lo angel counti will pay kobe bryant famili ne...,los angeles county will pay kobe bryants famil...
...,...,...,...,...,...,...
95,entertainment,"Diagnosed with Influenza B, staying away from ...",Actress Debina Bonnerjee took to Instagram Sto...,actress debina bonnerjee took to instagram sto...,actress debina bonnerje took to instagram stor...,actress debina bonnerjee took to instagram sto...
96,entertainment,Actors today focus on gym and social media mor...,Actor Govind Namdev has said that the current ...,actor govind namdev has said that the current ...,actor govind namdev ha said that the current a...,actor govind namdev ha said that the current a...
97,entertainment,I relived my battle: Sagarika on Rani's 'Mrs C...,"Sagarika Chatterjee, whose life inspired Rani ...",sagarika chatterjee whose life inspired rani m...,sagarika chatterje whose life inspir rani muke...,sagarika chatterjee whose life inspired rani m...
98,entertainment,Not solo anymore: Sidharth as paparazzi ask hi...,Sidharth Malhotra was recently asked to pose s...,sidharth malhotra was recently asked to pose s...,sidharth malhotra wa recent ask to pose solo f...,sidharth malhotra wa recently asked to pose so...


In [28]:
cleaned_content = []
stemmed = []
lemmatized = []

for content in codeup_df['content']:
    
    
    cleaned = clean_data(content)
    cleaned_content.append(cleaned)
    
    tokens = tokenize(cleaned)
    
    stemmed.append(stem(tokens))
    lemmatized.append(lemmatize(tokens))

In [29]:
codeup_df['clean'] = cleaned_content
codeup_df['stemmed'] = stemmed
codeup_df['lemmatized'] = lemmatized

In [30]:
codeup_df

Unnamed: 0,title,date_published,content,clean,stemmed,lemmatized
0,Black Excellence in Tech: Panelist Spotlight -...,"Feb 16, 2023",Codeup is hosting a Black Excellence in Tech ...,codeup is hosting a black excellence in tech ...,codeup is host a black excel in tech panel in ...,codeup is hosting a black excellence in tech p...
1,Black excellence in tech: Panelist Spotlight -...,"Feb 13, 2023",Codeup is hosting our second Black Excellence ...,codeup is hosting our second black excellence ...,codeup is host our second black excel in tech ...,codeup is hosting our second black excellence ...
2,Black excellence in tech: Panelist Spotlight -...,"Feb 10, 2023",Codeup is hosting our second Black Excellence ...,codeup is hosting our second black excellence ...,codeup is host our second black excel in tech ...,codeup is hosting our second black excellence ...
3,Black excellence in tech: Panelist Spotlight -...,"Feb 6, 2023",Codeup is hosting our second Black Excellence ...,codeup is hosting our second black excellence ...,codeup is host our second black excel in tech ...,codeup is hosting our second black excellence ...
4,Coding Bootcamp or Self-Learning? Which is Bes...,"Jan 20, 2023",If you’re interested in embarking on a career ...,if youre interested in embarking on a career i...,if your interest in embark on a career in tech...,if youre interested in embarking on a career i...
5,Codeup Among Top 58 Best Coding Bootcamps of 2...,"Jan 12, 2023",Codeup was highlighted for being an establishe...,codeup was highlighted for being an establishe...,codeup wa highlight for be an establish school...,codeup wa highlighted for being an established...
