In [74]:
import unicodedata
import re
import json

import pandas as pd

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk import sent_tokenize


### Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

    Lowercase everything
    Normalize unicode characters
    Replace anything that is not a letter, number, whitespace or a single quote.

In [42]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [43]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    return string

In [45]:
original = basic_clean(original)
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [39]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    string = tokenizer.tokenize(string, return_str = True)
    return string

In [47]:
original = tokenize(original)
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [40]:
def stem(string):

    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    stems = [ps.stem(word) for word in string.split()]
    
    article_stemmed = ' '.join(stems)
    
    return article_stemmed

In [49]:
original = stem(original)
original

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [56]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dulcechavez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [57]:
def lemmatize(string):
    
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

In [58]:
original = lemmatize(original)
original

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written a erdo or erdo either by mistak or out of typograph necess"

### Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
    This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [72]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dulcechavez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
stopword_list = stopwords.words('english')

In [69]:
def remove_stopwords(string):
    
    stopword_list = stopwords.words('english')
    words = string.split()
    filtered_words = [word for word in words if word not in stopword_list]
    article_without_stopwords = ' '.join(filtered_words)
    
    return article_without_stopwords

In [70]:
original = remove_stopwords(original)
original

"paul erdo georg polya influenti hungarian mathematician contribut lot field erdo ' name contain hungarian letter ' ' ' ' doubl acut accent often incorrectli written erdo erdo either mistak typograph necess"

### Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [76]:
import json

with open('inshorts-2021-10-28.json') as json_file:
    data = json.load(json_file)

In [77]:
data

{'title': {'0': "Facebook changes its company name to 'Meta'",
  '1': "'Man who takes 6 months parental leave is a loser,' says Palantir Co-founder",
  '2': 'Delhi HC notice to RBI, SBI over banning UPI payments in crypto exchanges',
  '3': 'Indian market has 3 key beauties: Paytm CEO ahead of ₹18,300-crore IPO',
  '4': 'Paytm will not force employees to come to office: CEO',
  '5': "Who are the top 10 new entrants on Hurun India's philanthropy list?",
  '6': 'Legacy companies eat Ola, Ather, Tork & SmartE (OATS) for breakfast: Rajiv Bajaj',
  '7': 'Shaktikanta Das reappointed as RBI Governor for three years',
  '8': 'Govt asks IRCTC to share 50% of service charge from e-ticketing',
  '9': 'Investors lose ₹4.82 lakh crore in a day as Sensex sees the worst fall in 6 months',
  '10': 'Sensex crashes 1,159 points to end below 60,000 in worst fall in 6 months',
  '11': 'Aditya Ghosh pens note to father upon completing Management Program from Harvard',
  '12': "Who are India's top philanthr

### Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

### For each dataframe, produce the following columns:
    - title to hold the title
    - original to hold the original article/post content
    - clean to hold the normalized and tokenized original with the stopwords removed.
    - stemmed to hold the stemmed version of the cleaned data.
    - lemmatized to hold the lemmatized version of the cleaned data.

# Ask yourself:

    If your string is 493KB, would you prefer to use stemmed or lemmatized text?
    If your string is 25MB, would you prefer to use stemmed or lemmatized text?
    If your string is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?