<a href="https://colab.research.google.com/github/krystal826/Natural-Language-Processing/blob/main/Lab03_Task02_TextNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab03 Task 02 instructions: 

# 1. These Python codes are to explore text normalization such as removing special characters, removing stopwords, stemming and lemmatization. 

# 2. Run all codes, see the output issued and understand how word normalization is done using Python programming.

# 3. If you have error about "undefined name", it means a particular package isn't donwloaded yet. Please execute the following code

# download.nltk() - this may take quite a long time to execute, else
# download.nltk("name-package") - just download the particular package

In [None]:
import nltk
import re
import string
from pprint import pprint

In [None]:
corpus = ["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for $199", "@@You'll (learn) a **lot** in the book. Python is an amazing language !@@"]

In [None]:
print(corpus)

["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for $199", "@@You'll (learn) a **lot** in the book. Python is an amazing language !@@"]


# Case conversion

In [None]:
print(corpus[0].lower())
print(corpus[0].upper())

the brown fox wasn't that quick and he couldn't win the race
THE BROWN FOX WASN'T THAT QUICK AND HE COULDN'T WIN THE RACE


# Removing special characters before tokenization

In [None]:
def remove_characters_before_tokenization(sentence, 
                                          keep_apostrophes=False): 
    sentence = sentence.strip() 
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]' # add other characters here to remove them
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    else:
        PATTERN = r'[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
        filtered_sentence = re.sub(PATTERN, r'', sentence) 
    return filtered_sentence

In [None]:
cleaned_corpus = [remove_characters_before_tokenization(sentence,
                keep_apostrophes=True)
                  for sentence in corpus]

In [None]:
print(cleaned_corpus)

["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for 199", "You'll learn a lot in the book. Python is an amazing language !"]


In [None]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [None]:
raw = "Well this was fun! What do you think? 123#@!"
print("\nRaw: ", raw)
print("Remove special chatacters: ", remove_special_characters(raw, remove_digits=True))


Raw:  Well this was fun! What do you think? 123#@!
Remove special chatacters:  Well this was fun What do you think 


# Removing accented characters

In [None]:
import unicodedata

In [None]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
print("\nSómě Áccěntěd těxt")
print("After removing accěntěd těxt: ")
print(remove_accented_chars('Sómě Áccěntěd těxt'))


Sómě Áccěntěd těxt
After removing accěntěd těxt: 
Some Accented text


# Removing HTML tags

In [None]:
from bs4 import BeautifulSoup
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [None]:
print("\n<html><h2>Some important text</h2></html>")
print("Remove html tags:")
print(strip_html_tags('<html><h2>Some important text</h2></html>'))


<html><h2>Some important text</h2></html>
Remove html tags:
Some important text


# Expanding Contractions

In [None]:
from contractions import CONTRACTION_MAP # make sure copy contraction.py in the same folder as this file
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [None]:
from contractions import CONTRACTION_MAP # make sure copy contraction.py in the same folder as this file
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    try:
        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
    except:
        return text
    return expanded_text

In [None]:
print("\nY'all can't expand contractions I'd think")
print("Expand contractions:")
print(expand_contractions("Y'all can't expand contractions I'd think"))


Y'all can't expand contractions I'd think
Expand contractions:
You all cannot expand contractions I would think


In [None]:
cleaned_corpus2 = [expand_contractions(sentence) 
                  for sentence in cleaned_corpus]
print(cleaned_corpus)
print(cleaned_corpus2)

["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for 199", "You'll learn a lot in the book. Python is an amazing language !"]
['The brown fox was not that quick and he could not win the race', 'Hey that is a great deal! I just bought a phone for 199', 'You will learn a lot in the book. Python is an amazing language !']


# Removing Stopwords

In [None]:
import nltk
nltk.download('stopwords')
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
#stopword_list.remove('no')
#stopword_list.remove('not')
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
cleaned_corpus3 = [remove_stopwords(sentence, is_lower_case=False) 
                  for sentence in cleaned_corpus2]
print(cleaned_corpus2)
print("\n")
print("Remove stopwords: \n", cleaned_corpus3)

['The brown fox was not that quick and he could not win the race', 'Hey that is a great deal! I just bought a phone for 199', 'You will learn a lot in the book. Python is an amazing language !']


Remove stopwords: 
 ['brown fox quick could win race', 'Hey great deal ! bought phone 199', 'learn lot book. Python amazing language !']


In [None]:
print("\nThe, and, if are stopwords, computer is not")
print("Remove stopwords:")
print(remove_stopwords("The, and, if are stopwords, computer is not"))


The, and, if are stopwords, computer is not
Remove stopwords:
, , stopwords , computer


In [None]:
from nltk.corpus import stopwords
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

# Word Stemmer: Porter Stemmer

In [None]:
# Stemming using Porter Stemmer
from nltk.stem import PorterStemmer

p_stemmer = PorterStemmer()
  
words = ['jumps','jumping','jumped', 'easily','fairly'] 

for word in words:
    print(word+' --> '+p_stemmer.stem(word))

jumps --> jump
jumping --> jump
jumped --> jump
easily --> easili
fairly --> fairli


# Word Stemmer: Lancaster Stemmer
# Based on Porter Stemmer code above, complete the following code segment (add the respective code in line 9 and 10 below)

In [None]:
# Stemming using Lancaster Stemmer
from nltk.stem import LancasterStemmer

l_stemmer = LancasterStemmer()
  
words = ['jumps','jumping','jumped', 'easily','fairly'] 

# Refer to line 8 and 9 in Porter Stemmer, copy the code and adjust them to Lancaster Stemmer
for word in words:
  print(word+' --> '+l_stemmer.stem(word))


jumps --> jump
jumping --> jump
jumped --> jump
easily --> easy
fairly --> fair


# Word Stemmer: Snowball Stemmer
# Based on Porter Stemmer code, complete the following code segment (add the respective code in line 10 and 11 below)

In [None]:
# Stemming using Snowball Stemmer
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')
  
words = ['jumps','jumping','jumped', 'easily','fairly'] 

# Refer to line 8 and 9 in Porter Stemmer, copy the code and adjust them to Lancaster Stemmer
for word in words:
  print(word+' --> '+s_stemmer.stem(word))


jumps --> jump
jumping --> jump
jumped --> jump
easily --> easili
fairly --> fair


# Word Stemmer: Regex Stemmer

In [None]:
# Stemming using Regex based Stemmer
from nltk.stem import RegexpStemmer

re_stemmer = RegexpStemmer('ing$|s$|ed$', min=4)
  
words = ['jumps','jumping','jumped', 'easily','fairly'] 

for word in words:
    print(word+' --> '+re_stemmer.stem(word))

jumps --> jump
jumping --> jump
jumped --> jump
easily --> easily
fairly --> fairly


# Lemmatization Using NLTK tool

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


# Lemmatization Using Spacy

In [None]:
import spacy

def show_lemmas(text):
    print("Word         POS     Lemma")
    print("--------------------------")
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma_}')
        
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"He saw eighteen mice today!")
show_lemmas(doc)

Word         POS     Lemma
--------------------------
He           PRON   -PRON-
saw          VERB   see
eighteen     NUM    eighteen
mice         NOUN   mouse
today        NOUN   today
!            PUNCT  !
