# Case Conversion

In [None]:
text = 'The quick brown fox jumped over The Big Dog'
text

In [None]:
text.lower()

In [None]:
text.upper()

In [None]:
text.title()

# Tokenization

In [None]:
sample_text = ("US unveils world's most powerful supercomputer, beats China. " 
               "The US has unveiled the world's most powerful supercomputer called 'Summit', " 
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")
sample_text

In [None]:
import nltk

nltk.sent_tokenize(sample_text)

In [None]:
print(nltk.word_tokenize(sample_text))

In [None]:
tt = nltk.toktok.ToktokTokenizer()
print(tt.tokenize(sample_text))

In [None]:
import spacy
nlp = spacy.load('en', parse=False, tag=False, entity=False)

text_spacy = nlp(sample_text)

In [None]:
[obj.text for obj in text_spacy.sents]

In [None]:
print([obj.text for obj in text_spacy])

# Removing HTML tags & noise

In [None]:
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])

In [None]:
with open("pg8001.html", "r", encoding='utf-8') as f:
    content = f.read()
    
print(content[2745:3948])

In [None]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])

# Removing Accented Characters

In [None]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
s = 'Sómě Áccěntěd těxt'
s

In [None]:
remove_accented_chars(s)

# Removing Special Characters, Numbers and Symbols

In [None]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


In [None]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

### Your Turn: Try both combinations first remove all special characters and then keep the digits

In [None]:
remove_special_characters(____________)

In [None]:
remove_special_characters(____________)

# Expanding Contractions

In [None]:
!pip install contractions

In [None]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

In [None]:
import contractions

list(contractions.contractions_dict.items())[:10]

In [None]:
contractions.fix(s)

In [None]:
from contractions_list import CONTRACTION_MAP
import re

list(CONTRACTION_MAP.items())[:10]

In [None]:
contractions_pattern = re.compile('({})'.format('|'.join(CONTRACTION_MAP.keys())), 
                                  flags=re.IGNORECASE|re.DOTALL)
contractions_pattern

In [None]:
contractions_pattern.sub(lambda r: print(r.group(0), '->', r.group(0).lower(), 
                                         '->', CONTRACTION_MAP.get(r.group(0).lower())), s)

In [None]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [None]:
expand_contractions(s, contraction_mapping=CONTRACTION_MAP)

# Stemming

In [None]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

In [None]:
ps.stem('lying')

In [None]:
ps.stem('strange')

### Your Turn: Try using Lancaster stemmer on the same two words as depicted previously

In [None]:
# Lancaster Stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

In [None]:
_____________

In [None]:
____________

In [None]:
import nltk
ps = nltk.porter.PorterStemmer()
ls = nltk.stem.LancasterStemmer()

def simple_stemming(text, stemmer=ps):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

### Your Turn: Try calling the above defined function for both Lancaster and Porter stemmer separately

Do you notice any difference in the results?

In [None]:
s = "My system keeps crashing his crashed yesterday ours crashes daily and presumably we are not lying"
s

In [None]:
simple_stemming(_________)

In [None]:
simple_stemming(_________)

# Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [None]:
help(wnl.lemmatize)

In [None]:
# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('boxes', 'n'))

In [None]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

In [None]:
# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

In [None]:
# ineffective lemmatization
print(wnl.lemmatize('ate', 'n'))
print(wnl.lemmatize('fancier', 'v'))
print(wnl.lemmatize('fancier'))

In [None]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [None]:
tokens = nltk.word_tokenize(s)
print(tokens)

In [None]:
lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)
lemmatized_text

In [None]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

In [None]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in tagged_tokens)
lemmatized_text

In [None]:
from nltk.corpus import wordnet

wordnet.ADJ

In [None]:
def pos_tag_wordnet(tagged_tokens):
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(), wordnet.NOUN))
                            for word, tag in tagged_tokens]
    return new_tagged_tokens

In [None]:
wordnet_tokens = pos_tag_wordnet(tagged_tokens)
print(wordnet_tokens)

In [None]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
lemmatized_text

### Your turn: Define a function such that you put all the above steps together so that it does the following

- Function name is __`wordnet_lemmatize_text(...)`__
- Input is a variable __`text`__ which should take in a document (bunch of words)
- Call the earlier defined functions and utilize them
- Return lemmatized text as the output (as a string)

In [None]:
wnl = WordNetLemmatizer()

def wordnet_lemmatize_text(text):
    ______
    return lemmatized_text

### Your Turn: Now call the function on the below sentence and test it

In [None]:
s

In [None]:
wordnet_lemmatize_text(s)

In [None]:
import spacy
nlp = spacy.load('en', parse=False, tag=False, entity=False)

def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
s

In [None]:
spacy_lemmatize_text(s)

# Stopword Removal

In [None]:
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10])

In [None]:
s

In [None]:
remove_stopwords(s, is_lower_case=False)

### Your turn: Remove the words 'the' and 'brown' from the stop_words list and call the function with this new list

In [None]:
_______________
_______________

In [None]:
remove_stopwords(______________)