# Case Conversion

In [1]:
text = 'The quick brown fox jumped over The Big Dog'
text

'The quick brown fox jumped over The Big Dog'

In [2]:
text.lower()

'the quick brown fox jumped over the big dog'

In [3]:
text.upper()

'THE QUICK BROWN FOX JUMPED OVER THE BIG DOG'

In [4]:
text.title()

'The Quick Brown Fox Jumped Over The Big Dog'

# Tokenization

In [5]:
sample_text = ("US unveils world's most powerful supercomputer, beats China. " 
               "The US has unveiled the world's most powerful supercomputer called 'Summit', " 
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")
sample_text

"US unveils world's most powerful supercomputer, beats China. The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts."

In [6]:
import nltk

nltk.sent_tokenize(sample_text)

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [7]:
print(nltk.word_tokenize(sample_text))

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'Summit", "'", ',', 'beating', 'the', 'previous', 'record-holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


In [8]:
tt = nltk.toktok.ToktokTokenizer()
print(tt.tokenize(sample_text))

['US', 'unveils', 'world', "'", 's', 'most', 'powerful', 'supercomputer', ',', 'beats', 'China.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'", 's', 'most', 'powerful', 'supercomputer', 'called', "'", 'Summit', "'", ',', 'beating', 'the', 'previous', 'record-holder', 'China', "'", 's', 'Sunway', 'TaihuLight.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


In [9]:
import spacy
nlp = spacy.load('en', parse=False, tag=False, entity=False)

text_spacy = nlp(sample_text)

In [10]:
[obj.text for obj in text_spacy.sents]

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [11]:
print([obj.text for obj in text_spacy])

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'", 'Summit', "'", ',', 'beating', 'the', 'previous', 'record', '-', 'holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


# Removing HTML tags & noise

In [12]:
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])

id00010">Language: English</p>

<p id="id00011" style="margin-top: 2em">*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***</p>

<p id="id00012" style="margin-top: 4em">This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.</p>

<h1 id="id00013" style="margin-top: 5em">Book 01        Genesis</h1>

<p id="id00014">01:001:001 In the beginning God created the heaven and the earth.</p>

<p id="id00015" style="margin-left: 0%; margin-right: 0%">01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.</p>

<p id="id00016">01:001:003 And God said, Let there be light: and there was light.</p>

<p id="id00017">01:001:004 And God saw the light, that it was good: and God divided the<br/>

           light from the darkness.<br/>
</p>

<p id="id00018">01:001:005 And God ca

In [13]:
with open("pg8001.html", "r", encoding='utf-8') as f:
    content = f.read()
    
print(content[2745:3948])


<p id="id00011" style="margin-top: 2em">*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***</p>

<p id="id00012" style="margin-top: 4em">This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.</p>

<h1 id="id00013" style="margin-top: 5em">Book 01        Genesis</h1>

<p id="id00014">01:001:001 In the beginning God created the heaven and the earth.</p>

<p id="id00015" style="margin-left: 0%; margin-right: 0%">01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.</p>

<p id="id00016">01:001:003 And God said, Let there be light: and there was light.</p>

<p id="id00017">01:001:004 And God saw the light, that it was good: and God divided the<br>

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;light from the darkness.<br>
</p>

<p id="id00018"

In [14]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])

*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***
This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.
Book 01        Genesis
01:001:001 In the beginning God created the heaven and the earth.
01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.
01:001:003 And God said, Let there be light: and there was light.
01:001:004 And God saw the light, that it was good: and God divided the
           light from the darkness.
01:001:005 And God called the light Day, and the darkness he called
           Night. And the evening and the morning were the first day.



# Removing Accented Characters

In [15]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [16]:
s = 'Sómě Áccěntěd těxt'
s

'Sómě Áccěntěd těxt'

In [17]:
remove_accented_chars(s)

'Some Accented text'

# Removing Special Characters, Numbers and Symbols

In [18]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


In [19]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

'Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂'

### Your Turn: Try both combinations first remove all special characters and then keep the digits

In [20]:
remove_special_characters(s, remove_digits=True)

'Well this was fun See you at  What do you think  '

In [21]:
remove_special_characters(s)

'Well this was fun See you at 730 What do you think 9318 '

# Expanding Contractions

In [22]:
!pip install contractions



thinc 6.10.2 requires pathlib<2.0.0,>=1.0.0, which is not installed.
spacy 2.0.11 requires pathlib, which is not installed.
smart-open 1.7.1 requires bz2file, which is not installed.
distributed 1.21.8 requires msgpack, which is not installed.
spacy 2.0.11 has requirement regex==2017.4.5, but you'll have regex 2017.11.9 which is incompatible.
skater 1.1.1b5 has requirement scikit-image==0.14, but you'll have scikit-image 0.13.1 which is incompatible.


In [23]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [24]:
import contractions

list(contractions.contractions_dict.items())[:10]

[("ain't", 'are not'),
 ("aren't", 'are not'),
 ("can't", 'cannot'),
 ("can't've", 'cannot have'),
 ("'cause", 'because'),
 ("could've", 'could have'),
 ("couldn't", 'could not'),
 ("couldn't've", 'could not have'),
 ("didn't", 'did not'),
 ("doesn't", 'does not')]

In [25]:
contractions.fix(s)

'you all cannot expand contractions I would think! You would not be able to. how did you do it?'

In [26]:
from contractions_list import CONTRACTION_MAP
import re

list(CONTRACTION_MAP.items())[:10]

[("ain't", 'is not'),
 ("aren't", 'are not'),
 ("can't", 'cannot'),
 ("can't've", 'cannot have'),
 ("'cause", 'because'),
 ("could've", 'could have'),
 ("couldn't", 'could not'),
 ("couldn't've", 'could not have'),
 ("didn't", 'did not'),
 ("doesn't", 'does not')]

In [27]:
contractions_pattern = re.compile('({})'.format('|'.join(CONTRACTION_MAP.keys())), 
                                  flags=re.IGNORECASE|re.DOTALL)
contractions_pattern

re.compile(r"(ain't|aren't|can't|can't've|'cause|could've|couldn't|couldn't've|didn't|doesn't|don't|hadn't|hadn't've|hasn't|haven't|he'd|he'd've|he'll|he'll've|he's|how'd|how'd'y|how'll|how's|I'd|I'd've|I'll|I'll've|I'm|I've|i'd|i'd've|i'll|i'll've|i'm|i've|isn't|it'd|it'd've|it'll|it'll've|it's|let's|ma'am|mayn't|might've|mightn't|mightn't've|must've|mustn't|mustn't've|needn't|needn't've|o'clock|oughtn't|oughtn't've|shan't|sha'n't|shan't've|she'd|she'd've|she'll|she'll've|she's|should've|shouldn't|shouldn't've|so've|so's|that'd|that'd've|that's|there'd|there'd've|there's|they'd|they'd've|they'll|they'll've|they're|they've|to've|wasn't|we'd|we'd've|we'll|we'll've|we're|we've|weren't|what'll|what'll've|what're|what's|what've|when's|when've|where'd|where's|where've|who'll|who'll've|who's|who've|why's|why've|will've|won't|won't've|would've|wouldn't|wouldn't've|y'all|y'all'd|y'all'd've|y'all're|y'all've|you'd|you'd've|you'll|you'll've|you're|you've)",
re.IGNORECASE|re.DOTALL|re.UNICODE)

In [28]:
contractions_pattern.sub(lambda r: print(r.group(0), '->', r.group(0).lower(), 
                                         '->', CONTRACTION_MAP.get(r.group(0).lower())), s)

Y'all -> y'all -> you all
can't -> can't -> cannot
I'd -> i'd -> i would
wouldn't -> wouldn't -> would not
How'd -> how'd -> how did


'  expand contractions  think! You  be able to.  you do it?'

In [29]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [30]:
expand_contractions(s, contraction_mapping=CONTRACTION_MAP)

'You all cannot expand contractions I would think! You would not be able to. How did you do it?'

# Stemming

In [31]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [32]:
ps.stem('lying')

'lie'

In [33]:
ps.stem('strange')

'strang'

### Your Turn: Try using Lancaster stemmer on the same two words as depicted previously

In [34]:
# Lancaster Stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

('jump', 'jump', 'jump')

In [35]:
ls.stem('lying')

'lying'

In [36]:
ls.stem('strange')

'strange'

In [37]:
import nltk
ps = nltk.porter.PorterStemmer()
ls = nltk.stem.LancasterStemmer()

def simple_stemming(text, stemmer=ps):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

### Your Turn: Try calling the above defined function for both Lancaster and Porter stemmer separately

Do you notice any difference in the results?

In [38]:
s = "My system keeps crashing his crashed yesterday ours crashes daily and presumably we are not lying"
s

'My system keeps crashing his crashed yesterday ours crashes daily and presumably we are not lying'

In [39]:
simple_stemming(s, stemmer=ps)

'My system keep crash hi crash yesterday our crash daili and presum we are not lie'

In [40]:
simple_stemming(s, stemmer=ls)

'my system keep crash his crash yesterday our crash dai and presum we ar not lying'

# Lemmatization

In [1]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [42]:
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word, pos='n') method of nltk.stem.wordnet.WordNetLemmatizer instance



In [43]:
# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('boxes', 'n'))

car
box


In [44]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

run
eat


In [45]:
# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

sad
fancy


In [46]:
# ineffective lemmatization
print(wnl.lemmatize('ate', 'n'))
print(wnl.lemmatize('fancier', 'v'))
print(wnl.lemmatize('fancier'))

ate
fancier
fancier


In [47]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [48]:
tokens = nltk.word_tokenize(s)
print(tokens)

['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'dogs', '!']


In [49]:
lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)
lemmatized_text

'The brown fox are quick and they are jumping over the sleeping lazy dog !'

In [50]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('brown', 'JJ'), ('foxes', 'NNS'), ('are', 'VBP'), ('quick', 'JJ'), ('and', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('sleeping', 'VBG'), ('lazy', 'JJ'), ('dogs', 'NNS'), ('!', '.')]


In [51]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in tagged_tokens)
lemmatized_text

KeyError: 'DT'

In [52]:
from nltk.corpus import wordnet

wordnet.ADJ

'a'

In [53]:
def pos_tag_wordnet(tagged_tokens):
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(), wordnet.NOUN))
                            for word, tag in tagged_tokens]
    return new_tagged_tokens

In [54]:
wordnet_tokens = pos_tag_wordnet(tagged_tokens)
print(wordnet_tokens)

[('The', 'n'), ('brown', 'a'), ('foxes', 'n'), ('are', 'v'), ('quick', 'a'), ('and', 'n'), ('they', 'n'), ('are', 'v'), ('jumping', 'v'), ('over', 'n'), ('the', 'n'), ('sleeping', 'v'), ('lazy', 'a'), ('dogs', 'n'), ('!', 'n')]


In [55]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
lemmatized_text

'The brown fox be quick and they be jump over the sleep lazy dog !'

### Your turn: Define a function such that you put all the above steps together so that it does the following

- Function name is __`wordnet_lemmatize_text(...)`__
- Input is a variable __`text`__ which should take in a document (bunch of words)
- Call the earlier defined functions and utilize them
- Return lemmatized text as the output (as a string)

In [56]:
wnl = WordNetLemmatizer()

def wordnet_lemmatize_text(text):
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(text))
    wordnet_tokens = pos_tag_wordnet(tagged_tokens)
    lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
    return lemmatized_text

### Your Turn: Now call the function on the below sentence and test it

In [57]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [58]:
wordnet_lemmatize_text(s)

'The brown fox be quick and they be jump over the sleep lazy dog !'

In [59]:
import spacy
nlp = spacy.load('en', parse=False, tag=False, entity=False)

def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [60]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [61]:
spacy_lemmatize_text(s)

'the brown fox be quick and they be jump over the sleep lazy dog !'

# Stopword Removal

In [62]:
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [63]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [64]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [65]:
remove_stopwords(s, is_lower_case=False)

'brown foxes quick jumping sleeping lazy dogs !'

### Your turn: Remove the words 'the' and 'brown' from the stop_words list and call the function with this new list

In [66]:
stop_words.remove('the')
stop_words.append('brown')

In [67]:
remove_stopwords(s, is_lower_case=False, stopwords=stop_words)

'The foxes quick jumping the sleeping lazy dogs !'