# Exploring Tokenization

@Acknowledgement: Aman Kedia

In [None]:
import nltk

In [None]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating',
           'siezing', 'itemization', 'traditional', 'reference', 'colonizer', 'plotted', 'having', 'generously']

# Porter Stemmer

In [None]:
from nltk.stem.porter import PorterStemmer 
stemmer = PorterStemmer()
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

# Snowball Stemmer

In [None]:
from nltk.stem.snowball import SnowballStemmer
print(SnowballStemmer.languages)

In [None]:
stemmer2 = SnowballStemmer(language='english')
singles = [stemmer2.stem(plural) for plural in plurals]
print(' '.join(singles))

# Wordnet Lemmatizer

In [None]:
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

In [None]:
lemmatizer = WordNetLemmatizer()
s = "We are putting in efforts to enhance our understanding of Lemmatization"
token_list = s.split()
print("The tokens are: ", token_list)
lemmatized_output = ' '.join([lemmatizer.lemmatize(token) for token in token_list])
print("The lemmatized output is: ", lemmatized_output)

## POS Tagging

In [None]:
#nltk.download('averaged_perceptron_tagger')
pos_tags = nltk.pos_tag(token_list)
pos_tags

## POS tag Mapping

In [None]:
from nltk.corpus import wordnet

##This is a common method which is widely used across the NLP community of practitioners and readers

def get_part_of_speech_tags(token):
    
    """Maps POS tags to first character lemmatize() accepts.
    We are focussing on Verbs, Nouns, Adjectives and Adverbs here."""

    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    tag = nltk.pos_tag([token])[0][1][0].upper()
    
    return tag_dict.get(tag, wordnet.NOUN)


## Wordnet Lemmatizer with POS Tag Information

In [None]:
lemmatized_output_with_POS_information = [lemmatizer.lemmatize(token, get_part_of_speech_tags(token)) for token in token_list]
print(' '.join(lemmatized_output_with_POS_information))

## Lemmatization vs Stemming

In [None]:
stemmer2 = SnowballStemmer(language='english')
stemmed_sentence = [stemmer2.stem(token) for token in token_list]
print(' '.join(stemmed_sentence))

# Stopwords

In [None]:
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
", ".join(stop)

In [None]:
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']

stop = set(stopwords.words('english'))

sentence = "how are we putting in efforts to enhance our understanding of Lemmatization"

for word in wh_words:
    stop.remove(word)

sentence_after_stopword_removal = [token for token in sentence.split() if token not in stop]
" ".join(sentence_after_stopword_removal)

# Case Folding

In [None]:
s = "We are putting in efforts to enhance our understanding of Lemmatization"
s = s.lower()
s

# N-grams

In [None]:
from nltk.util import ngrams
s = "Natural Language Processing is the way to go"
tokens = s.split()
bigrams = list(ngrams(tokens, 2))
[" ".join(token) for token in bigrams]

In [None]:
s = "Natural Language Processing is the way to go"
tokens = s.split()
trigrams = list(ngrams(tokens, 3))
[" ".join(token) for token in trigrams]

# Building a basic vocabulary

In [None]:
s = "Natural Language Processing is the way to go"
tokens = set(s.split())
vocabulary = sorted(tokens)
vocabulary

# Removing HTML Tags

In [None]:
html = "<!DOCTYPE html><html><body><h1>My First Heading</h1><p>My first paragraph.</p></body></html>"
from bs4 import BeautifulSoup

soup = BeautifulSoup(html)
text = soup.get_text()
print(text)