# Supplemental Data Cleaning: Using a Lemmatizer

### Test out WordNet lemmatizer (read more about WordNet [here](https://wordnet.princeton.edu/))

In [None]:
import nltk

# making comparison between the two
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [None]:
dir(wn)

In [None]:
# compare this with the next cell
print(ps.stem('meanness'))
print(ps.stem('meaning'))

In [None]:
# better results but longer runtime
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning'))
# conclusion: not condensing is better than incorrectly stemming

In [None]:
# another comparison
print(ps.stem('goose'))
print(ps.stem('geese'))
print()
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

### Read in raw text

In [None]:
import pandas as pd
import re
import string
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

data.head()

### Clean up text

In [None]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

data['body_text_nostop'] = data['body_text'].apply(lambda x: clean_text(x.lower()))

data.head()

### Lemmatize text

In [None]:
def lemmatize(tokenized_text):
    return [wn.lemmatize(word) for word in tokenized_text]

data['body_text_lemmatized'] = data['body_text_nostop'].apply(lambda x: lemmatize(x))

data.head()
# note: it's still not good when dealing with abbrev. & slangs