In [None]:
raw_docs = ["Here are some very simple basic sentences.",
"They won't be very interesting, I'm afraid.",
"The point of these examples is to _learn how basic text cleaning works_ on *very simple* data."]

Type these installation commands into your terminal:

pip install nltk scikit-learn

python -m nltk.downloader all

Here we're going to start using NLTK (Natural Language Toolkit) - NLTK is one of the most popular python packages for NLP analysis and contains a wide variety of tools and datasets.

In [None]:
from nltk.tokenize import word_tokenize

tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
tokenized_docs

### Removing punctuation

Punctuation can help with tokenizers, but once you've done that, there's no reason to keep it around. There are tons of ways to remove punctuation. Since we have already learned regex, how would we do this?

In [None]:
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    
    new_review = []
    for token in review: 
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
tokenized_docs_no_punctuation


### Cleaning text of stopwords

There are some really basic words that just don't matter. NLTK comes with a list of them for many languages.

In [None]:
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []
for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    tokenized_docs_no_stopwords.append(new_term_vector)
            
tokenized_docs_no_stopwords



### Stemming and Lemmatizing

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        final_doc.append(porter.stem(word))
        #final_doc.append(snowball.stem(word))
        #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
    preprocessed_docs.append(final_doc)

preprocessed_docs