# Develop Deep Learning Models for Natural Language in Python

## 5 - How to Clean Text Manually and with NLTK

In [49]:
import string
import re

import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

### 5.4 Manual Tokenization

In [29]:
# 5.4.1 - Load Data
filename = '/content/drive/MyDrive/NLP-In-Depth/Develop Deep Learning Models for Natural Language in Python/metamorphosis_clean.txt'
text = None

with open(filename, 'r', encoding='utf-8-sig') as file:
  text = file.read()

# 5.4.2 - Split by whitespace
words = text.split()
print('Words:', words[:10])

# 5.4.4 - Remove punctuation
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
stripped = [re_punc.sub('', word) for word in words]
print('Stripped:', stripped[:10])

# Filter non-printable characters
re_print = re.compile('[^%s]' % re.escape(string.printable))
printables = [re_print.sub('', word) for word in words]
print('Printables:', printables[:00])

# 5.4.5 - Normalizing Case
normed = [word.lower() for word in printables]
print('Noramalized:', normed[:10])

Words: ['One', 'morning,', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams,', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin.', 'He', 'lay', 'on', 'his', 'armour-like', 'back,', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly,', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections.', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment.', 'His', 'many', 'legs,', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him,', 'waved', 'about', 'helplessly', 'as', 'he', 'looked.', '"What\'s', 'happened', 'to', 'me?"', 'he', 'thought.', 'It', "wasn't", 'a', 'dream.', 'His', 'room,', 'a', 'proper', 'human']
Stripped: ['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 

### 5.5 - Tokenization and Cleaning with NLTK

In [56]:
# 5.5.1 - Load Data
filename = '/content/drive/MyDrive/NLP-In-Depth/Develop Deep Learning Models for Natural Language in Python/metamorphosis_clean.txt'
text = None

with open(filename, 'r', encoding='utf-8-sig') as file:
  text = file.read()

# 5.5.2 - Split into sentences
sentences = nltk.sent_tokenize(text)
print('Sentences:', sentences[:10])

# 5.5.3 - Split into words
words = nltk.word_tokenize(text)
print('Words:', words[:10])

# 5.5.4 - Filter out punctuations
words = [word for word in words if word.isalpha()]
print('Without punctuation:', words)

# 5.5.5 - Filter out stopwords (and Pipeline)
stopwords = nltk.corpus.stopwords.words('english')
words = [word for word in words if word not in stopwords]
print('No stopwords:', words)

# 5.5.6 - Stem Words
porter = nltk.stem.PorterStemmer()
stemmed = [porter.stem(word) for word in words]
print('Stemmed:', stemmed[:10])

Sentences: ['One morning, when Gregor Samsa woke from troubled dreams, he found\nhimself transformed in his bed into a horrible vermin.', 'He lay on\nhis armour-like back, and if he lifted his head a little he could\nsee his brown belly, slightly domed and divided by arches into stiff\nsections.', 'The bedding was hardly able to cover it and seemed ready\nto slide off any moment.', 'His many legs, pitifully thin compared\nwith the size of the rest of him, waved about helplessly as he\nlooked.', '"What\'s happened to me?"', 'he thought.', "It wasn't a dream.", 'His room,\na proper human room although a little too small, lay peacefully\nbetween its four familiar walls.', 'A collection of textile samples\nlay spread out on the table - Samsa was a travelling salesman - and\nabove it there hung a picture that he had recently cut out of an\nillustrated magazine and housed in a nice, gilded frame.', 'It showed\na lady fitted out with a fur hat and fur boa who sat upright,\nraising a heavy fur