# Text Cleaning

In [1]:
url = 'http://www.gutenberg.org/ebooks/1661.txt.utf-8'
file_name = 'sherlock.txt'

In [2]:
import urllib.request
# Download the file from `url` and save it locally under `file_name`:

with urllib.request.urlopen(url) as response:
    with open(file_name, 'wb') as out_file:
        data = response.read() # a `bytes` object
        out_file.write(data)

In [3]:
!ls {*.txt}

ls: cannot access '{*.txt}': No such file or directory


In [4]:
!head -2 sherlock.txt

﻿Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle



In [5]:
!sed -i 1,33d sherlock.txt

In [6]:
!head -5 sherlock.txt

THE ADVENTURES OF SHERLOCK HOLMES

by

SIR ARTHUR CONAN DOYLE


## Load Data

In [7]:
#let's the load data to RAM
text = open(file_name, 'r', encoding='utf-8').read()  # note that I add an encoding='utf-8' parameter to preserve information
print(text[:5])

THE A


In [8]:
print(f'The file is loaded as datatype: {type(text)} and has {len(text)} characters in it')

The file is loaded as datatype: <class 'str'> and has 581204 characters in it


### Exploring Loaded Data

In [9]:
# how many unique characters do we see? 
# For reference, ASCII has 127 characters in it - so we expect this to have at most 127 characters
unique_chars = list(set(text))
unique_chars.sort()
print(unique_chars)
print(f'There are {len(unique_chars)} unique characters, including both ASCII and Unicode character')

['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'è', 'é']
There are 85 unique characters, including both ASCII and Unicode character


## Tokenization 

### Split by Whitespace

In [10]:
words = text.split()
print(len(words))

107431


In [11]:
print(words[90:200])  #start with the first chapeter, ignoring the index for now

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler.', 'All', 'emotions,', 'and', 'that', 'one', 'particularly,', 'were', 'abhorrent', 'to', 'his', 'cold,', 'precise', 'but', 'admirably', 'balanced', 'mind.', 'He', 'was,', 'I', 'take', 'it,', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen,', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions,', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer.', 'They', 'were', 'admirable', 'things', 'for']


In [12]:
# Let's look at another example: 
'red-headed woman on the street'.split()

['red-headed', 'woman', 'on', 'the', 'street']

### Split by Word Extraction
**Introducing Regex**

In [13]:
import re
re.split('\W+', 'Words, words, words.')

['Words', 'words', 'words', '']

In [14]:
words_alphanumeric = re.split('\W+', text)

In [15]:
len(words_alphanumeric), len(words)

(109111, 107431)

In [16]:
print(words_alphanumeric[90:200])

['BOHEMIA', 'I', 'To', 'Sherlock', 'Holmes', 'she', 'is', 'always', 'THE', 'woman', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', 'All', 'emotions', 'and', 'that', 'one', 'particularly', 'were', 'abhorrent', 'to', 'his', 'cold', 'precise', 'but', 'admirably', 'balanced', 'mind', 'He', 'was', 'I', 'take', 'it', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', 'They', 'were', 'admirable']


In [17]:
words_break = re.split('\W+', "Isn't he coming home for dinner with the red-headed girl?")
print(words_break)

['Isn', 't', 'he', 'coming', 'home', 'for', 'dinner', 'with', 'the', 'red', 'headed', 'girl', '']


### spaCy for Tokenization

In [18]:
%%time
import spacy
nlp = spacy.load('en')

CPU times: user 455 ms, sys: 108 ms, total: 563 ms
Wall time: 505 ms


In [19]:
doc = nlp(text)

In [20]:
print(list(doc)[150:200])

[whole, of, her, sex, ., It, was, not, that, he, felt, 
, any, emotion, akin, to, love, for, Irene, Adler, ., All, emotions, ,, and, that, 
, one, particularly, ,, were, abhorrent, to, his, cold, ,, precise, but, 
, admirably, balanced, mind, ., He, was, ,, I, take, it, ,]


Conveniently, spaCy tokenizes all *punctuations and words* and returned those as individual tokens as well. Let's try the example which we didn't like earlier:

In [21]:
words = nlp("Isn't he coming home for dinner with the red-headed girl?")
print([token for token in words])

[Is, n't, he, coming, home, for, dinner, with, the, red, -, headed, girl, ?]


In [22]:
sentences = list(doc.sents)
print(sentences[13:18])

[The Adventure of the Copper Beeches




, ADVENTURE I. A SCANDAL IN BOHEMIA

, I.

, To Sherlock Holmes she is always THE woman., I have seldom heard
]


#### STOP WORD REMOVAL & CASE CHANGE

spaCy has already marked each token as a stop word or not and stored it in `is_stop` attribute of each token. This makes it very handy for text cleaning. Let's take a quick look: 

In [23]:
sentence_example = "the AI/AGI uprising cannot happen without the progress of NLP"

In [24]:
[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]

[(the, True, False),
 (AI, False, False),
 (/, False, True),
 (AGI, False, False),
 (uprising, False, False),
 (can, True, False),
 (not, True, False),
 (happen, False, False),
 (without, True, False),
 (the, True, False),
 (progress, False, False),
 (of, True, False),
 (NLP, False, False)]

In [25]:
for token in doc[:5]:
    print(token, token.is_stop, token.is_punct)

THE True False
ADVENTURES False False
OF True False
SHERLOCK False False
HOLMES False False


In [26]:
text_lower = text.lower()  # native python function
doc_lower = nlp(text_lower)

In [27]:
for token in doc_lower[:5]:
    print(token, token.is_stop)

the True
adventures False
of True
sherlock False
holmes False


In [28]:
from spacy.lang.en.stop_words import STOP_WORDS
f'spaCy has a dictionary of {len(list(STOP_WORDS))} stop words'

'spaCy has a dictionary of 312 stop words'

In [29]:
domain_stop_words = ["NLP", "Processing", "AGI"]
for word in domain_stop_words:
    STOP_WORDS.add(word)

In [30]:
[(token, token.is_stop, token.is_punct) for token in nlp(sentence_example)]

[(the, True, False),
 (AI, False, False),
 (/, False, True),
 (AGI, False, False),
 (uprising, False, False),
 (can, True, False),
 (not, True, False),
 (happen, False, False),
 (without, True, False),
 (the, True, False),
 (progress, False, False),
 (of, True, False),
 (NLP, False, False)]

In [31]:
[str(token) for token in nlp(sentence_example) if not token.is_stop and not token.is_punct]

['AI', 'AGI', 'uprising', 'happen', 'progress', 'NLP']

In [32]:
[str(token) for token in nlp(sentence_example) if not token.is_stop]

['AI', '/', 'AGI', 'uprising', 'happen', 'progress', 'NLP']

## Stemming and Lemmatization

### spaCy for Lemmatization
**spaCy only supports lemmatization** 

An underscore at end, such as `lemma_` tells spaCy we are looking for something which is human readable. spaCy stores the internal hash or identifier which spaCy stores in `token.lemma`. 

In [33]:
lemma_sentence_example = "Their Apples & Banana fruit salads are amazing. Would you like meeting me at the cafe?"
[(token, token.lemma_, token.lemma, token.pos_ ) for token in nlp(lemma_sentence_example)]

[(Their, '-PRON-', 561228191312463089, 'DET'),
 (Apples, 'Apples', 9297668116247400838, 'PROPN'),
 (&, '&', 15473034735919704609, 'CCONJ'),
 (Banana, 'Banana', 7617506991971869807, 'PROPN'),
 (fruit, 'fruit', 17674554054627885835, 'NOUN'),
 (salads, 'salad', 16382906660984395826, 'NOUN'),
 (are, 'be', 10382539506755952630, 'VERB'),
 (amazing, 'amazing', 12968186374132960503, 'ADJ'),
 (., '.', 12646065887601541794, 'PUNCT'),
 (Would, 'Would', 10299253490465169573, 'VERB'),
 (you, '-PRON-', 561228191312463089, 'PRON'),
 (like, 'like', 18194338103975822726, 'VERB'),
 (meeting, 'meet', 6880656908171229526, 'VERB'),
 (me, '-PRON-', 561228191312463089, 'PRON'),
 (at, 'at', 11667289587015813222, 'ADP'),
 (the, 'the', 7425985699627899538, 'DET'),
 (cafe, 'cafe', 10569699879655997926, 'NOUN'),
 (?, '?', 8205403955989537350, 'PUNCT')]