# Text Cleaning

1. Tokenization
2. Stop Word removal
3. Lemmatization

In [1]:
#Using the book The Adventures of Sherlock Holmes by Arthur Conan Doyle
url = 'http://www.gutenberg.org/ebooks/1661.txt.utf-8'
file_name = 'sherlock.txt'

In [2]:
#Download the file and save it locally
import urllib.request

with urllib.request.urlopen(url) as response:
    with open(file_name,'wb') as out_file:
        data = response.read()
        out_file.write(data)

Remove the header and footer infomation. We also don't need the first 33 lines.

In [11]:
#Load the data
text = open(file_name, 'r', encoding='utf-8').read()
print(text[:33])

THE ADVENTURES OF SHERLOCK HOLMES


In [12]:
print("Number of characters in the text:",len(text))

Number of characters in the text: 562330


# Explore Loaded data

In [13]:
unique_char = list(set(text))
unique_char.sort()
print(unique_char)
print("Number of unique characters:",len(unique_char))

['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'è', 'é']
Number of unique characters: 81


# Tokenization

In [14]:
#Split by space
words = text.split()
print("Number of words:",len(words))

Number of words: 104493


In [15]:
#Example
'N-L-P is an interesting field of AI.'.split()

['N-L-P', 'is', 'an', 'interesting', 'field', 'of', 'AI.']

Points to be noted:
1. N-L-P might be needed as seperated
2. AI. (Punctuation should be removed)

In [16]:
#Use Regex to split words
import re
words_new = re.split('\W+', text)
print(len(words), len(words_new))

104493 106000


In [17]:
#Tokenization using spaCy
import spacy
nlp = spacy.load('en')

In [18]:
words = nlp("N-L-P is an interest field of AI")
print([token for token in words])

[N, -, L, -, P, is, an, interest, field, of, AI]


In [19]:
doc = nlp(text)

In [20]:
#Sentence tokenization
sentences = list(doc.sents)

print("Number of sentences:", len(sentences))

Number of sentences: 6887


# Stop Word  Removal

In [22]:
#Check if a token is a stop word or punctuation
for token in doc[:5]:
    print(token, token.is_stop, token.is_punct)

THE False False
ADVENTURES False False
OF False False
SHERLOCK False False
HOLMES False False


In [23]:
#Convert everything to lowercase and then check again
text_lower = text.lower()
doc_lower = nlp(text_lower)

In [24]:
for token in doc_lower[:5]:
    print(token, token.is_stop)

the True
adventures False
of True
sherlock False
holmes False


# Lemmatization

In [26]:
[(token, token.lemma_, token.lemma, token.pos_) for token in doc_lower[:5]]

[(the, 'the', 7425985699627899538, 'DET'),
 (adventures, 'adventure', 96151693251643590, 'NOUN'),
 (of, 'of', 886050111519832510, 'ADP'),
 (sherlock, 'sherlock', 13444235815076800422, 'NOUN'),
 (holmes, 'holme', 7908237556874274451, 'NOUN')]

In [27]:
[(token, token.lemma_, token.lemma, token.pos_) for token in doc[:5]]

[(THE, 'the', 7425985699627899538, 'DET'),
 (ADVENTURES, 'adventures', 1710777569006724040, 'PROPN'),
 (OF, 'of', 886050111519832510, 'ADP'),
 (SHERLOCK, 'sherlock', 13444235815076800422, 'PROPN'),
 (HOLMES, 'holmes', 406440630240299238, 'PROPN')]