In [1]:
import string
import re
import nltk
import spacy

## Lowercasing

In [2]:
text_with_upper = "Hello, POLAND is a Very BeautiFul CountrY!"
text_with_upper.lower()

'hello, poland is a very beautiful country!'

## Punctuation Removal

### **Most of them use `string.punctuation`**



Based on the [StackOverflow question](https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string)

I ordered the solutions from the fastest to the slowest

In [3]:
puncts_text = "HI!!! I overuse ... !(@*punctuations +-*/ and*(& other signs __*(&!!"

### using `str.translate()` 
Probably the fastest way

In [4]:
puncts_text.translate(str.maketrans('', '', string.punctuation))

'HI I overuse  punctuations  and other signs '

### Using `re`

`re` stands for Regular Expressions

#### Method 1 with string.punctuation

In [5]:
regex = re.compile('[%s]' % re.escape(string.punctuation))
regex.sub('', puncts_text)

'HI I overuse  punctuations  and other signs '

#### Method 1 in a single line

In [6]:
re.sub(f'[{re.escape(string.punctuation)}]','', puncts_text)

'HI I overuse  punctuations  and other signs '

#### Method 2

Removes **not words** and **not spaces**

Note: `re` treats underscore as a word so its results are different

In [7]:
re.sub(r'[^\w\s]','', puncts_text)

'HI I overuse  punctuations  and other signs __'

### Excluding string.punctuation

In [8]:
# a slower solution
exclude = set(string.punctuation)
"".join(ch for ch in puncts_text if ch not in exclude)

'HI I overuse  punctuations  and other signs '

### Using `str.replace()`

In [9]:
clean_text = puncts_text
for c in string.punctuation:
    clean_text = clean_text.replace(c, "")
clean_text

'HI I overuse  punctuations  and other signs '

## Numbers Removal



In [10]:
text_numbers = '12abcd405'

### Using `str.translate` with `string.digits`

In [11]:
text_numbers.translate(str.maketrans('', '', string.digits))

'abcd'

### Using `re`

In [12]:
re.sub(r'\d+', '', text_numbers)

'abcd'

In [13]:
re.sub(r'[0-9]+', '', text_numbers)

'abcd'

### Using `join()` and NOT `isdigit()`

In [14]:
"".join(i for i in text_numbers if not i.isdigit())

'abcd'

### Using `join()` and `isalpha()`

This actually isn't the correct solution, because `isalpha()` is True only for letters.

In [15]:
"".join(i for i in text_numbers if i.isalpha())

'abcd'

## HTML Tags Removal

Solutions from [Stack Overflow](https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python)

In [16]:
html_text = """<tr class="color-5 negri a-bottom">
<td class="a-center" width="11%"><div style="min-width: 80px">3-Pointers</div></td>
<td><div class="left" style="min-width: 120px; max-width:175px; width: 57%">
<div class="left margen-l2">Player</div>
<div class="right"> Team</div>
</div>
</td>
<td><div style="min-width: 60px; ">Season</div></td>
<td><div class="">W/L Game</div>
</td>
</tr>"""


### Using `re`

 - begin with tag opening '<'
 - then not '<'
 - not '<' at least once

In [17]:
re.sub('<[^<]+?>', '', html_text)

'\n3-Pointers\n\nPlayer\n Team\n\n\nSeason\nW/L Game\n\n'

### Using `BeautifulSoup`

 - `get_text()` removes HTML tags
 - `strip = True` removes whitespaces and newlines

In [18]:
from bs4 import BeautifulSoup

In [19]:
soup = BeautifulSoup(html_text, 'html.parser')
soup.get_text(",", strip=True)

'3-Pointers,Player,Team,Season,W/L Game'

### Extreme example

In [20]:
html_comment = "<img<!-- --> src=x onerror=alert(1);//><!-- -->"

Both solutions fail

In [21]:
re.sub('<[^<]+?>', '', html_comment)

'<img src=x onerror=alert(1);//>'

In [22]:
soup = BeautifulSoup(html_comment, 'html.parser')
soup.get_text(",", strip=True)

'src=x onerror=alert(1);//>'

In [23]:
tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
no_tags = tag_re.sub('', html_comment)
no_tags

' src=x onerror=alert(1);//>'

In [24]:
import html

html.escape(no_tags)

' src=x onerror=alert(1);//&gt;'

## URL Removal

From this [StackOverflow Question](https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python/11332580)

In [25]:
text_url = """text1
text2
http://url.com/bla1/blah1/
text3
text4
http://url.com/bla2/blah2/
text5
text6
http://url.com/bla3/blah3/"""

In [26]:
re.sub(r'http\S+', '', text_url)

'text1\ntext2\n\ntext3\ntext4\n\ntext5\ntext6\n'

## Newlines, spaces and tabs removal

### Using `str.split()` and `join()`

In [27]:
my_str="I want to Remove all white \t\n\n\r spaces, new lines \n and tabs \t"
" ".join(my_str.split())

'I want to Remove all white spaces, new lines and tabs'

### Using `re`

 - **`\s` stands for whitespace character, equivalent to `[ \n\r\t\f]`**
 - **`\S` stands for not whitespace character, equivalent to `[^\s]`**

In [28]:
re.sub('\s+', ' ', my_str)

'I want to Remove all white spaces, new lines and tabs '

In [29]:
re.sub('[^\S]+', ' ', my_str)

'I want to Remove all white spaces, new lines and tabs '

In [30]:
re.sub('[\t\n\r\f ]+', ' ', my_str)

'I want to Remove all white spaces, new lines and tabs '

### Using `re.findall()`

Taken from [StackOverflow](https://stackoverflow.com/questions/4697882/how-can-i-find-all-matches-to-a-regular-expression-in-python)

NOTE: I believe this approach is slower than the one with `re.sub()`. 

In [31]:
match = re.findall('[\w]+ ', my_str)
"".join(match)

'I want to Remove all white new lines and tabs '

## Emojis Removal

Found [here](https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b#gistcomment-3315605)

In [32]:
text_emojis = u"Hi 🤔 How is your 🙈 and 😌. Have a nice weekend 💕👭👙\U0001F600\U0001F300"
text_emojis

'Hi 🤔 How is your 🙈 and 😌. Have a nice weekend 💕👭👙😀🌀'

In [33]:
emojis_pattern = re.compile(pattern="["
                    u"\U0001F600-\U0001F64F"  # emoticons
                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                    u"\U00002500-\U00002BEF"  # chinese char
                    u"\U00002702-\U000027B0"
                    u"\U00002702-\U000027B0"
                    u"\U000024C2-\U0001F251"
                    u"\U0001f926-\U0001f937"
                    u"\U00010000-\U0010ffff"
                    u"\u2640-\u2642"
                    u"\u2600-\u2B55"
                    u"\u200d"
                    u"\u23cf"
                    u"\u23e9"
                    u"\u231a"
                    u"\ufe0f"  # dingbats
                    u"\u3030"
                "]+", flags = re.UNICODE)

emojis_pattern.sub(r'', text_emojis)

'Hi  How is your  and . Have a nice weekend '

## Replacing Accented Characters

In [34]:
import unidecode
text_accented = "Málaga, àéêöhello. Polish: ńŃćĆśŚęąóżŻźŹ letters. German üöäöß letters"

unidecode.unidecode(text_accented)

'Malaga, aeeohello. Polish: nNcCsSeaozZzZ letters. German uoaoss letters'

## Spelling corrections

Solutions from [StackOverflow](https://stackoverflow.com/questions/13928155/spell-checker-for-python)

### Using `spellchecker`

In [35]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(text):
    corrected_text = list()
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        next_word = word
        if word in misspelled_words:
            next_word = spell.correction(word)
        corrected_text.append(next_word)
    
    return " ".join(corrected_text)

text_misspelled = "I realli needt smoe corection. This sentnce has mispelled wirds"
correct_spelling(text_misspelled)

'I really need some corrections This sentence has misspelled words'

### Using `autocorrect`

In [36]:
from autocorrect import Speller

speller = Speller(lang='en')

print(speller(text_misspelled))

I really need some correction. This sentence has misspelled words


### Using `textblob`

Found [here](https://www.geeksforgeeks.org/python-textblob-correct-method/)

In [37]:
from textblob import TextBlob

TextBlob(text_misspelled).correct()

TextBlob("I really need some correction. His sentence has dispelled words")

## Reading a text file

### Reading line by line

In [38]:
with open('metamorphosis.txt', 'r') as f:
    # without newlines
    lines = [line.rstrip() for line in f]
    # read text line by line including newlines \n
    # text_lines = f.readlines()

N_LINES = 20
text_lines = " ".join([line for line in lines[:N_LINES]])
text_lines

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.  He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.  The bedding was hardly able to cover it and seemed ready to slide off any moment.  His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked.  "What\'s happened to me?" he thought.  It wasn\'t a dream.  His room, a proper human room although a little too small, lay peacefully between its four familiar walls.  A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.  It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower ar

### Reading the whole text

In [39]:
# read the whole file
with open('metamorphosis.txt', 'r') as f:
    text = f.read()

N_CHARS = 300
text_chars = text[:N_CHARS]
text_chars

'One morning, when Gregor Samsa woke from troubled dreams, he found\nhimself transformed in his bed into a horrible vermin.  He lay on\nhis armour-like back, and if he lifted his head a little he could\nsee his brown belly, slightly domed and divided by arches into stiff\nsections.  The bedding was hardl'

## Tokenize Words

### Using `re`

Return the list of words

In [65]:
re_tokens = re.findall('[\w]+', text_lines)
print(len(re_tokens))
print(re_tokens)

200
['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'legs', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', 'What', 's', 'happened', 'to', 'me', 'he', 'thought', 'It', 'wasn', 't', 'a', 'dream', 'His', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', 'lay', 'peacefully', 'between', 'its', 'four', 'familiar', 'walls', 'A', 'collection', 'of', 'textile

### Using `nltk`

In [48]:
from nltk.tokenize import word_tokenize

nltk_tokens = word_tokenize(text_lines)
print(len(nltk_tokens), nltk_tokens)

224 ['One', 'morning', ',', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', ',', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', '.', 'He', 'lay', 'on', 'his', 'armour-like', 'back', ',', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', ',', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', '.', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', '.', 'His', 'many', 'legs', ',', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', ',', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', '.', '``', 'What', "'s", 'happened', 'to', 'me', '?', "''", 'he', 'thought', '.', 'It', 'was', "n't", 'a', 'dream', '.', 'His', 'room', ',', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', ',', 'lay', 'peacefu

### Using `spaCy`

In [63]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text_lines)
spacy_tokens = list([token for token in doc])
print(len(spacy_tokens))
print(spacy_tokens)

235
[One, morning, ,, when, Gregor, Samsa, woke, from, troubled, dreams, ,, he, found, himself, transformed, in, his, bed, into, a, horrible, vermin, .,  , He, lay, on, his, armour, -, like, back, ,, and, if, he, lifted, his, head, a, little, he, could, see, his, brown, belly, ,, slightly, domed, and, divided, by, arches, into, stiff, sections, .,  , The, bedding, was, hardly, able, to, cover, it, and, seemed, ready, to, slide, off, any, moment, .,  , His, many, legs, ,, pitifully, thin, compared, with, the, size, of, the, rest, of, him, ,, waved, about, helplessly, as, he, looked, .,  , ", What, 's, happened, to, me, ?, ", he, thought, .,  , It, was, n't, a, dream, .,  , His, room, ,, a, proper, human, room, although, a, little, too, small, ,, lay, peacefully, between, its, four, familiar, walls, .,  , A, collection, of, textile, samples, lay, spread, out, on, the, table, -, Samsa, was, a, travelling, salesman, -, and, above, it, there, hung, a, picture, that, he, had, recently, cut, 

### Using `gensim`

In [62]:
from gensim.utils import tokenize

gensim_tokens = list(tokenize(text_lines))
print(len(gensim_tokens))
print(gensim_tokens)

200
['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arches', 'into', 'stiff', 'sections', 'The', 'bedding', 'was', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'legs', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'as', 'he', 'looked', 'What', 's', 'happened', 'to', 'me', 'he', 'thought', 'It', 'wasn', 't', 'a', 'dream', 'His', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', 'lay', 'peacefully', 'between', 'its', 'four', 'familiar', 'walls', 'A', 'collection', 'of', 'textile

### Tokenization Comparision

 - it seems that `gensim` uses the same `re` function, that we showed above. Both returned only words
 - `nltk` and `spacy` return also punctuations
 - `spacy` treats a whitespace as a token if there is a double whitespace. In our text each sentence-ending dot is followed by double whitespace. We could clean this but at least we see that `spacy` behaves differently
 - **not or n't** contraction gives different results. `spacy` and `nltk` splits *wasn't* to *was* and *n't*, whereas `re` and `gensim` to *wasn* and *t*.
 - when there is a hyphen between words, we get 3 different results. Our example is armour-like. `nltk` returns **a single token**: *armour-like*, `re` and `gensim` return **two tokens**: *armour* and *like*. `spacy` returns **three tokens**: *armour*, *-*, and *like*
 - `nltk` converts quotation marks. Quote opening: **``**, quote closing **' '**

## Tokenize Sentences

### Using `re`

In [60]:
re_sentences = re.compile('[.?!]').split(text_lines)
len(re_sentences), re_sentences

(12,
 ['One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin',
  '  He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections',
  '  The bedding was hardly able to cover it and seemed ready to slide off any moment',
  '  His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked',
  '  "What\'s happened to me',
  '" he thought',
  "  It wasn't a dream",
  '  His room, a proper human room although a little too small, lay peacefully between its four familiar walls',
  '  A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame',
  '  It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a he

### Using `nltk`

In [51]:
from nltk.tokenize import sent_tokenize

nltk_sentences = sent_tokenize(text_lines)
print(len(nltk_sentences), nltk_sentences)

11 ['One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.', 'He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.', 'The bedding was hardly able to cover it and seemed ready to slide off any moment.', 'His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked.', '"What\'s happened to me?"', 'he thought.', "It wasn't a dream.", 'His room, a proper human room although a little too small, lay peacefully between its four familiar walls.', 'A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.', 'It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the

### Using `spaCy`

In [52]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text_lines)
spacy_sentences = list([sent for sent in doc.sents])
len(spacy_sentences), spacy_sentences

(11,
 [One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.,
   He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.,
   The bedding was hardly able to cover it and seemed ready to slide off any moment.,
   His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked.,
   "What's happened to me?",
  he thought.,
   It wasn't a dream.,
   His room, a proper human room although a little too small, lay peacefully between its four familiar walls.,
   A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.,
   It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that co

## Stopwords Removal

We'll ignore punctuations. Tokenization step using `re` gives us exactly that.

### Using `nltk`

In [106]:
from nltk.corpus import stopwords

stop_words_nltk = stopwords.words('english')
# print(len(stop_words_nltk),stop_words_nltk)

filtered_nltk = [word for word in re_tokens if word.lower() not in stop_words_nltk]
print(len(filtered_nltk), filtered_nltk)

106 ['One', 'morning', 'Gregor', 'Samsa', 'woke', 'troubled', 'dreams', 'found', 'transformed', 'bed', 'horrible', 'vermin', 'lay', 'armour', 'like', 'back', 'lifted', 'head', 'little', 'could', 'see', 'brown', 'belly', 'slightly', 'domed', 'divided', 'arches', 'stiff', 'sections', 'bedding', 'hardly', 'able', 'cover', 'seemed', 'ready', 'slide', 'moment', 'many', 'legs', 'pitifully', 'thin', 'compared', 'size', 'rest', 'waved', 'helplessly', 'looked', 'happened', 'thought', 'dream', 'room', 'proper', 'human', 'room', 'although', 'little', 'small', 'lay', 'peacefully', 'four', 'familiar', 'walls', 'collection', 'textile', 'samples', 'lay', 'spread', 'table', 'Samsa', 'travelling', 'salesman', 'hung', 'picture', 'recently', 'cut', 'illustrated', 'magazine', 'housed', 'nice', 'gilded', 'frame', 'showed', 'lady', 'fitted', 'fur', 'hat', 'fur', 'boa', 'sat', 'upright', 'raising', 'heavy', 'fur', 'muff', 'covered', 'whole', 'lower', 'arm', 'towards', 'viewer', 'Gregor', 'turned', 'look', 'w

### Using `spaCy`

In [108]:
stop_words_spacy = nlp.Defaults.stop_words

filtered_spacy = [word for word in re_tokens if word.lower() not in stop_words_spacy]
print(len(filtered_spacy), filtered_spacy)

99 ['morning', 'Gregor', 'Samsa', 'woke', 'troubled', 'dreams', 'found', 'transformed', 'bed', 'horrible', 'vermin', 'lay', 'armour', 'like', 'lifted', 'head', 'little', 'brown', 'belly', 'slightly', 'domed', 'divided', 'arches', 'stiff', 'sections', 'bedding', 'hardly', 'able', 'cover', 'ready', 'slide', 'moment', 'legs', 'pitifully', 'thin', 'compared', 'size', 'rest', 'waved', 'helplessly', 'looked', 's', 'happened', 'thought', 'wasn', 't', 'dream', 'room', 'proper', 'human', 'room', 'little', 'small', 'lay', 'peacefully', 'familiar', 'walls', 'collection', 'textile', 'samples', 'lay', 'spread', 'table', 'Samsa', 'travelling', 'salesman', 'hung', 'picture', 'recently', 'cut', 'illustrated', 'magazine', 'housed', 'nice', 'gilded', 'frame', 'showed', 'lady', 'fitted', 'fur', 'hat', 'fur', 'boa', 'sat', 'upright', 'raising', 'heavy', 'fur', 'muff', 'covered', 'lower', 'arm', 'viewer', 'Gregor', 'turned', 'look', 'window', 'dull', 'weather']


### Using `gensim`

In [111]:
from gensim.parsing.preprocessing import STOPWORDS

stop_words_gensim = STOPWORDS

filtered_gensim = [word for word in re_tokens if word.lower() not in stop_words_gensim]
print(len(filtered_gensim), filtered_gensim)

97 ['morning', 'Gregor', 'Samsa', 'woke', 'troubled', 'dreams', 'transformed', 'bed', 'horrible', 'vermin', 'lay', 'armour', 'like', 'lifted', 'head', 'little', 'brown', 'belly', 'slightly', 'domed', 'divided', 'arches', 'stiff', 'sections', 'bedding', 'hardly', 'able', 'cover', 'ready', 'slide', 'moment', 'legs', 'pitifully', 'compared', 'size', 'rest', 'waved', 'helplessly', 'looked', 's', 'happened', 'thought', 'wasn', 't', 'dream', 'room', 'proper', 'human', 'room', 'little', 'small', 'lay', 'peacefully', 'familiar', 'walls', 'collection', 'textile', 'samples', 'lay', 'spread', 'table', 'Samsa', 'travelling', 'salesman', 'hung', 'picture', 'recently', 'cut', 'illustrated', 'magazine', 'housed', 'nice', 'gilded', 'frame', 'showed', 'lady', 'fitted', 'fur', 'hat', 'fur', 'boa', 'sat', 'upright', 'raising', 'heavy', 'fur', 'muff', 'covered', 'lower', 'arm', 'viewer', 'Gregor', 'turned', 'look', 'window', 'dull', 'weather']


Another method from gensim using the `remove_stopwords` function.

In [76]:
from gensim.parsing.preprocessing import remove_stopwords

filtered_sentence_gensim = remove_stopwords(text_lines)
print(len(filtered_sentence_gensim), filtered_sentence_gensim)

713 One morning, Gregor Samsa woke troubled dreams, transformed bed horrible vermin. He lay armour-like back, lifted head little brown belly, slightly domed divided arches stiff sections. The bedding hardly able cover ready slide moment. His legs, pitifully compared size rest him, waved helplessly looked. "What's happened me?" thought. It wasn't dream. His room, proper human room little small, lay peacefully familiar walls. A collection textile samples lay spread table - Samsa travelling salesman - hung picture recently cut illustrated magazine housed nice, gilded frame. It showed lady fitted fur hat fur boa sat upright, raising heavy fur muff covered lower arm viewer. Gregor turned look window dull weather.


### Comparing results

**`nltk` vs `spacy`**

In [112]:
list(set(filtered_nltk) ^ set(filtered_spacy))

['seemed',
 'One',
 'wasn',
 'towards',
 'see',
 's',
 'four',
 'back',
 'although',
 'many',
 'could',
 'whole',
 't']

**`nltk` vs `gensim`**

In [113]:
list(set(filtered_nltk) ^ set(filtered_gensim))

['seemed',
 'One',
 'found',
 'wasn',
 'thin',
 'towards',
 'see',
 's',
 'four',
 'back',
 'although',
 'many',
 'could',
 'whole',
 't']

**`gensim` vs `spacy`**

In [114]:
list(set(filtered_gensim) ^ set(filtered_spacy))

['found', 'thin']

### Comparing stopwords lists

Lists of words

In [90]:
# print("NLTK stopwords",stop_words_nltk)
# print("Spacy stopwords",stop_words_spacy)
# print("Gensim stopwords",stop_words_gensim)

Comparing length

In [80]:
print("NLTK stopwords len", len(stop_words_nltk))
print("Spacy stopwords len", len(stop_words_spacy))
print("Gensim stopwords len", len(stop_words_gensim))

NLTK stopwords len 179
Spacy stopwords len 326
Gensim stopwords len 337


## Lemmatization

For lemmatization we'll ignore punctuations. Tokenization step using `re` gives us exactly that.

### Using `nltk`

In [100]:
from nltk.stem import WordNetLemmatizer

nltk_lemmatizer = WordNetLemmatizer()
nltk_lemmas = [nltk_lemmatizer.lemmatize(w) for w in re_tokens]
print(nltk_lemmas)

['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dream', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arch', 'into', 'stiff', 'section', 'The', 'bedding', 'wa', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'leg', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'a', 'he', 'looked', 'What', 's', 'happened', 'to', 'me', 'he', 'thought', 'It', 'wasn', 't', 'a', 'dream', 'His', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', 'lay', 'peacefully', 'between', 'it', 'four', 'familiar', 'wall', 'A', 'collection', 'of', 'textile', 'sample', 

### Using `spaCy`

[spacy documentation](https://spacy.io/api/lemmatizer)

In [104]:
# spacy_text = " ".join([token.text for token in spacy_tokens])
text_from_re_tokens = " ".join([word for word in re_tokens])
# spacy_lemmas = [word.lemma_ for word in nlp(text_lines)]
spacy_lemmas = [word.lemma_ for word in nlp(text_from_re_tokens)]
print(len(spacy_lemmas), spacy_lemmas)

200 ['one', 'morning', 'when', 'Gregor', 'Samsa', 'wake', 'from', 'troubled', 'dream', 'he', 'find', 'himself', 'transform', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'he', 'lie', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lift', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'dome', 'and', 'divide', 'by', 'arch', 'into', 'stiff', 'section', 'the', 'bedding', 'be', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seem', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'his', 'many', 'leg', 'pitifully', 'thin', 'compare', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'he', 'wave', 'about', 'helplessly', 'as', 'he', 'look', 'what', 's', 'happen', 'to', 'I', 'he', 'think', 'it', 'wasn', 't', 'a', 'dream', 'his', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', 'lay', 'peacefully', 'between', 'its', 'four', 'familiar', 'wall', 'a', 'collection', 'of', 'textile', 'sample', 'lie', 'sprea

### Using `TextBlob`

In [122]:
from textblob import TextBlob, Word

# create a TextBlob for our sentence
sent_tb = TextBlob(text_from_re_tokens)

# lemmatize each word
blob_lemmas = [word.lemmatize() for word in sent_tb.words]
print(len(blob_lemmas), blob_lemmas)

200 ['One', 'morning', 'when', 'Gregor', 'Samsa', 'woke', 'from', 'troubled', 'dream', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lifted', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divided', 'by', 'arch', 'into', 'stiff', 'section', 'The', 'bedding', 'wa', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seemed', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'leg', 'pitifully', 'thin', 'compared', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'waved', 'about', 'helplessly', 'a', 'he', 'looked', 'What', 's', 'happened', 'to', 'me', 'he', 'thought', 'It', 'wasn', 't', 'a', 'dream', 'His', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', 'lay', 'peacefully', 'between', 'it', 'four', 'familiar', 'wall', 'A', 'collection', 'of', 'textile', 'sampl

### Adding POS Tags to `nltk` and `TextBlob`

By default, `nltk` and `TextBlob` treat every word as a noun. This is why words like "woke", "found", or "transformed" don't change after the lemmatization step. We can provide more information by adding the corresponding Part of Speech for each token. 

In `nltk`:

 - we use `pos_tag()` to get tokens along with tags
 - we call the `lemmatize()` function with the second parameter, that is a tag

[Source](https://www.guru99.com/stemming-lemmatization-python-nltk.html)

In [120]:
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from collections import defaultdict

tag_map_nltk = defaultdict(lambda : wn.NOUN)
tag_map_nltk['J'] = wn.ADJ
tag_map_nltk['V'] = wn.VERB
tag_map_nltk['R'] = wn.ADV

nltk_lemmas2 = [nltk_lemmatizer.lemmatize(token, tag_map_nltk[tag[0]]) for token, tag in pos_tag(re_tokens)]
print(nltk_lemmas2)

['One', 'morning', 'when', 'Gregor', 'Samsa', 'wake', 'from', 'troubled', 'dream', 'he', 'find', 'himself', 'transform', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lift', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divide', 'by', 'arch', 'into', 'stiff', 'section', 'The', 'bedding', 'be', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seem', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'leg', 'pitifully', 'thin', 'compare', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'wave', 'about', 'helplessly', 'a', 'he', 'look', 'What', 's', 'happen', 'to', 'me', 'he', 'think', 'It', 'wasn', 't', 'a', 'dream', 'His', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', 'lay', 'peacefully', 'between', 'it', 'four', 'familiar', 'wall', 'A', 'collection', 'of', 'textile', 'sample', 'lay', 'spread',

In `TextBlob`:

 - we call `TextBlob(text).tags` to get tokens and tags
 - we call `word.lemmatize()` with the tag parameter

[Source](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#textbloblemmatizer)

In [121]:
tag_map_tb = {  "J": 'a', # adjectives
                "N": 'n', # nouns
                "V": 'v', # verbs
                "R": 'r'} # adverbs

words_and_tags = [(w, tag_map_tb.get(pos[0], 'n')) for w, pos in sent_tb.tags]
blob_lemmas2 = [word.lemmatize(tag) for word, tag in words_and_tags]
print(len(blob_lemmas2), blob_lemmas2)

200 ['One', 'morning', 'when', 'Gregor', 'Samsa', 'wake', 'from', 'troubled', 'dream', 'he', 'find', 'himself', 'transform', 'in', 'his', 'bed', 'into', 'a', 'horrible', 'vermin', 'He', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lift', 'his', 'head', 'a', 'little', 'he', 'could', 'see', 'his', 'brown', 'belly', 'slightly', 'domed', 'and', 'divide', 'by', 'arch', 'into', 'stiff', 'section', 'The', 'bedding', 'be', 'hardly', 'able', 'to', 'cover', 'it', 'and', 'seem', 'ready', 'to', 'slide', 'off', 'any', 'moment', 'His', 'many', 'leg', 'pitifully', 'thin', 'compare', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'wave', 'about', 'helplessly', 'a', 'he', 'look', 'What', 's', 'happen', 'to', 'me', 'he', 'think', 'It', 'wasn', 't', 'a', 'dream', 'His', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'little', 'too', 'small', 'lay', 'peacefully', 'between', 'it', 'four', 'familiar', 'wall', 'A', 'collection', 'of', 'textile', 'sample', 'lay', 'spre

### Comparing Results

In [123]:
list(set(nltk_lemmas) ^ set(blob_lemmas))

[]

In [126]:
list(set(nltk_lemmas2) ^ set(blob_lemmas2))

[]

`nltk` and `TextBlob` return identical results.

Let's see what's changed after applying POS tags

In [128]:
sorted(list(set(nltk_lemmas) ^ set(nltk_lemmas2)))

['be',
 'compare',
 'compared',
 'covered',
 'divide',
 'divided',
 'find',
 'fit',
 'fitted',
 'found',
 'gild',
 'gilded',
 'had',
 'hang',
 'happen',
 'happened',
 'have',
 'house',
 'housed',
 'hung',
 'lift',
 'lifted',
 'looked',
 'low',
 'lower',
 'raise',
 'raising',
 'sat',
 'seem',
 'seemed',
 'show',
 'showed',
 'sit',
 'think',
 'thought',
 'transform',
 'transformed',
 'turn',
 'turned',
 'wa',
 'wake',
 'wave',
 'waved',
 'woke']

`nltk` did a great job at turning past tense verbs to the present tense

## Stemming with `nltk`

### Using `PorterStemmer()`

In [132]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

ps_stemms = [ps.stem(w) for w in re_tokens]
print(ps_stemms)

['one', 'morn', 'when', 'gregor', 'samsa', 'woke', 'from', 'troubl', 'dream', 'he', 'found', 'himself', 'transform', 'in', 'hi', 'bed', 'into', 'a', 'horribl', 'vermin', 'he', 'lay', 'on', 'hi', 'armour', 'like', 'back', 'and', 'if', 'he', 'lift', 'hi', 'head', 'a', 'littl', 'he', 'could', 'see', 'hi', 'brown', 'belli', 'slightli', 'dome', 'and', 'divid', 'by', 'arch', 'into', 'stiff', 'section', 'the', 'bed', 'wa', 'hardli', 'abl', 'to', 'cover', 'it', 'and', 'seem', 'readi', 'to', 'slide', 'off', 'ani', 'moment', 'hi', 'mani', 'leg', 'piti', 'thin', 'compar', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'wave', 'about', 'helplessli', 'as', 'he', 'look', 'what', 's', 'happen', 'to', 'me', 'he', 'thought', 'it', 'wasn', 't', 'a', 'dream', 'hi', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'littl', 'too', 'small', 'lay', 'peac', 'between', 'it', 'four', 'familiar', 'wall', 'a', 'collect', 'of', 'textil', 'sampl', 'lay', 'spread', 'out', 'on', 'the', 'tabl', 'sams

In [133]:
from nltk.stem import SnowballStemmer

sno = SnowballStemmer('english')

sno_stemms = [sno.stem(w) for w in re_tokens]
print(sno_stemms)

['one', 'morn', 'when', 'gregor', 'samsa', 'woke', 'from', 'troubl', 'dream', 'he', 'found', 'himself', 'transform', 'in', 'his', 'bed', 'into', 'a', 'horribl', 'vermin', 'he', 'lay', 'on', 'his', 'armour', 'like', 'back', 'and', 'if', 'he', 'lift', 'his', 'head', 'a', 'littl', 'he', 'could', 'see', 'his', 'brown', 'belli', 'slight', 'dome', 'and', 'divid', 'by', 'arch', 'into', 'stiff', 'section', 'the', 'bed', 'was', 'hard', 'abl', 'to', 'cover', 'it', 'and', 'seem', 'readi', 'to', 'slide', 'off', 'ani', 'moment', 'his', 'mani', 'leg', 'piti', 'thin', 'compar', 'with', 'the', 'size', 'of', 'the', 'rest', 'of', 'him', 'wave', 'about', 'helpless', 'as', 'he', 'look', 'what', 's', 'happen', 'to', 'me', 'he', 'thought', 'it', 'wasn', 't', 'a', 'dream', 'his', 'room', 'a', 'proper', 'human', 'room', 'although', 'a', 'littl', 'too', 'small', 'lay', 'peac', 'between', 'it', 'four', 'familiar', 'wall', 'a', 'collect', 'of', 'textil', 'sampl', 'lay', 'spread', 'out', 'on', 'the', 'tabl', 'sam

In [134]:
sorted(list(set(ps_stemms) ^ set(sno_stemms)))

['hard',
 'hardli',
 'helpless',
 'helplessli',
 'hi',
 'his',
 'slight',
 'slightli',
 'wa',
 'was']