<a href="https://colab.research.google.com/github/kinn-j/NLP/blob/main/NLP_StopWords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stop Words
Stop words are those words that do not contribute to the deeper meaning of the phrase. They are the most common words such as: the, a, and is. For some applications like documentation classification, it may make sense to remove stop words. NLTK provides a list of commonly agreed upon stop words for a variety of languages, such as English..

In [1]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)
a= list(nlp.Defaults.stop_words)
len(a)

{'twenty', 'someone', 'nine', 'about', 'nevertheless', '‘m', 'how', 'fifteen', 'whither', 'was', 'anywhere', 'thereby', 'moreover', 'full', 'their', 'herself', 'has', 'nor', 'there', 'least', 'via', 'afterwards', 'under', 'which', '’d', "'re", 'five', 'using', 'except', 'make', 'then', 'ours', 'still', 'none', 'due', 'either', 'through', 'keep', 'them', 'each', 'all', 'than', 'she', 'though', 'had', '‘ll', 'why', 'move', 'whence', 'whenever', 'down', 'hundred', 'hereupon', 'never', 'used', 'sometime', 'many', 'take', 'amongst', 'very', '’ll', 'toward', "'ll", 'doing', 'go', 'this', 'while', 'without', 'hereafter', 'until', 'wherever', 'alone', 'back', 'meanwhile', 'herein', 'been', 'fifty', 'are', 'can', 'some', 'first', 'somewhere', 'above', 'every', 'his', 'my', 'once', 'seem', 'however', 'for', 'himself', 'get', 'any', 'see', 'may', '’ve', 'upon', 'whereby', 'us', 'since', 'he', 'next', 'yours', 'because', 'part', 'neither', 'might', 'regarding', 'him', 'we', 'nothing', 'among', 'na

326

In [9]:
len(nlp.Defaults.stop_words)

326

## To see if a word is a stop word

In [10]:
nlp.vocab['myself'].is_stop

True

In [11]:
nlp.vocab['mystery'].is_stop

False

In [12]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('mystery')

In [13]:
# Set the stop_word tag on the lexeme
nlp.vocab['mystery'].is_stop = True

In [14]:
len(nlp.Defaults.stop_words)

327

In [15]:
nlp.vocab['mystery'].is_stop

True

## To remove a stop word
Alternatively, you may decide that `'beyond'` should not be considered a stop word.

In [16]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [17]:
len(nlp.Defaults.stop_words)

326

In [18]:
nlp.vocab['beyond'].is_stop

False

In [20]:
import string
import re
import nltk
nltk.download('punkt')

from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
# load data
text = 'The Quick brown fox jump over the lazy dog!'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
# split into words
tokens = word_tokenize(text)
print(tokens)

['The', 'Quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [22]:
# convert to lower case
tokens = [w.lower() for w in tokens]
print(tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [23]:
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
print(re_punc)

re.compile('[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~]')


In [24]:
# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in tokens]
print(stripped)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '']


In [25]:
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
print(words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']


In [26]:
# filter out non-stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [None]:
nlp.vocab['dog'].is_stop

False

In [27]:
nlp.vocab['over'].is_stop

True