# Stopwords in Python

If you have ever wondered about the nature of stopword lists available in Python, wonder no more. This workbook reveals the length and composition of the most common stopword lists, offers a comparison, and then proffers some methods for creating custom stopword lists. 

## `stop-words`

In [9]:
from stop_words import get_stop_words

sw_stop = get_stop_words('en')
len(sw_stop)

174

In [32]:
print(sw_stop)

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 't

## NLTK

In [10]:
import nltk
from nltk.corpus import stopwords

nltk_stop = stopwords.words('english')
len(nltk_stop)

179

In [33]:
print(nltk_stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## SciKit Learn

In [11]:
from sklearn.feature_extraction import stop_words

sklearn_stop = stop_words.ENGLISH_STOP_WORDS
len(sklearn_stop)

318

In [34]:
print(sklearn_stop)

frozenset({'none', 'least', 'me', 'twenty', 'should', 'sometime', 'as', 'seeming', 'herein', 'each', 'eight', 'after', 'him', 'only', 'ever', 'move', 'seem', 'throughout', 'below', 'both', 'hereby', 'please', 'somehow', 'latterly', 'many', 'sometimes', 'toward', 'two', 'whoever', 'more', 'thereupon', 'eg', 'can', 'less', 'elsewhere', 'a', 'seems', 'for', 'how', 'fill', 'much', 'his', 'few', 'nowhere', 'sixty', 'against', 'if', 'together', 'may', 'and', 'cant', 'though', 'on', 'across', 'forty', 'but', 'made', 'find', 'amoungst', 'herself', 'this', 'had', 'until', 'alone', 'is', 'our', 'often', 'same', 'in', 'first', 'former', 'namely', 'twelve', 'that', 'indeed', 'where', 'must', 'system', 'keep', 'front', 'my', 'to', 'which', 'next', 'couldnt', 'every', 'than', 'other', 'its', 'under', 'ltd', 'cry', 'what', 'will', 'done', 'never', 'why', 'himself', 'found', 'de', 'whither', 'through', 'whereafter', 'last', 'something', 'third', 'however', 'onto', 'either', 'are', 'see', 'anywhere', '

## Stopword List Comparisons

### `stop-words` vs NLTK

In [30]:
print(sorted(list(set(sw_stop)-set(nltk_stop))))

["can't", 'cannot', 'could', "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", 'ought', "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", 'would']


In [29]:
print(sorted(list(set(nltk_stop)-set(sw_stop))))

['ain', 'aren', 'can', 'couldn', 'd', 'didn', 'doesn', 'don', 'hadn', 'hasn', 'haven', 'isn', 'just', 'll', 'm', 'ma', 'mightn', "mightn't", 'mustn', 'needn', "needn't", 'now', 'o', 're', 's', 'shan', "should've", 'shouldn', 't', "that'll", 've', 'wasn', 'weren', 'will', 'won', 'wouldn', 'y']


### `stop-words` vs SciKit-Learn

In [28]:
print(sorted(list(set(sklearn_stop)-set(sw_stop))))

['across', 'afterwards', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'amoungst', 'amount', 'another', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'around', 'back', 'became', 'become', 'becomes', 'becoming', 'beforehand', 'behind', 'beside', 'besides', 'beyond', 'bill', 'bottom', 'call', 'can', 'cant', 'co', 'con', 'couldnt', 'cry', 'de', 'describe', 'detail', 'done', 'due', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'former', 'formerly', 'forty', 'found', 'four', 'front', 'full', 'get', 'give', 'go', 'hasnt', 'hence', 'hereafter', 'hereby', 'herein', 'hereupon', 'however', 'hundred', 'ie', 'inc', 'indeed', 'interest', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'meanwhile', 'might', 'mill', 'mine', 'moreover', 'mostly',

In [27]:
print(sorted(list(set(sw_stop)-set(sklearn_stop))))

["aren't", "can't", "couldn't", 'did', "didn't", 'does', "doesn't", 'doing', "don't", "hadn't", "hasn't", "haven't", 'having', "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "let's", "mustn't", 'ought', "shan't", "she'd", "she'll", "she's", "shouldn't", "that's", 'theirs', "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "we've", "weren't", "what's", "when's", "where's", "who's", "why's", "won't", "wouldn't", "you'd", "you'll", "you're", "you've"]


### NLTK vs SciKit-Learn

In [26]:
print(sorted(list(set(nltk_stop)-set(sklearn_stop))))

['ain', 'aren', "aren't", 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'having', 'isn', "isn't", "it's", 'just', 'll', 'm', 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'o', 's', 'shan', "shan't", "she's", "should've", 'shouldn', "shouldn't", 't', "that'll", 'theirs', 've', 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'y', "you'd", "you'll", "you're", "you've"]


In [31]:
print(sorted(list(set(sklearn_stop)-set(nltk_stop))))

['across', 'afterwards', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'amoungst', 'amount', 'another', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'around', 'back', 'became', 'become', 'becomes', 'becoming', 'beforehand', 'behind', 'beside', 'besides', 'beyond', 'bill', 'bottom', 'call', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'done', 'due', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'former', 'formerly', 'forty', 'found', 'four', 'front', 'full', 'get', 'give', 'go', 'hasnt', 'hence', 'hereafter', 'hereby', 'herein', 'hereupon', 'however', 'hundred', 'ie', 'inc', 'indeed', 'interest', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'meanwhile', 'might', 'mill', 'mine', 'moreover

## Roll Your Own

### TF-IDF

### word2vec

A better approach (particularly for NLP tasks) would be to use an unsupervised model like word2vec and generate vectors for words. Then sort the vectors on magnitude and tag those vectors that are both low in magnitude and have high frequency counts as stopwords.