In [1]:
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag

In [2]:
def preprocess(filename):
    f = open(filename,'r')
    text = f.read()
    text = text.lower()
    
    text_p = "".join([char for char in text if char not in string.punctuation])
    
    words = word_tokenize(text_p)
    
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in filtered_words]
    
    pos = pos_tag(filtered_words)
    
    return words, filtered_words, stemmed, pos

In [3]:
words, filtered_words, stemmed, pos = preprocess('pride_and_prejudice.txt')

In [4]:
print('Words:', words)

Words: ['chapter', '1', 'it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife', 'however', 'little', 'known', 'the', 'feelings', 'or', 'views', 'of', 'such', 'a', 'man', 'may', 'be', 'on', 'his', 'first', 'entering', 'a', 'neighbourhood', 'this', 'truth', 'is', 'so', 'well', 'fixed', 'in', 'the', 'minds', 'of', 'the', 'surrounding', 'families', 'that', 'he', 'is', 'considered', 'the', 'rightful', 'property', 'of', 'some', 'one', 'or', 'other', 'of', 'their', 'daughters', '“', 'my', 'dear', 'mr', 'bennet', '”', 'said', 'his', 'lady', 'to', 'him', 'one', 'day', '“', 'have', 'you', 'heard', 'that', 'netherfield', 'park', 'is', 'let', 'at', 'last', '”', 'mr', 'bennet', 'replied', 'that', 'he', 'had', 'not', '“', 'but', 'it', 'is', '”', 'returned', 'she', '“', 'for', 'mrs', 'long', 'has', 'just', 'been', 'here', 'and', 'she', 'told', 'me', 'all', 'about', 'it', '”'

In [5]:
print('Filtered words:', filtered_words)

Filtered words: ['chapter', '1', 'truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife', 'however', 'little', 'known', 'feelings', 'views', 'man', 'may', 'first', 'entering', 'neighbourhood', 'truth', 'well', 'fixed', 'minds', 'surrounding', 'families', 'considered', 'rightful', 'property', 'one', 'daughters', '“', 'dear', 'mr', 'bennet', '”', 'said', 'lady', 'one', 'day', '“', 'heard', 'netherfield', 'park', 'let', 'last', '”', 'mr', 'bennet', 'replied', '“', '”', 'returned', '“', 'mrs', 'long', 'told', '”', 'mr', 'bennet', 'made', 'answer', '“', 'want', 'know', 'taken', '”', 'cried', 'wife', 'impatiently', '“', 'want', 'tell', 'objection', 'hearing', '”', 'invitation', 'enough', '“', 'dear', 'must', 'know', 'mrs', 'long', 'says', 'netherfield', 'taken', 'young', 'man', 'large', 'fortune', 'north', 'england', 'came', 'monday', 'chaise', 'four', 'see', 'place', 'much', 'delighted', 'agreed', 'mr', 'morris', 'immediately', 'take'

In [6]:
print('Stemmed words:', stemmed)

Stemmed words: ['chapter', '1', 'truth', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wife', 'howev', 'littl', 'known', 'feel', 'view', 'man', 'may', 'first', 'enter', 'neighbourhood', 'truth', 'well', 'fix', 'mind', 'surround', 'famili', 'consid', 'right', 'properti', 'one', 'daughter', '“', 'dear', 'mr', 'bennet', '”', 'said', 'ladi', 'one', 'day', '“', 'heard', 'netherfield', 'park', 'let', 'last', '”', 'mr', 'bennet', 'repli', '“', '”', 'return', '“', 'mr', 'long', 'told', '”', 'mr', 'bennet', 'made', 'answer', '“', 'want', 'know', 'taken', '”', 'cri', 'wife', 'impati', '“', 'want', 'tell', 'object', 'hear', '”', 'invit', 'enough', '“', 'dear', 'must', 'know', 'mr', 'long', 'say', 'netherfield', 'taken', 'young', 'man', 'larg', 'fortun', 'north', 'england', 'came', 'monday', 'chais', 'four', 'see', 'place', 'much', 'delight', 'agre', 'mr', 'morri', 'immedi', 'take', 'possess', 'michaelma', 'servant', 'hous', 'end', 'next', 'week', '”', '“',

In [7]:
print('Part of Speech:', pos)

Part of Speech: [('chapter', 'NN'), ('1', 'CD'), ('truth', 'NN'), ('universally', 'RB'), ('acknowledged', 'VBD'), ('single', 'JJ'), ('man', 'NN'), ('possession', 'NN'), ('good', 'JJ'), ('fortune', 'NN'), ('must', 'MD'), ('want', 'VB'), ('wife', 'NN'), ('however', 'RB'), ('little', 'JJ'), ('known', 'JJ'), ('feelings', 'NNS'), ('views', 'NNS'), ('man', 'NN'), ('may', 'MD'), ('first', 'VB'), ('entering', 'VBG'), ('neighbourhood', 'NN'), ('truth', 'NN'), ('well', 'RB'), ('fixed', 'VBN'), ('minds', 'NNS'), ('surrounding', 'VBG'), ('families', 'NNS'), ('considered', 'VBN'), ('rightful', 'JJ'), ('property', 'NN'), ('one', 'CD'), ('daughters', 'NNS'), ('“', 'VBP'), ('dear', 'JJ'), ('mr', 'JJ'), ('bennet', 'NN'), ('”', 'NN'), ('said', 'VBD'), ('lady', 'JJ'), ('one', 'CD'), ('day', 'NN'), ('“', 'NNP'), ('heard', 'VBD'), ('netherfield', 'DT'), ('park', 'NN'), ('let', 'VBD'), ('last', 'JJ'), ('”', 'JJ'), ('mr', 'FW'), ('bennet', 'NN'), ('replied', 'VBD'), ('“', 'NNP'), ('”', 'NNP'), ('returned', '