In [1]:
import nltk   #nltk is a Text Preprocessing Library. It also contains some sample examples like datasets.

In [2]:
from nltk.corpus import brown   #Brown corpus is a type of corpus.There are other corpuses as well.

In [3]:
brown.categories()    #this gives all the categories of words and sentences present in brown corpus

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
brown.words()   #gives words present in brown corpus

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [5]:
brown.sents()   #gives sentences present in brown corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [6]:
#Taking the sentences on emotion = "adventure" from the brown corpus
data = brown.sents(categories=["adventure"])

In [7]:
len(data)

4637

In [8]:
data   #This data is in form of words

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

In [9]:
" ".join(data[0])    #Since this data is in form of words, so join them to form sentences.

'Dan Morgan told himself he would forget Ann Turner .'

# Pipeline of NLP :
    
    1. Get the data
    2. Tokenization - Breaking the documents into sentences and sentences to words. During this stopwords are also removed.
    3. Stemming - Converting different forms of same word into single base word.
    4. Building of vocabulary - Vocabulary is a list of distinct words across the training data documents.
    5. Assign freq of each word as a number to that word - We will create a bag of words storing freq of each meaningful word corresponding to its index.


# Tokenization :
Breaking the documents into sentences and sentences to words. During this stopwords are also removed.

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize #sent_tokenize helps to converts documents to sentences and word_tokenize helps to convert sentences to words.

In [11]:
document = """A paragraph is a series of related sentences developing a central idea, called the topic. Try to think about paragraphs in terms of thematic unity: a paragraph is a sentence or a group of sentences that supports one central, unified idea. Paragraphs add one idea at a time to your broader argument.
"""

In [12]:
sentence = "Code is OP! abc@gmail.com"

In [13]:
sents = sent_tokenize(document)
print(sents)

['A paragraph is a series of related sentences developing a central idea, called the topic.', 'Try to think about paragraphs in terms of thematic unity: a paragraph is a sentence or a group of sentences that supports one central, unified idea.', 'Paragraphs add one idea at a time to your broader argument.']


In [14]:
#Note : Tokenize also breaks down special characters like @,#, etc..
words = word_tokenize(sentence)    
print(words)

['Code', 'is', 'OP', '!', 'abc', '@', 'gmail.com']


### Stop word Removal

In [15]:
from nltk.corpus import stopwords

In [16]:
sw = set(stopwords.words('english'))  #sw contains the list of stopwords in english language.

In [17]:
sw 

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [18]:
text = "I am very good in cricket".split(" ")  #contains the list of words in this sentence.

In [19]:
text

['I', 'am', 'very', 'good', 'in', 'cricket']

In [20]:
#We are creating a function to remove the stopwords from the above sentence.
def remove_stopwords(text, stopwords):
    useful = [w for w in text if w not in stopwords]   #contaijns words in text which are not in stopwords.
    return useful  

In [21]:
useful_words = remove_stopwords(text, sw)
useful_words   

['I', 'good', 'cricket']

# Tokenisation using regex (regular expression)

Using this we can consider the special characters as non breakable words.

In [23]:
from nltk.tokenize import RegexpTokenizer

In [29]:
tokenizer = RegexpTokenizer("[a-zA-Z@.]+")   #Inside the brackets it contains the regular expression

In [30]:
sent = "My email is abc@gmail.com. Please dont mail me."

In [31]:
useful = tokenizer.tokenize(sent)

In [32]:
print(useful)  #So it does not break on basis of @ as it considers it to be a word.

['My', 'email', 'is', 'abc@gmail.com.', 'Please', 'dont', 'mail', 'me.']


# Stemming

At the very basics of it, the major difference between the porter and lancaster stemming algorithms is that the lancaster stemmer is significantly more aggressive than the porter stemmer. The three major stemming algorithms in use today are Porter, Snowball(Porter2), and Lancaster (Paice-Husk), with the aggressiveness continuum basically following along those same lines. Porter is the least aggressive algorithm, with the specifics of each algorithm actually being fairly lengthy and technical. Here is a break down for you though:

Porter: Most commonly used stemmer without a doubt, also one of the most gentle stemmers. One of the few stemmers that actually has Java support which is a plus, though it is also the most computationally intensive of the algorithms(Granted not by a very significant margin). It is also the oldest stemming algorithm by a large margin.

Porter2: Nearly universally regarded as an improvement over porter, and for good reason. Porter himself in fact admits that it is better than his original algorithm. Slightly faster computation time than porter, with a fairly large community around it.

Lancaster: Very aggressive stemming algorithm, sometimes to a fault. With porter and snowball, the stemmed representations are usually fairly intuitive to a reader, not so with Lancaster, as many shorter words will become totally obfuscated. The fastest algorithm here, and will reduce your working set of words hugely, but if you want more distinction, not the tool you would want.

Honestly, I feel that Snowball is usually the way to go. There are certain circumstances in which Lancaster will hugely trim down your working set, which can be very useful, however the marginal speed increase over snowball in my opinion is not worth the lack of precision. Porter has the most implementations though and so is usually the default go-to algorithm, but if you can, use snowball.

In [33]:
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer    #these are the different types of stemmers.

In [34]:
ps = PorterStemmer()

In [35]:
ps.stem("running")

'run'

In [36]:
ps.stem("ran")

'ran'

In [66]:
#SnowballStemmer is multilingual ie it supports other languages too.
ls = LancasterStemmer()
ls.stem('ran')

'ran'

In [70]:
ss = SnowballStemmer('english')   #it needs language also
ss.stem('ran')

'ran'

# Building Vocabulary

In [48]:
#corpus is the collection of sentences and words.
corpus =  [
    "A paragraph is a series of related sentences developing a central idea, called the topic. Try to think about paragraphs in terms of thematic unity: a paragraph is a sentence or a group of sentences that supports one central, unified idea. ",
    'Life is what happens when you are busy making other plans.',
    'A friend is one that knows you as you are, understands where you have been, accepts what you have become, and still, gently allows you to grow.',
    'You only live once, but if you do it right, once is enough.'
]

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()   #CountVectorizer counts the frequency of all words.

In [50]:
vc = cv.fit_transform(corpus)    #vectorized_corpus..During training use fit_transform as it helps to create vocabulary in form of an array or vector.

In [51]:
vc.toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2,
        0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1, 2, 1,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 2, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 5],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 1, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]], dtype=int64)

In [52]:
print(vc)

  (0, 36)	2
  (0, 24)	2
  (0, 43)	1
  (0, 30)	3
  (0, 39)	1
  (0, 42)	2
  (0, 12)	1
  (0, 11)	2
  (0, 21)	2
  (0, 10)	1
  (0, 48)	1
  (0, 52)	1
  (0, 53)	1
  (0, 51)	1
  (0, 50)	1
  (0, 0)	1
  (0, 37)	1
  (0, 23)	1
  (0, 46)	1
  (0, 49)	1
  (0, 56)	1
  (0, 41)	1
  (0, 34)	1
  (0, 17)	1
  (0, 47)	1
  :	:
  (2, 15)	1
  (2, 26)	1
  (2, 5)	1
  (2, 54)	1
  (2, 59)	1
  (2, 20)	2
  (2, 7)	1
  (2, 1)	1
  (2, 6)	1
  (2, 3)	1
  (2, 44)	1
  (2, 16)	1
  (2, 2)	1
  (2, 18)	1
  (3, 24)	1
  (3, 60)	2
  (3, 33)	1
  (3, 28)	1
  (3, 31)	2
  (3, 9)	1
  (3, 22)	1
  (3, 13)	1
  (3, 25)	1
  (3, 40)	1
  (3, 14)	1


In [53]:
print(cv.vocabulary_)

{'paragraph': 36, 'is': 24, 'series': 43, 'of': 30, 'related': 39, 'sentences': 42, 'developing': 12, 'central': 11, 'idea': 21, 'called': 10, 'the': 48, 'topic': 52, 'try': 53, 'to': 51, 'think': 50, 'about': 0, 'paragraphs': 37, 'in': 23, 'terms': 46, 'thematic': 49, 'unity': 56, 'sentence': 41, 'or': 34, 'group': 17, 'that': 47, 'supports': 45, 'one': 32, 'unified': 55, 'life': 27, 'what': 57, 'happens': 19, 'when': 58, 'you': 60, 'are': 4, 'busy': 8, 'making': 29, 'other': 35, 'plans': 38, 'friend': 15, 'knows': 26, 'as': 5, 'understands': 54, 'where': 59, 'have': 20, 'been': 7, 'accepts': 1, 'become': 6, 'and': 3, 'still': 44, 'gently': 16, 'allows': 2, 'grow': 18, 'only': 33, 'live': 28, 'once': 31, 'but': 9, 'if': 22, 'do': 13, 'it': 25, 'right': 40, 'enough': 14}


In [54]:
print(len(cv.vocabulary_))    #_ means that it has been formed after running some operations like fit_transform function here.

61


# Reverse Mapping
To convert numbers into words.

In [55]:
vc = vc.toarray()

In [56]:
numbers = vc[2]  #2nd sentence
print(numbers)

[0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 2 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 5]


In [57]:
print(len(vc[2]))   #This has same length as it will give frequency of all words as in vocabulary.So all the examples will contain same no of features. 

61


In [58]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())   #lowering words
    
    #remove stopwords
    words = remove_stopwords(words, sw)
    return words

In [59]:
myTokenizer('This is a random text')  

['random', 'text']

In [60]:
cv = CountVectorizer(tokenizer = myTokenizer)  #we can pass tokenizer.By default it is none.

In [61]:
vc = cv.fit_transform(corpus).toarray()    #vc means vectorised corpus in form of array.

In [62]:
print(vc)  #this contains the frequency of words present at the indexes.

[[0 0 0 0 1 2 1 0 0 0 1 0 0 1 1 0 0 0 0 1 2 1 0 1 0 1 2 1 0 1 1 1 1 1 1 0
  1 1]
 [0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0]
 [1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
  0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0]]


In [63]:
len(vc[0])   #Now the length will be reduced as we have removed the stopwords by passing the tokenizer.

38

In [64]:
cv.vocabulary_   #this contains the words along with their index where they are present in the dictionary.

{'paragraph': 20,
 'series': 27,
 'related': 23,
 'sentences': 26,
 'developing': 6,
 'central': 5,
 'idea': 13,
 'called': 4,
 'topic.': 33,
 'try': 34,
 'think': 32,
 'paragraphs': 21,
 'terms': 30,
 'thematic': 31,
 'unity': 37,
 'sentence': 25,
 'group': 10,
 'supports': 29,
 'one': 19,
 'unified': 36,
 'idea.': 14,
 'life': 16,
 'happens': 12,
 'busy': 3,
 'making': 18,
 'plans.': 22,
 'friend': 8,
 'knows': 15,
 'understands': 35,
 'accepts': 0,
 'become': 2,
 'still': 28,
 'gently': 9,
 'allows': 1,
 'grow.': 11,
 'live': 17,
 'right': 24,
 'enough.': 7}