# Basic NLP Pipeline
        -Data collection
        -Tokenization,Stopwords,Stemming
        -Building a common vocab
        -Vectorizing the documents
        -Performing the classification/clustering

# Data collection

In [1]:
from nltk.corpus import brown

In [2]:
categories = brown.categories()
print(categories)

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data = brown.sents(categories = 'adventure')[:100]
print(len(data))
print(data[0])

100
['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.']


# Tokenization

In [4]:
text = "It was a pleasant day, the weather was cool and there were light showers. I went to the market to buy 10 bananas, 20 mangoes and some vegetables."
print(text)

It was a pleasant day, the weather was cool and there were light showers. I went to the market to buy 10 bananas, 20 mangoes and some vegetables.


In [5]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [6]:
sents = sent_tokenize(text)
print(len(sents))
print(sents)

2
['It was a pleasant day, the weather was cool and there were light showers.', 'I went to the market to buy 10 bananas, 20 mangoes and some vegetables.']


In [7]:
words_ = word_tokenize(text.lower())
print(len(words_))
print(words_)

32
['it', 'was', 'a', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.', 'i', 'went', 'to', 'the', 'market', 'to', 'buy', '10', 'bananas', ',', '20', 'mangoes', 'and', 'some', 'vegetables', '.']


# Tokenization using Regular expressions
    Problem with word tokenizer - can't handle complex tokenizations(.,/?!@#$..etc).So we use Regexp tokenizer class in NLTK

In [8]:
from nltk.tokenize import RegexpTokenizer

In [9]:
#extract only belonging to a-z A-Z,+ signigies one or more occurence of the preceding character
tokenizer = RegexpTokenizer("[a-zA-z]+")

In [10]:
words = tokenizer.tokenize(text.lower())
print(len(words))
print(words)

26
['it', 'was', 'a', 'pleasant', 'day', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', 'i', 'went', 'to', 'the', 'market', 'to', 'buy', 'bananas', 'mangoes', 'and', 'some', 'vegetables']


# Stopwords Removal

In [11]:
from nltk.corpus import stopwords

In [12]:
sw = set(stopwords.words('english'))
print(len(stopwords.words('english')))
print(stopwords.words('english')[:5])

179
['i', 'me', 'my', 'myself', 'we']


In [13]:
def remove_stopwords(words):
    return [w for w in words if w not in sw]


useful_words = remove_stopwords(words)
print("Total words = {}".format(len(words)))
print(words)
print("Useful Words = {}".format(len(useful_words)))
print(useful_words)

Total words = 26
['it', 'was', 'a', 'pleasant', 'day', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', 'i', 'went', 'to', 'the', 'market', 'to', 'buy', 'bananas', 'mangoes', 'and', 'some', 'vegetables']
Useful Words = 12
['pleasant', 'day', 'weather', 'cool', 'light', 'showers', 'went', 'market', 'buy', 'bananas', 'mangoes', 'vegetables']


# Stemming
    -process that transforms particular words(verbs,plurals etc.) into their radical form
    -preserves the symantics of the sentence without increasing the unique tokens
    -jumps,jumping,jumped,jump => jump

In [14]:
text1 = "Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft high wall."
#tokenize
words1 = tokenizer.tokenize(text1)
print(len(words1))
print(words1)

21
['Foxes', 'love', 'to', 'make', 'jumps', 'The', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'high', 'wall']


In [15]:
#remove stopwords
useful_words1 = remove_stopwords(words1)
print(len(useful_words1))
print(useful_words)

15
['pleasant', 'day', 'weather', 'cool', 'light', 'showers', 'went', 'market', 'buy', 'bananas', 'mangoes', 'vegetables']


# Stemming
     1.Snowball stemmer(multilingual)
     2.Porter stemmer
     3.Lancaster stemmer

In [16]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer

In [17]:
ss = SnowballStemmer('english')
print(ss.stem('playing'))
ps = PorterStemmer()
print(ps.stem('jumping'))

play
jump


In [18]:
from nltk.stem.lancaster import LancasterStemmer

In [19]:
ls = LancasterStemmer()
ls.stem('teenager')

'teen'

# lemmatization
        -same as stemming

In [20]:
from nltk.stem import WordNetLemmatizer

In [21]:
l = WordNetLemmatizer()
l.lemmatize('crying')

'cry'

In [22]:
#function to preprocess the data
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

tokenizer = RegexpTokenizer("[a-zA-Z]+")
sw = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def data_preprocess(text):
    words = tokenizer.tokenize(text)
    useful_words = [w for w in words if w not in sw]
    for i in range(len(useful_words)):
        useful_words[i] = stemmer.stem(useful_words[i])
    return useful_words
    

In [23]:
print(data_preprocess(text1))

['fox', 'love', 'make', 'jump', 'the', 'quick', 'brown', 'fox', 'seen', 'jump', 'love', 'dog', 'ft', 'high', 'wall']


# Building a common vocab and vectorizing documents(using bag of words model)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
corpus = [
            'Indian cricket team will wins worldcup, says capt. Virat kohli',
            'We will win next lok sabha elections , says confident Indian PM',
            'The nobel laurate won the hearts of the people',
            'The movie raazi is a interesting Indian spy thriller based upon a real story'
]

In [26]:
cv = CountVectorizer() #default tokenizer
vectorized_corpus = cv.fit_transform(corpus)

In [27]:
print(vectorized_corpus) #<4x36 sparse matrix of type '<class 'numpy.int64'>'

  (0, 6)	1
  (0, 3)	1
  (0, 24)	1
  (0, 30)	1
  (0, 32)	1
  (0, 34)	1
  (0, 21)	1
  (0, 1)	1
  (0, 28)	1
  (0, 9)	1
  (1, 6)	1
  (1, 30)	1
  (1, 21)	1
  (1, 29)	1
  (1, 31)	1
  (1, 13)	1
  (1, 11)	1
  (1, 20)	1
  (1, 4)	1
  (1, 2)	1
  (1, 17)	1
  (2, 25)	3
  (2, 14)	1
  (2, 10)	1
  (2, 33)	1
  (2, 5)	1
  (2, 15)	1
  (2, 16)	1
  (3, 6)	1
  (3, 25)	1
  (3, 12)	1
  (3, 18)	1
  (3, 8)	1
  (3, 7)	1
  (3, 22)	1
  (3, 26)	1
  (3, 0)	1
  (3, 27)	1
  (3, 19)	1
  (3, 23)	1


In [28]:
vectorized_corpus = vectorized_corpus.toarray()
print(vectorized_corpus.shape)
vectorized_corpus

(4, 35)


array([[0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1],
       [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [29]:
print(len(cv.vocabulary_))
print(cv.vocabulary_)

35
{'indian': 6, 'cricket': 3, 'team': 24, 'will': 30, 'wins': 32, 'worldcup': 34, 'says': 21, 'capt': 1, 'virat': 28, 'kohli': 9, 'we': 29, 'win': 31, 'next': 13, 'lok': 11, 'sabha': 20, 'elections': 4, 'confident': 2, 'pm': 17, 'the': 25, 'nobel': 14, 'laurate': 10, 'won': 33, 'hearts': 5, 'of': 15, 'people': 16, 'movie': 12, 'raazi': 18, 'is': 8, 'interesting': 7, 'spy': 22, 'thriller': 26, 'based': 0, 'upon': 27, 'real': 19, 'story': 23}


In [30]:
cv.vocabulary_['elections']

4

In [31]:
print(cv.inverse_transform(vectorized_corpus))

[array(['capt', 'cricket', 'indian', 'kohli', 'says', 'team', 'virat',
       'will', 'wins', 'worldcup'], dtype='<U11'), array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
       'says', 'we', 'will', 'win'], dtype='<U11'), array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U11'), array(['based', 'indian', 'interesting', 'is', 'movie', 'raazi', 'real',
       'spy', 'story', 'the', 'thriller', 'upon'], dtype='<U11')]


In [32]:
import numpy as np
vector = np.ones((vectorized_corpus[0].shape[0],))
vector[2:5] = 0
cv.inverse_transform(vector)

[array(['based', 'capt', 'hearts', 'indian', 'interesting', 'is', 'kohli',
        'laurate', 'lok', 'movie', 'next', 'nobel', 'of', 'people', 'pm',
        'raazi', 'real', 'sabha', 'says', 'spy', 'story', 'team', 'the',
        'thriller', 'upon', 'virat', 'we', 'will', 'win', 'wins', 'won',
        'worldcup'], dtype='<U11')]

In [33]:
cv1 = CountVectorizer(tokenizer = data_preprocess)
vectorized_corpus1 = cv1.fit_transform(corpus).toarray()

In [34]:
print(vectorized_corpus1.shape)
vectorized_corpus1

(4, 28)


array([[0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 0, 1, 1, 1],
       [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
        0, 1, 1, 0, 0, 0]], dtype=int64)

In [35]:
print(len(cv1.vocabulary_))
print(cv1.vocabulary_)

28
{'indian': 6, 'cricket': 3, 'team': 22, 'win': 26, 'worldcup': 27, 'say': 19, 'capt': 1, 'virat': 25, 'koh': 8, 'next': 12, 'lok': 10, 'sabha': 18, 'elect': 4, 'confid': 2, 'pm': 15, 'nobel': 13, 'laurat': 9, 'heart': 5, 'peopl': 14, 'movi': 11, 'raazi': 16, 'interest': 7, 'spi': 20, 'thriller': 23, 'base': 0, 'upon': 24, 'real': 17, 'stori': 21}


# Bag of words:
    -unigram model
    -Bigram,Trigram model
    -ngram model

In [43]:
# Bigram model
cv2 = CountVectorizer(tokenizer = data_preprocess,ngram_range = (2,2))
cv2.fit_transform(corpus).toarray()

array([[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 1, 0, 1, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
        1, 1, 0, 0, 0, 0]], dtype=int64)

In [44]:
print(cv2.vocabulary_)

{'indian cricket': 6, 'cricket team': 3, 'team win': 21, 'win worldcup': 26, 'worldcup say': 27, 'say capt': 18, 'capt virat': 1, 'virat koh': 24, 'win next': 25, 'next lok': 13, 'lok sabha': 11, 'sabha elect': 17, 'elect say': 4, 'say confid': 19, 'confid indian': 2, 'indian pm': 7, 'nobel laurat': 14, 'laurat heart': 10, 'heart peopl': 5, 'movi raazi': 12, 'raazi interest': 15, 'interest indian': 9, 'indian spi': 8, 'spi thriller': 20, 'thriller base': 22, 'base upon': 0, 'upon real': 23, 'real stori': 16}


In [46]:
#Bigram,trigram
cv3 = CountVectorizer(tokenizer = data_preprocess,ngram_range = (2,3))
cv3.fit_transform(corpus).toarray()

array([[0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [47]:
print(len(cv3.vocabulary_))
print(cv3.vocabulary_)

52
{'indian cricket': 11, 'cricket team': 6, 'team win': 39, 'win worldcup': 48, 'worldcup say': 50, 'say capt': 33, 'capt virat': 2, 'virat koh': 45, 'indian cricket team': 12, 'cricket team win': 7, 'team win worldcup': 40, 'win worldcup say': 49, 'worldcup say capt': 51, 'say capt virat': 34, 'capt virat koh': 3, 'win next': 46, 'next lok': 24, 'lok sabha': 20, 'sabha elect': 31, 'elect say': 8, 'say confid': 35, 'confid indian': 4, 'indian pm': 13, 'win next lok': 47, 'next lok sabha': 25, 'lok sabha elect': 21, 'sabha elect say': 32, 'elect say confid': 9, 'say confid indian': 36, 'confid indian pm': 5, 'nobel laurat': 26, 'laurat heart': 18, 'heart peopl': 10, 'nobel laurat heart': 27, 'laurat heart peopl': 19, 'movi raazi': 22, 'raazi interest': 28, 'interest indian': 16, 'indian spi': 14, 'spi thriller': 37, 'thriller base': 41, 'base upon': 0, 'upon real': 43, 'real stori': 30, 'movi raazi interest': 23, 'raazi interest indian': 29, 'interest indian spi': 17, 'indian spi thril

# Tf-idf Normalisation
    - Avoids features that occur very often in documents because they contain less information.
    - information decreases as the number of occurences of a token increases across different type of documents.
    - so we associate a weight with each term by defining a term : term-document frequency.
    - term frequency - tf(t,d) => number of times token t occurs in document d.
    - inverse document frequency - idf(t,c) => log(1/(1+count(d,t)) where count(d,t) => number of documents of corpus c in
      which token t occurs.
    - greater the count(d,t) less will be token's weight in predicting the result.
    - weight of a token is given by (tf*idf)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = data_preprocess,ngram_range = (1,1))
tfidf_vector = tfidf_vectorizer.fit_transform(corpus).toarray()
tfidf_vector

array([[0.        , 0.36153669, 0.        , 0.36153669, 0.        ,
        0.        , 0.23076418, 0.        , 0.36153669, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.28503968,
        0.        , 0.        , 0.36153669, 0.        , 0.        ,
        0.36153669, 0.28503968, 0.36153669],
       [0.        , 0.        , 0.36153669, 0.        , 0.36153669,
        0.        , 0.23076418, 0.        , 0.        , 0.        ,
        0.36153669, 0.        , 0.36153669, 0.        , 0.        ,
        0.36153669, 0.        , 0.        , 0.36153669, 0.28503968,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.28503968, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.        , 0.        , 0.        , 0.5       ,
        0.        , 0.        , 0.        , 0.5       , 0.5       ,
        0.        , 0.    

In [54]:
print(len(tfidf_vectorizer.vocabulary_))
print(tfidf_vectorizer.vocabulary_)

28
{'indian': 6, 'cricket': 3, 'team': 22, 'win': 26, 'worldcup': 27, 'say': 19, 'capt': 1, 'virat': 25, 'koh': 8, 'next': 12, 'lok': 10, 'sabha': 18, 'elect': 4, 'confid': 2, 'pm': 15, 'nobel': 13, 'laurat': 9, 'heart': 5, 'peopl': 14, 'movi': 11, 'raazi': 16, 'interest': 7, 'spi': 20, 'thriller': 23, 'base': 0, 'upon': 24, 'real': 17, 'stori': 21}
