## NLP Pipeline

1. Gather Data
2. Tokeinse, Remove stopwords, Stemming
3. Vectorize text data
4. Build Vocabulary
5. Classification

In [1]:
pip install nltk

Collecting nltk
  Downloading nltk-3.6.3-py3-none-any.whl (1.5 MB)
Collecting regex
  Downloading regex-2021.8.28-cp39-cp39-win_amd64.whl (271 kB)
Collecting click
  Downloading click-8.0.1-py3-none-any.whl (97 kB)
Collecting tqdm
  Downloading tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.0.1 nltk-3.6.3 regex-2021.8.28 tqdm-4.62.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
example = 'It was a very pleasant day. The weather was cool and there were light showers. I went to the market to buy some fruits.'


In [3]:
print(example)

It was a very pleasant day. The weather was cool and there were light showers. I went to the market to buy some fruits.


In [7]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Harshvardhan
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
words=word_tokenize(example)

## Stop-words Removal

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Harshvardhan
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
nltk.download('brown')

[nltk_data] Downloading package brown to C:\Users\Harshvardhan
[nltk_data]     Agarwal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [14]:
from nltk.corpus import stopwords

In [18]:
len(set(stopwords.words('english')))

179

In [19]:
sw=stopwords.words('english')

In [21]:
def remove_stopwords(text):
    useful_words=[]
    for word in text:
        if word.lower() not in sw:
            useful_words.append(word)
    return useful_words
            

In [22]:
relevant_words=remove_stopwords(words)
relevant_words

['pleasant',
 'day',
 '.',
 'weather',
 'cool',
 'light',
 'showers',
 '.',
 'went',
 'market',
 'buy',
 'fruits',
 '.']

## Stemming

In [26]:
sample= 'Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft feet high wall'


In [24]:
from nltk.stem import SnowballStemmer
ss=SnowballStemmer('english')

In [28]:
def stemming(text):
    words=word_tokenize(text)
    useful_words=[]
    
    for word in words:
        useful_words.append(ss.stem(word))
        
    return useful_words

In [29]:
stemming(sample)

['fox',
 'love',
 'to',
 'make',
 'jump',
 '.',
 'the',
 'quick',
 'brown',
 'fox',
 'was',
 'seen',
 'jump',
 'over',
 'the',
 'love',
 'dog',
 'from',
 'a',
 '6ft',
 'feet',
 'high',
 'wall']

# Data Cleaning/Preprocessing

In [37]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [46]:
def data_cleaning(document):
    document=document.lower()
    
    #tokenizing
    words=word_tokenize(document)
    
    #stopwords removal and stemming
    #using list comprehension
    new_words=[ss.stem(word) for word in words if word not in sw and len(word)>1]
    
    #convert back to sentence
    cleaned_sentence=" ".join(new_words)
    return cleaned_sentence    
    

In [47]:
print(data_cleaning(corpus[0]))

indian cricket team win world cup say capt virat koh world cup held sri lanka


In [48]:
cleaned_data=[]

for document in corpus:
    cleaned_data.append(data_cleaning(document))
cleaned_data

['indian cricket team win world cup say capt virat koh world cup held sri lanka',
 'win next lok sabha elect say confid indian pm',
 'nobel laurat heart peopl',
 'movi raazi excit indian spi thriller base upon real stori']

# Building a Vocabulary and Vectorization of Documents

In [52]:
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
vectorized_corpus=cv.fit_transform(corpus).toarray()

In [54]:
vectorized_corpus

array([[0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [57]:
cv.get_feature_names()

['an',
 'at',
 'based',
 'be',
 'capt',
 'confident',
 'cricket',
 'cup',
 'elections',
 'exciting',
 'hearts',
 'held',
 'indian',
 'is',
 'kohli',
 'lanka',
 'laurate',
 'lok',
 'movie',
 'next',
 'nobel',
 'of',
 'people',
 'pm',
 'raazi',
 'real',
 'sabha',
 'says',
 'spy',
 'sri',
 'story',
 'team',
 'the',
 'thriller',
 'upon',
 'virat',
 'we',
 'will',
 'win',
 'wins',
 'won',
 'world']

In [58]:
import pandas as pd
pd.DataFrame(vectorized_corpus,columns=cv.get_feature_names())

Unnamed: 0,an,at,based,be,capt,confident,cricket,cup,elections,exciting,...,the,thriller,upon,virat,we,will,win,wins,won,world
0,0,1,0,1,1,0,1,2,0,0,...,0,0,0,1,0,2,0,1,0,2
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,1,0
3,1,0,1,0,0,0,0,0,0,1,...,1,1,1,0,0,0,0,0,0,0


In [60]:
test="based on the fixture it should be an exciting match"
cv.transform([test]).toarray()

array([[1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)