# NLP using NLTK

##### NLP Pipeline
- Data collection
- Tokenization, Stopword Removal, Stemming
- Building a common vocab
- Vectorize documents
- Perform Classification/Clustering

### 1) Data Collection

In [6]:
import nltk
nltk.download('brown')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\jomin\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jomin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jomin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import brown

In [8]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [9]:
data = brown.sents(categories='editorial')[:100]
print(type(data), len(data))
print(data)

<class 'nltk.collections.LazySubsequence'> 100
[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


### 2.1) Tokenization

In [10]:
text = 'It was a very pleasant day, the weather was cool and there were showers. I went to the market to buy some fruits.'

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [12]:
sents = sent_tokenize(text)
sents

['It was a very pleasant day, the weather was cool and there were showers.',
 'I went to the market to buy some fruits.']

In [13]:
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'showers', '.']


### 2.2) Stopword Removal

In [14]:
from nltk.corpus import stopwords

In [15]:
sw = set(stopwords.words('english'))
print(sw)
print(len(sw))

{'themselves', 'don', "they're", "they've", 'aren', 'their', "we've", 'theirs', 'been', 'to', 'other', "wouldn't", "couldn't", 'be', 'had', 'now', 'mightn', 'shan', 'so', "he'll", 'll', 'are', 'whom', 'needn', 'up', 'didn', 'him', "hasn't", 'too', 'being', 'm', 'myself', 'his', 'an', 'weren', 'd', 'itself', "shan't", 'which', 'just', "you're", 'her', "they'd", 'do', 'into', 'some', 'have', 'hadn', 'yourselves', 'can', "needn't", 'under', 'during', "we'd", 'how', 'who', 'only', 'until', 'of', 'more', 'those', 'with', 'you', "it's", "hadn't", "it'll", 'its', "she's", 'were', 'further', 'herself', 're', 't', 'by', "they'll", 'did', 'should', 'most', 'same', 'then', 'hers', 'doesn', 'o', "should've", 'if', 'himself', 'isn', "doesn't", 'both', "shouldn't", 'about', "wasn't", "i'll", "isn't", "it'd", 'after', 'me', 'won', 'because', 'there', 'hasn', "that'll", 'and', 'each', 'it', 'off', 'this', "weren't", 'not', 'will', 'no', "don't", 'am', 'in', 'we', 'before', 'has', 'once', 'wasn', 'our'

#### Filter words from sentence

In [16]:
def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

In [17]:
useful_words = filter_words(word_list)
print(useful_words)

['pleasant', 'day', ',', 'weather', 'cool', 'showers', '.']


In [18]:
from nltk.tokenize import RegexpTokenizer

In [19]:
tokenizer = RegexpTokenizer('[a-zA-Z0-9]+')

In [20]:
sent = 'send the 50 documents to abc, def, ghi.'
print(tokenizer.tokenize(sent))

['send', 'the', '50', 'documents', 'to', 'abc', 'def', 'ghi']


### 2.3) Stemming
- Process that transforms particular words into root words
- Jumping, jump, jumps, jumped => jump

In [21]:
text = 'The quick brown fox was seen jumping on the lazy dog from a high wall. Foxes love to make jumps.'

In [22]:
word_list = tokenizer.tokenize(text.lower())
word_list

['the',
 'quick',
 'brown',
 'fox',
 'was',
 'seen',
 'jumping',
 'on',
 'the',
 'lazy',
 'dog',
 'from',
 'a',
 'high',
 'wall',
 'foxes',
 'love',
 'to',
 'make',
 'jumps']

### Types of Stemmers
- Snowball Stemmer (Multilingual)
- Porter Stemmer (English only)
- Lancaster Stemmer (English only)

In [23]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [24]:
ps = PorterStemmer()

In [25]:
ps.stem('jumped')

'jump'

In [26]:
ps.stem('jumping')

'jump'

In [27]:
ps.stem('lovely')

'love'

In [28]:
ps.stem('awesome')

'awesom'

In [29]:
ls = LancasterStemmer()
ls.stem('awesome')

'awesom'

In [30]:
ls.stem('teenager')

'teen'

In [31]:
ps.stem('teenager')

'teenag'

In [32]:
ss = SnowballStemmer('english')
ss.stem('lovely')

'love'

In [33]:
ss = SnowballStemmer('french')
ss.stem('courais')

'cour'

#### Bag of Words

In [3]:
# corpus = collection of documents
corpus = [
    'Indian cricket team will win world cup, says Indian captain Virat Kohli. World Cup will be held in India during the next year',
    'We will win next Lok Sabha election, says Indian PM',
    'The noble Rabindranath Tagore won the hearts of the people',
    'The movie Raazi has an exciting trailer. It is based on a real story'
]

print(corpus)

['Indian cricket team will win world cup, says Indian captain Virat Kohli. World Cup will be held in India during the next year', 'We will win next Lok Sabha election, says Indian PM', 'The noble Rabindranath Tagore won the hearts of the people', 'The movie Raazi has an exciting trailer. It is based on a real story']


- Converting words into numerical features
- Building a common vocabulary and vectorize the documents

In [4]:
def myTokenizer(sent):
    words = tokenizer.tokenize(sent.lower())
    return filter_words(words)

In [34]:
myTokenizer(corpus[0])

['indian',
 'cricket',
 'team',
 'win',
 'world',
 'cup',
 'says',
 'indian',
 'captain',
 'virat',
 'kohli',
 'world',
 'cup',
 'held',
 'india',
 'next',
 'year']

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [38]:
vectorized_corpus = cv.fit_transform(corpus)



In [39]:
print(vectorized_corpus)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 34 stored elements and shape (4, 30)>
  Coords	Values
  (0, 9)	2
  (0, 2)	1
  (0, 24)	1
  (0, 27)	1
  (0, 28)	2
  (0, 3)	2
  (0, 21)	1
  (0, 1)	1
  (0, 26)	1
  (0, 10)	1
  (0, 7)	1
  (0, 8)	1
  (0, 13)	1
  (0, 29)	1
  (1, 9)	1
  (1, 27)	1
  (1, 21)	1
  (1, 13)	1
  (1, 11)	1
  (1, 20)	1
  (1, 4)	1
  (1, 16)	1
  (2, 14)	1
  (2, 18)	1
  (2, 23)	1
  (2, 6)	1
  (2, 15)	1
  (3, 12)	1
  (3, 17)	1
  (3, 5)	1
  (3, 25)	1
  (3, 0)	1
  (3, 19)	1
  (3, 22)	1


In [40]:
vc = vectorized_corpus.toarray()

In [42]:
print(vc[0])
print(cv.vocabulary_)

[0 1 1 2 0 0 0 1 1 2 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 2 1]
{'indian': 9, 'cricket': 2, 'team': 24, 'win': 27, 'world': 28, 'cup': 3, 'says': 21, 'captain': 1, 'virat': 26, 'kohli': 10, 'held': 7, 'india': 8, 'next': 13, 'year': 29, 'lok': 11, 'sabha': 20, 'election': 4, 'pm': 16, 'noble': 14, 'rabindranath': 18, 'tagore': 23, 'hearts': 6, 'people': 15, 'movie': 12, 'raazi': 17, 'exciting': 5, 'trailer': 25, 'based': 0, 'real': 19, 'story': 22}


In [44]:
cv.inverse_transform(vc[0].reshape(1,-1))

[array(['captain', 'cricket', 'cup', 'held', 'india', 'indian', 'kohli',
        'next', 'says', 'team', 'virat', 'win', 'world', 'year'],
       dtype='<U12')]

### TF IDF

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer, ngram_range=(1,2))

In [51]:
vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus[0])
print(tfidf_vectorizer.vocabulary_)

[0.         0.         0.16227964 0.16227964 0.16227964 0.16227964
 0.32455927 0.16227964 0.16227964 0.         0.         0.
 0.         0.         0.         0.16227964 0.16227964 0.16227964
 0.16227964 0.25588626 0.16227964 0.16227964 0.         0.16227964
 0.16227964 0.         0.         0.         0.         0.12794313
 0.         0.16227964 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.12794313 0.12794313 0.         0.
 0.         0.16227964 0.16227964 0.         0.         0.16227964
 0.16227964 0.12794313 0.         0.16227964 0.32455927 0.32455927
 0.16227964]
{'indian': 19, 'cricket': 4, 'team': 49, 'win': 55, 'world': 58, 'cup': 6, 'says': 44, 'captain': 2, 'virat': 53, 'kohli': 23, 'held': 15, 'india': 17, 'next': 29, 'year': 60, 'indian cricket': 21, 'cricket team': 5, 'team win': 50, 'win world': 57, 'world cup': 59, 'cup says': 8, 'says indian': 45, 'indian captain': 20, 'captain virat': 3, 'virat