# NLP using NLTK

##### NLP Pipeline
- Data collection
- Tokenization, Stopword Removal, Stemming
- Building a common vocab
- Vectorize documents
- Perform Classification/Clustering

### 1) Data Collection

In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\jomin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [1]:
from nltk.corpus import brown

In [5]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [7]:
data = brown.sents(categories='editorial')[:100]
print(type(data), len(data))
print(data)

<class 'nltk.collections.LazySubsequence'> 100
[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


### 2.1) Tokenization

In [11]:
text = 'It was a very pleasant day, the weather was cool and there were showers. I went to the market to buy some fruits.'

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [14]:
sents = sent_tokenize(text)
sents

['It was a very pleasant day, the weather was cool and there were showers.',
 'I went to the market to buy some fruits.']

In [16]:
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'showers', '.']


### 2.2) Stopword Removal

In [17]:
from nltk.corpus import stopwords

In [21]:
sw = set(stopwords.words('english'))
print(sw)
print(len(sw))

{'not', 'again', 'himself', "we're", 'few', 'didn', 'which', 'to', 'for', 'having', 'were', "weren't", 'been', "wouldn't", 'own', 'how', 'from', "they've", 'haven', 'shan', 'other', 'all', "isn't", 'between', "we've", 'wasn', 'needn', "they'd", 'should', 'such', "hadn't", 'what', 'out', 'has', 'some', 'you', 'them', 'after', 'below', "don't", 'mightn', 'this', 'yourself', "shan't", 'that', 'under', 'i', "you'll", 'any', "mustn't", 'off', 'she', "shouldn't", 'or', 'if', 'hadn', 'will', "we'd", 'isn', 'their', 'until', 'hers', "you'd", 'only', 'll', 'hasn', 'won', 'and', 'am', 'nor', 'when', 'with', "doesn't", "you're", 'of', "i'd", 'don', 'there', 'while', 'ours', 'against', 'about', 'before', 'just', 'have', 'o', 're', "that'll", 'wouldn', "she'll", "she'd", 'then', 'now', "you've", "haven't", 'itself', 'he', 'myself', 'once', 'ourselves', 'ain', "she's", 'it', "they're", 'was', 'whom', 'up', "i've", 'shouldn', 'where', 'an', 'couldn', 'd', 'can', 'aren', 'these', 'be', 'both', "won't"

#### Filter words from sentence

In [22]:
def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

In [23]:
useful_words = filter_words(word_list)
print(useful_words)

['pleasant', 'day', ',', 'weather', 'cool', 'showers', '.']


In [24]:
from nltk.tokenize import RegexpTokenizer

In [31]:
tokenizer = RegexpTokenizer('[a-zA-Z0-9]+')

In [29]:
sent = 'send the 50 documents to abc, def, ghi.'
print(tokenizer.tokenize(sent))

['send', 'the', '50', 'documents', 'to', 'abc', 'def', 'ghi']


### 2.3) Stemming
- Process that transforms particular words into root words
- Jumping, jump, jumps, jumped => jump

In [30]:
text = 'The quick brown fox was seen jumping on the lazy dog from a high wall. Foxes love to make jumps.'

In [32]:
word_list = tokenizer.tokenize(text.lower())
word_list

['the',
 'quick',
 'brown',
 'fox',
 'was',
 'seen',
 'jumping',
 'on',
 'the',
 'lazy',
 'dog',
 'from',
 'a',
 'high',
 'wall',
 'foxes',
 'love',
 'to',
 'make',
 'jumps']

### Types of Stemmers
- Snowball Stemmer (Multilingual)
- Porter Stemmer (English only)
- Lancaster Stemmer (English only)

In [33]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [34]:
ps = PorterStemmer()

In [35]:
ps.stem('jumped')

'jump'

In [36]:
ps.stem('jumping')

'jump'

In [37]:
ps.stem('lovely')

'love'

In [38]:
ps.stem('awesome')

'awesom'

In [39]:
ls = LancasterStemmer()
ls.stem('awesome')

'awesom'

In [40]:
ls.stem('teenager')

'teen'

In [41]:
ps.stem('teenager')

'teenag'

In [42]:
ss = SnowballStemmer('english')
ss.stem('lovely')

'love'

In [43]:
ss = SnowballStemmer('french')
ss.stem('courais')

'cour'