# Natural Language Processing basics using NLTK


## NLP Pipeline
- Data Collection 
- Tokenization
- Stopwards Removal
- Stemming
- Bag of Words
- TF-IDF

In [18]:
# Data Collection
from nltk.corpus import brown

In [6]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [11]:
data = brown.sents(categories='editorial')[:100]

In [14]:
print(len(data[0]),data[0])

5 ['Assembly', 'session', 'brought', 'much', 'good']


# Tokenization

In [32]:

from nltk.tokenize import word_tokenize,sent_tokenize

In [25]:
text = "It was a very pleasant day in Pune, the weather was cool and there were showers. We still had to attend lectures. PICT - Top College indeed! LOL."

In [26]:
sents = sent_tokenize(text)

In [24]:
sents

['It was a very pleasant day in Pune, the weather was cool and there were showers.',
 'We still had to attend lectures.',
 'Long live PICT - Top College indeed!',
 'LOL.']

In [28]:
word_list = word_tokenize(text.lower())

In [29]:
word_list

['it',
 'was',
 'a',
 'very',
 'pleasant',
 'day',
 'in',
 'pune',
 ',',
 'the',
 'weather',
 'was',
 'cool',
 'and',
 'there',
 'were',
 'showers',
 '.',
 'we',
 'still',
 'had',
 'to',
 'attend',
 'lectures',
 '.',
 'pict',
 '-',
 'top',
 'college',
 'indeed',
 '!',
 'lol',
 '.']

# Stop Word Removal

In [34]:

from nltk.corpus import stopwords

In [40]:
sw = stopwords.words('english')
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [41]:
def filter_words(word_list):
    useful_words = [x for x in word_list if x not in sw]
    return useful_words

In [46]:
useful_words = filter_words(word_list)
print(useful_words)

['pleasant', 'day', 'pune', ',', 'weather', 'cool', 'showers', '.', 'still', 'attend', 'lectures', '.', 'pict', '-', 'top', 'college', 'indeed', '!', 'lol', '.']


In [44]:
from nltk.tokenize import RegexpTokenizer

In [58]:
tokenizer = RegexpTokenizer("[,.]")
print(text,'\n\n\n',tokenizer.tokenize(text))

It was a very pleasant day in Pune, the weather was cool and there were showers. We still had to attend lectures. PICT - Top College indeed! LOL. 


 [',', '.', '.', '.']


In [61]:
tokenizer = RegexpTokenizer("[a-z0-9]+")
print(text,'\n\n\n',tokenizer.tokenize(text.lower()))

The quick brown fox was seen jumpng over the lazy dog from high wall. Foxes love to make jumps 


 ['the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumpng', 'over', 'the', 'lazy', 'dog', 'from', 'high', 'wall', 'foxes', 'love', 'to', 'make', 'jumps']


# Stemming
(Ex: jump <- jumping, jumped, jumps, jump)

In [59]:
text = """The quick brown fox was seen jumpng over the lazy dog from high wall. Foxes love to make jumps"""

In [62]:
word_list = tokenizer.tokenize(text.lower())
print (word_list)

['the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumpng', 'over', 'the', 'lazy', 'dog', 'from', 'high', 'wall', 'foxes', 'love', 'to', 'make', 'jumps']


## Types of Stemmers
- Snowball (Multilingual)
- Porter
- Lancaster

In [63]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [64]:
ps = PorterStemmer()

In [65]:
ps.stem('jump')

'jump'

In [67]:
ps.stem('jumping')

'jump'

In [68]:
ps.stem('lovely')

'love'

In [74]:
ps.stem('awesome')

'awesom'

In [75]:
ls = LancasterStemmer()

In [76]:
ls.stem('awesome')

'awesom'

In [77]:
print(ls.stem('teenager'))
print(ps.stem('teenager'))

teen
teenag


In [85]:
ss = SnowballStemmer('english')

In [87]:
ss.stem('lovely')
ssf = SnowballStemmer('french')
ssf.stem('courais')

'cour'