## Introduction to Natural Language Processing

In [6]:
import nltk

In [None]:
# nltk.download()

In [None]:
nltk.download('brown')

In [None]:
from nltk.corpus import brown

In [4]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [37]:
data=brown.sents(categories='adventure')

In [44]:
print(len(data))
' '.join(data[1])
# print(type(data))

4637


'He was well rid of her .'

## Bag of Words Pipeline
- Get the Data/Corpus
- Tokenisation, Stopward Removal
- Streaming
- Building a Vocab
- Vectorization
- Claasification

### Tokenisation and Stopword Removal

In [8]:
document= """It was very hot day. And I had so bored whole day. I have 
watch Breaking bad web-series."""


In [9]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [10]:
sents= sent_tokenize(document)
print(len(sents))
print(sents[1])

3
And I had so bored whole day.


In [11]:
sents[0].split()

['It', 'was', 'very', 'hot', 'day.']

### Stopwords

In [12]:
from nltk.corpus import stopwords

In [13]:
sw= set(stopwords.words('english'))

In [14]:
print(sw)

{'aren', 'there', 'mustn', 'on', 'both', 'be', 'further', 'not', 'an', 'll', 'in', "mightn't", 'these', "wasn't", 'which', 'and', 'as', 'to', "couldn't", 'shan', "needn't", 'i', 'yourselves', "you'll", 'its', "shouldn't", 'because', 'after', 'we', 'all', 'ain', 's', 'being', 'him', 'more', "she's", 'm', 'how', "hasn't", 'most', 'by', 'doing', 'during', 'against', 'them', 'too', 'each', 'should', 'your', 'were', 'again', 'that', 'about', 'me', 'then', 'isn', 'any', 'at', "aren't", 'is', "isn't", 'they', 'themselves', 'the', 'do', 're', 'this', 'hasn', 'only', 'through', 'nor', "mustn't", "wouldn't", "shan't", "won't", 'own', "you've", 'whom', 'where', 'now', 'above', 'wasn', 'until', 'itself', 'once', 'mightn', 'wouldn', 'hadn', 'had', 'over', 'ours', 'here', 'will', 'who', "you'd", 'but', 'himself', 'needn', 'my', 'those', 'so', 'y', 'couldn', 'was', 'under', 'of', "it's", 'out', 'few', 'you', "you're", 'no', 'when', 'shouldn', 'into', 'their', 'down', 'd', "weren't", 'it', 'up', 'does

In [15]:
def remove_stopwords(text,stopwords):
    useful_words= [w for w in text.split() if w not in stopwords]
    return useful_words

In [16]:
text= "hey, how's it going, don't to be mean bro!"
uw= remove_stopwords(text,sw)
uw

['hey,', "how's", 'going,', 'mean', 'bro!']

### Tokenization using Regular Expression

In [17]:
sentence= "2,3 days were not so wiered?"

In [18]:
from nltk.tokenize import RegexpTokenizer

In [19]:
tokenizer= RegexpTokenizer('[a-zA-Z]+') # all the words and avoiding all the numbers
useful_text= tokenizer.tokenize(sentence)

In [20]:
useful_text

['days', 'were', 'not', 'so', 'wiered']

### Stemming 
- jumps, jumping, jumped, jump -> jump

In [21]:
text= """Foxes love to make jumps. The quick brwon fox was seen jumping
over the lovely dog form a 6ft high wall"""

In [22]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
# Snowball Stemmer, parter, Lancaster Stemmer

In [23]:
ps= PorterStemmer()

In [25]:
ps.stem('jumps')

'jump'

In [26]:
ps.stem('lovely')

'love'

In [27]:
# Snowball Stemmer
ss= SnowballStemmer('english')

In [28]:
ss.stem('loving')

'love'

In [29]:
from nltk.stem import WordNetLemmatizer

wn= WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

### Building a Vocab & Vectorization

In [45]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
cv= CountVectorizer()

In [63]:
vectorized_corpus= cv.fit_transform(corpus)

In [54]:
vectorized_corpus=vectorized_corpus.toarray()

In [56]:
vectorized_corpus

array([[0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

In [58]:
cv.vocabulary_

{'indian': 12,
 'cricket': 6,
 'team': 31,
 'will': 37,
 'wins': 39,
 'world': 41,
 'cup': 7,
 'says': 27,
 'capt': 4,
 'virat': 35,
 'kohli': 14,
 'be': 3,
 'held': 11,
 'at': 1,
 'sri': 29,
 'lanka': 15,
 'we': 36,
 'win': 38,
 'next': 19,
 'lok': 17,
 'sabha': 26,
 'elections': 8,
 'confident': 5,
 'pm': 23,
 'the': 32,
 'nobel': 20,
 'laurate': 16,
 'won': 40,
 'hearts': 10,
 'of': 21,
 'people': 22,
 'movie': 18,
 'raazi': 24,
 'is': 13,
 'an': 0,
 'exciting': 9,
 'spy': 28,
 'thriller': 33,
 'based': 2,
 'upon': 34,
 'real': 25,
 'story': 30}

In [64]:
# Reverse Mapping!
numbers= vectorized_corpus[1]
numbers

<1x42 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [66]:
cv.inverse_transform(numbers)

[array(['indian', 'will', 'says', 'we', 'win', 'next', 'lok', 'sabha',
        'elections', 'confident', 'pm'], dtype='<U9')]

### More ways to Create Features
- Unigram - every word as a feature
- Bigrams
- Trigrams
- n-grams
- TF-IDF Normalisation

In [90]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [99]:
cv=CountVectorizer(ngram_range=(2,2))

In [100]:
docs= [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 1, 1, 0, 0, 0, 1],
       [1, 1, 1, 1, 1, 1, 1, 1]])

In [101]:
 cv.vocabulary_

{'this is': 7,
 'is good': 3,
 'good movie': 2,
 'movie but': 5,
 'but actor': 1,
 'actor is': 0,
 'is not': 4,
 'not present': 6}

### Tf-idf Normalisation
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [74]:
sent_1= "this is good movie"
sent_2= "this was good movie"
sent_3= "this is not good movie"

corpus= [sent_1,sent_2,sent_3]

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
tfidf= TfidfVectorizer()

In [4]:
vc=tfidf.fit_transform(corpus).toarray()

NameError: name 'corpus' is not defined

In [79]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [1]:
tfidf.vocabulary_

NameError: name 'tfidf' is not defined