# Basic NLP Tutorial

In [1]:
import nltk

In [2]:
# nltk.download() #  to download all lib

In [3]:
# nltk.download('punkt') # to download punkt lib

In [4]:
para = """Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."""
para

"Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

### Tokenizing Text

In [5]:
# tokenize in sentences
sentences = nltk.sent_tokenize(para)
sentences

['Hello Mr. Smith, how are you doing today?',
 'The weather is great, and city is awesome.',
 'The sky is pinkish-blue.',
 "You shouldn't eat cardboard."]

In [6]:
# tokenizing in words
words = nltk.word_tokenize(para)
words[:3]

['Hello', 'Mr.', 'Smith']

#### Stemming

In [7]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [8]:
stemer = PorterStemmer()
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)
sentences

['hello mr. smith , today ?',
 'the weather great , citi awesom .',
 'the sky pinkish-blu .',
 "you n't eat cardboard ."]

#### Lemmatizing

In [9]:
# tokenize in sentences
sentences = nltk.sent_tokenize(para)
sentences

['Hello Mr. Smith, how are you doing today?',
 'The weather is great, and city is awesome.',
 'The sky is pinkish-blue.',
 "You shouldn't eat cardboard."]

In [10]:
from nltk.stem import WordNetLemmatizer

In [11]:
lemmatizer = WordNetLemmatizer()
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)
sentences

['Hello Mr. Smith , today ?',
 'The weather great , city awesome .',
 'The sky pinkish-blue .',
 "You n't eat cardboard ."]

#### Bag of Words

In [12]:
import re

In [14]:
# cleaning text
sentences = nltk.sent_tokenize(para)

In [15]:
corpus = [] # after cleaning store sent here
for i in range(len(sentences)):
    review  = re.sub('[^a-zA-Z]', ' ',sentences[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [16]:
#  creating bag of words - document matrix
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()

In [19]:
x = cv.fit_transform(corpus).toarray()
x

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0],
       [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

### TF-IDF : Term Frequency - Inverse Document Frequency

disadvantages of bag of words - it doesn't take into account the order of the words, it doesn't take into account the meaning of the words, it doesn't take into account the similarity between words, it doesnt take frequency of word

#### Formula:

tf = no of freq word in sentence / total no of words in sentence

idf = log(total no of sentences / no of sentences with word)

In [21]:
sentences = nltk.sent_tokenize(para)

In [23]:
corpus = [] # after cleaning store sent here
for i in range(len(sentences)):
    review  = re.sub('[^a-zA-Z]', ' ',sentences[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
corpus

['hello mr smith today',
 'weather great city awesome',
 'sky pinkish blue',
 'eat cardboard']

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
X

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.5       , 0.5       , 0.        , 0.        ,
        0.5       , 0.5       , 0.        ],
       [0.5       , 0.        , 0.        , 0.5       , 0.        ,
        0.5       , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.5       ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.57735027, 0.57735027,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.70710678, 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])