# Natural Language Processing 

In [1]:
import numpy as np
import nltk

In [2]:
paragraph="He is a good boy. She is a good girl. boy & girl are good"

In [3]:
sentences = nltk.sent_tokenize(paragraph)
sentences

['He is a good boy.', 'She is a good girl.', 'boy & girl are good']

In [4]:
words =nltk.word_tokenize(paragraph)
words

['He',
 'is',
 'a',
 'good',
 'boy',
 '.',
 'She',
 'is',
 'a',
 'good',
 'girl',
 '.',
 'boy',
 '&',
 'girl',
 'are',
 'good']

In [5]:
import re

corpus = []
for i in range(len(sentences)):
    rp = re.sub('[^a-zA-Z]'," ",sentences[i])
    corpus.append(rp)

print(corpus)

['He is a good boy ', 'She is a good girl ', 'boy   girl are good']


In [6]:
from nltk.corpus import stopwords
#stopwords.words('english')

## Remove Stopwords

In [8]:
corpus=[]

for i in range(len(sentences)):
    rp = re.sub('[^a-zA-Z]'," ",sentences[i])
    rp = rp.lower()
    rp=rp.split()
    rp = [word for word in rp if not word in set(stopwords.words('english'))]
    rp  = " ".join(rp)
    corpus.append(rp)
    
print(corpus)

['good boy', 'good girl', 'boy girl good']


# Stemming

In [10]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps.stem("history")

'histori'

# Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
wnl.lemmatize("history")

'history'

In [14]:
corpus = []
for i in range(len(sentences)):
    rp = re.sub('[^a-zA-Z]'," ",sentences[i])
    rp = rp.lower()
    rp = rp.split()
    rp = [ps.stem(word) for word in rp if not word in set(stopwords.words('english'))]
    rp = " ".join(rp)
    corpus.append(rp)

# Vectorization (Bag of Words)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
blow = cv.fit_transform(corpus).toarray()
blow

array([[1, 0, 1],
       [0, 1, 1],
       [1, 1, 1]], dtype=int64)

# TF-IDF Vectorizer

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tfidf = tf.fit_transform(corpus).toarray()
tfidf

array([[0.78980693, 0.        , 0.61335554],
       [0.        , 0.78980693, 0.61335554],
       [0.61980538, 0.61980538, 0.48133417]])

In [27]:
text  = "I love natural language processing"
words = nltk.word_tokenize(text)
nltk.pos_tag(words)

[('I', 'PRP'),
 ('love', 'VBP'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN')]

In [28]:
nltk.help.upenn_tagset("PRP")

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


In [29]:
nltk.help.upenn_tagset("VBP")

VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...


In [30]:
nltk.help.upenn_tagset("JJ")

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [31]:
nltk.help.upenn_tagset("NNP")

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [32]:
nltk.help.upenn_tagset("NN")

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
