# NLP

In [15]:
import nltk

In [16]:
#nltk.download()

In [17]:
paragraph = "He is a good boy. She is a good girl. boy & girl are good."
paragraph

'He is a good boy. She is a good girl. boy & girl are good.'

# 2 Tokenization
## Sentence Tokenization.

In [18]:
# Loading data.
sentences = nltk.sent_tokenize(paragraph)
sentences

['He is a good boy.', 'She is a good girl.', 'boy & girl are good.']

## Word tokenization

In [19]:
words = nltk.word_tokenize(paragraph)
words

['He',
 'is',
 'a',
 'good',
 'boy',
 '.',
 'She',
 'is',
 'a',
 'good',
 'girl',
 '.',
 'boy',
 '&',
 'girl',
 'are',
 'good',
 '.']

# 3 Text Cleaning 
## 3.a Remove Punctuations

In [20]:
import re

corpus = []
print(sentences)

for i in range(len(sentences)):
    rp = re.sub("[^a-zA-Z]", " ",sentences[i])
    corpus.append(rp)

print(corpus)

['He is a good boy.', 'She is a good girl.', 'boy & girl are good.']
['He is a good boy ', 'She is a good girl ', 'boy   girl are good ']


## 3.b Stopwords in English

In [21]:
from nltk.corpus import stopwords
## To see list of stop words in english
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
import re
from nltk.corpus import stopwords

corpus = []
#print(sentences)

for i in range(len(sentences)):
    rp = re.sub("[^a-zA-Z]", " ",sentences[i])
    rp = rp.lower()
    rp = rp.split()
    rp = [word for word in rp if not word in stopwords.words("english")] # list list comprehensio (just if in for loop). 
    rp = " ".join(rp)
    corpus.append(rp)

print(corpus)

['good boy', 'good girl', 'boy girl good']


## 3.c Stemming and Lemmatization

In [23]:
# stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps.stem("historically")

'histor'

In [24]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
wnl.lemmatize("historically")

'historically'

## Final ever thing together for text Cleaning (remove punctuation + Remove Stop words + Stemming / Lemmatization)

In [25]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer #Stemming
ps = PorterStemmer()

corpus = []

for i in range(len(sentences)):
    rp = re.sub("[^a-zA-Z]", " ",sentences[i])
    rp = rp.lower()
    rp = rp.split()
    rp = [ps.stem(word) for word in rp if not word in stopwords.words("english")] # list list comprehensio (just if in for loop). 
    rp = " ".join(rp)
    corpus.append(rp)

print(corpus)

['good boy', 'good girl', 'boy girl good']


# 4 Vectorization
count vectorizer (bag of words) --> it counts the repetion no.of words in the give sentence. 

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow = cv.fit_transform(corpus).toarray()
bow

array([[1, 0, 1],
       [0, 1, 1],
       [1, 1, 1]])

In [27]:
cv.get_feature_names_out()

array(['boy', 'girl', 'good'], dtype=object)

## TF-IDF Vectorizer
- Term Frequency(tf) = no.of times term t occurs in a document.
- inverse document frequency(idf) is a measure of how common or rare a term is across the entire corpus of documents. so the point to note is that its common to all the documents. if the word is common and appears in many documents, the idf value(normalized) will approach 0 or else approach 1 if its rare.

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tfidf = tf.fit_transform(corpus).toarray()
tfidf

array([[0.78980693, 0.        , 0.61335554],
       [0.        , 0.78980693, 0.61335554],
       [0.61980538, 0.61980538, 0.48133417]])

# Part of speech tagging(POS tagging)

In [32]:
text = "I love natural language processing"

words = nltk.word_tokenize(text)
words

['I', 'love', 'natural', 'language', 'processing']

In [33]:
nltk.pos_tag(words)

[('I', 'PRP'),
 ('love', 'VBP'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN')]

In [35]:
# to know the full for of tags use this. 
nltk.help.upenn_tagset("JJ")

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
