Tokenization

In [1]:
from nltk.tokenize import word_tokenize

In [13]:
text = "vijaya stack over flow stack over flow text vectorization scikit its processing"

In [14]:
words = word_tokenize(text)

print(words) # printing all the words in the text

['vijaya', 'stack', 'over', 'flow', 'stack', 'over', 'flow', 'text', 'vectorization', 'scikit', 'its', 'processing']


Lemmatization

In [15]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemma_words = [lemmatizer.lemmatize(word) for word in words]

print(lemma_words)

['vijaya', 'stack', 'over', 'flow', 'stack', 'over', 'flow', 'text', 'vectorization', 'scikit', 'it', 'processing']


Stemming

In [16]:
# Porter Stemmer

from nltk.stem import PorterStemmer

port_stemmer = PorterStemmer()

port_words = [port_stemmer.stem(word) for word in words]

print(port_words)

['vijaya', 'stack', 'over', 'flow', 'stack', 'over', 'flow', 'text', 'vector', 'scikit', 'it', 'process']


In [17]:
#Snowball Stemmer

from nltk.stem import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english')

snow_words = [snow_stemmer.stem(word) for word in words]

print(snow_words)



['vijaya', 'stack', 'over', 'flow', 'stack', 'over', 'flow', 'text', 'vector', 'scikit', 'it', 'process']


Part-of-Speech (POS)

In [33]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vijaya/sw_install/anaconda3/envs/nlp/nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [18]:
from nltk import pos_tag

p_tags = pos_tag(words)

print(p_tags)

[('vijaya', 'NN'), ('stack', 'NN'), ('over', 'IN'), ('flow', 'JJ'), ('stack', 'NN'), ('over', 'IN'), ('flow', 'JJ'), ('text', 'JJ'), ('vectorization', 'NN'), ('scikit', 'VBD'), ('its', 'PRP$'), ('processing', 'NN')]


Named Entity Recognition

In [38]:
import nltk
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/vijaya/sw_install/anaconda3/envs/nlp/nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [10]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/vijaya/sw_install/anaconda3/envs/nlp/nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping corpora/words.zip.


True

In [19]:
import nltk
from nltk import ne_chunk
from nltk.tokenize import word_tokenize
text = "Barack Obama was born in Hawaii."
tokens = word_tokenize(text)
tagged_tokens = nltk.pos_tag(tokens)
ner_tree = ne_chunk(tagged_tokens)
print(ner_tree)
# Output: (S (PERSON Barack))

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP)
  ./.)


In [20]:
from nltk import ne_chunk

ner_tree = ne_chunk(p_tags)
print(ner_tree)

(S
  vijaya/NN
  stack/NN
  over/IN
  flow/JJ
  stack/NN
  over/IN
  flow/JJ
  text/JJ
  vectorization/NN
  scikit/VBD
  its/PRP$
  processing/NN)


Sentiment Analysis

In [22]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/vijaya/sw_install/anaconda3/envs/nlp/nltk_data.
[nltk_data]     ..


True

In [23]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
#text = "I love this product! It's amazing."
sia = SentimentIntensityAnalyzer()
score = sia.polarity_scores(text)
print(score)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


Text Classification

In [30]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

data = {'text': ['This is a positive text.', 'This is a negative text.'], 'label': ['positive', 'negative']}
df = pd.DataFrame(data)

bow_vectorizer = CountVectorizer()
X = bow_vectorizer.fit_transform(df['text'])

clf = MultinomialNB()
clf.fit(X , df['label'])

# Test the model
text = "This is a neutral text."
X_test = bow_vectorizer.transform([text])

pred = clf.predict(X_test)
print(pred)

['negative']


Language Translation

In [11]:
from googletrans import Translator

trans = Translator()
dd = trans.translate('vijaya' , src='en' , dest='fr')
print(dd.text)

AttributeError: 'NoneType' object has no attribute 'group'

Ref:  https://ankushmulkar.medium.com/every-beginner-nlp-engineer-must-know-these-techniques-678605dc6026