Tokenization and Text Cleaning


In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
text = "NLP is amazing! Let's explore its wonders."
token = word_tokenize( text )
word = [word.lower() for word in token]
print(word)


STOP WORDS


In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word if word not in stop_words]
print(filtered_words)

punctuation

In [None]:
import string
string.punctuation

In [None]:
punctuation = [filtered_words for filtered_words in filtered_words if filtered_words not in string.punctuation]
print(punctuation)

Stemming & Lemmatizing

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_words = [stemmer.stem(word) for word in punctuation]
lemmatized_words = [lemmatizer.lemmatize(word) for word in punctuation]

print(punctuation)
print(stemmed_words)
print(lemmatized_words)

 Part-of-Speech Tagging

In [None]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
post_tag =pos_tag(punctuation)
print(post_tag)

Named Entity Recognition (NER)

In [None]:
from nltk import ne_chunk
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('maxent_ne_chunker_tab')

In [None]:
ner_tags = ne_chunk(post_tag)
print(ner_tags)

 contractions

In [None]:
!pip install contractions

In [None]:
import contractions
text = contractions.fix("I can't go because it's late.")
print(text)

Handling Emojis

In [None]:
!pip install emoji

In [None]:
import emoji
text = emoji.demojize("I love NLP ❤️")
print(text)

Spell Correction

In [None]:
from textblob import TextBlob

text = "NLP is amazng and beautifl."
corrected = str(TextBlob(text).correct())
print(corrected)

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
words = ["amazng", "beutifl", "intellignt", "pythn"]
corrected = [spell.correction(w) for w in words]
print(corrected)

Removing Rare Words

In [None]:
from collections import Counter
words = ["nlp","nlp","deep","learning","data","science","rareword"]
counts = Counter(words)
filtered = [w for w in words if counts[w] > 1]
print(filtered)

Bag Of Words


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one."]

In [None]:
 vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
print("Feature Names:", vectorizer.get_feature_names_out())
print("Document-Term Matrix:\n", X.toarray())

With N-Gram


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one."]

vectorizer = CountVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(documents)

print("Feature Names:", vectorizer.get_feature_names_out())
print("Document-Term Matrix:\n", X.toarray())

TF-IDF


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [None]:
documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one."]

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(documents)
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", X_tfidf.toarray())

Word Embedding : Word2Vec , Glove , fasttext

In [None]:
!pip install --upgrade gensim
!pip install scipy==1.10.1

In [None]:
from gensim.models import Word2Vec
import nltk

sentences = [
    ["this", "is", "the", "first", "document"],
    ["this", "document", "is", "the", "second", "document"],
    ["and", "this", "is", "the", "third", "one"],
    ["is", "this", "the", "first", "document"]
]

model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, workers=4 , sg =1)

print("Vector for 'document':\n", model.wv['document'])

print("\nMost similar to 'document':")
print(model.wv.most_similar('document'))


In [None]:
from gensim.models import FastText
from nltk.tokenize import word_tokenize

sentences = ["FastText embeddings handle subword information.",
             "It is effective for various languages."]
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

word_embeddings = model.wv
print(word_embeddings['subword'])

In [None]:
import gensim.downloader as api

glove_vectors = api.load("glove-wiki-gigaword-100")

print(glove_vectors["computer"])
print(glove_vectors.most_similar("computer"))
