In [None]:
# Import the Natural Language Toolkit library
import nltk
# Import PorterStemmer for reducing words to their root/stem form
from nltk.stem import PorterStemmer
# Import stopwords collection (common words like 'the', 'a', 'an' that are often filtered out)
from nltk.corpus import stopwords
# Import tokenization functions to split text into sentences and words
from nltk.tokenize import sent_tokenize, word_tokenize
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

In [2]:
paragraph = '''
I am the Ex Co-founder and Chief AI Engineer of iNeuron and my experience is pioneering in
machine learning, deep learning, and computer vision, Generative AI, an educator, and a mentor,
with over 15 years' experience in the industry. These are my Udemy Courses where I explain
various topics on machine learning, deep learning, and AI with many real-world problem
scenarios. I have delivered over 30+ tech talks on data science, machine learning, and AI at
various meet-ups, technical institutions, and community-arranged forums. My main aim is to
make everyone familiar with ML and AI.
'''

In [3]:
paragraph


"\nI am the Ex Co-founder and Chief AI Engineer of iNeuron and my experience is pioneering in\nmachine learning, deep learning, and computer vision, Generative AI, an educator, and a mentor,\nwith over 15 years' experience in the industry. These are my Udemy Courses where I explain\nvarious topics on machine learning, deep learning, and AI with many real-world problem\nscenarios. I have delivered over 30+ tech talks on data science, machine learning, and AI at\nvarious meet-ups, technical institutions, and community-arranged forums. My main aim is to\nmake everyone familiar with ML and AI.\n"

In [4]:
# Download the 'punkt_tab' resource from NLTK
# This resource is used for tokenization (breaking text into words, sentences, etc.)
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
# Tokenize the paragraph into sentences using NLTK's sentence tokenizer
# This splits the text at sentence boundaries, recognizing punctuation and abbreviations
sentences=nltk.sent_tokenize(paragraph)

In [6]:
print(sentences)

["\nI am the Ex Co-founder and Chief AI Engineer of iNeuron and my experience is pioneering in\nmachine learning, deep learning, and computer vision, Generative AI, an educator, and a mentor,\nwith over 15 years' experience in the industry.", 'These are my Udemy Courses where I explain\nvarious topics on machine learning, deep learning, and AI with many real-world problem\nscenarios.', 'I have delivered over 30+ tech talks on data science, machine learning, and AI at\nvarious meet-ups, technical institutions, and community-arranged forums.', 'My main aim is to\nmake everyone familiar with ML and AI.']


In [7]:
stemmer=PorterStemmer()

In [8]:
stemmer.stem('going')

'go'

In [9]:
# Assistant
# First, import the necessary modules
import nltk
from nltk.stem import WordNetLemmatizer

# Download the required WordNet resource
nltk.download('wordnet')

# Initialize the WordNet lemmatizer from NLTK
lemmatizer = WordNetLemmatizer()

# Lemmatize the word 'history' (converts to base/dictionary form)
# Default POS is 'noun', so this returns 'history' unchanged
lemmatizer.lemmatize('history')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'history'

In [10]:
import re

In [11]:
len(sentences)

4

In [12]:
# Create an empty list to store processed text
corpus = []
# Iterate through each sentence in the sentences list
for i in range(len(sentences)):
    # Remove all non-alphabetic characters and replace with spaces
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    # Convert all characters to lowercase
    review = review.lower()
    # Add the processed text to the corpus list
    corpus.append(review)

In [13]:
corpus

[' i am the ex co founder and chief ai engineer of ineuron and my experience is pioneering in machine learning  deep learning  and computer vision  generative ai  an educator  and a mentor  with over    years  experience in the industry ',
 'these are my udemy courses where i explain various topics on machine learning  deep learning  and ai with many real world problem scenarios ',
 'i have delivered over     tech talks on data science  machine learning  and ai at various meet ups  technical institutions  and community arranged forums ',
 'my main aim is to make everyone familiar with ml and ai ']

In [14]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [15]:
# First, download the required NLTK resources
import nltk
nltk.download('stopwords')  # Download stopwords corpus for filtering common words


for i in corpus:  # Iterate through each document/text in the corpus
    words = nltk.word_tokenize(i)  # Split the text into individual words
    for word in words:  # Process each word
        if word not in set(stopwords.words('english')):  # Filter out common English stopwords
            print(stemmer.stem(word))  # Reduce word to its root form and print it

ex
co
founder
chief
ai
engin
ineuron
experi
pioneer
machin
learn
deep
learn
comput
vision
gener
ai
educ
mentor
year
experi
industri
udemi
cours
explain
variou
topic
machin
learn
deep
learn
ai
mani
real
world
problem
scenario
deliv
tech
talk
data
scienc
machin
learn
ai
variou
meet
up
technic
institut
commun
arrang
forum
main
aim
make
everyon
familiar
ml
ai


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
## lemmatization
for i in corpus:  # Iterate through each document/text in the corpus
    words = nltk.word_tokenize(i)  # Split the text into individual words
    for word in words:  # Process each word
        if word not in set(stopwords.words('english')):  # Filter out common English stopwords
            print(lemmatizer.lemmatize(word))  # Reduce word to its root form and print it

ex
co
founder
chief
ai
engineer
ineuron
experience
pioneering
machine
learning
deep
learning
computer
vision
generative
ai
educator
mentor
year
experience
industry
udemy
course
explain
various
topic
machine
learning
deep
learning
ai
many
real
world
problem
scenario
delivered
tech
talk
data
science
machine
learning
ai
various
meet
ups
technical
institution
community
arranged
forum
main
aim
make
everyone
familiar
ml
ai


In [17]:
## apply stopswords.  lemmatize
# Apply Stopwords, Lemmatize
import re

# Initialize empty list to store processed text
corpus = []

# Loop through each sentence in the sentences list
for i in range(len(sentences)):
    # Remove all non-alphabetic characters and replace with spaces
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    # Convert text to lowercase
    review = review.lower()
    # Split the text into individual words
    review = review.split()
    # For each word: check if it's not a stopword, then lemmatize it
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    # Join the processed words back into a single string
    review = ' '.join(review)
    # Add the processed text to our corpus
    corpus.append(review)

In [18]:
corpus

['ex co founder chief ai engineer ineuron experience pioneering machine learning deep learning computer vision generative ai educator mentor year experience industry',
 'udemy course explain various topic machine learning deep learning ai many real world problem scenario',
 'delivered tech talk data science machine learning ai various meet ups technical institution community arranged forum',
 'main aim make everyone familiar ml ai']

In [19]:
# Import CountVectorizer from scikit-learn's text feature extraction module
# CountVectorizer converts a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
# Create a CountVectorizer with binary=True option
# This transforms text into a binary bag-of-words representation
# where feature values are 1 if the word appears in the document, 0 otherwise
cv=CountVectorizer(binary=True)

In [21]:
# Transform the text corpus into a document-term matrix using the fitted CountVectorizer
# This converts the text documents into numerical feature vectors based on word frequencies
x = cv.fit_transform(corpus)

In [22]:
# Access the vocabulary_ attribute of the CountVectorizer (cv) object
# This returns a dictionary mapping terms to their indices in the feature matrix
# The keys are the unique words found in the corpus, and the values are their integer indices
cv.vocabulary_

{'ex': 14,
 'co': 4,
 'founder': 19,
 'chief': 3,
 'ai': 0,
 'engineer': 12,
 'ineuron': 22,
 'experience': 15,
 'pioneering': 32,
 'machine': 25,
 'learning': 24,
 'deep': 9,
 'computer': 6,
 'vision': 44,
 'generative': 20,
 'educator': 11,
 'mentor': 30,
 'year': 46,
 'industry': 21,
 'udemy': 41,
 'course': 7,
 'explain': 16,
 'various': 43,
 'topic': 40,
 'many': 28,
 'real': 34,
 'world': 45,
 'problem': 33,
 'scenario': 35,
 'delivered': 10,
 'tech': 38,
 'talk': 37,
 'data': 8,
 'science': 36,
 'meet': 29,
 'ups': 42,
 'technical': 39,
 'institution': 23,
 'community': 5,
 'arranged': 2,
 'forum': 18,
 'main': 26,
 'aim': 1,
 'make': 27,
 'everyone': 13,
 'familiar': 17,
 'ml': 31}

In [23]:
corpus[0]

'ex co founder chief ai engineer ineuron experience pioneering machine learning deep learning computer vision generative ai educator mentor year experience industry'

In [24]:
# Convert the sparse matrix at index 0 of x to a dense numpy array
x[0].toarray()

array([[1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
        1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1]], dtype=int64)