In [None]:
# =======================
# Phase 1: Text Preprocessing
# =======================

# Input text
text = "I am feeling very very happy today!!!..."
print("Original text:", text)

# Step 1: Convert to lowercase
text = text.lower()
print("Lowercase:", text)

# Step 2: Remove punctuation using regex
import re
text = re.sub(r'[^\w\s]', '', text)
print("Without punctuation:", text)

# Step 3: Tokenization
tokens = text.split()
print("Tokens:", tokens)

# Step 4: Stopword removal
stopwords = ["i", "am", "is", "the", "a", "an", "very"]
filtered_tokens = []

for word in tokens:
    if word not in stopwords:
        filtered_tokens.append(word)

print("After removing stopwords:", filtered_tokens)

# Step 5: Stemming (simple rule-based)
def stem(word):
    if word.endswith("ing"):
        return word[:-3]
    return word

stemmed_words = []

for word in filtered_tokens:
    stemmed_words.append(stem(word))

print("Stemmed words:", stemmed_words)


# =======================
# Phase 2: Bag of Words (BoW)
# =======================

from sklearn.feature_extraction.text import CountVectorizer

sentences = [
    "i am good",
    "i am boy",
    "happy today",
    "i am sad"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Matrix:")
print(X.toarray())


# =======================
# Bag of Words – Example 2
# =======================

sentences = [
    "i am learning python and python is good",
    "python is a great programming language to learn",
    "i also love ml with python"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Matrix:")
print(X.toarray())


# =======================
# Phase 3: TF-IDF
# =======================

# TF-IDF gives less weight to frequent words like "python"

from sklearn.feature_extraction.text import TfidfVectorizer

sentences = [
    "i am learning python and python is good",
    "python is a great programming language and python is fast",
    "i also love ml with python"
]

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(sentences)

print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
print(X.toarray())


Original text: I am feeling very very happy today!!!...
Lowercase: i am feeling very very happy today!!!...
Without punctuation: i am feeling very very happy today
Tokens: ['i', 'am', 'feeling', 'very', 'very', 'happy', 'today']
After removing stopwords: ['feeling', 'happy', 'today']
Stemmed words: ['feel', 'happy', 'today']
Vocabulary: ['am' 'boy' 'good' 'happy' 'sad' 'today']
BoW Matrix:
[[1 0 1 0 0 0]
 [1 1 0 0 0 0]
 [0 0 0 1 0 1]
 [1 0 0 0 1 0]]
Vocabulary: ['also' 'am' 'and' 'good' 'great' 'is' 'language' 'learn' 'learning'
 'love' 'ml' 'programming' 'python' 'to' 'with']
BoW Matrix:
[[0 1 1 1 0 1 0 0 1 0 0 0 2 0 0]
 [0 0 0 0 1 1 1 1 0 0 0 1 1 1 0]
 [1 0 0 0 0 0 0 0 0 1 1 0 1 0 1]]
Vocabulary: ['also' 'am' 'and' 'fast' 'good' 'great' 'is' 'language' 'learning' 'love'
 'ml' 'programming' 'python' 'with']
TF-IDF Matrix:
[[0.         0.42439575 0.32276391 0.         0.42439575 0.
  0.32276391 0.         0.42439575 0.         0.         0.
  0.50130994 0.        ]
 [0.         0.     

In [None]:
import nltk
# 4 major user of nltk
# a. Tokenization
# b. stopword removal
# c. stemming
# d. Lemmatization

nltk.download("punkt_tab")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# for tokenization
from nltk.tokenize import word_tokenize
# for removing stopwords
from nltk.corpus import stopwords
# for stemming and lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
text = "I am learning Python programming and it is very helpful &>..."
print("Original text:",text)

Original text: I am learning Python programming and it is very helpful &>...


In [None]:
# Step-1: Lowercase
text = text.lower()
print("After Lowercase:",text)

After Lowercase: i am learning python programming and it is very helpful &>...


In [None]:
# Step-2 Tokenization
tokens=word_tokenize(text)
print("Tokens : ",tokens)

Tokens :  ['i', 'am', 'learning', 'python', 'programming', 'and', 'it', 'is', 'very', 'helpful', '&', '>', '...']


In [None]:
# Step-3: Remove punctuations
import string
punc =string.punctuation
print(punc)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
punctuation_filter = [
    word for word in tokens if word not in punc
]
print("Removed Punctuations:",punctuation_filter)

Removed Punctuations: ['i', 'am', 'learning', 'python', 'programming', 'and', 'it', 'is', 'very', 'helpful', '...']


In [None]:
# Step-4: Remove stopwords
print(stopwords.words("english"))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
eng_stopwords = stopwords.words("english")
filtered_tokens = [word for word in punctuation_filter if word not in eng_stopwords]
print("Removed stopwords:",filtered_tokens)

Removed stopwords: ['learning', 'python', 'programming', 'helpful', '...']


In [None]:
# Step-5: Stemming & Lemmatization
stem = PorterStemmer()
stem.stem("fly")

'fli'

In [None]:
wnet = WordNetLemmatizer()
print(wnet.lemmatize("playing","v"))
print(wnet.lemmatize("flying","v"))
print(wnet.lemmatize("went","v"))
print(wnet.lemmatize("bought","v"))

play
fly
go
buy


In [None]:
lemmatized_words = []
for word in filtered_tokens:
  lemmatized_words.append(wnet.lemmatize(word,"v"))
print("Lemmatization:",lemmatized_words)

Lemmatization: ['learn', 'python', 'program', 'helpful', '...']


In [None]:
cleaned_text = " ".join(lemmatized_words)
print("Cleaned Text:",cleaned_text)

Cleaned Text: learn python program helpful ...


In [None]:
sentences = [
    "i am good",
    "i am boy",
    "happy today",
    "i am sad"
]

def preprocess_text(sentences):
  cleaned_words = []

  stop_words = {"i", "am"}
  lemmatized =WordNetLemmatizer()

  for sentence in sentences:
    words = sentence.split()
    for word in words:
      if word not in stop_words:
        lemma = lemmatized.lemmatize(word)
        cleaned_words.append(lemma)
  return cleaned_words

cleaned_text = preprocess_text(sentences)
print(cleaned_text)

['good', 'boy', 'happy', 'today', 'sad']


In [None]:
# First import text processing libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Second import sklearn dependencies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
training_data = [
    ("hello", "greet"),
    ("hi there", "greet"),
    ("good morning", "greet"),
    ("hey","greet"),
    ("what's the weather today","weather"),
    ("tell me weather","weather"),
    ("is it raining","weather"),
    ("open","open_app"),
    ("open google","open_app"),
    ("open youtube","open_app"),
    ("bye","exit"),
    ("goodbye","exit")
]

In [None]:
sentences = []
labels = []

for text, intent in training_data:
    sentences.append(text)
    labels.append(intent)

In [None]:
import string
punc = string.punctuation

def preprocess_text(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        text = sentence.lower()
        tokens = word_tokenize(text)
        punctuation_filter = [word for word in tokens if word not in punc]
        eng_stopwords = stopwords.words("english")
        filtered_tokens = [word for word in punctuation_filter if word not in eng_stopwords]
        wnet = WordNetLemmatizer()
        lemmatized_words = []
        for word in filtered_tokens:
            lemmatized_words.append(wnet.lemmatize(word,"v"))
        cleaned_text = " ".join(lemmatized_words)
        cleaned_sentences.append(cleaned_text)

    return cleaned_sentences

In [None]:
clean_data = preprocess_text(sentences)

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(clean_data)

In [None]:
logistic = LogisticRegression()
logistic.fit(X, labels)

In [None]:
user_input = "hola"
processed = preprocess_text([user_input])
user_vector = vectorizer.transform(processed)
prediction = logistic.predict(user_vector)
print(prediction)

['greet']
