In [1]:
# Importing necessary libraries from NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree



In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# Ensure all necessary NLTK packages are downloaded
#nltk.download('punkt') # Tokenization
nltk.download('stopwords') # remoinf stopwords
nltk.download('wordnet') # for lemmatization and semantic analysis
nltk.download('averaged_perceptron_tagger') # for POS tagging
nltk.download('maxent_ne_chunker') #for NER
nltk.download('words') # for recognizing words in NER
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers

True

In [4]:
# Sample input text
input_text = "Chatbots can be built using Dialogflow,RASA,LLM,SLM."

In [5]:
# 1. Normalization: Converting text to lowercase
normalized_text = input_text.lower()
print("\n1. Normalized Text:")
print(normalized_text)


1. Normalized Text:
chatbots can be built using dialogflow,rasa,llm,slm.


In [6]:
# 2. Tokenization: Splitting the text into words
tokens = word_tokenize(normalized_text)
print("\n2. Tokens:")
print(tokens)


2. Tokens:
['chatbots', 'can', 'be', 'built', 'using', 'dialogflow', ',', 'rasa', ',', 'llm', ',', 'slm', '.']


In [7]:
# 3. Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
print("\n3. Tokens After Stop Words Removal:")
print(filtered_tokens)



3. Tokens After Stop Words Removal:
['chatbots', 'built', 'using', 'dialogflow', ',', 'rasa', ',', 'llm', ',', 'slm', '.']


In [None]:
# 4. Spell Check (Optional)
# Here we assume the input text is correct; otherwise, spell check libraries like `pyspellchecker` can be used.

In [8]:
# 5. Stemming / Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
print("\n5. Lemmatized Tokens:")
print(lemmatized_tokens)



5. Lemmatized Tokens:
['chatbots', 'built', 'using', 'dialogflow', ',', 'rasa', ',', 'llm', ',', 'slm', '.']


In [9]:
# 6. Conversational Context (Optional in this simple example)
# Context can be managed with external frameworks or memory stores.


In [10]:
# 7. Named Entity Recognition (NER)
pos_tags = pos_tag(lemmatized_tokens)  # POS tagging
print("Lemmatized Tokens:", lemmatized_tokens)
print("POS Tags:", pos_tags)
ner_chunks = ne_chunk(pos_tags)  # Named Entity Recognition
print("\nNER Chunks:")
print(ner_chunks)
print("\n7. Named Entities:")
for chunk in ner_chunks:
    if isinstance(chunk, Tree):
        print(f"{' '.join(c[0] for c in chunk)} ({chunk.label()})")


Lemmatized Tokens: ['chatbots', 'built', 'using', 'dialogflow', ',', 'rasa', ',', 'llm', ',', 'slm', '.']
POS Tags: [('chatbots', 'NNS'), ('built', 'VBN'), ('using', 'VBG'), ('dialogflow', 'NN'), (',', ','), ('rasa', 'NN'), (',', ','), ('llm', 'NN'), (',', ','), ('slm', 'NN'), ('.', '.')]

NER Chunks:
(S
  chatbots/NNS
  built/VBN
  using/VBG
  dialogflow/NN
  ,/,
  rasa/NN
  ,/,
  llm/NN
  ,/,
  slm/NN
  ./.)

7. Named Entities:


In [11]:
# Example Output Frequency Distribution (optional visualization)
fdist = FreqDist(lemmatized_tokens)
print("\nWord Frequency Distribution:")
print(fdist.most_common(5))


Word Frequency Distribution:
[(',', 3), ('chatbots', 1), ('built', 1), ('using', 1), ('dialogflow', 1)]


NER using SPACY

In [12]:
import spacy

# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

# Input text
input_text = "Skyscanner's chatbot helps users find and book flights, hotels, and car rentals by providing personalized travel recommendations and real-time pricing.."

# Process the text
doc = nlp(input_text)

# Extract Named Entities
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")



Named Entities:
Skyscanner (ORG)
