In [3]:
# =================================================================
# MASTER SCRIPT: THE END-TO-END NLP JOURNEY
# =================================================================

# --- 1. SETUP & LIBRARIES ---
# We install sentence-transformers for the 'Semantic Brain' in Module 3
!pip install -U sentence-transformers -q

import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util

# Download the 'Rulebooks' (dictionaries) for language processing
nltk.download('punkt')      # For splitting sentences into words
nltk.download('stopwords')  # For identifying filler words (is, the, etc.)
nltk.download('wordnet')    # For finding the dictionary roots of words
nltk.download('omw-1.4')    # Supporting data for lemmatization
nltk.download('punkt_tab')  # Required for word tokenization based on the error

print("üöÄ SETUP COMPLETE: All libraries and dictionaries loaded.\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


üöÄ SETUP COMPLETE: All libraries and dictionaries loaded.



In [4]:
# =================================================================
# MODULE 1: THE CLEANING PIPELINE (Foundation)
# =================================================================
print("üõ†Ô∏è --- MODULE 1: CLEANING & PREPROCESSING ---")

# Our raw input: messy, mixed case, symbols, and filler words
raw_input = "The system is SEARCHING for 'AI Conferences' üìÖ in NYC... and it's EXCITING! üöÄ"
print(f"1. RAW TEXT: {raw_input}")

# Step 1: Lowercasing & Tokenization
# Converts to lowercase and splits into a list of words
tokens = word_tokenize(raw_input.lower())
print(f"2. TOKENIZED (Split & Lowercase): {tokens}")

# Step 2: Noise Removal (Stopwords & Symbols)
# Removes emojis, punctuation, and common filler words like 'is' or 'the'
stop_words = set(stopwords.words('english'))
clean_tokens = [w for w in tokens if w.isalnum() and w not in stop_words]
print(f"3. CLEANED (No Noise/Filler): {clean_tokens}")

# Step 3: Lemmatization
# Reduces words to their dictionary root (e.g., 'searching' -> 'search')
lemmatizer = WordNetLemmatizer()
processed_tokens = [lemmatizer.lemmatize(w) for w in clean_tokens]
clean_sentence = " ".join(processed_tokens)
print(f"4. LEMMATIZED (Root Words): {processed_tokens}")
print(f"üí° CONCEPT: We turned 'messy speech' into 'clean keywords' for the AI.\n")

üõ†Ô∏è --- MODULE 1: CLEANING & PREPROCESSING ---
1. RAW TEXT: The system is SEARCHING for 'AI Conferences' üìÖ in NYC... and it's EXCITING! üöÄ
2. TOKENIZED (Split & Lowercase): ['the', 'system', 'is', 'searching', 'for', "'ai", 'conferences', "'", 'üìÖ', 'in', 'nyc', '...', 'and', 'it', "'s", 'exciting', '!', 'üöÄ']
3. CLEANED (No Noise/Filler): ['system', 'searching', 'conferences', 'nyc', 'exciting']
4. LEMMATIZED (Root Words): ['system', 'searching', 'conference', 'nyc', 'exciting']
üí° CONCEPT: We turned 'messy speech' into 'clean keywords' for the AI.



In [5]:
# =================================================================
# MODULE 2: VECTORIZATION (The Mathematical Bridge)
# =================================================================
print("üìä --- MODULE 2: VECTORIZATION (TF-IDF) ---")

# Let's compare our cleaned sentence to other known sentences
knowledge_base = [
    clean_sentence,
    "students learning artificial intelligence technology",
    "machine learning models need data training"
]

# Create the TF-IDF Vectorizer
# This calculates word importance (Rare words = High score)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(knowledge_base)

# Convert to a Table for visualization
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(),
                        columns=vectorizer.get_feature_names_out(),
                        index=['Our Sentence', 'AI Lesson', 'ML Training'])

print("5. TF-IDF MATRIX (The 'Math Map' of our words):")
print(df_tfidf.round(2))
print("üí° CONCEPT: Every word is now a number. Machines can now 'calculate' language.\n")

üìä --- MODULE 2: VECTORIZATION (TF-IDF) ---
5. TF-IDF MATRIX (The 'Math Map' of our words):
              artificial  conference  data  exciting  intelligence  learning  \
Our Sentence        0.00        0.45  0.00      0.45          0.00      0.00   
AI Lesson           0.47        0.00  0.00      0.00          0.47      0.36   
ML Training         0.00        0.00  0.42      0.00          0.00      0.32   

              machine  models  need   nyc  searching  students  system  \
Our Sentence     0.00    0.00  0.00  0.45       0.45      0.00    0.45   
AI Lesson        0.00    0.00  0.00  0.00       0.00      0.47    0.00   
ML Training      0.42    0.42  0.42  0.00       0.00      0.00    0.00   

              technology  training  
Our Sentence        0.00      0.00  
AI Lesson           0.47      0.00  
ML Training         0.00      0.42  
üí° CONCEPT: Every word is now a number. Machines can now 'calculate' language.



In [7]:
# =================================================================
# MODULE 3: SEMANTIC UNDERSTANDING & AGENCY (The Brain)
# =================================================================
print("üß† --- MODULE 3: SEMANTIC SEARCH & AGENTIC ACTION ---")

# Load a Pre-trained Transformer Model (A 'Brain' that understands meaning)
# This model represents every sentence as a 384-dimensional 'Meaning Vector'
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define the Agent's potential Actions (Tools)
agent_actions = [
    "Action: Book Travel (flight, NYC, tickets, conference)",
    "Action: Data Science (models, training, cleanup)",
    "Action: Technical Support (error, password, system)"
]

# Generate 'Embeddings' (Deep-meaning math) for the tools and our query
tool_embeddings = model.encode(agent_actions, convert_to_tensor=True)
query_embedding = model.encode(clean_sentence, convert_to_tensor=True)

# SEMANTIC SEARCH: Match the query to the best tool based on CONCEPT
# Note: It matches 'NYC' and 'Search' to 'Book Travel' even without identical words
search_results = util.semantic_search(query_embedding, tool_embeddings, top_k=1)[0]
best_match = agent_actions[search_results[0]['corpus_id']]
confidence = search_results[0]['score']

print(f"6. USER INTENT DETECTED: {clean_sentence}")
print(f"7. BEST MATCHING ACTION: {best_match}")
print(f"8. CONFIDENCE SCORE: {confidence:.4f}")

# AGENTIC LOGIC: Decision Gate
# If the AI is confident, it acts. If not, it asks for help.
THRESHOLD = 0.45
if confidence > THRESHOLD:
    print(f"üöÄ AGENT DECISION: Match Confirmed. Executing {best_match.split(':')[1]}...")
else:
    print("‚ö†Ô∏è AGENT DECISION: Confidence too low. Asking user for clarification.")

print("\n" + "="*60 + "\n‚úÖ NLP MASTERCLASS COMPLETE!")

üß† --- MODULE 3: SEMANTIC SEARCH & AGENTIC ACTION ---
6. USER INTENT DETECTED: system searching conference nyc exciting
7. BEST MATCHING ACTION: Action: Book Travel (flight, NYC, tickets, conference)
8. CONFIDENCE SCORE: 0.4623
üöÄ AGENT DECISION: Match Confirmed. Executing  Book Travel (flight, NYC, tickets, conference)...

‚úÖ NLP MASTERCLASS COMPLETE!
