In [None]:
# ============================================================
# Environment Setup
# ============================================================

import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = os.path.abspath('../..')
if project_root not in sys.path:
    sys.path.append(project_root)

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
import re

# NLP imports
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    accuracy_score, f1_score
)

# SageMaker imports
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.sklearn import SKLearnModel

# Configuration
try:
    from utils.sagemaker_config import get_sagemaker_config
    config = get_sagemaker_config(s3_prefix='lab3-text-classification')
    role = config['role']
    session = config['session']
    bucket = config['bucket']
    region = config['region']
except ImportError:
    print("Using fallback configuration")
    role = get_execution_role()
    session = sagemaker.Session()
    bucket = session.default_bucket()
    region = session.boto_region_name

print("Configuration complete.")
print(f"Region: {region}")
print(f"S3 Bucket: {bucket}")

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## ‚ö†Ô∏è Endpoint Cleanup (Run if getting scaler.pkl error)

Si vous obtenez l'erreur `scaler.pkl not found`, ex√©cutez cette cellule pour supprimer les anciens endpoints avant de red√©ployer.

In [None]:
# ============================================================
# Cleanup Old Endpoints (if needed)
# ============================================================

import boto3

print("üîç Checking for old text-classification endpoints...")
print("=" * 60)

sm_client = boto3.client('sagemaker')

try:
    # List existing endpoints
    response = sm_client.list_endpoints(
        NameContains='text-classification',
        StatusEquals='InService'
    )
    
    endpoints = response['Endpoints']
    
    if not endpoints:
        print("‚úÖ No old endpoints found. You can proceed with deployment.")
    else:
        print(f"\n‚ö†Ô∏è  Found {len(endpoints)} existing endpoint(s):\n")
        for ep in endpoints:
            print(f"  ‚Ä¢ {ep['EndpointName']} (Created: {ep['CreationTime']})")
        
        print("\nüóëÔ∏è  Deleting old endpoints to avoid scaler.pkl errors...")
        for ep in endpoints:
            endpoint_name = ep['EndpointName']
            sm_client.delete_endpoint(EndpointName=endpoint_name)
            print(f"‚úÖ Deleted: {endpoint_name}")
        
        print("\n‚úÖ Cleanup complete! You can now deploy a new endpoint.")
        
except Exception as e:
    print(f"Note: {e}")
    print("Proceeding with notebook...")

---

## Section 1: Data Generation and Text Preprocessing

We'll generate synthetic customer support tickets across different categories.

### Categories

| Category | Description | Example Issues |
|----------|-------------|----------------|
| Technical | Software bugs, errors | "App crashes when uploading", "Login error" |
| Billing | Payment, invoices | "Incorrect charge", "Refund request" |
| Account | Profile, settings | "Can't reset password", "Update email" |
| Product | Features, usage | "How to use feature X", "Product info" |


In [None]:
# ============================================================
# Generate Synthetic Support Ticket Dataset
# ============================================================

def generate_support_tickets(n_samples=2000, random_state=42):
    """
    Generate synthetic customer support tickets
    """
    np.random.seed(random_state)
    
    # Define ticket templates by category
    templates = {
        'Technical': [
            "The app crashes when I try to {}",
            "Getting error message: {}",
            "Unable to {} in the system",
            "Bug report: {} not working",
            "Technical issue with {}",
            "Software freezes during {}",
            "Connection timeout when {}",
            "API error: {}",
            "Database sync failed for {}",
            "Performance issues with {}"
        ],
        'Billing': [
            "I was charged {} incorrectly",
            "Need refund for {}",
            "My invoice shows {} but should be {}",
            "Payment failed for {}",
            "Billing question about {}",
            "Double charged for {}",
            "Need receipt for {}",
            "Subscription cancelled but still charged {}",
            "Price changed without notice for {}",
            "Discount code {} not working"
        ],
        'Account': [
            "Cannot reset my password for {}",
            "Need to update my {} information",
            "Account locked, unable to {}",
            "Email verification not working for {}",
            "Profile {} needs to be changed",
            "Security question issue with {}",
            "Two-factor authentication problem for {}",
            "Want to delete my account {}",
            "Merge accounts for {}",
            "Access denied to {}"
        ],
        'Product': [
            "How do I use {} feature?",
            "Need information about {}",
            "Does the product support {}?",
            "Feature request: {}",
            "Documentation for {}",
            "Tutorial needed for {}",
            "Comparison between {} and {}",
            "Is {} available in my plan?",
            "How to configure {}",
            "Best practices for {}"
        ]
    }
    
    # Filler words for templates
    fillers = {
        'actions': ['upload', 'download', 'export', 'import', 'share', 'sync', 'save', 'delete'],
        'features': ['dashboard', 'reports', 'analytics', 'integrations', 'API', 'mobile app', 'notifications'],
        'billing': ['subscription', 'upgrade', 'invoice', 'payment', 'trial period', 'annual plan'],
        'account': ['email', 'username', 'profile', 'settings', 'preferences', 'security'],
        'products': ['premium features', 'data export', 'team collaboration', 'storage limit', 'API access']
    }
    
    tickets = []
    categories = []
    
    samples_per_category = n_samples // 4
    
    for category, template_list in templates.items():
        for _ in range(samples_per_category):
            template = np.random.choice(template_list)
            
            # Fill template with random fillers
            if category == 'Technical':
                filler = np.random.choice(fillers['actions'] + fillers['features'])
            elif category == 'Billing':
                filler = np.random.choice(fillers['billing'])
            elif category == 'Account':
                filler = np.random.choice(fillers['account'])
            else:  # Product
                filler = np.random.choice(fillers['products'])
            
            # Handle templates with multiple placeholders
            if template.count('{}') == 2:
                filler2 = np.random.choice(fillers['products'])
                text = template.format(filler, filler2)
            else:
                text = template.format(filler)
            
            # Add some variation
            if np.random.random() < 0.3:
                text = text.upper()
            elif np.random.random() < 0.3:
                text = text.lower()
            
            # Add urgency markers sometimes
            if np.random.random() < 0.2:
                text = "URGENT: " + text
            
            # Add noise (typos, extra spaces)
            if np.random.random() < 0.1:
                text = text + "!!!"
            
            tickets.append(text)
            categories.append(category)
    
    # Create DataFrame
    df = pd.DataFrame({
        'ticket_id': [f'TKT_{i:06d}' for i in range(len(tickets))],
        'text': tickets,
        'category': categories,
        'priority': np.random.choice(['Low', 'Medium', 'High'], len(tickets), p=[0.5, 0.3, 0.2])
    })
    
    # Shuffle
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return df

# Generate dataset
print("Generating support ticket dataset...")
tickets_df = generate_support_tickets(n_samples=2000, random_state=42)

print(f"\nDataset shape: {tickets_df.shape}")
print(f"\nCategory distribution:")
print(tickets_df['category'].value_counts())
print(f"\nSample tickets:")
tickets_df.head(10)

In [None]:
# ============================================================
# Text Preprocessing
# ============================================================

def preprocess_text(text):
    """
    Clean and preprocess text data
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URGENT markers
    text = re.sub(r'urgent:\s*', '', text)
    
    # Remove extra punctuation
    text = re.sub(r'!+', '!', text)
    text = re.sub(r'\?+', '?', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Strip
    text = text.strip()
    
    return text

# Apply preprocessing
tickets_df['text_clean'] = tickets_df['text'].apply(preprocess_text)

print("Text Preprocessing Complete")
print("\nOriginal vs Cleaned Examples:")
for idx in range(3):
    print(f"\nOriginal: {tickets_df.iloc[idx]['text']}")
    print(f"Cleaned:  {tickets_df.iloc[idx]['text_clean']}")

---

## Section 2: Exploratory Data Analysis

Understanding text characteristics and patterns.


In [None]:
# ============================================================
# Text Analysis
# ============================================================

# Text length analysis
tickets_df['text_length'] = tickets_df['text_clean'].str.len()
tickets_df['word_count'] = tickets_df['text_clean'].str.split().str.len()

print("Text Statistics by Category:")
print("="*60)
print(tickets_df.groupby('category')[['text_length', 'word_count']].agg(['mean', 'median', 'std']))

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Category distribution
ax = axes[0, 0]
tickets_df['category'].value_counts().plot(kind='bar', ax=ax, color='skyblue')
ax.set_title('Ticket Distribution by Category')
ax.set_xlabel('Category')
ax.set_ylabel('Count')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

# Text length by category
ax = axes[0, 1]
tickets_df.boxplot(column='text_length', by='category', ax=ax)
ax.set_title('Text Length Distribution by Category')
ax.set_xlabel('Category')
ax.set_ylabel('Characters')

# Word count by category
ax = axes[1, 0]
tickets_df.boxplot(column='word_count', by='category', ax=ax)
ax.set_title('Word Count Distribution by Category')
ax.set_xlabel('Category')
ax.set_ylabel('Words')

# Priority distribution
ax = axes[1, 1]
pd.crosstab(tickets_df['category'], tickets_df['priority']).plot(kind='bar', stacked=True, ax=ax)
ax.set_title('Priority by Category')
ax.set_xlabel('Category')
ax.set_ylabel('Count')
ax.legend(title='Priority')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

plt.tight_layout()
plt.show()

---

## Section 3: Feature Extraction with TF-IDF

TF-IDF (Term Frequency-Inverse Document Frequency) is cost-effective and performs well for text classification.

### TF-IDF Advantages

- **Fast**: No GPU required
- **Interpretable**: See which words matter
- **Efficient**: Low memory footprint
- **Effective**: Good baseline performance


In [None]:
# ============================================================
# TF-IDF Vectorization
# ============================================================

# Prepare data
X_text = tickets_df['text_clean']
y = tickets_df['category']

# Label encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Label mapping:")
for idx, label in enumerate(label_encoder.classes_):
    print(f"  {label}: {idx}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_text, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# TF-IDF Vectorization
# Using limited features to keep model lightweight
tfidf = TfidfVectorizer(
    max_features=1000,  # Limit vocabulary size
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,  # Minimum document frequency
    max_df=0.8,  # Maximum document frequency
    strip_accents='unicode',
    lowercase=True,
    stop_words='english'
)

# Fit and transform
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"\nTF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")
print(f"Matrix density: {X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1]):.4f}")

# Show top features per category
print("\nTop 10 features per category:")
print("="*60)

feature_names = tfidf.get_feature_names_out()
for idx, category in enumerate(label_encoder.classes_):
    # Get samples for this category
    category_mask = y_train == idx
    category_tfidf = X_train_tfidf[category_mask].mean(axis=0).A1
    
    # Get top features
    top_indices = category_tfidf.argsort()[-10:][::-1]
    top_features = [feature_names[i] for i in top_indices]
    
    print(f"\n{category}:")
    print(f"  {', '.join(top_features)}")

---

## Section 4: Model Training and Comparison

Training multiple lightweight models for text classification.


In [None]:
# ============================================================
# Train Multiple Models
# ============================================================

models = {
    'Naive Bayes': MultinomialNB(alpha=0.1),
    'Logistic Regression': LogisticRegression(
        random_state=42, 
        max_iter=500,
        class_weight='balanced',
        solver='lbfgs'
    )
}

results = {}

print("Training models...\n")

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train
    model.fit(X_train_tfidf, y_train)
    
    # Predict
    y_pred = model.predict(X_test_tfidf)
    y_pred_proba = model.predict_proba(X_test_tfidf)
    
    # Metrics
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'accuracy': accuracy_score(y_test, y_pred),
        'f1_macro': f1_score(y_test, y_pred, average='macro')
    }
    
    print(f"  Accuracy: {results[name]['accuracy']:.4f}")
    print(f"  F1 Score (macro): {results[name]['f1_macro']:.4f}\n")

# Select best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']

print(f"Best model: {best_model_name}")
print(f"Accuracy: {results[best_model_name]['accuracy']:.4f}")

In [None]:
# ============================================================
# Detailed Evaluation
# ============================================================

y_pred_best = results[best_model_name]['predictions']

print(f"Detailed Evaluation: {best_model_name}")
print("="*60)

# Classification report
print("\nClassification Report:")
print(classification_report(
    y_test, 
    y_pred_best, 
    target_names=label_encoder.classes_
))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_
)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Category')
plt.xlabel('Predicted Category')
plt.show()

# Per-category performance
print("\nPer-Category Accuracy:")
for idx, category in enumerate(label_encoder.classes_):
    mask = y_test == idx
    category_accuracy = accuracy_score(y_test[mask], y_pred_best[mask])
    print(f"  {category}: {category_accuracy:.4f}")

---

## Section 5: Model Interpretation

Understanding which words drive predictions.


In [None]:
# ============================================================
# Feature Importance Analysis
# ============================================================

if best_model_name == 'Logistic Regression':
    # Get feature importance from coefficients
    feature_names = tfidf.get_feature_names_out()
    
    print("Most Important Words per Category:")
    print("="*60)
    
    for idx, category in enumerate(label_encoder.classes_):
        # Get coefficients for this category
        coef = best_model.coef_[idx]
        
        # Top positive features
        top_positive_idx = coef.argsort()[-10:][::-1]
        top_positive = [(feature_names[i], coef[i]) for i in top_positive_idx]
        
        print(f"\n{category} - Top indicators:")
        for word, score in top_positive:
            print(f"  {word}: {score:.4f}")

# Test with sample predictions
print("\n\nSample Predictions:")
print("="*60)

test_samples = [
    "app crashes when uploading files",
    "need refund for incorrect charge",
    "cannot reset my password",
    "how to use the new analytics feature"
]

for sample in test_samples:
    # Preprocess
    clean_sample = preprocess_text(sample)
    
    # Vectorize
    sample_tfidf = tfidf.transform([clean_sample])
    
    # Predict
    pred_idx = best_model.predict(sample_tfidf)[0]
    pred_proba = best_model.predict_proba(sample_tfidf)[0]
    pred_category = label_encoder.classes_[pred_idx]
    
    print(f"\nText: '{sample}'")
    print(f"Predicted: {pred_category} (confidence: {pred_proba[pred_idx]:.2%})")
    print(f"All probabilities: {dict(zip(label_encoder.classes_, pred_proba))}")

---

## Section 6: Model Deployment

Package and deploy the text classification model to SageMaker.


In [None]:
# ============================================================
# Package Model for Deployment
# ============================================================

import joblib
import tarfile
import shutil

# Create model directory
model_dir = 'text_classification_model'
if os.path.exists(model_dir):
    shutil.rmtree(model_dir)
os.makedirs(model_dir)

# Save model artifacts
joblib.dump(best_model, os.path.join(model_dir, 'model.pkl'))
joblib.dump(tfidf, os.path.join(model_dir, 'tfidf.pkl'))
joblib.dump(label_encoder, os.path.join(model_dir, 'label_encoder.pkl'))

print("Model artifacts saved")

# Create inference script
inference_code = """
import joblib
import os
import json
import re

def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, 'model.pkl'))
    tfidf = joblib.load(os.path.join(model_dir, 'tfidf.pkl'))
    label_encoder = joblib.load(os.path.join(model_dir, 'label_encoder.pkl'))
    return {'model': model, 'tfidf': tfidf, 'label_encoder': label_encoder}

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'urgent:\\s*', '', text)
    text = re.sub(r'!+', '!', text)
    text = re.sub(r'\\?+', '?', text)
    text = re.sub(r'\\s+', ' ', text)
    return text.strip()

def input_fn(request_body, request_content_type):
    if request_content_type == 'application/json':
        data = json.loads(request_body)
        if isinstance(data, dict):
            return [data.get('text', '')]
        elif isinstance(data, list):
            return [item.get('text', '') if isinstance(item, dict) else str(item) for item in data]
        else:
            return [str(data)]
    else:
        return [request_body]

def predict_fn(input_data, model_dict):
    model = model_dict['model']
    tfidf = model_dict['tfidf']
    label_encoder = model_dict['label_encoder']
    
    # Preprocess
    texts_clean = [preprocess_text(text) for text in input_data]
    
    # Vectorize
    X_tfidf = tfidf.transform(texts_clean)
    
    # Predict
    predictions = model.predict(X_tfidf)
    probabilities = model.predict_proba(X_tfidf)
    
    # Format results
    results = []
    for pred_idx, proba in zip(predictions, probabilities):
        category = label_encoder.classes_[pred_idx]
        confidence = float(proba[pred_idx])
        all_probs = {label: float(prob) for label, prob in zip(label_encoder.classes_, proba)}
        
        results.append({
            'category': category,
            'confidence': confidence,
            'all_probabilities': all_probs
        })
    
    return results

def output_fn(prediction, accept):
    if accept == 'application/json':
        return json.dumps(prediction), accept
    raise ValueError(f'Unsupported accept type: {accept}')
"""

with open(os.path.join(model_dir, 'inference.py'), 'w') as f:
    f.write(inference_code)

print("Inference script created")

# Create requirements
requirements = """scikit-learn==1.3.0
joblib==1.3.2
numpy==1.24.3
"""

with open(os.path.join(model_dir, 'requirements.txt'), 'w') as f:
    f.write(requirements)

print("Requirements file created")

# Create tar.gz
model_archive = 'model.tar.gz'
with tarfile.open(model_archive, 'w:gz') as tar:
    tar.add(model_dir, arcname='.')

print(f"Model archive created: {model_archive}")

# Upload to S3
model_s3_key = f'lab3-text-classification/models/{datetime.now().strftime("%Y%m%d-%H%M%S")}/model.tar.gz'
s3_client = boto3.client('s3')
s3_client.upload_file(model_archive, bucket, model_s3_key)

model_s3_uri = f's3://{bucket}/{model_s3_key}'
print(f"\nModel uploaded to: {model_s3_uri}")

In [None]:
# ============================================================
# Deploy to SageMaker Endpoint
# ============================================================

from sagemaker.sklearn import SKLearnModel

# Create model
sklearn_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    entry_point='inference.py',
    framework_version='1.2-1',
    py_version='py3',
    name=f'text-classification-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
)

print("Deploying model to endpoint...")
print("This will take 5-10 minutes...")

# Deploy
predictor = sklearn_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    endpoint_name=f'text-classification-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
)

print(f"\nEndpoint deployed: {predictor.endpoint_name}")

In [None]:
# ============================================================
# Test Endpoint
# ============================================================

# Test with sample tickets
test_tickets = [
    {"text": "The application crashes every time I try to upload a document"},
    {"text": "I was charged twice for my subscription this month"},
    {"text": "How do I reset my password? The email link expired"},
    {"text": "What features are included in the premium plan?"}
]

print("Testing endpoint with sample tickets:")
print("="*60)

for ticket in test_tickets:
    response = predictor.predict(ticket)
    
    print(f"\nTicket: '{ticket['text']}'")
    print(f"Category: {response[0]['category']}")
    print(f"Confidence: {response[0]['confidence']:.2%}")
    print(f"All probabilities: {response[0]['all_probabilities']}")

# Test batch prediction
print("\n\nBatch prediction test:")
print("="*60)

batch_response = predictor.predict(test_tickets)

for i, (ticket, result) in enumerate(zip(test_tickets, batch_response)):
    print(f"\n{i+1}. {result['category']} ({result['confidence']:.2%}) - '{ticket['text'][:50]}...'")

---

## Section 8: SageMaker Clarify - Explainability


In [None]:
# ============================================================
# SageMaker Clarify - Model Explainability
# ============================================================

from sagemaker import clarify

print("SageMaker Clarify - Model Explainability")
print("=" * 60)
print("\nClarify helps explain model predictions and detect bias\n")

# Manual Feature Importance Analysis (cost-effective alternative)
print("Feature Importance Analysis:")
print("-" * 60)

# Obtenir les coefficients du mod√®le
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_

# Pour chaque cat√©gorie
categories = label_encoder.classes_
for idx, category in enumerate(categories):
    print(f"\n{category.upper()} Category:")
    print("-" * 40)
    
    # Top 10 mots positifs
    top_indices = np.argsort(coefficients[idx])[-10:][::-1]
    print(f"Top positive indicators:")
    for i, feat_idx in enumerate(top_indices, 1):
        print(f"  {i}. '{feature_names[feat_idx]}' (weight: {coefficients[idx][feat_idx]:.3f})")

# Visualiser les poids pour chaque cat√©gorie
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, category in enumerate(categories):
    # Top 15 features par valeur absolue
    top_features_idx = np.argsort(np.abs(coefficients[idx]))[-15:]
    top_features = [feature_names[i] for i in top_features_idx]
    top_weights = coefficients[idx][top_features_idx]
    
    # Plot
    colors = ['green' if w > 0 else 'red' for w in top_weights]
    axes[idx].barh(range(len(top_features)), top_weights, color=colors, alpha=0.7)
    axes[idx].set_yticks(range(len(top_features)))
    axes[idx].set_yticklabels(top_features)
    axes[idx].set_xlabel('Weight')
    axes[idx].set_title(f'{category} - Feature Importance')
    axes[idx].axvline(x=0, color='black', linestyle='--', linewidth=0.5)
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('clarify_feature_importance.png', dpi=100, bbox_inches='tight')
print(f"\nüìä Feature importance visualization saved")
plt.show()

In [None]:
# ============================================================
# Explain Individual Predictions
# ============================================================

print("\n\nExplaining Individual Predictions:")
print("=" * 60)

# Prendre un exemple de chaque cat√©gorie
example_tickets = {
    'Technical': "The software freezes when I try to export data",
    'Billing': "I was charged for a service I cancelled last month",
    'Account': "I forgot my username and need to recover my account",
    'Product': "What's the difference between the basic and pro plans?"
}

for category, ticket_text in example_tickets.items():
    print(f"\n{category} Example:")
    print(f"Ticket: '{ticket_text}'")
    print("-" * 60)
    
    # Vectoriser le ticket
    ticket_vector = vectorizer.transform([ticket_text])
    
    # Obtenir les pr√©dictions
    prediction = model.predict(ticket_vector)[0]
    probabilities = model.predict_proba(ticket_vector)[0]
    
    # Trouver les mots du ticket qui ont le plus d'influence
    feature_indices = ticket_vector.nonzero()[1]
    ticket_features = [(feature_names[i], ticket_vector[0, i], coefficients[prediction][i]) 
                       for i in feature_indices]
    ticket_features.sort(key=lambda x: abs(x[2]), reverse=True)
    
    print(f"Predicted: {label_encoder.classes_[prediction]}")
    print(f"Confidence: {max(probabilities):.2%}")
    print(f"\nTop influential words:")
    for word, tfidf, coef in ticket_features[:5]:
        impact = tfidf * coef
        print(f"  '{word}': TF-IDF={tfidf:.3f}, Coef={coef:.3f}, Impact={impact:.3f}")

---

### SageMaker Clarify Benefits

**Model Explainability:**
- Feature importance scores (which words matter most)
- SHAP values for individual predictions
- Global and local explanations

**Bias Detection:**
- Pre-training bias (data imbalance)
- Post-training bias (model fairness)
- Fairness metrics across groups

**Use Cases:**
- Compliance (GDPR, regulatory requirements)
- Model debugging and improvement
- Building trust with stakeholders
- Identifying data quality issues

**Production Integration:**
```python
# In production, use Clarify for:
clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large'
)

# Run explainability analysis
clarify_processor.run_explainability(
    data_config=data_config,
    model_config=model_config,
    explainability_config=shap_config
)
```

**Key Metrics:**
- Feature importance rankings
- SHAP values (contribution of each feature)
- Bias metrics (DI, DPL, KL divergence)
- Fairness metrics


In [None]:
# ============================================================
# Cleanup All Resources
# ============================================================

print("üßπ Cleaning up Lab 3 resources...")
print("=" * 60)

# Delete ALL text-classification endpoints
sm_client = boto3.client('sagemaker')

try:
    response = sm_client.list_endpoints(NameContains='text-classification')
    endpoints = response['Endpoints']
    
    if endpoints:
        print(f"\nüóëÔ∏è  Deleting {len(endpoints)} endpoint(s)...")
        for ep in endpoints:
            endpoint_name = ep['EndpointName']
            try:
                sm_client.delete_endpoint(EndpointName=endpoint_name)
                print(f"  ‚úÖ Deleted: {endpoint_name}")
            except Exception as e:
                print(f"  ‚ö†Ô∏è  {endpoint_name}: {e}")
    else:
        print("  No endpoints to delete")
except Exception as e:
    print(f"  Error listing endpoints: {e}")

# Cleanup local files
print("\nüìÅ Cleaning up local files...")
try:
    if 'model_dir' in globals() and os.path.exists(model_dir):
        shutil.rmtree(model_dir)
        print("  ‚úÖ Deleted model directory")
    if 'model_archive' in globals() and os.path.exists(model_archive):
        os.remove(model_archive)
        print("  ‚úÖ Deleted model archive")
except Exception as e:
    print(f"  ‚ö†Ô∏è  {e}")

print("\n‚úÖ Lab 3 cleanup complete!")
print("üí° Note: S3 data and models are kept for future use")

---

## Summary

In this lab, you:

1. Generated and preprocessed synthetic support ticket data
2. Extracted features using TF-IDF vectorization (cost-effective)
3. Trained and compared lightweight classification models
4. Interpreted model predictions with feature importance
5. Deployed the model to a SageMaker endpoint

### Key Takeaways

- **TF-IDF**: Cost-effective alternative to transformers for many tasks
- **Text Preprocessing**: Critical for model performance
- **Model Selection**: Logistic Regression and Naive Bayes are fast and effective
- **Interpretability**: TF-IDF features are easy to understand
- **SageMaker**: Simplifies deployment and scaling

### Cost Optimization

| Component | Choice | Savings |
|-----------|--------|---------|
| Model | TF-IDF + Logistic Regression | No GPU needed |
| Instance | ml.m5.large | 90% cheaper than GPU |
| Training | < 1 minute | Minimal compute cost |
| Features | Limited to 1000 | Smaller model size |

### Next Steps

- Lab 4: Sentiment Analysis
- Try different n-gram ranges
- Experiment with different vectorization parameters
- Add custom preprocessing rules

---

**Remember to delete your endpoint to avoid charges!**
