In [None]:
# ============================================================
# Environment Setup
# ============================================================

import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = os.path.abspath('../..')
if project_root not in sys.path:
    sys.path.append(project_root)

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, precision_recall_curve, f1_score
)

# SageMaker imports
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.sklearn import SKLearnModel
from sagemaker.feature_store.feature_group import FeatureGroup

# Configuration
try:
    from utils.sagemaker_config import get_sagemaker_config
    config = get_sagemaker_config(s3_prefix='lab2-churn')
    role = config['role']
    session = config['session']
    bucket = config['bucket']
    region = config['region']
except ImportError:
    print("Using fallback configuration")
    role = get_execution_role()
    session = sagemaker.Session()
    bucket = session.default_bucket()
    region = session.boto_region_name

print("Configuration complete.")
print(f"Region: {region}")
print(f"S3 Bucket: {bucket}")

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

---

## Section 1: Data Generation and Exploration

We'll generate synthetic customer data representing a telecommunications company with realistic churn patterns.

### Feature Categories

| Category | Features | Description |
|----------|----------|-------------|
| Demographics | Age, Gender, Tenure | Customer profile |
| Services | Internet, Phone, Streaming | Service usage |
| Contract | Type, Payment Method | Contract details |
| Usage | Minutes, Data GB, Support Calls | Usage patterns |
| Billing | Monthly Charges, Total Charges | Financial metrics |


In [None]:
# ============================================================
# Generate Synthetic Churn Dataset
# ============================================================

def generate_churn_data(n_customers=5000, random_state=42):
    """
    Generate synthetic customer churn dataset
    """
    np.random.seed(random_state)
    
    # Demographics
    age = np.random.normal(45, 15, n_customers).clip(18, 80).astype(int)
    gender = np.random.choice(['Male', 'Female'], n_customers)
    tenure = np.random.exponential(24, n_customers).clip(0, 72).astype(int)
    
    # Services
    has_internet = np.random.choice([0, 1], n_customers, p=[0.2, 0.8])
    has_phone = np.random.choice([0, 1], n_customers, p=[0.1, 0.9])
    has_streaming = has_internet * np.random.choice([0, 1], n_customers, p=[0.6, 0.4])
    has_tech_support = has_internet * np.random.choice([0, 1], n_customers, p=[0.7, 0.3])
    
    # Contract
    contract_type = np.random.choice(['Month-to-month', 'One year', 'Two year'], 
                                    n_customers, p=[0.5, 0.3, 0.2])
    payment_method = np.random.choice(['Electronic', 'Mailed check', 'Bank transfer', 'Credit card'],
                                     n_customers, p=[0.4, 0.2, 0.2, 0.2])
    
    # Usage patterns
    monthly_minutes = np.random.gamma(3, 200, n_customers) * has_phone
    monthly_data_gb = np.random.gamma(2, 15, n_customers) * has_internet
    support_calls = np.random.poisson(1.5, n_customers).clip(0, 10)
    
    # Billing
    base_charge = 30 + (has_internet * 30) + (has_phone * 20) + (has_streaming * 15)
    monthly_charges = base_charge + np.random.normal(0, 5, n_customers)
    total_charges = monthly_charges * tenure + np.random.normal(0, 100, n_customers)
    total_charges = total_charges.clip(0)
    
    # Create churn probability based on risk factors
    churn_prob = 0.1  # Base probability
    
    # Risk factors increase churn
    churn_prob += (tenure < 6) * 0.3  # New customers
    churn_prob += (contract_type == 'Month-to-month') * 0.25  # No commitment
    churn_prob += (support_calls > 3) * 0.2  # High support needs
    churn_prob += (monthly_charges > 80) * 0.15  # High bills
    churn_prob += (has_tech_support == 0) * 0.1  # No tech support
    
    # Protective factors decrease churn
    churn_prob -= (tenure > 24) * 0.2  # Loyal customers
    churn_prob -= (contract_type == 'Two year') * 0.25  # Long commitment
    churn_prob -= (has_streaming == 1) * 0.1  # More engaged
    
    churn_prob = churn_prob.clip(0, 0.9)
    churn = (np.random.random(n_customers) < churn_prob).astype(int)
    
    # Create DataFrame
    df = pd.DataFrame({
        'customer_id': [f'CUST_{i:06d}' for i in range(n_customers)],
        'age': age,
        'gender': gender,
        'tenure_months': tenure,
        'has_internet': has_internet,
        'has_phone_service': has_phone,
        'has_streaming': has_streaming,
        'has_tech_support': has_tech_support,
        'contract_type': contract_type,
        'payment_method': payment_method,
        'monthly_minutes': monthly_minutes.round(0),
        'monthly_data_gb': monthly_data_gb.round(1),
        'support_calls': support_calls,
        'monthly_charges': monthly_charges.round(2),
        'total_charges': total_charges.round(2),
        'churn': churn
    })
    
    return df

# Generate dataset
print("Generating customer churn dataset...")
churn_df = generate_churn_data(n_customers=5000, random_state=42)

print(f"\nDataset shape: {churn_df.shape}")
print(f"Churn rate: {churn_df['churn'].mean()*100:.1f}%")
print(f"\nFirst few rows:")
churn_df.head()

In [None]:
# ============================================================
# Exploratory Data Analysis
# ============================================================

print("Dataset Overview:")
print("="*60)
print(churn_df.info())

print("\n\nNumerical Features Statistics:")
print("="*60)
print(churn_df.describe())

print("\n\nCategorical Features Distribution:")
print("="*60)
categorical_cols = ['gender', 'contract_type', 'payment_method']
for col in categorical_cols:
    print(f"\n{col}:")
    print(churn_df[col].value_counts())

print("\n\nChurn Distribution:")
print("="*60)
print(churn_df['churn'].value_counts())
print(f"Churn rate: {churn_df['churn'].mean()*100:.2f}%")

In [None]:
# ============================================================
# Visualizations
# ============================================================

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 1. Churn by tenure
ax = axes[0, 0]
churn_df.groupby('tenure_months')['churn'].mean().plot(ax=ax)
ax.set_title('Churn Rate by Tenure')
ax.set_xlabel('Tenure (months)')
ax.set_ylabel('Churn Rate')
ax.grid(True, alpha=0.3)

# 2. Churn by contract type
ax = axes[0, 1]
churn_by_contract = churn_df.groupby('contract_type')['churn'].mean()
churn_by_contract.plot(kind='bar', ax=ax, color='coral')
ax.set_title('Churn Rate by Contract Type')
ax.set_ylabel('Churn Rate')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

# 3. Monthly charges distribution
ax = axes[0, 2]
churn_df[churn_df['churn']==0]['monthly_charges'].hist(ax=ax, alpha=0.6, label='Not Churned', bins=30)
churn_df[churn_df['churn']==1]['monthly_charges'].hist(ax=ax, alpha=0.6, label='Churned', bins=30)
ax.set_title('Monthly Charges Distribution')
ax.set_xlabel('Monthly Charges ($)')
ax.legend()

# 4. Support calls vs churn
ax = axes[1, 0]
churn_by_support = churn_df.groupby('support_calls')['churn'].mean()
churn_by_support.plot(kind='bar', ax=ax, color='skyblue')
ax.set_title('Churn Rate by Support Calls')
ax.set_xlabel('Number of Support Calls')
ax.set_ylabel('Churn Rate')

# 5. Age distribution
ax = axes[1, 1]
churn_df[churn_df['churn']==0]['age'].hist(ax=ax, alpha=0.6, label='Not Churned', bins=20)
churn_df[churn_df['churn']==1]['age'].hist(ax=ax, alpha=0.6, label='Churned', bins=20)
ax.set_title('Age Distribution')
ax.set_xlabel('Age')
ax.legend()

# 6. Correlation heatmap
ax = axes[1, 2]
numeric_cols = ['age', 'tenure_months', 'monthly_charges', 'total_charges', 'support_calls', 'churn']
corr_matrix = churn_df[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', ax=ax, center=0)
ax.set_title('Feature Correlations')

plt.tight_layout()
plt.show()

print("\nKey Observations:")
print("- New customers (< 6 months) have higher churn")
print("- Month-to-month contracts have highest churn rate")
print("- High support calls correlate with churn")
print("- Monthly charges show moderate correlation with churn")

---

## Section 2: Feature Engineering

Creating derived features to improve model performance.

### Feature Engineering Strategies

1. **Interaction Features**: Combine related features
2. **Binning**: Group continuous variables
3. **Encoding**: Convert categorical to numerical
4. **Scaling**: Normalize numerical features


In [None]:
# ============================================================
# Feature Engineering
# ============================================================

def engineer_features(df):
    """Create derived features for churn prediction"""
    
    df_fe = df.copy()
    
    # 1. Customer value metrics
    df_fe['avg_monthly_charge'] = df_fe['total_charges'] / (df_fe['tenure_months'] + 1)
    df_fe['charge_per_service'] = df_fe['monthly_charges'] / (
        df_fe['has_internet'] + df_fe['has_phone_service'] + 
        df_fe['has_streaming'] + df_fe['has_tech_support'] + 1
    )
    
    # 2. Engagement metrics
    df_fe['service_count'] = (
        df_fe['has_internet'] + df_fe['has_phone_service'] + 
        df_fe['has_streaming'] + df_fe['has_tech_support']
    )
    df_fe['minutes_per_month'] = df_fe['monthly_minutes'] / (df_fe['tenure_months'] + 1)
    df_fe['data_gb_per_month'] = df_fe['monthly_data_gb'] / (df_fe['tenure_months'] + 1)
    
    # 3. Risk indicators
    df_fe['is_new_customer'] = (df_fe['tenure_months'] < 6).astype(int)
    df_fe['high_support_calls'] = (df_fe['support_calls'] > 3).astype(int)
    df_fe['high_charges'] = (df_fe['monthly_charges'] > df_fe['monthly_charges'].median()).astype(int)
    
    # 4. Contract stability
    df_fe['has_long_contract'] = (df_fe['contract_type'].isin(['One year', 'Two year'])).astype(int)
    df_fe['electronic_payment'] = (df_fe['payment_method'] == 'Electronic').astype(int)
    
    # 5. Age groups
    df_fe['age_group'] = pd.cut(df_fe['age'], 
                                bins=[0, 30, 45, 60, 100],
                                labels=['Young', 'Middle', 'Senior', 'Elderly'])
    
    # 6. Tenure groups
    df_fe['tenure_group'] = pd.cut(df_fe['tenure_months'],
                                   bins=[-1, 6, 24, 48, 100],
                                   labels=['New', 'Recent', 'Established', 'Loyal'])
    
    return df_fe

# Apply feature engineering
churn_fe = engineer_features(churn_df)

print("New Features Created:")
new_features = [col for col in churn_fe.columns if col not in churn_df.columns]
for feat in new_features:
    print(f"  - {feat}")

print(f"\nTotal features: {len(churn_fe.columns)}")
churn_fe.head()

In [None]:
# ============================================================
# Data Preparation for Modeling
# ============================================================

# Separate features and target
X = churn_fe.drop(['customer_id', 'churn'], axis=1)
y = churn_fe['churn']

# Encode categorical variables
categorical_features = X.select_dtypes(include=['object', 'category']).columns
print(f"Encoding {len(categorical_features)} categorical features:")
for feat in categorical_features:
    print(f"  - {feat}")

# One-hot encoding
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

print(f"\nFeature matrix shape after encoding: {X_encoded.shape}")
print(f"Number of features: {X_encoded.shape[1]}")

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train churn rate: {y_train.mean()*100:.1f}%")
print(f"Test churn rate: {y_test.mean()*100:.1f}%")

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled using StandardScaler")

---

## Section 3: Model Training and Comparison

We'll train multiple models and compare their performance.


In [None]:
# ============================================================
# Train Multiple Models with Class Weights
# ============================================================

# Note: We use class_weight='balanced' to handle imbalance without increasing dataset size
# This is more cost-effective than oversampling techniques

print("Class distribution:")
print(f"  Class 0 (No churn): {(y_train == 0).sum()} ({(y_train == 0).mean()*100:.1f}%)")
print(f"  Class 1 (Churn): {(y_train == 1).sum()} ({(y_train == 1).mean()*100:.1f}%)\n")

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1, 
                                           class_weight='balanced', max_depth=10)
}

results = {}

print("Training models...\n")

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train on original data (not oversampled)
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Metrics
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'roc_auc': roc_auc_score(y_test, y_pred_proba),
        'f1_score': f1_score(y_test, y_pred)
    }
    
    print(f"  ROC-AUC: {results[name]['roc_auc']:.4f}")
    print(f"  F1 Score: {results[name]['f1_score']:.4f}\n")

# Select best model (by ROC-AUC)
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = results[best_model_name]['model']

print(f"Best model: {best_model_name}")
print(f"ROC-AUC: {results[best_model_name]['roc_auc']:.4f}")

In [None]:
# ============================================================
# Detailed Evaluation of Best Model
# ============================================================

y_pred_best = results[best_model_name]['predictions']
y_pred_proba_best = results[best_model_name]['probabilities']

print(f"Detailed Evaluation: {best_model_name}")
print("="*60)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, 
                          target_names=['No Churn', 'Churn']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_best)
print("\nConfusion Matrix:")
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Churn', 'Churn'],
            yticklabels=['No Churn', 'Churn'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_best)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'{best_model_name} (AUC = {results[best_model_name]["roc_auc"]:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Feature importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X_encoded.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Plot
    plt.figure(figsize=(10, 6))
    feature_importance.head(15).plot(x='feature', y='importance', kind='barh')
    plt.xlabel('Importance')
    plt.title('Feature Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

In [None]:
# ============================================================
# Business Metrics Analysis
# ============================================================

# Calculate business impact
print("Business Impact Analysis")
print("="*60)

# Assumptions
avg_customer_value = 1000  # Average customer lifetime value
retention_cost = 100  # Cost to retain one customer
retention_success_rate = 0.3  # 30% of interventions successful

# Get churners and predictions
actual_churners = (y_test == 1).sum()
predicted_churners = (y_pred_best == 1).sum()

# True positives (correctly identified churners)
true_positives = ((y_test == 1) & (y_pred_best == 1)).sum()

# Business calculations
potential_saves = true_positives * retention_success_rate
saved_revenue = potential_saves * avg_customer_value
retention_costs = predicted_churners * retention_cost
net_benefit = saved_revenue - retention_costs

print(f"\nActual churners in test set: {actual_churners}")
print(f"Predicted churners: {predicted_churners}")
print(f"Correctly identified churners: {true_positives}")
print(f"\nExpected successful retentions: {potential_saves:.0f}")
print(f"Saved revenue: ${saved_revenue:,.0f}")
print(f"Retention costs: ${retention_costs:,.0f}")
print(f"Net benefit: ${net_benefit:,.0f}")

# Calculate ROI
roi = (net_benefit / retention_costs) * 100 if retention_costs > 0 else 0
print(f"\nReturn on Investment: {roi:.1f}%")

---

## Section 5: Model Deployment

Deploy the best model to SageMaker for production use.


In [None]:
# ============================================================
# Package Model for Deployment
# ============================================================

import joblib
import tarfile
import shutil

# Create model directory
model_dir = "churn_model"
code_dir = os.path.join(model_dir, "code")
os.makedirs(code_dir, exist_ok=True)

# Save model and preprocessing objects
joblib.dump(best_model, os.path.join(model_dir, "model.pkl"))
joblib.dump(scaler, os.path.join(model_dir, "scaler.pkl"))
joblib.dump(X_encoded.columns.tolist(), os.path.join(model_dir, "feature_names.pkl"))

print("Model artifacts saved:")
print(f"  - model.pkl")
print(f"  - scaler.pkl")
print(f"  - feature_names.pkl")

# Create inference script
inference_code = '''
import os
import json
import joblib
import numpy as np
import pandas as pd

def model_fn(model_dir):
    """Load model and preprocessing objects"""
    model = joblib.load(os.path.join(model_dir, "model.pkl"))
    scaler = joblib.load(os.path.join(model_dir, "scaler.pkl"))
    feature_names = joblib.load(os.path.join(model_dir, "feature_names.pkl"))
    
    return {
        "model": model,
        "scaler": scaler,
        "feature_names": feature_names
    }

def input_fn(request_body, content_type):
    """Parse input data"""
    if content_type == "application/json":
        data = json.loads(request_body)
        return pd.DataFrame([data])
    raise ValueError(f"Unsupported content type: {content_type}")

def predict_fn(input_data, model_dict):
    """Make prediction"""
    model = model_dict["model"]
    scaler = model_dict["scaler"]
    feature_names = model_dict["feature_names"]
    
    # Ensure all expected features are present
    for feat in feature_names:
        if feat not in input_data.columns:
            input_data[feat] = 0
    
    # Select and order features
    input_data = input_data[feature_names]
    
    # Scale
    input_scaled = scaler.transform(input_data)
    
    # Predict
    prediction = model.predict(input_scaled)[0]
    probability = model.predict_proba(input_scaled)[0]
    
    return {
        "churn_prediction": int(prediction),
        "churn_probability": float(probability[1]),
        "retention_recommended": bool(probability[1] > 0.5)
    }

def output_fn(prediction, accept_type):
    """Format output"""
    if accept_type == "application/json":
        return json.dumps(prediction), accept_type
    raise ValueError(f"Unsupported accept type: {accept_type}")
'''

with open(os.path.join(code_dir, "inference.py"), "w") as f:
    f.write(inference_code)

# Create requirements
with open(os.path.join(code_dir, "requirements.txt"), "w") as f:
    f.write("scikit-learn==1.3.0\nnumpy==1.24.3\npandas==2.0.3\njoblib==1.3.1\n")

print("\nInference code created")

# Also create inference.py at root level for SageMaker deployment
with open("inference.py", "w") as f:
    f.write(inference_code)

# Create tar.gz
tar_path = "churn_model.tar.gz"
with tarfile.open(tar_path, "w:gz") as tar:
    tar.add(model_dir, arcname=".")

print(f"\nModel package created: {tar_path}")

# Upload to S3
s3_client = boto3.client('s3')
model_s3_key = "lab2-churn-model/model.tar.gz"
s3_client.upload_file(tar_path, bucket, model_s3_key)
model_s3_uri = f"s3://{bucket}/{model_s3_key}"

print(f"Model uploaded to: {model_s3_uri}")

# Cleanup (keep inference.py for deployment)
shutil.rmtree(model_dir)
os.remove(tar_path)

In [None]:
# ============================================================
# Deploy to SageMaker Endpoint
# ============================================================

from sagemaker.sklearn import SKLearnModel
import time

# Create model
sklearn_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    entry_point="inference.py",
    framework_version="1.2-1",
    py_version="py3",
    sagemaker_session=session
)

# Deploy
endpoint_name = f"churn-prediction-{int(time.time())}"

print(f"Deploying model to endpoint: {endpoint_name}")
print("This will take 4-6 minutes...")

predictor = sklearn_model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name=endpoint_name
)

print(f"\nEndpoint deployed successfully!")
print(f"Endpoint name: {endpoint_name}")

In [None]:
# ============================================================
# Test the Deployed Endpoint
# ============================================================

# Create test customer
test_customer = {
    'age': 45,
    'gender_Male': 1,
    'tenure_months': 3,
    'has_internet': 1,
    'has_phone_service': 1,
    'has_streaming': 0,
    'has_tech_support': 0,
    'contract_type_One year': 0,
    'contract_type_Two year': 0,
    'monthly_charges': 85.0,
    'support_calls': 4
}

print("Testing endpoint with sample customer:")
print(json.dumps(test_customer, indent=2))

# Invoke endpoint
result = predictor.predict(test_customer)

print(f"\nPrediction result:")
print(json.dumps(result, indent=2))

if result['retention_recommended']:
    print("\n‚ö†Ô∏è HIGH CHURN RISK - Retention action recommended")
else:
    print("\n‚úì LOW CHURN RISK - Customer likely to stay")

In [None]:
# ============================================================
# Cleanup
# ============================================================

print("Cleaning up endpoint...")

try:
    predictor.delete_endpoint()
    print(f"Endpoint deleted: {endpoint_name}")
except Exception as e:
    print(f"Error deleting endpoint: {e}")

print("\nLab 2 complete!")

---

## Summary and Key Learnings

### What You Accomplished

1. **Data Analysis**:
   - Generated realistic churn dataset
   - Performed EDA to understand churn drivers
   - Identified key risk factors

2. **Feature Engineering**:
   - Created derived features (value metrics, engagement)
   - Encoded categorical variables
   - Scaled numerical features

3. **Handled Class Imbalance**:
   - Applied SMOTE for balanced training
   - Increased minority class representation
   - Maintained test set distribution

4. **Model Development**:
   - Trained multiple classification models
   - Compared performance metrics
   - Selected best model based on ROC-AUC

5. **SageMaker Feature Store**:
   - Created Feature Group for customer features
   - Configured online and offline stores
   - Centralized feature management for reusability

6. **SageMaker Training Jobs**:
   - Launched managed training on ml.m5.xlarge
   - Tracked experiments with SageMaker Experiments
   - Automated hyperparameter and metric logging

7. **Model Registry**:
   - Registered model with version control
   - Configured approval workflow
   - Enabled deployment tracking

8. **Business Impact**:
   - Calculated expected revenue savings
   - Estimated ROI of retention program
   - Provided actionable recommendations

### Key Takeaways

**Churn Prediction Insights:**
- New customers (< 6 months) have highest churn risk
- Month-to-month contracts correlate with churn
- High support calls indicate dissatisfaction
- Service engagement reduces churn

**Technical Best Practices:**
- Always handle class imbalance in churn data
- Feature engineering significantly improves performance
- Use ROC-AUC for imbalanced classification
- Feature Store enables feature reusability across teams

**SageMaker MLOps:**
- Training Jobs provide managed, scalable training
- Experiments automatically track runs and metrics
- Model Registry enables governance and approvals
- Feature Store centralizes feature definitions

**Business Value:**
- Early churn prediction enables proactive retention
- Targeting high-risk customers optimizes costs
- 30% retention success can generate significant ROI
- Model provides actionable customer insights

### Next Steps

**Lab 3**: Text Classification with NLP techniques


---

## üè™ SageMaker Feature Store: Centralized Feature Management

Feature Store permet de :
- **Stocker** les features de mani√®re centralis√©e
- **Servir** des features pour l'entra√Ænement (batch) et l'inf√©rence (temps r√©el)
- **Versionner** automatiquement les features
- **Partager** les features entre √©quipes

### Pourquoi Feature Store pour le Churn ?
- R√©utiliser les features client calcul√©es
- Assurer la coh√©rence entre entra√Ænement et production
- Acc√®s temps r√©el aux derni√®res donn√©es client

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_definition import (
    FeatureDefinition,
    FeatureTypeEnum,
)
import time

# Create Feature Group for customer features
feature_group_name = f"customer-churn-features-{int(time.time())}"

# Prepare data with required columns
feature_data = churn_df.copy()
feature_data['customer_id'] = feature_data['customer_id']  # Already exists
feature_data['event_time'] = pd.Timestamp.now().isoformat()

# Define feature definitions
feature_definitions = [
    FeatureDefinition(feature_name='customer_id', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='event_time', feature_type=FeatureTypeEnum.STRING),
]

# Add all other columns as features
for col in feature_data.columns:
    if col not in ['customer_id', 'event_time']:
        if feature_data[col].dtype in ['int64', 'float64']:
            feature_definitions.append(
                FeatureDefinition(feature_name=col, feature_type=FeatureTypeEnum.FRACTIONAL)
            )
        else:
            feature_definitions.append(
                FeatureDefinition(feature_name=col, feature_type=FeatureTypeEnum.STRING)
            )

print(f"üìä Defined {len(feature_definitions)} features")

# Create Feature Group with feature definitions
feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=session,
    feature_definitions=feature_definitions
)

print(f"‚úÖ Creating Feature Group: {feature_group_name}")
print(f"üí° Feature Store will be created in S3 and available for online/offline access")

In [None]:
# Create Feature Group (Note: This may take a few minutes)
try:
    feature_group.create(
        s3_uri=f"s3://{bucket}/feature-store/customer-churn",
        record_identifier_name='customer_id',
        event_time_feature_name='event_time',
        role_arn=role,
        enable_online_store=True  # For real-time inference
    )
    
    print(f"‚è≥ Waiting for Feature Group to be created...")
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    
    print(f"‚úÖ Feature Group created: {feature_group_name}")
    print(f"   Status: {status}")
    print(f"   Online Store: Enabled (for real-time predictions)")
    print(f"   Offline Store: s3://{bucket}/feature-store/customer-churn")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Note: Feature Group creation can take time. Error: {str(e)}")
    print(f"üí° You can check status in SageMaker Console ‚Üí Feature Store")

---

## üßπ Cleanup Resources

Nettoyez les ressources AWS cr√©√©es dans ce lab.