# Azure ML Project Notebook

This notebook demonstrates Azure ML workflows and data science operations.

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("🚀 Azure ML Project Started!")
print(f"📅 Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("📊 Libraries loaded successfully!")

## 1. Data Preparation

In [None]:
# Create sample dataset for Azure ML demonstration
np.random.seed(42)

# Generate synthetic data
n_samples = 1000
data = {
    'age': np.random.randint(18, 80, n_samples),
    'income': np.random.normal(50000, 20000, n_samples),
    'experience': np.random.randint(0, 40, n_samples),
    'education_score': np.random.normal(75, 15, n_samples),
    'satisfaction': np.random.choice([0, 1], n_samples, p=[0.3, 0.7])
}

df = pd.DataFrame(data)
df['income'] = np.clip(df['income'], 20000, 150000)  # Realistic income range
df['education_score'] = np.clip(df['education_score'], 0, 100)  # Score 0-100

print("📊 Dataset Created:")
print(f"   Shape: {df.shape}")
print(f"   Columns: {list(df.columns)}")
print("\n🔍 First 5 rows:")
display(df.head())

## 2. Exploratory Data Analysis

In [None]:
# Data overview and statistics
print("📈 Dataset Statistics:")
print("=" * 50)
display(df.describe())

print("\n🎯 Target Variable Distribution:")
satisfaction_counts = df['satisfaction'].value_counts()
print(f"   Satisfied (1): {satisfaction_counts[1]} ({satisfaction_counts[1]/len(df)*100:.1f}%)")
print(f"   Not Satisfied (0): {satisfaction_counts[0]} ({satisfaction_counts[0]/len(df)*100:.1f}%)")

In [None]:
# Create comprehensive visualizations
plt.figure(figsize=(15, 10))

# Age distribution
plt.subplot(2, 3, 1)
plt.hist(df['age'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

# Income distribution
plt.subplot(2, 3, 2)
plt.hist(df['income'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
plt.title('Income Distribution')
plt.xlabel('Income ($)')
plt.ylabel('Frequency')

# Experience vs Income
plt.subplot(2, 3, 3)
scatter = plt.scatter(df['experience'], df['income'], c=df['satisfaction'], 
                     cmap='viridis', alpha=0.6)
plt.title('Experience vs Income')
plt.xlabel('Years of Experience')
plt.ylabel('Income ($)')
plt.colorbar(scatter, label='Satisfaction')

# Education score distribution
plt.subplot(2, 3, 4)
plt.hist(df['education_score'], bins=20, alpha=0.7, color='orange', edgecolor='black')
plt.title('Education Score Distribution')
plt.xlabel('Education Score')
plt.ylabel('Frequency')

# Satisfaction by age groups
plt.subplot(2, 3, 5)
age_groups = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], labels=['18-30', '31-45', '46-60', '60+'])
satisfaction_by_age = df.groupby(age_groups)['satisfaction'].mean()
satisfaction_by_age.plot(kind='bar', color='coral')
plt.title('Satisfaction Rate by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Satisfaction Rate')
plt.xticks(rotation=45)

# Correlation heatmap
plt.subplot(2, 3, 6)
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Feature Correlation Matrix')

plt.tight_layout()
plt.show()

print("📊 Visualizations completed!")

## 3. Machine Learning Pipeline

In [None]:
# Machine Learning Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Prepare features and target
X = df[['age', 'income', 'experience', 'education_score']]
y = df['satisfaction']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("🔄 Data Split:")
print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Test set: {X_test.shape[0]} samples")
print(f"   Features: {X_train.shape[1]}")

# Scale features for logistic regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Feature scaling completed!")

In [None]:
# Train multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

results = {}

print("🤖 Training Models:")
print("=" * 50)

for name, model in models.items():
    print(f"\n🔄 Training {name}...")
    
    # Use scaled data for logistic regression, original for tree-based models
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_mean': cv_mean,
        'cv_std': cv_std,
        'predictions': y_pred
    }
    
    print(f"   ✅ Test Accuracy: {accuracy:.3f}")
    print(f"   📊 CV Score: {cv_mean:.3f} (±{cv_std:.3f})")

print("\n🏆 Model Training Completed!")

In [None]:
# Model comparison and detailed results
print("📊 Model Comparison:")
print("=" * 60)

comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test Accuracy': [results[name]['accuracy'] for name in results.keys()],
    'CV Mean': [results[name]['cv_mean'] for name in results.keys()],
    'CV Std': [results[name]['cv_std'] for name in results.keys()]
})

comparison_df = comparison_df.sort_values('Test Accuracy', ascending=False)
display(comparison_df)

# Best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"\n🥇 Best Model: {best_model_name}")
print(f"   Accuracy: {results[best_model_name]['accuracy']:.3f}")

# Detailed classification report for best model
print(f"\n📋 Detailed Classification Report ({best_model_name}):")
print(classification_report(y_test, best_predictions))

## 4. Feature Importance Analysis

In [None]:
# Feature importance analysis (for tree-based models)
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("🎯 Feature Importance Analysis:")
    print("=" * 40)
    display(feature_importance)
    
    # Visualize feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.xlabel('Importance Score')
    plt.ylabel('Features')
    plt.show()
    
else:
    # For logistic regression, show coefficients
    coefficients = pd.DataFrame({
        'feature': X.columns,
        'coefficient': best_model.coef_[0]
    }).sort_values('coefficient', key=abs, ascending=False)
    
    print("🎯 Model Coefficients (Logistic Regression):")
    print("=" * 45)
    display(coefficients)
    
    # Visualize coefficients
    plt.figure(figsize=(10, 6))
    sns.barplot(data=coefficients, x='coefficient', y='feature', palette='coolwarm')
    plt.title('Model Coefficients - Logistic Regression')
    plt.xlabel('Coefficient Value')
    plt.ylabel('Features')
    plt.axvline(x=0, color='black', linestyle='--', alpha=0.5)
    plt.show()

## 5. Azure ML Integration Template

In [None]:
# Azure ML Integration Template
print("☁️ Azure ML Integration Template")
print("=" * 40)

azure_ml_template = """
# Azure ML SDK Integration Example
# Uncomment and configure the following code to use with Azure ML

# from azureml.core import Workspace, Experiment, Environment
# from azureml.core.model import Model
# from azureml.core.run import Run
# import joblib

# # Connect to Azure ML Workspace
# ws = Workspace.from_config()  # or Workspace.get(name='your-workspace', 
#                               #                   subscription_id='your-sub-id',
#                               #                   resource_group='your-rg')

# # Create or get experiment
# experiment = Experiment(workspace=ws, name='satisfaction-prediction')

# # Start a run
# run = experiment.start_logging()

# # Log metrics
# run.log('accuracy', best_accuracy)
# run.log('model_type', best_model_name)

# # Save and register model
# model_filename = 'satisfaction_model.pkl'
# joblib.dump(best_model, model_filename)

# # Upload model file
# run.upload_file(name=model_filename, path_or_stream=model_filename)

# # Register model
# model = run.register_model(model_name='satisfaction-predictor',
#                           model_path=model_filename,
#                           description='Employee satisfaction prediction model')

# # Complete the run
# run.complete()

# print(f"Model registered: {model.name} version {model.version}")
"""

print(azure_ml_template)

# Configuration template
config_template = {
    "subscription_id": "your-azure-subscription-id",
    "resource_group": "your-resource-group-name",
    "workspace_name": "your-azureml-workspace-name",
    "experiment_name": "satisfaction-prediction",
    "model_name": "satisfaction-predictor"
}

print("\n⚙️ Configuration Template:")
for key, value in config_template.items():
    print(f"   {key}: {value}")

print("\n✅ Azure ML template ready for customization!")

## 6. Summary and Next Steps

In [None]:
# Project summary
print("🎉 PROJECT SUMMARY")
print("=" * 50)
print(f"📊 Dataset: {df.shape[0]} samples, {df.shape[1]} features")
print(f"🤖 Models Trained: {len(models)}")
print(f"🏆 Best Model: {best_model_name}")
print(f"🎯 Best Accuracy: {results[best_model_name]['accuracy']:.3f}")
print(f"⏰ Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print("\n🚀 NEXT STEPS:")
print("   1. Configure Azure ML workspace credentials")
print("   2. Uncomment and run Azure ML integration code")
print("   3. Deploy model to Azure ML endpoint")
print("   4. Set up monitoring and retraining pipeline")
print("   5. Create web service for predictions")

print("\n✨ Happy Machine Learning with Azure ML! ✨")