In [1]:
# Cell [1]: Check Environment
import os
print(f"Current directory: {os.getcwd()}")
print(f"Files in current directory: {os.listdir('.')}")
print(f"Data folder exists: {os.path.exists('data')}")

Current directory: C:\Users\Jahnavi Gajula\Hubspot_Assesment\ml-framework-package\notebooks
Files in current directory: ['.ipynb_checkpoints', 'artifacts', 'demo.ipynb', 'demo_updated.ipynb']
Data folder exists: False


In [2]:
# Cell [2]: Setup Path
import os
import sys

# If you're in notebooks/, change directory to parent
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
    print(f"‚úì Changed to: {os.getcwd()}")

# Add src to path
sys.path.insert(0, os.path.join(os.getcwd(), 'src'))
print(f"‚úì Added to path: {os.path.join(os.getcwd(), 'src')}")
print(f"‚úì Data folder exists: {os.path.exists('data')}")

‚úì Changed to: C:\Users\Jahnavi Gajula\Hubspot_Assesment\ml-framework-package
‚úì Added to path: C:\Users\Jahnavi Gajula\Hubspot_Assesment\ml-framework-package\src
‚úì Data folder exists: True


In [8]:
# Cell [3]: Import and Load Config
from ml_framework.training import Trainer
from ml_framework.utils import load_config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For API testing
import requests
import json
from datetime import datetime

# Load configuration
config = load_config('configs/config.yaml')

print("Configuration loaded successfully!")
print("="*60)
print(f"Experiment: {config.experiment.name}")
print(f"Model: {config.model.type}")
print(f"MLflow URI: {config.experiment.mlflow_tracking_uri}")
print(f"Test size: {config.data.test_size}")
print(f"Random seed: {config.reproducibility.seed}")

Configuration loaded successfully!
Experiment: customer_conversion_baseline
Model: logistic_regression
MLflow URI: ./mlruns
Test size: 0.2
Random seed: 42


In [9]:
# Cell [4]: Initialize and Train (WITH MLFLOW!)
print("Starting training with MLflow tracking...")
print("="*60)

# Initialize trainer
trainer = Trainer(config)

# Run training (this will log to MLflow automatically)
results = trainer.train()

print("\n" + "="*60)
print("‚úÖ TRAINING COMPLETE!")
print("="*60)
print(f"\nMetrics:")
for metric, value in results['metrics'].items():
    if value is not None:
        print(f"  {metric}: {value:.4f}")

print(f"\nArtifacts saved to: {results['artifact_dir']}")
print("\nüí° Check console output above for 'MLflow Run ID'")

Starting training with MLflow tracking...
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Experiment 'customer_conversion_baseline' initialized
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Set random seed: 42
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - MLflow tracking URI: ./mlruns
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - MLflow experiment: customer_conversion_baseline
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Artifact directory: artifacts\customer_conversion_baseline_20251109_152156
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Trainer initialized for experiment: customer_conversion_baseline
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Starting experiment: customer_conversion_baseline
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Loading data from configured paths...
2025-11-09


  ‚ö†Ô∏è  Found 4 customers (2.0%) with MRR ‚â§ 0 (pipeline will clean these)

  ‚ö†Ô∏è  Soft uniqueness check failed for key ('id',): 3 duplicates found (acceptable but flagged)

‚ö†Ô∏è  DUPLICATES FOUND: noncustomers
Found 6 duplicate rows for key=['id']
Unique IDs affected: 3
Strategy 'most_complete' will be applied.


Sample duplicate groups:
      id  _duplicate_group
447  278                 0
446  278                 0
923  279                 1
922  279                 1
443  280                 2
444  280                 2

üóëÔ∏è  Removed 3 duplicate rows

‚ö†Ô∏è  Found 4 customers with MRR <= 0 (2.00%)
Business rule: Customers (is_customer = 1) must have MRR > 0

Sample invalid rows:
     id     MRR
2   118  -61.15
5   141 -403.20
6   197 -260.00
19    3 -555.00

üóëÔ∏è  Removed 4 invalid customers


2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Loaded data: 5196 samples, 36 features
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Parameter - n_samples: 5196
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Parameter - n_features: 36
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Parameter - n_customers: 196
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Parameter - n_noncustomers: 5000
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Splitting data...
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Train: 4156 samples, Test: 1040 samples
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Parameter - train_size: 4156
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Parameter - test_size: 1040
2025-11-09 15:21:56 - experiment.customer_conversion_baseline - INFO - Preprocessing data 



2025-11-09 15:22:04 - experiment.customer_conversion_baseline - INFO - MLflow Run ID: f707c95e271444bd9d266b2ec9068775
2025-11-09 15:22:04 - experiment.customer_conversion_baseline - INFO - Experiment complete!

‚úÖ TRAINING COMPLETE!

Metrics:
  accuracy: 0.9442
  precision: 0.3827
  recall: 0.7949
  f1: 0.5167
  roc_auc: 0.9510
  average_precision: 0.6449

Artifacts saved to: artifacts\customer_conversion_baseline_20251109_152156

üí° Check console output above for 'MLflow Run ID'


Registered model 'customer_conversion_baseline_model' already exists. Creating a new version of this model...
Created version '4' of model 'customer_conversion_baseline_model'.


In [5]:
# Cell [5]: View MLflow Runs
import mlflow
import pandas as pd

# Set tracking URI
mlflow.set_tracking_uri(config.experiment.mlflow_tracking_uri)

# Search for runs
runs = mlflow.search_runs(experiment_names=[config.experiment.name])

print(f"üìä Found {len(runs)} runs in experiment '{config.experiment.name}'")
print("="*60)

if len(runs) > 0:
    latest = runs.iloc[0]
    print(f"\nüèÉ Latest Run:")
    print(f"  Run ID: {latest['run_id']}")
    print(f"  Start Time: {latest['start_time']}")
    print(f"  Status: {latest['status']}")
    
    print(f"\nüìà Metrics:")
    for col in runs.columns:
        if col.startswith('metrics.'):
            metric_name = col.replace('metrics.', '')
            if pd.notna(latest[col]):
                print(f"  {metric_name}: {latest[col]:.4f}")
    
    print(f"\n‚öôÔ∏è  Parameters:")
    for col in runs.columns:
        if col.startswith('params.'):
            param_name = col.replace('params.', '')
            if pd.notna(latest[col]):
                print(f"  {param_name}: {latest[col]}")
    
    # Display runs dataframe
    print(f"\nüìã All Runs:")
    display(runs[['run_id', 'start_time', 'metrics.accuracy', 'metrics.roc_auc', 'params.model_type']].head())
else:
    print("‚ùå No runs found! Train a model first (run Cell [4])")

üìä Found 3 runs in experiment 'customer_conversion_baseline'

üèÉ Latest Run:
  Run ID: 5968fa55a74c4d6e98a36bc295eb9a1c
  Start Time: 2025-11-09 21:20:04.002000+00:00
  Status: FINISHED

üìà Metrics:
  roc_auc: 0.9510
  accuracy: 0.9442
  precision: 0.3827
  recall: 0.7949
  f1: 0.5167

‚öôÔ∏è  Parameters:
  lookback_days: 30
  random_state: 42
  model_type: logistic_regression
  test_size: 0.2

üìã All Runs:


Unnamed: 0,run_id,start_time,metrics.accuracy,metrics.roc_auc,params.model_type
0,5968fa55a74c4d6e98a36bc295eb9a1c,2025-11-09 21:20:04.002000+00:00,0.944231,0.950998,logistic_regression
1,325667bbe0654d828428ac5082be7f1c,2025-11-09 18:35:20.136000+00:00,0.944231,0.950998,logistic_regression
2,3ad6390915214234a4f87c3cc42568a9,2025-11-09 17:49:41.960000+00:00,0.944231,0.950998,logistic_regression


In [6]:
# Cell [6]: Load Model from MLflow
if len(runs) > 0:
    latest_run_id = runs.iloc[0]['run_id']
    
    print(f"Loading model from MLflow...")
    print(f"Run ID: {latest_run_id}")
    
    # Load model
    model_uri = f"runs:/{latest_run_id}/model"
    loaded_model = mlflow.sklearn.load_model(model_uri)
    
    print(f"‚úÖ Model loaded successfully!")
    print(f"Model type: {type(loaded_model).__name__}")
    
    # Get feature names from artifacts
    import joblib
    from pathlib import Path
    
    artifact_dir = Path(results['artifact_dir'])
    feature_engineer_path = artifact_dir / 'models' / 'feature_engineer.joblib'
    
    if feature_engineer_path.exists():
        feature_engineer = joblib.load(feature_engineer_path)
        print(f"\nüìä Model expects {len(feature_engineer.feature_names)} features")
else:
    print("‚ùå No runs to load model from")

Loading model from MLflow...
Run ID: 5968fa55a74c4d6e98a36bc295eb9a1c
‚úÖ Model loaded successfully!
Model type: LogisticRegression

üìä Model expects 84 features


In [10]:
# ============================================================
# Cell [8]: Test FastAPI - Health Check
# ============================================================
print("üè• Testing FastAPI Health Check")
print("=" * 60)

API_BASE_URL = "http://localhost:8000"

try:
    # Test root endpoint
    response = requests.get(f"{API_BASE_URL}/", timeout=5)
    if response.status_code == 200:
        print("‚úÖ API is running!")
        print(json.dumps(response.json(), indent=2))
    
    # Test health endpoint
    print("\nüìä Health Check:")
    response = requests.get(f"{API_BASE_URL}/health", timeout=5)
    health_data = response.json()
    
    print(f"  Status: {health_data['status']}")
    print(f"  Model Loaded: {health_data['model_loaded']}")
    print(f"  Model Version: {health_data['model_version']}")
    
    if health_data['status'] == 'healthy':
        print("\n‚úÖ API is healthy and ready for predictions!")
    else:
        print("\n‚ö†Ô∏è  API is running but model not loaded")
        
except requests.exceptions.ConnectionError:
    print("‚ùå Cannot connect to API!")
    print("\nüí° To start the API, run in a new terminal:")
    print("   python run_api.py")
    print("\n   Then rerun this cell.")
except Exception as e:
    print(f"‚ùå Error: {e}")

print("=" * 60)

üè• Testing FastAPI Health Check
‚úÖ API is running!
{
  "status": "healthy",
  "service": "HubSpot Customer Conversion API",
  "version": "1.0.0"
}

üìä Health Check:
‚ùå Error: Expecting value: line 1 column 1 (char 0)


_IncompleteInputError: incomplete input (334759546.py, line 35)

In [None]:
# Cell [7]: Compare Multiple Runs (Run this after training several times)
import matplotlib.pyplot as plt

runs = mlflow.search_runs(experiment_names=[config.experiment.name])

if len(runs) > 1:
    print(f"Comparing {len(runs)} runs...")
    
    # Sort by time
    runs_sorted = runs.sort_values('start_time')
    
    # Plot
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Accuracy
    axes[0].plot(range(len(runs_sorted)), runs_sorted['metrics.accuracy'], 
                 marker='o', linewidth=2, markersize=8)
    axes[0].set_xlabel('Run Number', fontsize=12)
    axes[0].set_ylabel('Accuracy', fontsize=12)
    axes[0].set_title('Accuracy Across Runs', fontsize=14, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    # ROC AUC
    axes[1].plot(range(len(runs_sorted)), runs_sorted['metrics.roc_auc'], 
                 marker='s', color='orange', linewidth=2, markersize=8)
    axes[1].set_xlabel('Run Number', fontsize=12)
    axes[1].set_ylabel('ROC AUC', fontsize=12)
    axes[1].set_title('ROC AUC Across Runs', fontsize=14, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Find best run
    print(f"\nüèÜ Best Run by ROC AUC:")
    best_idx = runs_sorted['metrics.roc_auc'].idxmax()
    best_run = runs_sorted.loc[best_idx]
    print(f"  Run ID: {best_run['run_id']}")
    print(f"  ROC AUC: {best_run['metrics.roc_auc']:.4f}")
    print(f"  Accuracy: {best_run['metrics.accuracy']:.4f}")
    print(f"  Precision: {best_run['metrics.precision']:.4f}")
    print(f"  Recall: {best_run['metrics.recall']:.4f}")
    
elif len(runs) == 1:
    print(f"Only 1 run found. Train more models to compare!")
    print(f"üí° Tip: Change hyperparameters in config.yaml and run Cell [4] again")
else:
    print("No runs found!")

In [None]:
# Cell [8]: View MLflow Artifacts
if len(runs) > 0:
    client = mlflow.tracking.MlflowClient()
    latest_run_id = runs.iloc[0]['run_id']
    
    print(f"üìÅ Artifacts for Run: {latest_run_id}")
    print("="*60)
    
    artifacts = client.list_artifacts(latest_run_id)
    
    for artifact in artifacts:
        print(f"üì¶ {artifact.path}")
        if artifact.is_dir:
            # List files in subdirectory
            sub_artifacts = client.list_artifacts(latest_run_id, artifact.path)
            for sub in sub_artifacts:
                print(f"   ‚îî‚îÄ {sub.path}")
else:
    print("No runs found!")

In [None]:
# Cell [9]: Instructions for MLflow UI
print("üöÄ TO VIEW MLFLOW UI:")
print("="*60)
print("\n1. Open a NEW terminal/command prompt")
print("2. Navigate to project directory:")
print(f"   cd {os.getcwd()}")
print("\n3. Run:")
print("   mlflow ui")
print("\n4. Open browser and go to:")
print("   http://localhost:5000")
print("\n" + "="*60)
print("\n‚ú® In the UI you'll see:")
print("  ‚úÖ All experiments and runs")
print("  ‚úÖ Metrics visualizations")
print("  ‚úÖ Parameter comparisons")
print("  ‚úÖ Model registry")
print("  ‚úÖ Plots and artifacts")
print("\nüí° Keep training running while viewing the UI!")

In [None]:
import mlflow
mlflow.set_tracking_uri("./mlruns")
runs = mlflow.search_runs(experiment_names=["customer_conversion_baseline"])
print(f"Runs found: {len(runs)}")