# Data Science Workflow Template

**Purpose**: Template notebook for data scientists working with the Risk Platform

**Use Cases**:
- Model development and backtesting
- Advanced analytics and research
- Custom risk metric development

**Author**: Risk Platform Team  
**Created**: September 2025

## 🔧 Development Environment Setup

In [None]:
# Development imports - add your preferred libraries here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import requests
import json
from datetime import datetime, timedelta

# Risk platform specific imports
import sys
sys.path.append('/home/jovyan/shared/libs')  # Risk platform libraries

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Development environment configured")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")

## 🔌 Risk Platform Integration

In [None]:
class RiskPlatformClient:
    """Client for Risk Platform API integration"""
    
    def __init__(self, base_url="http://fastapi-service.default.svc.cluster.local"):
        self.base_url = base_url
        self.headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }
    
    def get_portfolios(self):
        """Retrieve all portfolios"""
        try:
            response = requests.get(f"{self.base_url}/api/v1/portfolios", headers=self.headers)
            return response.json() if response.status_code == 200 else []
        except:
            return []
    
    def calculate_risk(self, portfolio_data):
        """Calculate risk metrics for portfolio"""
        try:
            response = requests.post(
                f"{self.base_url}/api/v1/risk/calculate", 
                headers=self.headers, 
                json=portfolio_data
            )
            return response.json() if response.status_code == 200 else {}
        except:
            return {}
    
    def get_market_data(self, symbols, start_date, end_date):
        """Retrieve market data"""
        try:
            data = {
                "symbols": symbols,
                "start_date": start_date,
                "end_date": end_date
            }
            response = requests.post(
                f"{self.base_url}/api/v1/market/data", 
                headers=self.headers, 
                json=data
            )
            return response.json() if response.status_code == 200 else {}
        except:
            return {}

# Initialize client
risk_client = RiskPlatformClient()
print("✅ Risk Platform client initialized")

## 📊 Data Exploration Template

In [None]:
def explore_dataset(df, title="Dataset"):
    """Comprehensive dataset exploration function"""
    print(f"📊 {title} Exploration")
    print("="*50)
    print(f"Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print("\nColumn Information:")
    print(df.info())
    
    print("\nNumerical Summary:")
    display(df.describe())
    
    print("\nMissing Values:")
    missing = df.isnull().sum()
    if missing.sum() > 0:
        display(missing[missing > 0].sort_values(ascending=False))
    else:
        print("No missing values found")
    
    # Correlation matrix for numerical columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 1:
        plt.figure(figsize=(10, 8))
        correlation_matrix = df[numeric_cols].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title(f'{title} - Correlation Matrix')
        plt.tight_layout()
        plt.show()

# Example usage - replace with your actual dataset
# sample_data = pd.DataFrame({
#     'returns': np.random.normal(0.001, 0.02, 1000),
#     'volume': np.random.lognormal(15, 1, 1000),
#     'volatility': np.random.gamma(2, 0.01, 1000)
# })
# explore_dataset(sample_data, "Sample Financial Data")

print("📊 Data exploration template ready")

## 🤖 Model Development Framework

In [None]:
class RiskModelFramework:
    """Framework for developing and testing risk models"""
    
    def __init__(self, name="Risk Model"):
        self.name = name
        self.model = None
        self.training_data = None
        self.test_data = None
        self.predictions = None
        self.metrics = {}
    
    def prepare_data(self, data, target_col, test_size=0.2, random_state=42):
        """Prepare data for model training"""
        X = data.drop(columns=[target_col])
        y = data[target_col]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        self.training_data = (X_train, y_train)
        self.test_data = (X_test, y_test)
        
        print(f"✅ Data prepared: Training {X_train.shape}, Test {X_test.shape}")
        return X_train, X_test, y_train, y_test
    
    def train_model(self, model, X_train=None, y_train=None):
        """Train the risk model"""
        if X_train is None or y_train is None:
            X_train, y_train = self.training_data
        
        self.model = model
        self.model.fit(X_train, y_train)
        
        print(f"✅ Model {self.name} trained successfully")
        return self.model
    
    def evaluate_model(self, X_test=None, y_test=None):
        """Evaluate model performance"""
        if X_test is None or y_test is None:
            X_test, y_test = self.test_data
        
        self.predictions = self.model.predict(X_test)
        
        # Calculate metrics
        self.metrics = {
            'mse': mean_squared_error(y_test, self.predictions),
            'rmse': np.sqrt(mean_squared_error(y_test, self.predictions)),
            'r2': r2_score(y_test, self.predictions),
            'mae': np.mean(np.abs(y_test - self.predictions))
        }
        
        print(f"📊 {self.name} Performance Metrics:")
        for metric, value in self.metrics.items():
            print(f"   {metric.upper()}: {value:.4f}")
        
        return self.metrics
    
    def plot_results(self, y_test=None):
        """Plot model results"""
        if y_test is None:
            _, y_test = self.test_data
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # Actual vs Predicted
        ax1.scatter(y_test, self.predictions, alpha=0.6)
        ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        ax1.set_xlabel('Actual Values')
        ax1.set_ylabel('Predicted Values')
        ax1.set_title(f'{self.name} - Actual vs Predicted')
        ax1.grid(True, alpha=0.3)
        
        # Residuals
        residuals = y_test - self.predictions
        ax2.scatter(self.predictions, residuals, alpha=0.6)
        ax2.axhline(y=0, color='r', linestyle='--')
        ax2.set_xlabel('Predicted Values')
        ax2.set_ylabel('Residuals')
        ax2.set_title(f'{self.name} - Residuals Plot')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        print(f"📈 {self.name} results plotted")

print("🤖 Risk model framework ready")
print("📝 Usage: framework = RiskModelFramework('My Risk Model')")

## 📈 Example: Volatility Prediction Model

In [None]:
# Create sample dataset for volatility prediction
np.random.seed(42)
n_samples = 1000

# Generate synthetic features
sample_data = pd.DataFrame({
    'previous_volatility': np.random.gamma(2, 0.01, n_samples),
    'volume': np.random.lognormal(15, 1, n_samples),
    'price_change': np.random.normal(0, 0.02, n_samples),
    'market_cap': np.random.lognormal(20, 2, n_samples),
    'sector_beta': np.random.normal(1, 0.3, n_samples)
})

# Create target variable (future volatility)
sample_data['future_volatility'] = (
    0.7 * sample_data['previous_volatility'] + 
    0.2 * np.abs(sample_data['price_change']) + 
    0.1 * sample_data['sector_beta'] * 0.01 +
    np.random.normal(0, 0.005, n_samples)
)

print("📊 Sample dataset created for volatility prediction")
print(f"Dataset shape: {sample_data.shape}")
display(sample_data.head())

# Initialize and use the framework
vol_model = RiskModelFramework("Volatility Prediction Model")

# Prepare data
X_train, X_test, y_train, y_test = vol_model.prepare_data(
    sample_data, 'future_volatility', test_size=0.2
)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
vol_model.train_model(rf_model, X_train, y_train)

# Evaluate model
metrics = vol_model.evaluate_model(X_test, y_test)

# Plot results
vol_model.plot_results(y_test)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
plt.title('Feature Importance - Volatility Prediction Model')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print("\n🎯 Feature Importance:")
display(feature_importance)

## 💾 Save & Export Models

In [None]:
# Model persistence utilities
import pickle
import joblib
from pathlib import Path

def save_model(model, name, save_dir="/home/jovyan/shared/models"):
    """Save model with metadata"""
    save_path = Path(save_dir)
    save_path.mkdir(exist_ok=True)
    
    # Save model
    model_file = save_path / f"{name}_model.joblib"
    joblib.dump(model, model_file)
    
    # Save metadata
    metadata = {
        'name': name,
        'created_date': datetime.now().isoformat(),
        'model_type': type(model).__name__,
        'file_path': str(model_file)
    }
    
    metadata_file = save_path / f"{name}_metadata.json"
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"✅ Model saved: {model_file}")
    print(f"📋 Metadata saved: {metadata_file}")
    return model_file, metadata_file

def load_model(name, save_dir="/home/jovyan/shared/models"):
    """Load model with metadata"""
    save_path = Path(save_dir)
    model_file = save_path / f"{name}_model.joblib"
    metadata_file = save_path / f"{name}_metadata.json"
    
    if model_file.exists():
        model = joblib.load(model_file)
        
        if metadata_file.exists():
            with open(metadata_file, 'r') as f:
                metadata = json.load(f)
            print(f"✅ Model loaded: {name}")
            print(f"📅 Created: {metadata.get('created_date', 'Unknown')}")
            print(f"🔧 Type: {metadata.get('model_type', 'Unknown')}")
        else:
            print(f"✅ Model loaded: {name} (no metadata)")
        
        return model
    else:
        print(f"❌ Model not found: {name}")
        return None

# Example: Save the volatility model
# save_model(vol_model.model, "volatility_prediction_v1")

print("💾 Model persistence utilities ready")
print("📝 Usage: save_model(your_model, 'model_name')")
print("📝 Usage: loaded_model = load_model('model_name')")

## 📋 Experiment Tracking Template

In [None]:
# Experiment tracking utilities
experiment_log = []

def log_experiment(name, model_type, metrics, parameters=None, notes=""):
    """Log experiment results"""
    experiment = {
        'timestamp': datetime.now().isoformat(),
        'name': name,
        'model_type': model_type,
        'metrics': metrics,
        'parameters': parameters or {},
        'notes': notes
    }
    
    experiment_log.append(experiment)
    print(f"✅ Experiment logged: {name}")
    return experiment

def show_experiments():
    """Display experiment results"""
    if not experiment_log:
        print("No experiments logged yet")
        return
    
    df = pd.DataFrame(experiment_log)
    
    # Expand metrics into separate columns
    metrics_df = pd.json_normalize(df['metrics'])
    result_df = pd.concat([
        df[['timestamp', 'name', 'model_type', 'notes']], 
        metrics_df
    ], axis=1)
    
    print("📊 Experiment Results:")
    display(result_df)
    return result_df

# Example: Log the volatility model experiment
log_experiment(
    name="Volatility Prediction v1",
    model_type="RandomForestRegressor",
    metrics=vol_model.metrics,
    parameters={'n_estimators': 100, 'random_state': 42},
    notes="Initial baseline model with synthetic data"
)

# Show all experiments
show_experiments()

print("\n📋 Experiment tracking ready")
print("📝 Usage: log_experiment('name', 'model_type', metrics_dict, params_dict, 'notes')")

## 🎯 Next Steps & Development Guidelines

### 📚 Development Best Practices
1. **Version Control**: Save notebook versions for important experiments
2. **Documentation**: Add clear markdown explanations for complex analyses
3. **Reproducibility**: Set random seeds and document dependencies
4. **Testing**: Validate models on out-of-sample data

### 🔄 Integration with Risk Platform
- **API Integration**: Use RiskPlatformClient for data access
- **Model Deployment**: Export models to shared directory for production use
- **Monitoring**: Track model performance over time

### 📊 Available Data Sources
- **Risk API**: Portfolio and risk calculation data
- **Market Data API**: Historical price and volume data
- **Shared Storage**: `/home/jovyan/shared/data/`
- **External APIs**: Connect to Bloomberg, Refinitiv, etc.

### 🎯 Model Development Workflow
1. **Data Exploration**: Use `explore_dataset()` function
2. **Feature Engineering**: Create relevant risk factors
3. **Model Training**: Use `RiskModelFramework` class
4. **Validation**: Backtest and stress test models
5. **Deployment**: Save to shared models directory
6. **Monitoring**: Track performance in production

### 📞 Support & Resources
- **Documentation**: `/docs/` folder in repository
- **Examples**: `/notebooks/examples/` for reference notebooks
- **Team Support**: Contact data science team for assistance