In [1]:
# Import required libraries
import sys
from pathlib import Path

import logging
logging.basicConfig(level=logging.DEBUG)

# Import required libraries
import sys
from pathlib import Path

# Add project root to path
project_root = Path('C:/RnD/MedHack/team54/data-science-project')
sys.path.append(str(project_root))
# Add src directory to path
sys.path.append(str(project_root / 'src'))

from src.data.preprocessing import *
from src.models.model import ModelTrainer
from src.utils.helpers import plot_data_distribution, calculate_metrics, log_message
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

DEBUG:matplotlib:matplotlib data path: c:\RnD\MedHack\team54\data-science-project\.venv\lib\site-packages\matplotlib\mpl-data
DEBUG:matplotlib:CONFIGDIR=C:\Users\humza\.matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is win32
DEBUG:matplotlib:CACHEDIR=C:\Users\humza\.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from C:\Users\humza\.matplotlib\fontlist-v390.json


# Model Training and Evaluation Notebook

This notebook demonstrates:
1. Loading and preprocessing data
2. Model initialization and training
3. Performance evaluation
4. Hyperparameter tuning

In [2]:
# Load data
data_dict = load_data()

logging.debug(f"Data loaded successfully. Keys: {list(data_dict.keys())}")

# Display available datasets
print("Available datasets:")
for name in data_dict.keys():
    print(f"- {name}")

Successfully loaded: allergies.csv
Successfully loaded: conditions.csv
Successfully loaded: devices.csv
Successfully loaded: encounters.csv
Successfully loaded: imaging.csv
Successfully loaded: immunizations.csv
Successfully loaded: medications.csv
Successfully loaded: observations.csv
Successfully loaded: patients.csv


DEBUG:root:Data loaded successfully. Keys: ['allergies', 'conditions', 'devices', 'encounters', 'imaging', 'immunizations', 'medications', 'observations', 'patients', 'procedures']


Successfully loaded: procedures.csv
Available datasets:
- allergies
- conditions
- devices
- encounters
- imaging
- immunizations
- medications
- observations
- patients
- procedures


## Model Setup and Training
Choose a dataset and target column for training

In [None]:
# Reload modules to get latest changes
import importlib
import src.data.preprocessing
import src.models.model
import src.utils.helpers

importlib.reload(src.data.preprocessing)
importlib.reload(src.models.model)
importlib.reload(src.utils.helpers)

# Import required classes and functions
from src.data.preprocessing import load_data
from src.models.model import ModelTrainer
from src.utils.helpers import plot_data_distribution, calculate_metrics
from sklearn.ensemble import RandomForestClassifier

# Load and verify data
data_dict = load_data()
print("Available datasets:", list(data_dict.keys()))


# Initialize model with preprocessing
model = RandomForestClassifier(
    n_estimators=100,  # Reduce number of trees for initial testing
    max_depth=10,      # Limit tree depth
    n_jobs=-1         # Use all CPU cores
)
trainer = ModelTrainer(model)

try:
    # Sample data for development (remove for production)
    sample_size = 10000  # Adjust based on your memory constraints
    data_dict['conditions'] = data_dict['conditions'].sample(n=min(sample_size, len(data_dict['conditions'])), random_state=42)
    
    # Check data size and optimize memory
    trainer.inspect_data_size(data_dict['conditions'])  # Check size first
    data_dict['conditions'] = trainer.optimize_dtypes(data_dict['conditions'])  # Optimize memory

    # Prepare data with automatic handling of categorical features
    trainer.prepare_data(data_dict, dataset_name='conditions', target_column='DESCRIPTION')
    
    # Train model
    trainer.train()
    
    # Evaluate model
    score = trainer.evaluate()
    print(f"\nModel Score: {score:.4f}")
    
    # Optional: Hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, None],
        'min_samples_split': [2, 5]
    }
    trainer.tune_hyperparameters(param_grid)
    
    # Save the best model
    model_path = 'models/random_forest.joblib'
    trainer.save_model(model_path)
    print(f"\nModel saved to: {model_path}")
    
except Exception as e:
    print(f"Error during model training: {str(e)}")

Successfully loaded: allergies.csv
Successfully loaded: conditions.csv
Successfully loaded: devices.csv
Successfully loaded: encounters.csv
Successfully loaded: imaging.csv
Successfully loaded: immunizations.csv
Successfully loaded: medications.csv
Successfully loaded: observations.csv
Successfully loaded: patients.csv
Successfully loaded: procedures.csv
Available datasets: ['allergies', 'conditions', 'devices', 'encounters', 'imaging', 'immunizations', 'medications', 'observations', 'patients', 'procedures']
Error during model training: 'ModelTrainer' object has no attribute 'inspect_data_size'


In [4]:
from sklearn.exceptions import NotFittedError
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Model evaluation and visualization
try:
    # First check if data is prepared
    if not hasattr(trainer, 'X_test') or trainer.X_test is None:
        raise ValueError("Data not prepared. Call prepare_data() first")
        
    # Then check if model exists and is fitted
    if not hasattr(trainer, 'model') or trainer.model is None:
        raise ValueError("Model not initialized")
    
    try:
        trainer.model.predict(trainer.X_test[:1])
    except (NotFittedError, AttributeError):
        raise NotFittedError("Model must be trained before evaluation. Call train() first")
    
    # Get predictions and metrics
    y_pred = trainer.model.predict(trainer.X_test)
    metrics = calculate_metrics(trainer.y_test, y_pred)
    
    # Display metrics with formatting
    print("\n" + "="*50)
    print(f"Model Performance Metrics - {trainer.dataset_name}")
    print("="*50)
    for metric, value in metrics.items():
        print(f"{metric.title():>15}: {value:.4f}")
        trainer.logger.info(f"{metric}: {value:.4f}")
    
    # Visualize feature importance
    if hasattr(trainer.model, 'feature_importances_'):
        # Create feature importance DataFrame
        feature_importance = pd.DataFrame({
            'feature': trainer.X_train.columns,
            'importance': trainer.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        # Plot settings
        plt.style.use('seaborn')
        fig, ax = plt.subplots(figsize=(12, 8))
        
        # Plot top N features
        top_n = min(20, len(feature_importance))
        sns.barplot(
            data=feature_importance.head(top_n),
            x='importance',
            y='feature',
            palette='viridis',
            ax=ax
        )
        
        # Customize plot
        ax.set_title(f'Top {top_n} Feature Importance\nModel: {type(trainer.model).__name__}',
                    pad=20, fontsize=12)
        ax.set_xlabel('Importance Score', fontsize=10)
        ax.set_ylabel('Features', fontsize=10)
        
        # Add value labels
        for i, v in enumerate(feature_importance.head(top_n)['importance']):
            ax.text(v, i, f'{v:.3f}', va='center', fontsize=8)
        
        plt.tight_layout()
        plt.show()
        
        # Log top features
        trainer.logger.info("\nTop 5 important features:")
        for idx, row in feature_importance.head().iterrows():
            trainer.logger.info(f"{row['feature']}: {row['importance']:.4f}")
        
        # Save feature importance plot
        try:
            plot_path = Path('plots')
            plot_path.mkdir(exist_ok=True)
            fig.savefig(plot_path / f'feature_importance_{trainer.dataset_name}.png',
                       bbox_inches='tight', dpi=300)
            trainer.logger.info(f"Feature importance plot saved to: {plot_path}")
        except Exception as e:
            trainer.logger.warning(f"Could not save plot: {str(e)}")
    
    else:
        trainer.logger.warning(
            f"Model {type(trainer.model).__name__} doesn't support feature importance visualization"
        )

except NotFittedError as e:
    trainer.logger.error(f"Model not fitted: {str(e)}")
    raise
except Exception as e:
    trainer.logger.error(f"Error in model evaluation: {str(e)}")
    raise

2025-02-17 17:38:30,531 - src.models.model - ERROR - Error in model evaluation: Data not prepared. Call prepare_data() first
ERROR:src.models.model:Error in model evaluation: Data not prepared. Call prepare_data() first


ValueError: Data not prepared. Call prepare_data() first

In [None]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

trainer.tune_hyperparameters(param_grid, trainer.X_train, trainer.y_train)

# Save the best model
trainer.save_model('best_model.joblib')