In [1]:
import os

In [2]:
%pwd

'd:\\Production\\projects\\brain_tumor_classification\\notebook'

In [3]:
os.chdir("../")

In [67]:
%pwd

'd:\\Production\\projects\\brain_tumor_classification'

In [None]:
# import mlflow

# DAGsHub credentials
os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/hafizshakeel/brain_tumor_classification.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "hafizshakeel"  # settings -> profile -> username 
os.environ["MLFLOW_TRACKING_PASSWORD"] = "=**********"  #  # settings -> tokens 

# https://dagshub.com/user/settings/tokens

In [None]:
# # for gitbash
# export MLFLOW_TRACKING_URI=https://dagshub.com/hafizshakeel/brain_tumor_classification.mlflow
# export MLFLOW_TRACKING_USERNAME=hafizshakeel 
# export MLFLOW_TRACKING_PASSWORD==**********


### Entity


In [70]:
from pathlib import Path
from dataclasses import dataclass

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    train_data_dir: Path
    val_split: float
    epochs: int
    batch_size: int
    learning_rate: float
    weight_decay: float
    optimizer: str
    scheduler: str
    step_size: int
    gamma: float
    

    

### Configuration


In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchvision import datasets, transforms
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import mlflow
import os

from src.brain_tumor_classification import logger
from src.brain_tumor_classification.constants import *
from src.brain_tumor_classification.utils.common import read_yaml, create_directories


In [72]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_training_config(self) -> TrainingConfig:
        config = self.config.training
        params = self.params.training

        create_directories([config.root_dir])

        training_config = TrainingConfig(
            root_dir=Path(config.root_dir),
            trained_model_path=Path(config.trained_model_path),
            train_data_dir=Path(config.train_data_dir),
            val_split=params.val_split,
            epochs=params.epochs,
            batch_size=params.batch_size,
            learning_rate=params.learning_rate,
            weight_decay=params.weight_decay,
            optimizer=params.optimizer,
            scheduler=params.scheduler,
            step_size=params.step_size,
            gamma=params.gamma
        )
        return training_config


    def get_mlflow_config(self):
        return self.config.mlflow

### Training

In [None]:
class TrainingPipeline:
    def __init__(self, config: TrainingConfig, model: torch.nn.Module, device, mlflow_config):
        self.config = config
        self.model = model
        self.device = device

        self.mlflow_config = mlflow_config

        # Setup MLflow
        if self.mlflow_config.log_with_mlflow:
            mlflow.set_tracking_uri(self.mlflow_config.mlflow_tracking_uri)
            mlflow.set_experiment(self.mlflow_config.mlflow_experiment)

        # Transforms
        self.train_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ])
        self.val_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ])

        # Dataset + Split
        full_dataset = datasets.ImageFolder(self.config.train_data_dir, transform=self.train_transform)
        val_size = int(len(full_dataset) * self.config.val_split)
        train_size = len(full_dataset) - val_size
        self.train_dataset, self.val_dataset = random_split(full_dataset, [train_size, val_size])
        self.val_dataset.dataset.transform = self.val_transform

        # Dataloaders
        self.train_loader = DataLoader(self.train_dataset, batch_size=config.batch_size, shuffle=True)
        self.val_loader = DataLoader(self.val_dataset, batch_size=config.batch_size, shuffle=False)

        # Loss, optimizer, scheduler
        self.criterion = nn.CrossEntropyLoss()
        if config.optimizer.lower() == "adamw":
            self.optimizer = optim.AdamW(self.model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
        elif config.optimizer.lower() == "sgd":
            self.optimizer = optim.SGD(self.model.parameters(), lr=config.learning_rate, momentum=0.9, weight_decay=config.weight_decay)
        else:
            raise ValueError(f"Unknown optimizer: {config.optimizer}")

        if config.scheduler == "step":
            self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=config.step_size, gamma=config.gamma)
        elif config.scheduler == "cosine":
            self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=config.epochs)
        else:
            self.scheduler = None

        # Save class names (for reports)
        self.class_names = full_dataset.classes

    def train(self, epoch):
        self.model.train()
        running_loss, correct, total = 0.0, 0, 0

        for images, labels in self.train_loader:
            images, labels = images.to(self.device), labels.to(self.device)
            self.optimizer.zero_grad()

            outputs = self.model(images)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        if self.scheduler:
            self.scheduler.step()

        epoch_loss = running_loss / total
        epoch_acc = 100. * correct / total

        # Log to MLflow
        if self.mlflow_config.log_with_mlflow:
            mlflow.log_metric("train_loss", epoch_loss, step=epoch)
            mlflow.log_metric("train_accuracy", epoch_acc, step=epoch)

        logger.info(f"Epoch [{epoch}] Train Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.2f}%")
        return epoch_loss, epoch_acc

    def validate(self, epoch):
        self.model.eval()
        running_loss, correct, total = 0.0, 0, 0

        all_labels = []
        all_preds = []

        with torch.no_grad():
            for images, labels in self.val_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                outputs = self.model(images)
                loss = self.criterion(outputs, labels)

                running_loss += loss.item() * images.size(0)
                _, predicted = outputs.max(1)
                correct += predicted.eq(labels).sum().item()
                total += labels.size(0)

                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(predicted.cpu().numpy())

        epoch_loss = running_loss / total
        epoch_acc = 100. * correct / total

        # Log to MLflow
        if self.mlflow_config.log_with_mlflow:
            mlflow.log_metric("val_loss", epoch_loss, step=epoch)
            mlflow.log_metric("val_accuracy", epoch_acc, step=epoch)

        logger.info(f"Epoch [{epoch}] Val Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.2f}%")

        # Metrics (per class)
        report = classification_report(all_labels, all_preds, target_names=self.class_names, digits=4, output_dict=True, zero_division=0)
        logger.info(f"\nClassification Report (Epoch {epoch}):\n{report}")

        # Log per-class metrics to MLflow
        if self.mlflow_config.log_with_mlflow:
            for class_name in self.class_names:
                if class_name in report:
                    mlflow.log_metric(f"precision_{class_name}", report[class_name]['precision'], step=epoch)
                    mlflow.log_metric(f"recall_{class_name}", report[class_name]['recall'], step=epoch)
                    mlflow.log_metric(f"f1_{class_name}", report[class_name]['f1-score'], step=epoch)

        return epoch_loss, epoch_acc, report


    def run(self):
        logger.info("Starting Training...")
        best_acc = 0.0
        
        # Start MLflow run
        if self.mlflow_config.log_with_mlflow:
            mlflow.start_run()
            # Log parameters
            mlflow.log_params({
                "epochs": self.config.epochs,
                "batch_size": self.config.batch_size,
                "learning_rate": self.config.learning_rate,
                "weight_decay": self.config.weight_decay,
                "optimizer": self.config.optimizer,
                "scheduler": self.config.scheduler,
                "val_split": self.config.val_split,
                "model_name": "swin_tiny_patch4_window7_224"
            })

        try:
            for epoch in range(1, self.config.epochs + 1):
                train_loss, train_acc = self.train(epoch)
                val_loss, val_acc, report = self.validate(epoch)

                if val_acc > best_acc:
                    best_acc = val_acc
                    torch.save(self.model, self.config.trained_model_path)
                    logger.info(f"Best model saved at {self.config.trained_model_path} with Val Acc: {best_acc:.2f}%")

                    # Log best model to MLflow as an artifact (not using mlflow.pytorch.log_model)
                    if self.mlflow_config.log_with_mlflow:
                        mlflow.log_artifact(self.config.trained_model_path, artifact_path="models")
                        mlflow.log_metric("best_val_accuracy", best_acc, step=epoch)

            logger.info("Training Completed.")
            
            # Log the final model as an artifact
            if self.mlflow_config.log_with_mlflow:
                final_model_path = os.path.join(self.config.root_dir, "final_model.pth")
                torch.save(self.model, final_model_path)
                mlflow.log_artifact(final_model_path, artifact_path="models")
                mlflow.log_metric("final_val_accuracy", val_acc)
                
        except Exception as e:
            logger.error(f"Error during training: {e}")
            # Re-raise the exception to ensure it's not silently caught
            raise
        finally:
            if self.mlflow_config.log_with_mlflow:
                mlflow.end_run()

In [None]:
try:
    config_manager = ConfigurationManager()
    training_config = config_manager.get_training_config()
    mlflow_config = config_manager.get_mlflow_config()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    updated_model_path = config_manager.config["prepare_base_model"]["updated_model_path"]
    model = torch.load(updated_model_path, map_location=device, weights_only=False)
    model = model.to(device)

    trainer = TrainingPipeline(config=training_config, model=model, device=device, mlflow_config=mlflow_config)
    trainer.run()

except Exception as e:
    logger.exception(f"Training failed: {e}")
    raise e