In [1]:
import mlflow
import mlflow.sklearn  # or mlflow.<your_ml_framework>


In [2]:
mlflow.set_experiment("my_experiment")


<Experiment: artifact_location='file:///rsrch5/home/plm/yshokrollahi/project4/mlruns/203727228454802580', creation_time=1721059740401, experiment_id='203727228454802580', last_update_time=1721059740401, lifecycle_stage='active', name='my_experiment', tags={}>

In [13]:
import mlflow
import os
import sys
import torch
from ultralytics import YOLO
import re

def sanitize_metric_name(name):
    return re.sub(r'[^a-zA-Z0-9_\-. /]', '_', name)

def save_yolo_model(model, path):
    model.save(path)

mlflow.set_experiment("YOLOv8_Training")

with mlflow.start_run():
    # Log parameters
    mlflow.log_param("model", "yolov8x.pt")
    mlflow.log_param("data", "/rsrch5/home/plm/yshokrollahi/project4/RawDataset/dataset/config.yaml")
    mlflow.log_param("epochs", 250)
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("image_size", 640)
    mlflow.log_param("data_version", "1.0")  # Adjust as needed
    mlflow.log_param("code_version", "1.0")  # Adjust as needed

    # Log system info
    mlflow.log_param("python_version", sys.version)
    mlflow.log_param("torch_version", torch.__version__)
    if torch.cuda.is_available():
        mlflow.log_param("cuda_version", torch.version.cuda)
        mlflow.log_param("gpu_name", torch.cuda.get_device_name(0))

    # Set up and train YOLOv8 model
    model = YOLO('yolov8x.pt')
    results = model.train(
        data='/rsrch5/home/plm/yshokrollahi/project4/RawDataset/dataset/config.yaml', 
        epochs=250, 
        batch=32, 
        imgsz=640
    )

    # Log metrics
    metrics = results.results_dict
    for key, value in metrics.items():
        try:
            sanitized_key = sanitize_metric_name(key)
            mlflow.log_metric(sanitized_key, value)
        except Exception as e:
            print(f"Error logging metric {key}: {e}")

    # Save and log the model
    model_path = os.path.join(results.save_dir, 'weights', 'best.pt')
    save_yolo_model(model, model_path)
    mlflow.log_artifact(model_path, "model")

    # Log the training configuration file
    mlflow.log_artifact('/rsrch5/home/plm/yshokrollahi/project4/RawDataset/dataset-mod/config.yaml')

    # Optionally, log other relevant artifacts
    # Log the labels plot
    labels_plot_path = os.path.join(results.save_dir, 'labels.jpg')
    if os.path.exists(labels_plot_path):
        mlflow.log_artifact(labels_plot_path, "plots")

    # Log the results plot
    results_plot_path = os.path.join(results.save_dir, 'results.png')
    if os.path.exists(results_plot_path):
        mlflow.log_artifact(results_plot_path, "plots")

    # Log the confusion matrix
    confusion_matrix_path = os.path.join(results.save_dir, 'confusion_matrix.png')
    if os.path.exists(confusion_matrix_path):
        mlflow.log_artifact(confusion_matrix_path, "plots")

print("Training and logging completed successfully!")

New https://pypi.org/project/ultralytics/8.2.57 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.2.54 🚀 Python-3.10.14 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 4090, 24209MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n.pt, data=/rsrch5/home/plm/yshokrollahi/project4/RawDataset/dataset/config.yaml, epochs=1, time=None, patience=100, batch=4, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train25, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=F

[34m[1mtrain: [0mScanning /rsrch5/home/plm/yshokrollahi/project4/RawDataset/dataset/train/labels.cache... 1316 images, 0 backg[0m




[34m[1mval: [0mScanning /rsrch5/home/plm/yshokrollahi/project4/RawDataset/dataset/valid/labels.cache... 331 images, 0 backgrou[0m


Plotting labels to runs/detect/train25/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001429, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)


2024/07/15 11:48:07 INFO mlflow.tracking.fluent: Experiment with name '/Shared/YOLOv8' does not exist. Creating a new experiment.
2024/07/15 11:48:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2024/07/15 11:48:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.


[34m[1mMLflow: [0mlogging run_id(e5cde304887143a08c88dab45886f9e9) to runs/mlflow
[34m[1mMLflow: [0mview at http://127.0.0.1:5000 with 'mlflow server --backend-store-uri runs/mlflow'
[34m[1mMLflow: [0mdisable with 'yolo settings mlflow=False'
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mruns/detect/train25[0m
Starting training for 1 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/1      4.38G      2.925      4.211      2.006         67        640: 100%|██████████| 329/329 [00:13<00:00
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 42/42 [00


                   all        331       3666    0.00314     0.0881    0.00192   0.000549

1 epochs completed in 0.017 hours.
Optimizer stripped from runs/detect/train25/weights/last.pt, 6.2MB
Optimizer stripped from runs/detect/train25/weights/best.pt, 6.2MB

Validating runs/detect/train25/weights/best.pt...
Ultralytics YOLOv8.2.54 🚀 Python-3.10.14 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 4090, 24209MiB)
Model summary (fused): 168 layers, 3006233 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 42/42 [00


                   all        331       3666    0.00308     0.0871    0.00189    0.00054
                  cd45        259       1640    0.00454      0.161    0.00298   0.000854
                 panCK         72       1121    0.00223      0.033    0.00126   0.000369
                Others        112        905    0.00248     0.0674    0.00143   0.000398
Speed: 0.1ms preprocess, 1.1ms inference, 0.0ms loss, 1.3ms postprocess per image
Results saved to [1mruns/detect/train25[0m
[34m[1mMLflow: [0mresults logged to runs/mlflow
[34m[1mMLflow: [0mdisable with 'yolo settings mlflow=False'
Training and logging completed successfully!
