# download_dataset.py

In [None]:
import os
from pathlib import Path
import zipfile
import shutil

# Define paths
project_dir = Path('D:/Projects/LungCancerDetection')
raw_data_dir = project_dir / 'data' / 'raw'

# Create directories
raw_data_dir.mkdir(parents=True, exist_ok=True)

# Download dataset
os.system(f'kaggle datasets download -d andrewmvd/lung-and-colon-cancer-histopathological-images -p "{raw_data_dir}"')

# Unzip dataset
zip_path = raw_data_dir / 'lung-and-colon-cancer-histopathological-images.zip'
extract_path = raw_data_dir / 'lung_colon_image_set'
if zip_path.exists():
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(raw_data_dir)
    # Fix nesting if present
    nested_dir = raw_data_dir / 'lung_colon_image_set' / 'lung_colon_image_set'
    if nested_dir.exists():
        for item in nested_dir.iterdir():
            shutil.move(str(item), str(extract_path))
        shutil.rmtree(nested_dir)
    print("Dataset unzipped successfully!")
else:
    print(f"Error: {zip_path} not found.")
    exit(1)

# Verify structure
if extract_path.exists():
    print("Directory contents:", os.listdir(extract_path))
else:
    print(f"Error: {extract_path} not found.")

# check_dataset.py

In [None]:
from pathlib import Path
import os

# Path to unzipped dataset
dataset_dir = Path('D:/Projects/LungCancerDetection/data/raw/lung_colon_image_set')

# List contents recursively
def list_dir(path, level=0):
    print('  ' * level + f'{path.name}/')
    for item in path.iterdir():
        if item.is_dir():
            list_dir(item, level + 1)
        else:
            print('  ' * (level + 1) + item.name)

list_dir(dataset_dir)

# prepare_data.py

In [None]:
import os
import shutil
from pathlib import Path
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

# Paths
raw_data_dir = Path('D:/Projects/LungCancerDetection/data/raw/lung_colon_image_set')
output_dir = Path('D:/Projects/LungCancerDetection/data')
img_size = 640

# Class mapping: cancer (0), non-cancer (1)
class_map = {
    'lung_aca': 0,
    'lung_scc': 0,
    'colon_aca': 0,
    'lung_n': 1,
    'colon_n': 1
}

def create_dirs():
    for split in ['train', 'val', 'test']:
        (output_dir / 'images' / split).mkdir(parents=True, exist_ok=True)
        (output_dir / 'labels' / split).mkdir(parents=True, exist_ok=True)

def process_images():
    images = []
    labels = []
    
    # Check if base directory exists
    if not raw_data_dir.exists():
        print(f"Error: {raw_data_dir} does not exist.")
        exit(1)
    
    # Define subfolder groups
    subfolder_groups = {
        'lung_image_sets': ['lung_aca', 'lung_scc', 'lung_n'],
        'colon_image_sets': ['colon_aca', 'colon_n']
    }
    
    # Iterate through subfolder groups and their classes
    for group_name, class_names in subfolder_groups.items():
        group_dir = raw_data_dir / group_name
        if not group_dir.exists():
            print(f"Warning: {group_dir} does not exist. Skipping.")
            continue
        for class_name in class_names:
            class_dir = group_dir / class_name
            if not class_dir.exists():
                print(f"Warning: {class_dir} does not exist. Skipping.")
                continue
            for img_name in os.listdir(class_dir):
                if img_name.endswith('.jpeg'):
                    images.append(str(class_dir / img_name))
                    labels.append(class_map[class_name])
    
    # Check if images were found
    if not images:
        print("Error: No valid images found in subfolders. Check dataset structure.")
        exit(1)
    
    # Split dataset
    train_imgs, temp_imgs, train_labels, temp_labels = train_test_split(
        images, labels, test_size=0.3, stratify=labels, random_state=42
    )
    val_imgs, test_imgs, val_labels, test_labels = train_test_split(
        temp_imgs, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42
    )
    
    # Process each split
    for split, img_list, label_list in [
        ('train', train_imgs, train_labels),
        ('val', val_imgs, val_labels),
        ('test', test_imgs, test_labels)
    ]:
        for img_path, label in zip(img_list, label_list):
            # Read and resize image
            img = cv2.imread(img_path)
            if img is None:
                print(f"Warning: Failed to read {img_path}")
                continue
            img = cv2.resize(img, (img_size, img_size))
            img_name = Path(img_path).name
            cv2.imwrite(str(output_dir / 'images' / split / img_name), img)
            
            # Create annotation: full-image bounding box
            with open(output_dir / 'labels' / split / img_name.replace('.jpeg', '.txt'), 'w') as f:
                f.write(f'{label} 0.5 0.5 1.0 1.0\n')

def create_yaml():
    yaml_content = f"""
train: {output_dir}/images/train
val: {output_dir}/images/val
test: {output_dir}/images/test
nc: 2
names: ['cancer', 'non-cancer']
"""
    with open(output_dir / 'lc25000.yaml', 'w') as f:
        f.write(yaml_content)

if __name__ == '__main__':
    create_dirs()
    process_images()
    create_yaml()
    print("Dataset prepared successfully!")
    print(f"Train images: {len(os.listdir(output_dir / 'images' / 'train'))}")
    print(f"Val images: {len(os.listdir(output_dir / 'images' / 'val'))}")
    print(f"Test images: {len(os.listdir(output_dir / 'images' / 'test'))}")

# train.py

In [1]:
from ultralytics import YOLO

def train_model():
    # Load model
    model = YOLO('yolov8s.pt')

    # Train
    model.train(
        data='D:/Projects/LungCancerDetection/data/lc25000.yaml',
        epochs=50,
        imgsz=640,
        batch=8,  # Reduced for 4GB GPU
        device=0,  # Use GPU
        patience=10,  # Early stopping
        project='D:/Projects/LungCancerDetection/runs/detect',  # Explicit save path
        name='yolov8s_lc25000',
        exist_ok=True,  # Overwrite if exists
        cache=True  # Cache dataset for faster loading
    )

    print("Training completed!")

if __name__ == '__main__':
    train_model()

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt'...


100%|██████████| 21.5M/21.5M [00:08<00:00, 2.74MB/s]


Ultralytics 8.3.147  Python-3.10.17 torch-2.7.0+cu128 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=8, bgr=0.0, box=7.5, cache=True, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=D:/Projects/LungCancerDetection/data/lc25000.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8s.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=yolov8s_lc25000, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, pa

100%|██████████| 5.35M/5.35M [00:01<00:00, 3.75MB/s]


[34m[1mAMP: [0mchecks passed 
[34m[1mtrain: [0mFast image access  (ping: 0.30.2 ms, read: 188.848.3 MB/s, size: 108.5 KB)


[34m[1mtrain: [0mScanning D:\Projects\LungCancerDetection\data\labels\train.cache... 17500 images, 0 backgrounds, 0 corrupt: 100%|██████████| 17500/17500 [00:00<?, ?it/s]


[34m[1mval: [0mFast image access  (ping: 0.30.1 ms, read: 142.432.6 MB/s, size: 111.5 KB)


[34m[1mval: [0mScanning D:\Projects\LungCancerDetection\data\labels\val.cache... 3750 images, 0 backgrounds, 0 corrupt: 100%|██████████| 3750/3750 [00:00<?, ?it/s]






Plotting labels to D:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mD:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000[0m
Starting training for 50 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/50      1.84G     0.1712     0.5502     0.9602         12        640: 100%|██████████| 2188/2188 [07:25<00:00,  4.91it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:42<00:00,  5.54it/s]


                   all       3750       3750      0.942      0.951      0.994      0.994

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/50      1.86G     0.1393     0.2659     0.9158         14        640: 100%|██████████| 2188/2188 [07:22<00:00,  4.94it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:42<00:00,  5.55it/s]

                   all       3750       3750      0.987      0.993      0.995      0.968






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/50      1.93G     0.1642     0.2884     0.9198         13        640: 100%|██████████| 2188/2188 [07:13<00:00,  5.04it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:42<00:00,  5.55it/s]

                   all       3750       3750      0.952      0.974      0.978      0.906






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/50       1.9G      0.155     0.2833     0.9185         14        640: 100%|██████████| 2188/2188 [07:19<00:00,  4.97it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:51<00:00,  4.53it/s]

                   all       3750       3750      0.998      0.997      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/50      1.83G     0.1242     0.2367     0.9047         11        640: 100%|██████████| 2188/2188 [07:42<00:00,  4.73it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:47<00:00,  4.91it/s]

                   all       3750       3750      0.935      0.991      0.994      0.994






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/50      1.83G      0.108     0.2114     0.9017         14        640: 100%|██████████| 2188/2188 [07:40<00:00,  4.75it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:46<00:00,  5.03it/s]

                   all       3750       3750      0.995      0.998      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/50      1.85G    0.09736     0.1968     0.8985         15        640: 100%|██████████| 2188/2188 [07:42<00:00,  4.73it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:49<00:00,  4.74it/s]

                   all       3750       3750      0.999      0.999      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/50      1.83G    0.08878     0.1816     0.8956         15        640: 100%|██████████| 2188/2188 [09:53<00:00,  3.68it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [01:32<00:00,  2.54it/s]

                   all       3750       3750      0.999      0.999      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/50      1.85G    0.08123     0.1723      0.895         10        640: 100%|██████████| 2188/2188 [14:00<00:00,  2.60it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:45<00:00,  5.17it/s]

                   all       3750       3750      0.999      0.999      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/50       1.9G    0.07706     0.1632     0.8943         16        640: 100%|██████████| 2188/2188 [08:06<00:00,  4.50it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:55<00:00,  4.26it/s]

                   all       3750       3750      0.999      0.999      0.995      0.961






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      11/50      1.85G    0.07263     0.1581     0.8919         13        640: 100%|██████████| 2188/2188 [08:12<00:00,  4.44it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:49<00:00,  4.73it/s]

                   all       3750       3750      0.993      0.999      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      12/50       1.9G    0.06947     0.1535     0.8911         14        640: 100%|██████████| 2188/2188 [07:19<00:00,  4.98it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:42<00:00,  5.58it/s]

                   all       3750       3750      0.999          1      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      13/50      1.85G     0.0671     0.1496     0.8925         16        640: 100%|██████████| 2188/2188 [08:08<00:00,  4.48it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:46<00:00,  5.02it/s]

                   all       3750       3750          1          1      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      14/50       1.9G     0.0639     0.1453      0.892         12        640: 100%|██████████| 2188/2188 [07:53<00:00,  4.62it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:44<00:00,  5.29it/s]

                   all       3750       3750      0.999          1      0.995      0.991






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      15/50      1.84G    0.06123     0.1388     0.8894         11        640: 100%|██████████| 2188/2188 [07:24<00:00,  4.92it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:41<00:00,  5.66it/s]

                   all       3750       3750          1          1      0.995      0.995






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      16/50       1.9G    0.05846     0.1327     0.8915         13        640: 100%|██████████| 2188/2188 [07:06<00:00,  5.13it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:41<00:00,  5.67it/s]

                   all       3750       3750          1          1      0.995      0.987






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      17/50      1.93G    0.05586     0.1273     0.8898          8        640: 100%|██████████| 2188/2188 [07:06<00:00,  5.13it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:47<00:00,  4.91it/s]

                   all       3750       3750          1          1      0.995      0.995
[34m[1mEarlyStopping: [0mTraining stopped early as no improvement observed in last 10 epochs. Best results observed at epoch 7, best model saved as best.pt.
To update EarlyStopping(patience=10) pass a new patience value, i.e. `patience=300` or use `patience=0` to disable EarlyStopping.






17 epochs completed in 2.528 hours.
Optimizer stripped from D:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000\weights\last.pt, 22.5MB
Optimizer stripped from D:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000\weights\best.pt, 22.5MB

Validating D:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000\weights\best.pt...
Ultralytics 8.3.147  Python-3.10.17 torch-2.7.0+cu128 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
Model summary (fused): 72 layers, 11,126,358 parameters, 0 gradients, 28.4 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 235/235 [00:45<00:00,  5.12it/s]


                   all       3750       3750          1          1      0.995      0.995
                cancer       2250       2250          1          1      0.995      0.995
            non-cancer       1500       1500      0.999          1      0.995      0.995
Speed: 0.3ms preprocess, 6.3ms inference, 0.0ms loss, 1.6ms postprocess per image
Results saved to [1mD:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000[0m
Training completed!


# Evaluate.py

In [2]:
import os
import numpy as np
from ultralytics import YOLO
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# Load trained model
model = YOLO('D:/Projects/LungCancerDetection/runs/detect/yolov8s_lc25000/weights/best.pt')

# Test set paths
test_img_dir = 'D:/Projects/LungCancerDetection/data/images/test'
test_label_dir = 'D:/Projects/LungCancerDetection/data/labels/test'

# Initialize lists
y_true = []
y_pred = []

# Predict on test images
for img_name in os.listdir(test_img_dir):
    if not img_name.endswith('.jpeg'):
        continue
    img_path = os.path.join(test_img_dir, img_name)
    results = model.predict(img_path, conf=0.5)
    
    # Extract class prediction
    pred_class = int(results[0].boxes.cls[0]) if results[0].boxes else 0  # Default to cancer if no detection
    y_pred.append(pred_class)
    
    # Read ground truth
    label_path = os.path.join(test_label_dir, img_name.replace('.jpeg', '.txt'))
    with open(label_path, 'r') as f:
        true_class = int(f.read().strip().split()[0])
    y_true.append(true_class)

# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Cancer', 'Non-Cancer'], yticklabels=['Cancer', 'Non-Cancer'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('D:/Projects/LungCancerDetection/confusion_matrix.png')
plt.show()


image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca10.jpeg: 640x640 1 cancer, 137.0ms
Speed: 8.5ms preprocess, 137.0ms inference, 5.8ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca100.jpeg: 640x640 1 cancer, 12.3ms
Speed: 3.2ms preprocess, 12.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca1008.jpeg: 640x640 1 cancer, 12.2ms
Speed: 2.5ms preprocess, 12.2ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca101.jpeg: 640x640 1 cancer, 12.4ms
Speed: 2.2ms preprocess, 12.4ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca1015.jpeg: 640x640 1 cancer, 12.3ms
Speed: 2.4ms preprocess, 12.3ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

i

<Figure size 800x600 with 2 Axes>

# visualize_misclassified.py

In [3]:
import os
import cv2
import numpy as np
from ultralytics import YOLO
import matplotlib.pyplot as plt
from pathlib import Path

# Paths
model_path = 'D:/Projects/LungCancerDetection/runs/detect/yolov8s_lc25000/weights/best.pt'
test_img_dir = 'D:/Projects/LungCancerDetection/data/images/test'
test_label_dir = 'D:/Projects/LungCancerDetection/data/labels/test'

# Load model
model = YOLO(model_path)

# Class names
class_names = ['Cancer', 'Non-Cancer']

# Collect misclassified images
misclassified = []

for img_name in os.listdir(test_img_dir):
    if not img_name.endswith('.jpeg'):
        continue
    img_path = os.path.join(test_img_dir, img_name)
    results = model.predict(img_path, conf=0.5)
    
    # Predicted class
    pred_class = int(results[0].boxes.cls[0]) if results[0].boxes else 0
    
    # Ground truth
    label_path = os.path.join(test_label_dir, img_name.replace('.jpeg', '.txt'))
    with open(label_path, 'r') as f:
        true_class = int(f.read().strip().split()[0])
    
    if pred_class != true_class:
        misclassified.append({
            'img_path': img_path,
            'true_class': true_class,
            'pred_class': pred_class
        })

# Visualize misclassified images
if not misclassified:
    print("No misclassified images found!")
else:
    print(f"Found {len(misclassified)} misclassified images.")
    for item in misclassified[:5]:  # Show up to 5
        img = cv2.imread(item['img_path'])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.figure(figsize=(6, 6))
        plt.imshow(img)
        plt.title(f"True: {class_names[item['true_class']]}, Pred: {class_names[item['pred_class']]}")
        plt.axis('off')
        plt.show()

# Save misclassified paths to a file
if misclassified:
    with open('D:/Projects/LungCancerDetection/misclassified.txt', 'w') as f:
        for item in misclassified:
            f.write(f"{item['img_path']}, True: {class_names[item['true_class']]}, Pred: {class_names[item['pred_class']]}\n")


image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca10.jpeg: 640x640 1 cancer, 12.2ms
Speed: 2.2ms preprocess, 12.2ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca100.jpeg: 640x640 1 cancer, 12.1ms
Speed: 2.0ms preprocess, 12.1ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca1008.jpeg: 640x640 1 cancer, 12.1ms
Speed: 1.8ms preprocess, 12.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca101.jpeg: 640x640 1 cancer, 12.1ms
Speed: 2.4ms preprocess, 12.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca1015.jpeg: 640x640 1 cancer, 12.2ms
Speed: 1.7ms preprocess, 12.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

ima

# Constraint to Epoch 2

# train.py

In [1]:
from ultralytics import YOLO

def train_model():
    # Load model
    model = YOLO('yolov8s.pt')

    # Train
    model.train(
        data='D:/Projects/LungCancerDetection/data/lc25000.yaml',
        epochs=2,  # 2 epochs
        imgsz=640,
        batch=4,  # Reduced for memory
        device=0,
        patience=0,
        project='D:/Projects/LungCancerDetection/runs/detect',
        name='yolov8s_lc25000_2epochs',
        exist_ok=True,
        cache=False  # Disable RAM caching
    )

    print("Training completed!")

if __name__ == '__main__':
    train_model()

Ultralytics 8.3.147  Python-3.10.17 torch-2.7.0+cu128 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=4, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=D:/Projects/LungCancerDetection/data/lc25000.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=2, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8s.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=yolov8s_lc25000_2epochs, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=

[34m[1mtrain: [0mScanning D:\Projects\LungCancerDetection\data\labels\train.cache... 17500 images, 0 backgrounds, 0 corrupt: 100%|██████████| 17500/17500 [00:00<?, ?it/s]


[34m[1mval: [0mFast image access  (ping: 0.50.1 ms, read: 8.02.6 MB/s, size: 112.0 KB)


[34m[1mval: [0mScanning D:\Projects\LungCancerDetection\data\labels\val.cache... 3750 images, 0 backgrounds, 0 corrupt: 100%|██████████| 3750/3750 [00:00<?, ?it/s]


Plotting labels to D:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000_2epochs\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001667, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mD:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000_2epochs[0m
Starting training for 2 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/2      1.07G     0.1495     0.4735     0.9323         13        640: 100%|██████████| 4375/4375 [09:56<00:00,  7.33it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 469/469 [00:54<00:00,  8.63it/s]

                   all       3750       3750      0.964      0.983      0.985      0.959






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/2      1.31G     0.0763     0.2332     0.8991         15        640: 100%|██████████| 4375/4375 [09:20<00:00,  7.81it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 469/469 [00:50<00:00,  9.28it/s]

                   all       3750       3750      0.937      0.983      0.959      0.959






2 epochs completed in 0.351 hours.
Optimizer stripped from D:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000_2epochs\weights\last.pt, 22.5MB
Optimizer stripped from D:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000_2epochs\weights\best.pt, 22.5MB

Validating D:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000_2epochs\weights\best.pt...
Ultralytics 8.3.147  Python-3.10.17 torch-2.7.0+cu128 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)
Model summary (fused): 72 layers, 11,126,358 parameters, 0 gradients, 28.4 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 469/469 [00:43<00:00, 10.71it/s]


                   all       3750       3750      0.964      0.983      0.985      0.959
                cancer       2250       2250      0.999       0.98      0.995      0.991
            non-cancer       1500       1500      0.929      0.986      0.974      0.928
Speed: 0.3ms preprocess, 6.5ms inference, 0.0ms loss, 1.5ms postprocess per image
Results saved to [1mD:\Projects\LungCancerDetection\runs\detect\yolov8s_lc25000_2epochs[0m
Training completed!


# Evaluate.py

In [2]:
import os
import numpy as np
from ultralytics import YOLO
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# Load trained model
model = YOLO('D:/Projects/LungCancerDetection/runs/detect/yolov8s_lc25000_2epochs/weights/last.pt')

# Test set paths
test_img_dir = 'D:/Projects/LungCancerDetection/data/images/test'
test_label_dir = 'D:/Projects/LungCancerDetection/data/labels/test'

# Initialize lists
y_true = []
y_pred = []

# Predict on test images
for img_name in os.listdir(test_img_dir):
    if not img_name.endswith('.jpeg'):
        continue
    img_path = os.path.join(test_img_dir, img_name)
    results = model.predict(img_path, conf=0.5)
    
    # Extract class prediction
    pred_class = int(results[0].boxes.cls[0]) if results[0].boxes else 0
    y_pred.append(pred_class)
    
    # Read ground truth
    label_path = os.path.join(test_label_dir, img_name.replace('.jpeg', '.txt'))
    with open(label_path, 'r') as f:
        true_class = int(f.read().strip().split()[0])
    y_true.append(true_class)

# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Cancer', 'Non-Cancer'], yticklabels=['Cancer', 'Non-Cancer'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (2 Epochs)')
plt.savefig('D:/Projects/LungCancerDetection/confusion_matrix_2epochs.png')
plt.show()


image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca10.jpeg: 640x640 1 cancer, 42.8ms
Speed: 6.2ms preprocess, 42.8ms inference, 4.5ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca100.jpeg: 640x640 1 cancer, 12.3ms
Speed: 2.5ms preprocess, 12.3ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca1008.jpeg: 640x640 1 cancer, 12.3ms
Speed: 2.0ms preprocess, 12.3ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca101.jpeg: 640x640 1 cancer, 12.5ms
Speed: 2.8ms preprocess, 12.5ms inference, 2.7ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca1015.jpeg: 640x640 1 cancer, 12.4ms
Speed: 3.5ms preprocess, 12.4ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

ima

<Figure size 800x600 with 2 Axes>

# visualize_misclassified.py

In [3]:
import os
import cv2
import numpy as np
from ultralytics import YOLO
import matplotlib.pyplot as plt
from pathlib import Path

# Paths
model_path = 'D:/Projects/LungCancerDetection/runs/detect/yolov8s_lc25000_2epochs/weights/last.pt'
test_img_dir = 'D:/Projects/LungCancerDetection/data/images/test'
test_label_dir = 'D:/Projects/LungCancerDetection/data/labels/test'

# Load model
model = YOLO(model_path)

# Class names
class_names = ['Cancer', 'Non-Cancer']

# Collect misclassified images
misclassified = []

for img_name in os.listdir(test_img_dir):
    if not img_name.endswith('.jpeg'):
        continue
    img_path = os.path.join(test_img_dir, img_name)
    results = model.predict(img_path, conf=0.5)
    
    # Predicted class
    pred_class = int(results[0].boxes.cls[0]) if results[0].boxes else 0
    
    # Ground truth
    label_path = os.path.join(test_label_dir, img_name.replace('.jpeg', '.txt'))
    with open(label_path, 'r') as f:
        true_class = int(f.read().strip().split()[0])
    
    if pred_class != true_class:
        misclassified.append({
            'img_path': img_path,
            'true_class': true_class,
            'pred_class': pred_class
        })

# Visualize misclassified images
if not misclassified:
    print("No misclassified images found!")
else:
    print(f"Found {len(misclassified)} misclassified images.")
    for item in misclassified[:5]:
        img = cv2.imread(item['img_path'])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.figure(figsize=(6, 6))
        plt.imshow(img)
        plt.title(f"True: {class_names[item['true_class']]}, Pred: {class_names[item['pred_class']]}")
        plt.axis('off')
        plt.show()

# Save misclassified paths
if misclassified:
    with open('D:/Projects/LungCancerDetection/misclassified_2epochs.txt', 'w') as f:
        for item in misclassified:
            f.write(f"{item['img_path']}, True: {class_names[item['true_class']]}, Pred: {class_names[item['pred_class']]}\n")


image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca10.jpeg: 640x640 1 cancer, 107.4ms
Speed: 5.8ms preprocess, 107.4ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca100.jpeg: 640x640 1 cancer, 12.4ms
Speed: 2.8ms preprocess, 12.4ms inference, 2.8ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca1008.jpeg: 640x640 1 cancer, 12.3ms
Speed: 2.4ms preprocess, 12.3ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca101.jpeg: 640x640 1 cancer, 12.4ms
Speed: 2.6ms preprocess, 12.4ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 D:\Projects\LungCancerDetection\data\images\test\colonca1015.jpeg: 640x640 1 cancer, 12.4ms
Speed: 2.5ms preprocess, 12.4ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 640)

i

<Figure size 600x600 with 1 Axes>

<Figure size 600x600 with 1 Axes>

<Figure size 600x600 with 1 Axes>

<Figure size 600x600 with 1 Axes>

<Figure size 600x600 with 1 Axes>

# predict.py

In [None]:
import os
from ultralytics import YOLO
from pathlib import Path
import cv2
import json

# Configuration
model_path = 'D:/Projects/LungCancerDetection/runs/detect/yolov8s_lc25000_2epochs/weights/last.pt'  # 2-epoch model; for 17-epoch, use 'D:/Projects/LungCancerDetection/backup_17epochs/runs/detect/yolov8s_lc25000/weights/best.pt'
input_dir = 'D:/Projects/LungCancerDetection/test_images'  # Directory with new images
output_dir = 'D:/Projects/LungCancerDetection/predictions'
conf_threshold = 0.5
class_names = ['Cancer', 'Non-Cancer']

# Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Load model
model = YOLO(model_path)

# Predict on new images
results_list = []
for img_name in os.listdir(input_dir):
    if not img_name.endswith(('.jpg', '.jpeg', '.png')):
        continue
    img_path = os.path.join(input_dir, img_name)
    results = model.predict(img_path, conf=conf_threshold)
    
    # Extract prediction
    pred_class = int(results[0].boxes.cls[0]) if results[0].boxes else 0
    pred_label = class_names[pred_class]
    conf = float(results[0].boxes.conf[0]) if results[0].boxes else 0.0
    
    # Save result
    results_list.append({
        'image': img_name,
        'prediction': pred_label,
        'confidence': conf
    })
    
    # Save image with label
    img = cv2.imread(img_path)
    cv2.putText(img, f'{pred_label} ({conf:.2f})', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.imwrite(os.path.join(output_dir, img_name), img)

# Save predictions to JSON
with open(os.path.join(output_dir, 'predictions.json'), 'w') as f:
    json.dump(results_list, f, indent=4)

print(f"Predictions saved to {output_dir}")

# if we want to use 17 epoch model for prediction

In [None]:
model_path = 'D:/Projects/LungCancerDetection/backup_17epochs/runs/detect/yolov8s_lc25000/weights/best.pt'