In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cd /content/drive/MyDrive

In [None]:
!pip install bitsandbytes
!pip install accelerate
!pip install transformers
!pip install datasets
!pip install evaluate

Collecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.2.post2
Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.24.1 huggingface-hub-0.19.4
Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import cv2

# Define the source and destination directories
SRC_DIR = "/content/drive/MyDrive/Data"
DEST_DIR = "/content/drive/MyDrive/Data2"

# List of categories/classes
CATS = ["adenocarcinoma", "large.cell.carcinoma", "normal", "squamous.cell.carcinoma"]

# Create the destination directory if it doesn't exist
if not os.path.exists(DEST_DIR):
    os.mkdir(DEST_DIR)

# Loop through each subdirectory (train, test, val)
for subdir in ["train", "test", "valid"]:
    # Create subdirectories in the destination directory for each class
    for cat in CATS:
        cat_dir = os.path.join(DEST_DIR, subdir, cat)
        os.makedirs(cat_dir, exist_ok=True)

    # Loop through each class/category
    for category in CATS:
        path = os.path.join(SRC_DIR, subdir, category)
        for image in os.listdir(path):
            curr = os.path.join(path, image)
            img = cv2.imread(curr, 0)
            equalizedImage = cv2.equalizeHist(img)
            e, segmentedImage = cv2.threshold(equalizedImage, 128, 255, cv2.THRESH_TOZERO)

            # Create the destination path based on the category and subdirectory
            imgDest = os.path.join(DEST_DIR, subdir, category, image)
            cv2.imwrite(imgDest, segmentedImage)

print("Processed data directory created successfully at", DEST_DIR)


Processed data directory created successfully at /content/drive/MyDrive/Data2


In [None]:
from transformers import (
    ViTImageProcessor,
    ViTForImageClassification,
    TrainingArguments,
    Trainer,
    DefaultDataCollator,
)
from torchvision import transforms
from datasets import load_dataset
import numpy as np
import evaluate
import torch

PATH_TO_DATASET = "/content/drive/MyDrive/Data2"
MODEL_PATH = "google/vit-base-patch16-224"


def get_dataset(folder_path):
    """
    Loads dataset of CT scan images

    LABELS:
    0-> "adeno carcinoma" (cancer type A)
    1-> "large cell carcinoma" (cancer type B)
    2-> "normal" (no cancer)
    3-> "squamous cell carcinoma" (cancer type C)
    """
    _label2id = {
        "adeno carcinoma": 0,
        "large cell carcinoma": 1,
        "normal": 2,
        "squamous cell carcinoma": 3,
    }
    _id2label = {
        0: "adeno carcinoma",
        1: "large cell carcinoma",
        2: "normal",
        3: "squamous cell carcinoma",
    }
    dataset = load_dataset("imagefolder", data_dir=folder_path)

    image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")

    def preprocess(samples):
        try:
            samples["pixel_values"] = image_processor.preprocess(
                images=samples["image"], return_tensors="pt"
            ).pixel_values
        except:
            samples["pixel_values"] = image_processor.preprocess(
                images=[img.convert("RGB") for img in samples["image"]],
                return_tensors="pt",
            ).pixel_values
        del samples["image"]
        return samples

    dataset = dataset.map(preprocess, batched=True, batch_size=3)
    print("Dataset loaded successfully.")
    return dataset, _id2label, _label2id, image_processor


def setup_model(base_model_path, _id2label, _label2id, _labels):
    model = ViTForImageClassification.from_pretrained(
        base_model_path,
        num_labels=len(_labels),
        id2label=_id2label,
        label2id=_label2id,
        ignore_mismatched_sizes=True,
    )

    print("Model loaded successfully.")
    return model


def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


def finetune_model(model, dataset, image_processor):
    data_collator = DefaultDataCollator()

    for param in model.base_model.parameters():
        param.requires_grad = False

    training_args = TrainingArguments(
        output_dir="cancer_trainer",
        remove_unused_columns=False,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        learning_rate=1e-3,
        optim="adamw_torch",
        save_strategy="epoch",
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=16,
        num_train_epochs=6,
        warmup_ratio=0.0,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        weight_decay=0.00,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=image_processor,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    model.save_pretrained("cancer_model")


def validate_model(model, test_dataset):
    model.to("cpu")
    model.eval()
    correct = [0, 0, 0, 0]
    incorrect = [0, 0, 0, 0]
    for sample in test_dataset:
        inputs = torch.as_tensor(sample["pixel_values"], dtype=torch.float32)
        inputs = inputs[None, :, :, :]
        with torch.no_grad():
            logits = model(inputs).logits
        predicted_label = logits.argmax(-1).item()
        if predicted_label == sample["label"]:
            correct[sample["label"]] += 1
        else:
            incorrect[sample["label"]] += 1
            print(
                "incorrect prediction... predicted:",
                model.config.id2label[predicted_label],
                "expected:",
                model.config.id2label[sample["label"]],
            )

    for i in range(len(correct)):
        print("\n", model.config.id2label[i])
        print("right:", correct[i], " wrong:", incorrect[i])
        print("accuracy:", correct[i] / (correct[i] + incorrect[i]))
    print("\noverall")
    print("right:", sum(correct), "wrong:", sum(incorrect))
    print("accuracy:", sum(correct) / (sum(correct) + sum(incorrect)))


if __name__ == "__main__":
    dataset, id2label, label2id, image_processor = get_dataset(PATH_TO_DATASET)
    labels = label2id.keys()
    model = setup_model(MODEL_PATH, id2label, label2id, labels)
    finetune_model(model, dataset, image_processor)
    validate_model(model, dataset["validation"])


Resolving data files:   0%|          | 0/613 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/315 [00:00<?, ?it/s]

Dataset loaded successfully.


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully.


Epoch,Training Loss,Validation Loss,Accuracy
0,1.2606,1.17968,0.44127
2,0.819,1.104697,0.447619
4,0.7133,1.104962,0.466667
5,0.6735,1.108856,0.466667


incorrect prediction... predicted: large cell carcinoma expected: adeno carcinoma
incorrect prediction... predicted: squamous cell carcinoma expected: adeno carcinoma
incorrect prediction... predicted: squamous cell carcinoma expected: adeno carcinoma
incorrect prediction... predicted: squamous cell carcinoma expected: adeno carcinoma
incorrect prediction... predicted: squamous cell carcinoma expected: adeno carcinoma
incorrect prediction... predicted: squamous cell carcinoma expected: adeno carcinoma
incorrect prediction... predicted: large cell carcinoma expected: adeno carcinoma
incorrect prediction... predicted: adeno carcinoma expected: large cell carcinoma
incorrect prediction... predicted: squamous cell carcinoma expected: large cell carcinoma
incorrect prediction... predicted: squamous cell carcinoma expected: large cell carcinoma
incorrect prediction... predicted: squamous cell carcinoma expected: large cell carcinoma
incorrect prediction... predicted: squamous cell carcinoma 

In [None]:
!mv /content/cancer_model /content/drive/MyDrive