In [1]:
import torch
from transformers import ViTFeatureExtractor, ViTForImageClassification, TrainingArguments, Trainer, ViTConfig
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset
from PIL import Image
import os
import numpy as np
import torchvision.transforms as transforms
# class VisionDataset(Dataset):
#     def __init__(self, filepaths, labels, feature_extractor):
#         self.filepaths = filepaths
#         self.labels = labels
#         self.feature_extractor = feature_extractor

#     def __len__(self):
#         return len(self.filepaths)

#     def __getitem__(self, idx):
#         image = Image.open(self.filepaths[idx])
#         image = np.array(image)
#         encoding = self.feature_extractor(images=image, return_tensors='pt')
#         encoding['pixel_values'] = encoding['pixel_values'].squeeze() # remove batch dimension
#         label = self.labels[idx]
#         return {'pixel_values': encoding['pixel_values'], 'label': torch.tensor(label)}

class VisionDataset(Dataset):
    def __init__(self, filepaths, labels, feature_extractor):
        self.filepaths = filepaths
        self.labels = labels
        self.feature_extractor = feature_extractor
        self.transform = transforms.Compose([
            # other transforms...
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5], std=[0.5])
        ])

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        image = Image.open(self.filepaths[idx])
        image = image * 0.5 + 0.5  # unnormalize
        image = Image.fromarray((image * 255).astype(np.uint8))  # convert to PIL image
        image = self.transform(image)
        if len(image.shape) == 2:
            image = image.unsqueeze(0)
        encoding = self.feature_extractor(images=image, return_tensors='pt')
        encoding['pixel_values'] = encoding['pixel_values'].squeeze() # remove batch dimension
        label = self.labels[idx]
        return {'pixel_values': encoding['pixel_values'], 'label': torch.tensor(label)}

# load pretrained vision transformer model and feature extractor
config = ViTConfig.from_pretrained('google/vit-base-patch16-224')
config.num_labels = 2
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', config=config,ignore_mismatched_sizes=True)
# Classifier's weight and bias sizes should now match with the updated config.
print(model.classifier.weight.size())
print(model.classifier.bias.size())
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')


  from .autonotebook import tqdm as notebook_tqdm
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([2, 768])
torch.Size([2])




In [2]:
# prepare data
dir_path = '/home/kevinluo/Benign_Malignant_dataclassifier/'
subfolders = ['train', 'valid']
labels_dict = {'benign': 0, 'malignant': 1}
all_files = []
all_labels = []

for subfolder in subfolders:
    for label_folder, label_id in labels_dict.items():
        folder_path = os.path.join(dir_path, subfolder, label_folder)
        files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
        labels = [label_id]*len(files)
        all_files.extend(files)
        all_labels.extend(labels)

# split data into 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results = {}

for fold, (train_ids, val_ids) in enumerate(skf.split(all_files, all_labels)):
    # prepare train and val datasets
    train_dataset = VisionDataset([all_files[i] for i in train_ids], [all_labels[i] for i in train_ids], feature_extractor)
    val_dataset = VisionDataset([all_files[i] for i in val_ids], [all_labels[i] for i in val_ids], feature_extractor)
    
    # set up training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy='steps',
        save_strategy='steps'
    )
    
    # initialize the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )
    
    # train and validate
    trainer.train()
    eval_result = trainer.evaluate()
    
    # save model
    trainer.save_model(f'./results_fold_{fold}/model')
    
    # store results
    results[fold] = eval_result

# print results
for fold, result in results.items():
    print(f"Fold: {fold}, Eval Result: {result}")

# test data
test_files = []
test_labels = []
for label_folder, label_id in labels_dict.items():
    folder_path = os.path.join(dir_path, 'test', label_folder)
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
    labels = [label_id]*len(files)
    test_files.extend(files)
    test_labels.extend(labels)

# test dataset
test_dataset = VisionDataset(test_files, test_labels, feature_extractor)

# load the best model and make prediction
best_model_index = np.argmin([result['eval_loss'] for result in results.values()])
model = ViTForImageClassification.from_pretrained(f'./results_fold_{best_model_index}/model')
trainer = Trainer(model=model, args=training_args)
predictions = trainer.predict(test_dataset=test_dataset)

# print prediction result
print(predictions)




TypeError: unsupported operand type(s) for *: 'JpegImageFile' and 'float'

In [10]:
import os
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet50
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from transformers import ViTFeatureExtractor, ViTForImageClassification, TrainingArguments, Trainer
import mlflow
# 資料夾路徑
directories = [
    '/home/kevinluo/Benign_Malignant_dataclassifier/train/benign',
    '/home/kevinluo/Benign_Malignant_dataclassifier/train/malignant',
    '/home/kevinluo/Benign_Malignant_dataclassifier/valid/benign',
    '/home/kevinluo/Benign_Malignant_dataclassifier/valid/malignant',
]

# 整理資料與標籤
images = []
labels = []
for i, directory in enumerate(directories):
    for filename in os.listdir(directory):
        if filename.endswith('.jpg'):
            images.append(os.path.join(directory, filename))
            labels.append(i // 2)  # 為 benign 資料夾給 0，malignant 資料夾給 1

# feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')

class MammographyDataset(Dataset):
    def __init__(self, image_paths, labels, transforms=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transforms = transforms

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
        if self.transforms:
            image = self.transforms(image)
        label = self.labels[idx]
        return image, label

# 訓練函數
def train_and_validate(train_images, train_labels, valid_images, valid_labels):
    # 建立 dataloader
    train_dataset = MammographyDataset(train_images, train_labels, transforms=transforms.ToTensor())
    valid_dataset = MammographyDataset(valid_images, valid_labels, transforms=transforms.ToTensor())
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

    # 建立模型
    model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=2, ignore_mismatched_sizes=True)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # 設定訓練參數
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # 建立 trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
    )

    # 訓練模型
    trainer.train()

    # 驗證模型
    metrics = trainer.evaluate()

    return metrics

# 進行 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
results = []
for fold, (train_index, valid_index) in enumerate(kf.split(images, labels)):
    train_images, train_labels = [images[i] for i in train_index], [labels[i] for i in train_index]
    valid_images, valid_labels = [images[i] for i in valid_index], [labels[i] for i in valid_index]
    
    # Create a new experiment for each fold
    mlflow.create_experiment(name=f"Fold_{fold}")
    
    # Then, start a new run in the newly created experiment
    with mlflow.start_run(experiment_id=f"Fold_{fold}"):
        result = train_and_validate(train_images, train_labels, valid_images, valid_labels, output_dir=f'./results_fold_{fold}')
        results.append(result)

# 輸出結果
for i, result in enumerate(results):
    print(f'Fold {i+1}:')
    print(f'\tLoss: {result["eval_loss"]}')
    print(f'\tAccuracy: {result["eval_accuracy"]}')




MlflowException: Experiment 'Fold_0' already exists.

In [13]:
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.model_selection import StratifiedKFold
from transformers import ViTForImageClassification, AdamW
from tqdm import tqdm

# 資料夾路徑
directories = [
    '/home/kevinluo/Benign_Malignant_dataclassifier/train/benign',
    '/home/kevinluo/Benign_Malignant_dataclassifier/train/malignant',
    '/home/kevinluo/Benign_Malignant_dataclassifier/valid/benign',
    '/home/kevinluo/Benign_Malignant_dataclassifier/valid/malignant',
]

# 整理資料與標籤
images = []
labels = []
for i, directory in enumerate(directories):
    for filename in os.listdir(directory):
        if filename.endswith('.jpg'):
            images.append(os.path.join(directory, filename))
            labels.append(i // 2)  # 為 benign 資料夾給 0，malignant 資料夾給 1
# feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')

# 定義我們的 dataset
class CustomImageDataset(Dataset):
    def __init__(self, img_paths, labels, transform=None):
        self.img_paths = img_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        image = Image.open(img_path).convert('RGB')
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)
        return image, label

# 定義我們的 train function
def train(model, device, train_loader, criterion, optimizer):
    model.train()
    train_loss = 0
    correct = 0

    for batch_idx, (data, target) in enumerate(tqdm(train_loader, desc='Training')):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.logits, target)
        train_loss += loss.item()
        pred = output.logits.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        loss.backward()
        optimizer.step()

    return train_loss / len(train_loader), correct / len(train_loader.dataset)

# 定義我們的 validation function
def validate(model, device, valid_loader, criterion):
    model.eval()
    valid_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in tqdm(valid_loader, desc='Validating'):
            data, target = data.to(device), target.to(device)
            output = model(data)
            valid_loss += criterion(output.logits, target).item()
            pred = output.logits.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    return valid_loss / len(valid_loader), correct / len(valid_loader.dataset)

# 定義我們的 test function
def test(model, device, test_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for data, target in tqdm(test_loader, desc='Testing'):
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.logits.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    return correct / len(test_loader.dataset)

# 定義一個 function 來執行我們的 10-fold validation 訓練和驗證
def train_and_validate(train_images, train_labels, valid_images, valid_labels):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=2)
    model = model.to(device)

    train_dataset = CustomImageDataset(train_images, train_labels, transform=transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()]))
    valid_dataset = CustomImageDataset(valid_images, valid_labels, transform=transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()]))
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
    
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    for epoch in range(10):  # 這裡我們只訓練 10 個 epoch
        train_loss, train_acc = train(model, device, train_loader, criterion, optimizer)
        valid_loss, valid_acc = validate(model, device, valid_loader, criterion)
        print(f'Epoch: {epoch}, Train Loss: {train_loss}, Train Acc: {train_acc}, Valid Loss: {valid_loss}, Valid Acc: {valid_acc}')

    torch.save(model.state_dict(), f'./vit_model_fold.pth')

# 先將 train 和 valid 的路徑及 label 合併
train_valid_images = train_images + valid_images
train_valid_labels = train_labels + valid_labels

# 進行 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
results = []
for fold, (train_index, valid_index) in enumerate(kf.split(train_valid_images, train_valid_labels)):
    train_images, train_labels = [train_valid_images[i] for i in train_index], [train_valid_labels[i] for i in train_index]
    valid_images, valid_labels = [train_valid_images[i] for i in valid_index], [train_valid_labels[i] for i in valid_index]
    train_and_validate(train_images, train_labels, valid_images, valid_labels)
    
# 最後使用全部的 train 和 valid 數據來對不起，我不小心送出了不完整的回答，以下是完整的程式碼：


# 最後使用全部的 train 和 valid 數據來訓練模型，並在 test 數據上進行測試
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=2)
model = model.to(device)

train_valid_dataset = CustomImageDataset(train_valid_images, train_valid_labels, transform=transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()]))
test_dataset = CustomImageDataset(test_images, test_labels, transform=transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()]))

train_valid_loader = DataLoader(train_valid_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

criterion = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(10):  # 這裡我們只訓練 10 個 epoch
    train_loss, train_acc = train(model, device, train_valid_loader, criterion, optimizer)
    print(f'Epoch: {epoch}, Train Loss: {train_loss}, Train Acc: {train_acc}')

test_acc = test(model, device, test_loader)
print(f'Test Acc: {test_acc}')




RuntimeError: Error(s) in loading state_dict for ViTForImageClassification:
	size mismatch for classifier.weight: copying a param with shape torch.Size([1000, 768]) from checkpoint, the shape in current model is torch.Size([2, 768]).
	size mismatch for classifier.bias: copying a param with shape torch.Size([1000]) from checkpoint, the shape in current model is torch.Size([2]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [None]:
from transformers import ViTModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn as nn
import torch.nn.functional as F

class ViTForImageClassification(nn.Module):
    def __init__(self, num_labels=3):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.vit.config.hidden_size, num_labels)
        self.num_labels = num_labels

    def forward(self, pixel_values, labels):
        outputs = self.vit(pixel_values=pixel_values)
        output = self.dropout(outputs.last_hidden_state[:,0])
        logits = self.classifier(output)

        loss = None
        if labels is not None:
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if loss is not None:
          return logits, loss.item()
        else:
          return logits, None

In [None]:
from transformers import ViTFeatureExtractor
import torch.nn as nn
import torch
# Define Model
#model = ViTForImageClassification(len(train_ds.classes))
# Feature Extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
# Adam Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# Cross Entropy Loss
loss_func = nn.CrossEntropyLoss()
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
  model.cuda()