In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from PIL import Image
# from torchinfo import summary
import torch
import os
import warnings
warnings.filterwarnings("ignore")
from typing import Tuple

from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

from datasets import load_dataset
import pandas as pd

torch.cuda.empty_cache()


In [2]:
print('Number CUDA Devices:', torch.cuda.device_count())
print ('Current cuda device: ', torch.cuda.current_device(), ' **May not correspond to nvidia-smi ID above, check visibility parameter')

Number CUDA Devices: 1
Current cuda device:  0  **May not correspond to nvidia-smi ID above, check visibility parameter


In [None]:
# model_name_or_path = 'google/vit-base-patch16-224-in21k'
# model_name_or_path = 'google/vit-large-patch16-384'
model_name_or_path = 'google/vit-large-patch32-384'

### Load dataset (3 options)

In [3]:
# First from folder with train and test !!! Another preprocessor is not needed
from transformers import ViTImageProcessor

# dataset_folder_name = '../mnt/local/data/kalexu97/large_dataset'
# dataset_folder_name = '../mnt/local/data/kalexu97/short_dataset'
dataset_folder_name = '../mnt/local/data/kalexu97/large_balanced_dataset'
dataset = load_dataset("imagefolder", data_dir=dataset_folder_name)
train_dataset, test_dataset = dataset['train'], dataset['test']

processor = ViTImageProcessor.from_pretrained(model_name_or_path)

def transform(example_batch):
    inputs = processor([x.convert("RGB") for x in example_batch['image']], return_tensors='pt')

    inputs['label'] = example_batch['label']
    return inputs

prepared_ds_train = train_dataset.with_transform(transform)
prepared_ds_test = test_dataset.with_transform(transform)

prepared_ds_train = prepared_ds_train.shuffle(seed=42)
prepared_ds_test = prepared_ds_test.shuffle(seed=42)


Resolving data files:   0%|          | 0/3000 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/540 [00:00<?, ?it/s]

In [5]:
# Second option: also from folder with train and test
# !!! it is used for own preprocessor (the same for the 3-rd option)

dataset_folder_name = '../mnt/local/data/kalexu97/large_dataset'

def load_dataset_path2images(dataset_folder_name):
    train_test_folders = os.listdir(dataset_folder_name)
    datasets = {}
    for trts_split in train_test_folders:
        class_folders = os.listdir(dataset_folder_name+'/'+trts_split)
        labels = []
        pathes = []
        for class_folder in class_folders:
            image_names = os.listdir(dataset_folder_name+'/'+trts_split+'/'+class_folder)
            image_names = [dataset_folder_name+'/'+trts_split+'/'+class_folder+'/'+x for x in image_names]
            class_labels = [int(class_folder)]*len(image_names)
            labels.extend(class_labels)
            pathes.extend(image_names)
        local_dataset = {'image_path': pathes, 'label':labels}
        datasets[trts_split] = pd.DataFrame.from_dict(local_dataset)

    return datasets

dataset = load_dataset_path2images(dataset_folder_name)

train_dataset = dataset['train']
test_dataset = dataset['test']

In [3]:
# Third option: from csv table with names.
# !!! Need to server to save memory with large dataset
from sklearn.model_selection import train_test_split

labelsTable = pd.read_csv('../mnt/local/data/kalexu97/trainLabels.csv')
root_dir = '../mnt/local/data/kalexu97/train'
labelsTable['image_path'] = labelsTable['image'].apply(lambda x: os.path.join(root_dir, x+'.jpeg'))
labelsTable['label'] = labelsTable['level']

labelsTable = labelsTable.drop(columns=['image', 'level'], axis=1)

train_dataset, test_dataset = train_test_split(labelsTable, test_size=0.2)

In [4]:
# Resampling 

##############################################################################################

def resample(_dataset, ratio = 2): # n_items_in_all_resampled_class = ratio * n_in_most_minority_class
    min_size = _dataset['label'].value_counts().min()
    lst = []
    
    for class_index, group in _dataset.groupby('label'):
        if class_index == 0:
            lst.append(group.sample(min_size*ratio, replace=False)) # undersampling
        else:
            if len(group) > min_size*ratio:
                lst.append(group.sample(min_size*ratio, replace=False)) #undersampling
            else:
                lst.append(group) # add all minority class
                lst.append(group.sample(min_size*ratio-len(group), replace=True)) # oversampling
        
    _dataset = pd.concat(lst)

    for class_index, group in _dataset.groupby('label'):
        print(f'{class_index}: length: {len(group)}')

    return _dataset

##############################################################################################
train_dataset = resample(train_dataset, ratio = 4)
# test_dataset = resample(test_dataset, ratio = 1) # it is not good idea to rasmple test dataset! it confused metrics


0: length: 2236
1: length: 2236
2: length: 2236
3: length: 2236
4: length: 2236


### Preprocessing (!!! only for 2 and 3 options)

In [5]:
# mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
from datasets import Dataset
from transformers import ViTImageProcessor
from transformers import AutoImageProcessor


image_processor = AutoImageProcessor.from_pretrained(model_name_or_path)

normalize = T.Normalize(mean=image_processor.image_mean, std=image_processor.image_std)

size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)

print(size)

_transforms = T.Compose([
    T.CenterCrop(1880),
    T.Resize(size, interpolation=T.InterpolationMode.BICUBIC),
    # T.RandomResizedCrop(size),
    T.ToTensor(),
    normalize])

_transforms_train = T.Compose([
    T.CenterCrop(1880),
    T.RandomHorizontalFlip(p = 0.5),
    T.RandomVerticalFlip(p = 0.5),
    # T.RandomRotation(degrees=(-90, 90)),
    T.Resize(size, interpolation=T.InterpolationMode.BICUBIC),
    # T.RandomResizedCrop(size),
    T.ToTensor(),
    normalize])


def load_image(path_image, label, mode):
    # load image
    image = Image.open(path_image)

    if mode == 'train' and label != 0:
        # image = transform_basic(image)
        # image = transform_plus_rand(image)
        return _transforms_train(image.convert("RGB"))
    else:
        # image = transform_basic(image)
        return _transforms(image.convert("RGB"))

        
def func_transform(examples):

    examples["pixel_values"] = [load_image(path, lb, 'train') 
                                for path, lb in zip(examples['image_path'], examples['label'])]
    del examples["image_path"]
    return examples

def func_transform_test(examples):

    examples["pixel_values"] = [load_image(path, lb, 'test')
                                for path, lb in zip(examples['image_path'], examples['label'])]
    del examples["image_path"]
    return examples
    

train_ds = Dataset.from_pandas(train_dataset, preserve_index=False)
test_ds = Dataset.from_pandas(test_dataset, preserve_index=False)

prepared_ds_train = train_ds.with_transform(func_transform)
prepared_ds_test = test_ds.with_transform(func_transform_test)

prepared_ds_train = prepared_ds_train.shuffle(seed=42)
prepared_ds_test = prepared_ds_test.shuffle(seed=42)


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

(384, 384)


In [6]:
print(len(prepared_ds_train))
print(len(prepared_ds_test))

11180
7026


In [7]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [9]:
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.metrics import f1_score #, kappa
# from sklearn import metrics

import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions_proba, labels = eval_pred

    predictions = np.argmax(predictions_proba, axis=1)
    result_accuracy = accuracy.compute(predictions=predictions, references=labels)

    result = {
             'accuracy': np.mean([result_accuracy['accuracy']]),
             'kappa': np.mean([cohen_kappa_score(labels, predictions, weights = "quadratic")]),
             # 'quadratic_kappa': np.mean([kappa(labels, predictions, weights = "quadratic")]),
             'f1': np.mean([f1_score(labels, predictions, average='weighted')]),
             # 'roc_auc': np.mean([roc_auc_score(labels, predictions_proba, multi_class='ovr')])
             }

    cm = confusion_matrix(labels, predictions)
    print(cm)

    return result


In [11]:
from transformers import ViTForImageClassification

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    ignore_mismatched_sizes=True,
    num_labels=5
)

pytorch_model.bin:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch32-384 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([5, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./vit-base",
    
    # report_to="wandb",  # enable logging to W&B
    # run_name="vit384(32)_t_1",  # name of the W&B run (optional)
    logging_steps=20,  # how often to log to W&B
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=2,
    fp16=True,
    save_steps=20,
    eval_steps=20,
    # label_smoothing_factor = 0.96,
    learning_rate=1e-5,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="kappa", 
    greater_is_better = True
)

In [None]:
# it is the same as validation dataset (need to save time during training)

sample_ids = np.random.choice(len(prepared_ds_test), size=200, replace=False)
sample_ids_train = np.random.choice(len(prepared_ds_train), size=len(prepared_ds_train), replace=False)
val_ds = prepared_ds_test.select(sample_ids)
train_ds_shiffled = prepared_ds_train.select(sample_ids_train)

In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds_train,
    # eval_dataset=prepared_ds_test,
    eval_dataset=val_ds,
    tokenizer=image_processor,
)

In [14]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

[34m[1mwandb[0m: Currently logged in as: [33mkalexu97[0m ([33malexu97skoltech[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Kappa,F1
20,1.7008,1.536411,0.18,0.136699,0.200266
40,1.485,1.372026,0.56,0.339791,0.59591
60,1.4841,1.089692,0.72,0.453225,0.672762
80,1.2457,1.333434,0.27,0.413201,0.343473
100,1.331,0.881428,0.74,0.528797,0.723683
120,1.2404,0.921559,0.72,0.504229,0.707163
140,1.2058,1.106381,0.57,0.433281,0.624035
160,1.0881,0.888589,0.76,0.614164,0.731086
180,1.1941,1.097134,0.475,0.555744,0.566367
200,1.1021,0.838995,0.71,0.60374,0.720904


[2 4 1 1 1 4 1 4 2 1 4 1 2 1 1 1 1 1 1 1 0 1 1 2 1 2 1 1 1 1 2 4 1 1 1 1 2
 1 1 1 2 1 3 0 2 2 2 1 1 1 0 0 2 1 1 2 2 2 0 2 1 1 0 2 0 1 2 2 1 2 4 1 4 1
 1 1 1 1 0 2 1 1 2 1 2 1 1 0 1 2 3 1 2 2 3 1 4 0 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 1 1 1 1 2 3 4 1 1 1 4 1 0 1 1 1 2 1 1 0 0 0 1 1 1 1 0 1 0 0 1 3 1 1
 2 1 2 0 1 2 2 1 1 1 1 1 4 0 1 1 1 1 1 2 1 1 2 1 2 1 3 1 4 1 1 1 0 1 3 4 1
 1 3 1 1 0 1 1 1 2 2 4 1 2 1 1]
[0 0 1 0 0 4 0 1 0 0 0 0 2 0 0 0 0 0 1 3 0 0 0 0 0 2 0 0 2 0 0 4 2 0 0 0 2
 0 0 0 0 0 2 0 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 3 2 1
 0 1 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 1 0 3 0 0 0 0 0 0 2 2 2 3 0 3 0
 0 0 0 0 0 2 0 2 1 0 0 0 3 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 1
 0 0 0 0 0 0 0 2 0 0 2 0 0 4 0 0 0 2 0 2 2 0 0 2 0 0 0 0 4 0 0 0 0 0 2 3 0
 0 0 0 2 2 0 0 0 0 0 1 0 2 0 2]
[0 4 0 0 0 4 4 4 0 0 4 0 4 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 4 0 0 0 0 4
 0 0 0 0 1 1 0 4 0 4 1 4 0 0 0 0 1 1 0 4 0 0 4 1 4 0 0 0 1 2 0 0 0 4 4 4 4
 0 1 0 0 0 1 0 0 0 4 4 1 0 0 0 0 0 0

In [15]:
metrics = trainer.evaluate(prepared_ds_test)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

[1 4 0 ... 0 2 0]
[0 0 0 ... 1 2 0]
***** eval metrics *****
  epoch                   =        2.0
  eval_accuracy           =     0.6926
  eval_f1                 =     0.7147
  eval_kappa              =     0.6355
  eval_loss               =     0.8205
  eval_runtime            = 0:12:53.89
  eval_samples_per_second =      9.079
  eval_steps_per_second   =      1.136
