In [3]:

from PIL import Image
from torchinfo import summary
import torch
import os
import warnings
warnings.filterwarnings("ignore")
from typing import Tuple

from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

from datasets import load_dataset


In [None]:

# pip install datasets


In [None]:
# !pip install torchvision
# !pip install torchinfo
# !pip install -q git+https://github.com/huggingface/transformers.git


In [4]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
dataset_folder_name = 'drive/MyDrive/short_dataset'
dataset = load_dataset("imagefolder", data_dir=dataset_folder_name)


Resolving data files:   0%|          | 0/500 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/250 [00:00<?, ?it/s]

In [6]:
dataset


DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 500
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 250
    })
})

In [36]:
from transformers import AutoImageProcessor, Swinv2Model

CHECKPOINT = 'microsoft/swinv2-tiny-patch4-window8-256'

processor = AutoImageProcessor.from_pretrained(CHECKPOINT)





In [37]:
def transform(example_batch):
    inputs = processor([x for x in example_batch['image']], return_tensors='pt')

    inputs['label'] = example_batch['label']
    return inputs

In [38]:
prepared_ds = dataset.with_transform(transform)


In [39]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [40]:

# pip install evaluate

In [41]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    result_accuracy = accuracy.compute(predictions=predictions, references=labels)

    result = {
             'accuracy': np.mean([result_accuracy['accuracy']]),
             'kappa': np.mean([cohen_kappa_score(labels, predictions)]),
             'f1': np.mean([f1_score(labels, predictions, average='weighted')])
             }

    return result


In [42]:
from transformers import Swinv2ForImageClassification

labels = dataset['train'].features['label'].names

model = Swinv2ForImageClassification.from_pretrained(
    CHECKPOINT,
    num_labels=len(labels),
    ignore_mismatched_sizes=True,
)

model.to("cuda")



Some weights of Swinv2ForImageClassification were not initialized from the model checkpoint at microsoft/swinv2-tiny-patch4-window8-256 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Swinv2ForImageClassification(
  (swinv2): Swinv2Model(
    (embeddings): Swinv2Embeddings(
      (patch_embeddings): Swinv2PatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): Swinv2Encoder(
      (layers): ModuleList(
        (0): Swinv2Stage(
          (blocks): ModuleList(
            (0-1): 2 x Swinv2Layer(
              (attention): Swinv2Attention(
                (self): Swinv2SelfAttention(
                  (continuous_position_bias_mlp): Sequential(
                    (0): Linear(in_features=2, out_features=512, bias=True)
                    (1): ReLU(inplace=True)
                    (2): Linear(in_features=512, out_features=3, bias=False)
                  )
                  (query): Linear(in_features=96, out_features=96, bias=True)
                  (key): Linear(in_features=96, out_features=96

In [None]:
# pip install accelerate -U


In [None]:
# pip install transformers -U

In [43]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./Swinv2",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=1,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='tensorboard',
    load_best_model_at_end=True,
)

In [44]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=processor,
)

In [45]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Step,Training Loss,Validation Loss


***** train metrics *****
  epoch                    =        1.0
  total_flos               = 15151244GF
  train_loss               =     1.6178
  train_runtime            = 0:01:55.79
  train_samples_per_second =      4.318
  train_steps_per_second   =      0.276


In [20]:
metrics = trainer.evaluate(prepared_ds['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        1.0
  eval_accuracy           =       0.32
  eval_f1                 =     0.2555
  eval_kappa              =       0.15
  eval_loss               =     1.5599
  eval_runtime            = 0:02:42.49
  eval_samples_per_second =      1.538
  eval_steps_per_second   =      0.197
