In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = "drive/MyDrive/CS231N/project/train/"

#Dependencies

In [None]:
!pip install transformers datasets evaluate accelerate pillow torchvision scikit-learn

Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl (547 kB)
Collecting evaluate
  Using cached evaluate-0.4.2-py3-none-any.whl (84 kB)
Collecting accelerate
  Using cached accelerate-0.31.0-py3-none-any.whl (309 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl (134 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from t

#Load Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files=f"{data_dir}train.csv", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['image', 'category'],
    num_rows: 6252
})

In [None]:
dataset = dataset.train_test_split(test_size=0.3)

In [None]:
labels = [1,2,3,4,5]
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label-1] = str(i+1)
    id2label[str(i+1)] = label-1

#Load Swin Transformer

In [None]:
from transformers import AutoImageProcessor

checkpoint = "microsoft/swin-tiny-patch4-window7-224"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

#Baseline

In [None]:
import torch
from transformers import pipeline
from PIL import Image
pipe = pipeline("image-classification", model="microsoft/swin-tiny-patch4-window7-224", device="cuda" if torch.cuda.is_available() else "cpu")



config.json:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/113M [00:00<?, ?B/s]

In [None]:
lookup = {
    1: "cargo",
2: "military",
3: "carrier",
4: "cruise",
5: "tankers",
}

In [None]:
correct = 0
total = 0
for dat in dataset["test"]:
  total += 1
  label = dat['category']
  infer = pipe(Image.open(data_dir + '/images/' + dat['image']))
  for elem in infer:
    if lookup[label] in elem["label"].split(", "):
      correct += 1
      break

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
print("CORRECT: ", correct)
print("TOTAL: ", total)
print("Accuracy: ", correct/total)

CORRECT:  77
TOTAL:  1876
Accuracy:  0.041044776119402986


#Apply transform

In [None]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
     image_processor.size["shortest_edge"]
     if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
 )
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [None]:
from PIL import Image
def transforms(examples):
    examples["pixel_values"] = [_transforms(Image.open(f"{data_dir}/images/{img}").convert("RGB")) for img in examples["image"]]
    examples["label"] = [label-1 for label in examples["category"]]
    del examples["image"]
    del examples["category"]
    return examples

In [None]:
dataset = dataset.with_transform(transforms)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

#Evaluation metrics

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
dataset["train"].features['category']

Value(dtype='int64', id=None)

#*Finetune*

In [None]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
# print(len(labels))
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes = True,
)

Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
if torch.cuda.is_available():
  device = "cuda"
  print("CUDA is available!")
else:
  device = "cpu"
  print("CUDA is not available.")

CUDA is available!


In [None]:
model.to(device)

SwinForImageClassification(
  (swin): SwinModel(
    (embeddings): SwinEmbeddings(
      (patch_embeddings): SwinPatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): SwinEncoder(
      (layers): ModuleList(
        (0): SwinStage(
          (blocks): ModuleList(
            (0-1): 2 x SwinLayer(
              (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attention): SwinAttention(
                (self): SwinSelfAttention(
                  (query): Linear(in_features=96, out_features=96, bias=True)
                  (key): Linear(in_features=96, out_features=96, bias=True)
                  (value): Linear(in_features=96, out_features=96, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )
                (output): SwinSelfOutput(
  

##Training with LR=3e-3

In [None]:
import torch

training_args = TrainingArguments(
    output_dir="finetuned_model_lr3",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.5839,1.416841,0.374733
1,1.2389,0.83454,0.757463
2,0.6245,0.455776,0.831557
4,0.4112,0.337238,0.876866
5,0.3446,0.305423,0.884328
6,0.337,0.293382,0.886461
8,0.3166,0.286565,0.888593
9,0.3083,0.275047,0.903518


TrainOutput(global_step=80, training_loss=0.6166996091604233, metrics={'train_runtime': 1981.0468, 'train_samples_per_second': 22.089, 'train_steps_per_second': 0.04, 'total_flos': 9.949194975195464e+17, 'train_loss': 0.6166996091604233, 'epoch': 9.142857142857142})

##Training with LR=5e-5

In [None]:
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes = True,
)
model.to(device)

Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SwinForImageClassification(
  (swin): SwinModel(
    (embeddings): SwinEmbeddings(
      (patch_embeddings): SwinPatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): SwinEncoder(
      (layers): ModuleList(
        (0): SwinStage(
          (blocks): ModuleList(
            (0-1): 2 x SwinLayer(
              (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attention): SwinAttention(
                (self): SwinSelfAttention(
                  (query): Linear(in_features=96, out_features=96, bias=True)
                  (key): Linear(in_features=96, out_features=96, bias=True)
                  (value): Linear(in_features=96, out_features=96, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )
                (output): SwinSelfOutput(
  

In [None]:
import torch

training_args = TrainingArguments(
    output_dir="finetuned_model_lr5",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.5996,1.40978,0.348614
1,1.2418,0.873786,0.731343
2,0.6361,0.462312,0.83209
4,0.3928,0.333858,0.870469
5,0.3419,0.302816,0.889126
6,0.332,0.280556,0.897122
8,0.2923,0.281994,0.891791
9,0.3065,0.276082,0.898721


TrainOutput(global_step=80, training_loss=0.6158025428652764, metrics={'train_runtime': 605.3993, 'train_samples_per_second': 72.283, 'train_steps_per_second': 0.132, 'total_flos': 9.949194975195464e+17, 'train_loss': 0.6158025428652764, 'epoch': 9.142857142857142})

#Training with LR = 8e-8

In [None]:
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes = True,
)
model.to(device)

Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SwinForImageClassification(
  (swin): SwinModel(
    (embeddings): SwinEmbeddings(
      (patch_embeddings): SwinPatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): SwinEncoder(
      (layers): ModuleList(
        (0): SwinStage(
          (blocks): ModuleList(
            (0-1): 2 x SwinLayer(
              (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attention): SwinAttention(
                (self): SwinSelfAttention(
                  (query): Linear(in_features=96, out_features=96, bias=True)
                  (key): Linear(in_features=96, out_features=96, bias=True)
                  (value): Linear(in_features=96, out_features=96, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )
                (output): SwinSelfOutput(
  

In [None]:
import torch

training_args = TrainingArguments(
    output_dir="finetuned_model_lr8",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=8e-8,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.6305,1.641171,0.184435
1,1.631,1.643208,0.189765
2,1.6355,1.63644,0.201493
4,1.6214,1.634316,0.196695


Epoch,Training Loss,Validation Loss,Accuracy
0,1.6305,1.641171,0.184435
1,1.631,1.643208,0.189765
2,1.6355,1.63644,0.201493
4,1.6238,1.637724,0.195096
5,1.6167,1.638475,0.191898
6,1.6187,1.631609,0.201493
8,1.6208,1.633693,0.19403
9,1.6218,1.632336,0.194563


TrainOutput(global_step=80, training_loss=1.6243785738945007, metrics={'train_runtime': 606.0789, 'train_samples_per_second': 72.202, 'train_steps_per_second': 0.132, 'total_flos': 9.949194975195464e+17, 'train_loss': 1.6243785738945007, 'epoch': 9.142857142857142})

In [2]:
!apt-get install texlive texlive-xetex texlive-latex-extra pandoc
!pip install pypandoc

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pandoc is already the newest version (2.9.2.1-3ubuntu2).
pandoc set to manually installed.
The following additional packages will be installed:
  dvisvgm fonts-droid-fallback fonts-lato fonts-lmodern fonts-noto-mono fonts-texgyre
  fonts-urw-base35 libapache-pom-java libcommons-logging-java libcommons-parent-java
  libfontbox-java libfontenc1 libgs9 libgs9-common libidn12 libijs-0.35 libjbig2dec0 libkpathsea6
  libpdfbox-java libptexenc1 libruby3.0 libsynctex2 libteckit0 libtexlua53 libtexluajit2 libwoff1
  libzzip-0-13 lmodern poppler-data preview-latex-style rake ruby ruby-net-telnet ruby-rubygems
  ruby-webrick ruby-xmlrpc ruby3.0 rubygems-integration t1utils teckit tex-common tex-gyre
  texlive-base texlive-binaries texlive-fonts-recommended texlive-latex-base
  texlive-latex-recommended texlive-pictures texlive-plain-generic tipa xfonts-encodings
  xfonts-utils
Suggested packages:
  fo

In [3]:
!jupyter nbconvert --to PDF "/content/drive/MyDrive/cs231n/project/CS231N_Swin_Final.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/cs231n/project/CS231N_Swin_Final.ipynb to PDF
[NbConvertApp] Writing 105446 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 84026 bytes to /content/drive/MyDrive/cs231n/project/CS231N_Swin_Final.pdf
