In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = "drive/MyDrive/CS231N/project/train/"

#Dependencies

In [None]:
!pip install transformers datasets evaluate accelerate pillow torchvision scikit-learn

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

%

#Load Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files=f"{data_dir}train.csv", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['image', 'category'],
    num_rows: 6252
})

In [None]:
dataset = dataset.train_test_split(test_size=0.3)

In [None]:
labels = [1,2,3,4,5]
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label-1] = str(i+1)
    id2label[str(i+1)] = label-1

#Load ViT

In [None]:
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

#Baseline

In [None]:
import torch
from transformers import pipeline
from PIL import Image
pipe = pipeline("image-classification", model="google/vit-base-patch16-224", device="cuda" if torch.cuda.is_available() else "cpu")



config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [None]:
lookup = {
    1: "cargo",
2: "military",
3: "carrier",
4: "cruise",
5: "tankers",
}

In [None]:
correct = 0
total = 0
for dat in dataset["test"]:
  total += 1
  label = dat['category']
  infer = pipe(Image.open(data_dir + '/images/' + dat['image']))
  for elem in infer:
    if lookup[label] in elem["label"].split(", "):
      correct += 1
      break

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
print("CORRECT: ", correct)
print("TOTAL: ", total)
print("Accuracy: ", correct/total)

CORRECT:  56
TOTAL:  1876
Accuracy:  0.029850746268656716


#Apply transform

In [None]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
     image_processor.size["shortest_edge"]
     if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
 )
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [None]:
from PIL import Image
def transforms(examples):
    examples["pixel_values"] = [_transforms(Image.open(f"{data_dir}/images/{img}").convert("RGB")) for img in examples["image"]]
    examples["label"] = [label-1 for label in examples["category"]]
    del examples["image"]
    del examples["category"]
    return examples

In [None]:
dataset = dataset.with_transform(transforms)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

#Evaluation metrics

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
dataset["train"].features['category']

Value(dtype='int64', id=None)

#*Finetune*

In [None]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
# print(len(labels))
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
if torch.cuda.is_available():
  device = "cuda"
  print("CUDA is available!")
else:
  device = "cpu"
  print("CUDA is not available.")

CUDA is available!


In [None]:
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

##Training with LR=3e-3

In [None]:
import torch

training_args = TrainingArguments(
    output_dir="finetuned_model_lr3",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
0,1.6018,1.476286,0.50693
1,1.3315,1.147291,0.637527
2,1.0167,0.90986,0.845949
4,0.694,0.602406,0.918443
5,0.5729,0.539231,0.908849
6,0.5023,0.495292,0.914179
8,0.4503,0.456662,0.922175
9,0.4443,0.461536,0.91791


TrainOutput(global_step=80, training_loss=0.8140636295080185, metrics={'train_runtime': 4254.1768, 'train_samples_per_second': 10.286, 'train_steps_per_second': 0.019, 'total_flos': 3.101622786684076e+18, 'train_loss': 0.8140636295080185, 'epoch': 9.142857142857142})

##Training with LR=5e-5

In [None]:
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)
model.to(device)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [None]:
import torch

training_args = TrainingArguments(
    output_dir="finetuned_model_lr5",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.5667,1.449045,0.586887
1,1.3054,1.138266,0.738273
2,0.993,0.883596,0.8742
4,0.6636,0.583665,0.91791
5,0.5507,0.527443,0.906716
6,0.4842,0.484983,0.909915
8,0.4327,0.445842,0.916311
9,0.4296,0.44679,0.925373


TrainOutput(global_step=80, training_loss=0.7910986602306366, metrics={'train_runtime': 899.562, 'train_samples_per_second': 48.646, 'train_steps_per_second': 0.089, 'total_flos': 3.101622786684076e+18, 'train_loss': 0.7910986602306366, 'epoch': 9.142857142857142})

#LR 8e-8

In [None]:
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)
model.to(device)

In [None]:
import torch

training_args = TrainingArguments(
    output_dir="finetuned_model_lr8",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=8e-8,
    per_device_train_batch_size=128,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,1.5771,1.581152,0.30597
1,1.5778,1.581806,0.299041
2,1.5784,1.581195,0.306503
4,1.5767,1.578113,0.318763
5,1.5758,1.579128,0.323028
6,1.5742,1.579234,0.326226
8,1.5755,1.580101,0.313966
9,1.5761,1.580654,0.313433


TrainOutput(global_step=80, training_loss=1.5770449936389923, metrics={'train_runtime': 900.5733, 'train_samples_per_second': 48.591, 'train_steps_per_second': 0.089, 'total_flos': 3.101622786684076e+18, 'train_loss': 1.5770449936389923, 'epoch': 9.142857142857142})

In [4]:
!apt-get install texlive texlive-xetex texlive-latex-extra pandoc
!pip install pypandoc

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pandoc is already the newest version (2.9.2.1-3ubuntu2).
pandoc set to manually installed.
The following additional packages will be installed:
  dvisvgm fonts-droid-fallback fonts-lato fonts-lmodern fonts-noto-mono fonts-texgyre
  fonts-urw-base35 libapache-pom-java libcommons-logging-java libcommons-parent-java
  libfontbox-java libfontenc1 libgs9 libgs9-common libidn12 libijs-0.35 libjbig2dec0 libkpathsea6
  libpdfbox-java libptexenc1 libruby3.0 libsynctex2 libteckit0 libtexlua53 libtexluajit2 libwoff1
  libzzip-0-13 lmodern poppler-data preview-latex-style rake ruby ruby-net-telnet ruby-rubygems
  ruby-webrick ruby-xmlrpc ruby3.0 rubygems-integration t1utils teckit tex-common tex-gyre
  texlive-base texlive-binaries texlive-fonts-recommended texlive-latex-base
  texlive-latex-recommended texlive-pictures texlive-plain-generic tipa xfonts-encodings
  xfonts-utils
Suggested packages:
  fo

In [16]:
import os
os.listdir('/content/drive/MyDrive/cs231n/project/')


['CS231N_ViT_Final.ipynb',
 'CS231N_Swin_Final.ipynb',
 'Copy_CS231N_ViT.ipynb',
 'Copy_CS231N_ViT.pdf',
 'Copy_CS231N_Swin.pdf',
 'Copy_CS231N_Swin.ipynb',
 'train']

In [17]:
!jupyter nbconvert --to PDF "/content/drive/MyDrive/cs231n/project/CS231N_ViT_Final.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/cs231n/project/CS231N_ViT_Final.ipynb to PDF
[NbConvertApp] Writing 92769 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 78051 bytes to /content/drive/MyDrive/cs231n/project/CS231N_ViT_Final.pdf
