In [1]:
import pandas as pd
import numpy as np
import torch
import os
from PIL import Image
from torchvision import models, transforms
from src import helpers

### In this notebook we will interactively create vector representation of images in our dog breed dataset

First, let's get a pretrained model

In [2]:
model = models.get_model("resnet50") # initialize a pretrained model, e.g. resnet50, from torchvision.models with default weights https://pytorch.org/vision/stable/models.html
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

Now we need to iterate through dataset and create vector representations of every image. 

In [None]:
data_dir = "data"
dataset_dir = f"{data_dir}/dogs_dataset"
print("so it begins")
vectors = []
for subdir, dirs, files in os.walk(dataset_dir):
    for file in files:
        print("it does something " + file)
        img_path = os.path.join(subdir, file)
        vector = helpers.image_to_vector(img_path, model)
        try:
            vectors.append({"img_path": img_path, "vector": vector})
        except Exception as e:
            print(f"Skipping {img_path}, {e}")

We stored the vectors with the corresponding image paths to the list - look at its shape and structure >>

In [None]:
vectors

As the last step, we want to save the list into a file (e.g. parquet)

In [None]:
df = pd.DataFrame.from_records(vectors)
df.to_parquet(f"{data_dir}/vectors.parquet")

Check vectors.parquet in /data/ folder.