In [1]:
import numpy as np
import geopy as gp
import pandas as pd
from geopy.geocoders import Nominatim
from pathlib import Path
from tqdm import tqdm
from google.colab import drive
import os
import shutil

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import drive
import os
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
DRIVE_FOLDER = '/content/drive/MyDrive/Colab/where-am-i/data'  # Folder where chunks are stored
EXTRACTION_FOLDER = '/content/train'  # Folder to extract images

os.makedirs(EXTRACTION_FOLDER, exist_ok=True)

# List chunks
chunks = [f for f in os.listdir(DRIVE_FOLDER) if f == 'train_15.zip']

# Extract all chunks
for chunk in chunks:
    chunk_path = os.path.join(DRIVE_FOLDER, chunk)
    print(f"Extracting {chunk_path}...")
    shutil.unpack_archive(chunk_path, EXTRACTION_FOLDER)

print("All images extracted successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracting /content/drive/MyDrive/Colab/where-am-i/data/train_15.zip...
All images extracted successfully!


In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab/where-am-i/train.csv')

In [4]:
INTERIM_DATA_DIR = '/content/'

ids = set(df_train.loc[:, 'id'].values.tolist())
dic_ids = []
for root, dirs, files in os.walk('/content/train'):
    for file in tqdm(files):
        id = int(file.split('.jpg')[0])
        if id in ids:
            dic_ids.append(id)
#df_train = pd.read_csv(INTERIM_DATA_DIR / 'train/train.csv')
trainset = df_train.set_index(keys='id').loc[dic_ids,].reset_index().iloc[:int(len(df_train) * 0.9)]
valset = df_train.set_index(keys='id').loc[dic_ids,].reset_index().iloc[int(len(df_train) * 0.9):]

100%|██████████| 32985/32985 [00:00<00:00, 353595.32it/s]


In [5]:
import os
import pandas as pd
from torchvision.io import decode_image, read_file
from torch.utils.data import Dataset
import torch
from pathlib import Path

class OSVImageDataset(Dataset):
    def __init__(self, annotations_df, img_dir, transform=None, target_transform=None):
        self.device = torch.device("cuda")
        self.img_labels = annotations_df
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        #todo: idx using image id?
        img_path = os.path.join(self.img_dir, str(self.img_labels.iloc[idx, 0]) + '.jpg')
        image = decode_image(img_path).float() / 255.0
        label = torch.tensor((self.img_labels.iloc[idx, 1], self.img_labels.iloc[idx, 2], self.img_labels.iloc[idx, 3]))
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        image = image.clamp(0, 1)
        return image, label

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as func
from torchvision import datasets, transforms
#from src.base.OSVImageDataset import OSVImageDataset
from torch.utils.data import DataLoader
from transformers import ViTImageProcessor
from torchvision.transforms import v2

BATCH_SIZE = 64
KERNEL_SIZE = 16 #16x16 patch
CHANNELS = 3 #rgb
RESIZE = 224
EMBED_DIM = CHANNELS * KERNEL_SIZE ** 2
NUM_PATCHES = ((RESIZE + 0 - KERNEL_SIZE)//KERNEL_SIZE + 1) ** 2
COARSE = int(df_train['coarse_i'].values.max()) + 1
MEDIUM = int(df_train['medium_i'].values.max()) + 1
FINE = int(df_train['fine_i'].values.max()) + 1
MODEL_NAME = 'google/vit-base-patch16-224-in21k'


#Using values the ViT was trained on
processor = ViTImageProcessor.from_pretrained(MODEL_NAME, do_rescale = False, return_tensors = 'pt')

image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]

normalize = v2.Normalize(mean=image_mean, std=image_std)

train_transform = v2.Compose([
      v2.Resize((processor.size["height"], processor.size["width"])),
      v2.RandomHorizontalFlip(0.4),
      v2.RandomVerticalFlip(0.1),
      v2.RandomApply(transforms=[v2.RandomRotation(degrees=(0, 90))], p=0.5),
      v2.RandomApply(transforms=[v2.ColorJitter(brightness=.3, hue=.1)], p=0.3),
      v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5, 9))], p=0.3),
      normalize
 ])

test_transform = v2.Compose([
    v2.Resize((processor.size["height"], processor.size["width"])),
    normalize
])

torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else"cpu")
print(f"Using device: {device}")

train_dataset = OSVImageDataset(annotations_df = trainset, img_dir=INTERIM_DATA_DIR + 'train', transform=train_transform)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataset = OSVImageDataset(annotations_df = valset, img_dir = INTERIM_DATA_DIR + 'train', transform=test_transform)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


Using device: cuda


In [10]:
from transformers import ViTImageProcessor, ViTModel
from PIL import Image

class GeoLocator(nn.Module):
    def __init__(self,):
        super(GeoLocator, self).__init__()

        self.backbone = ViTModel.from_pretrained(MODEL_NAME,)

        self.coarse_layer = nn.Linear(self.backbone.config.hidden_size, COARSE)
        self.medium_layer = nn.Linear(self.backbone.config.hidden_size, MEDIUM)
        self.fine_layer = nn.Linear(self.backbone.config.hidden_size, FINE)

    def forward(self, x):
        outputs = self.backbone(x).last_hidden_state
        outputs = outputs[:, 0, :] #for classification only need CLS token

        coarse_output = self.coarse_layer(outputs)
        medium_output = self.medium_layer(outputs)
        fine_output = self.fine_layer(outputs)

        return coarse_output, medium_output, fine_output

In [11]:
import gc
from torch.amp import autocast, GradScaler
import time

model = GeoLocator().to(device=device)
#freezing backbone
for param in model.backbone.parameters():
    param.requires_grad = False

#optimizer for custom layers only
optimizer = torch.optim.AdamW([
    {'params': model.coarse_layer.parameters()},
    {'params': model.medium_layer.parameters()},
    {'params': model.fine_layer.parameters()}
], lr = 0.0004)
#optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

scaler = GradScaler()
num_epochs = 10
temp = 0
start = time.perf_counter()
for epoch in tqdm(range(num_epochs)):
    model.train()
    for images, labels in train_dataloader:
        images = images.to(device=device)
        labels = labels.to(device=device)

        with autocast(device_type=device.__str__()):
            coarse_output, medium_output, fine_output = model(images)

            coarse_loss = criterion(coarse_output, labels[:, 0])
            medium_loss = criterion(medium_output, labels[:, 1])
            fine_loss = criterion(fine_output, labels[:, 2])

            loss = 0.6 * coarse_loss + 0.8 * medium_loss + fine_loss

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    if (epoch + 1) % 1 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

        #clearing memory so that my gpu doesn't die :)
        del images, labels, coarse_output, medium_output, fine_output
        gc.collect
        torch.cuda.empty_cache()
        temp += 1
        if temp == 1:
          break

end = time.perf_counter()
print(f'perf:{end - start}')

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.27 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1.15 GiB is free. Process 2996 has 13.59 GiB memory in use. Of the allocated memory 13.25 GiB is allocated by PyTorch, and 217.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
#from src.config import MODELS_DIR

nn_path = "geonn_v0.pt"
torch.save(model.state_dict(), str(nn_path))

In [None]:
model.eval()
fine_true = np.array([])
fine_pred = np.empty(shape=(0, FINE))
coarse_true = np.array([])
coarse_pred = np.empty(shape=(0, COARSE))
with torch.no_grad():
    for images, labels in tqdm(val_dataloader):
        images = images.to(device=device)
        labels = labels.to(device=device)
        coarse_output, _, fine_output = model(images)

        coarse_true = np.concatenate((coarse_true, labels[:,0].cpu()), axis=0)
        coarse_pred = np.concatenate((coarse_pred, coarse_output.cpu()), axis=0)
        fine_true = np.concatenate((fine_true, labels[:, 2].cpu()), axis=0)
        fine_pred = np.concatenate((fine_pred, fine_output.cpu()), axis=0)

        del images, labels, coarse_output, fine_output
        gc.collect
        torch.cuda.empty_cache()

  0%|          | 0/39 [00:00<?, ?it/s]

100%|██████████| 39/39 [00:14<00:00,  2.75it/s]


In [None]:
from sklearn.metrics import top_k_accuracy_score

print(f'Top K Accuracy Fine: {top_k_accuracy_score(fine_true, fine_pred, k=5, labels=[i for i in range(FINE)]) * 100}')
print(f'Top K Accuracy Output: {top_k_accuracy_score(coarse_true, coarse_pred, k=5, labels=[i for i in range(COARSE)]) * 100}')

Top K Accuracy Fine: 0.0
Top K Accuracy Output: 7.377706495589415


In [None]:
(1.2 * 1e5 * 13) / (1.2 * 1e3)

1300.0

In [None]:
((500000/128) * 4) / 3600

4.340277777777778