In [9]:
import numpy as np
import geopy as gp
import pandas as pd
from geopy.geocoders import Nominatim
from pathlib import Path
from tqdm import tqdm
from src.config import PROCESSED_DATA_DIR, RAW_DATA_DIR, INTERIM_DATA_DIR

In [11]:
#df_train.drop(columns=['coarse', 'medium', 'fine']).to_csv(INTERIM_DATA_DIR / 'train/train.csv', index = False)
df_train = pd.read_csv(INTERIM_DATA_DIR / 'pos_train.csv')

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as func
from torchvision import datasets, transforms
from src.base.OSVImageDataset import OSVImageDataset
from torch.utils.data import DataLoader
from transformers import ViTImageProcessor
from torchvision.transforms import v2

BATCH_SIZE = 64
KERNEL_SIZE = 16 #16x16 patch
CHANNELS = 3 #rgb
RESIZE = 224
EMBED_DIM = CHANNELS * KERNEL_SIZE ** 2
NUM_PATCHES = ((RESIZE + 0 - KERNEL_SIZE)//KERNEL_SIZE + 1) ** 2
COARSE = int(df_train['coarse_i'].values.max()) + 1
MEDIUM = int(df_train['medium_i'].values.max()) + 1
FINE = int(df_train['fine_i'].values.max()) + 1
MODEL_NAME = 'google/vit-base-patch16-224-in21k'

#Using values the ViT was trained on
processor = ViTImageProcessor.from_pretrained(MODEL_NAME, do_rescale = False, return_tensors = 'pt')

image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]

normalize = v2.Normalize(mean=image_mean, std=image_std)

train_transform = v2.Compose([
      v2.Resize((processor.size["height"], processor.size["width"])),
      #v2.RandomHorizontalFlip(0.4),
      #v2.RandomVerticalFlip(0.1),
      #v2.RandomApply(transforms=[v2.RandomRotation(degrees=(0, 90))], p=0.5),
      #v2.RandomApply(transforms=[v2.ColorJitter(brightness=.3, hue=.1)], p=0.3),
      #v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5, 9))], p=0.3),
      normalize
 ])

test_transform = v2.Compose([
    v2.Resize((processor.size["height"], processor.size["width"])),
    normalize
])

torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else"cpu")
print(f"Using device: {device}")

train_dataset = OSVImageDataset(annotations_df = df_train, img_dir=INTERIM_DATA_DIR / 'train', transform=train_transform)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)



Using device: cuda


In [7]:
from safetensors.torch import save_file
import gc
import time

start = time.perf_counter()
batch_num = 0
for images, labels in tqdm(train_dataloader):
    save_file(tensors={'images':images, 'labels': labels}, filename=PROCESSED_DATA_DIR / 'tensors'/ f'batch_{batch_num}.safetensors')
    batch_num+=1
    if batch_num == 1:
        break


  0%|          | 0/18846 [00:00<?, ?it/s]


In [29]:
from safetensors import safe_open

start = time.perf_counter()
images_buffer = []
labels_buffer = []
for i in range(2):
    with safe_open(f"test_{i}.safetensors", framework='pt', device=device.__str__()) as t:
        images = t.get_tensor('images')
        labels = t.get_tensor('labels')
        #print(np.shape(images))
        images_buffer.append(images)
        labels_buffer.append(labels)

batch_imgs = torch.cat(images_buffer, dim=0) 
batch_labels = torch.cat(labels_buffer, dim=0) 
end = time.perf_counter()  
print(np.shape(batch_imgs))
print(np.shape(batch_labels))
print(f'concat perf:{end - start}')
    

torch.Size([128, 3, 224, 224])
torch.Size([128, 3])
concat perf:0.07821529998909682


In [36]:
import os
import pandas as pd
from torchvision.io import decode_image, read_file
from safetensors import safe_open
from torch.utils.data import Dataset
import torch
from pathlib import Path

class OSVProcessedImages(Dataset):
    def __init__(self, batch_dir):
        self.batch_files = sorted(os.listdir(batch_dir))
        self.batch_dir = batch_dir

    def __len__(self):
        return len(self.batch_files)

    def __getitem__(self, idx):
        batch_path = os.path.join(self.batch_dir, self.batch_files[idx])
        with safe_open(batch_path, framework='pt', device=device.__str__()) as t:
            images = t.get_tensor('images')
            labels = t.get_tensor('labels')

        return images, labels

In [42]:
batch_d = OSVProcessedImages(batch_dir=INTERIM_DATA_DIR / 'temp')
batch_loader = DataLoader(batch_d, batch_size=2, shuffle=False)
start = time.perf_counter()
for images, labels in batch_loader:
    print(images.is_contiguous())
    print(np.shape(images.reshape(-1, CHANNELS, RESIZE, RESIZE)))
    print(np.shape(labels.reshape(-1, CHANNELS)))
end = time.perf_counter()  
print(f'concat perf:{end - start}')


True
torch.Size([128, 3, 224, 224])
torch.Size([128, 3])
concat perf:0.0538624000037089


## Creating zip files for Drive

In [10]:
import os
import pandas as pd
import shutil
import math
from zipfile import ZipFile

# Define paths
IMAGE_FOLDER = INTERIM_DATA_DIR / 'train'  # Replace with the folder containing images
CSV_FILE = INTERIM_DATA_DIR / 'train.csv'  # CSV file with the image IDs
OUTPUT_DIR = PROCESSED_DATA_DIR / 'train'  # Folder where compressed chunks will be saved
CHUNK_SIZE_GB = 4  # Target size of each chunk in GB
IMAGE_EXTENSION = '.jpg'  # Image file format

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Read image IDs from the CSV file
df = pd.read_csv(CSV_FILE)
image_ids = df['id'].astype(str).tolist()  # Ensure IDs are strings

# Function to calculate size of images
def get_image_size(image_path):
    return os.path.getsize(image_path) / (1024 ** 3)  # Convert bytes to GB

# Create chunks
current_chunk = 1
current_chunk_size = 0
current_images = []

for img_id in tqdm(image_ids):
    img_path = os.path.join(IMAGE_FOLDER, f"{img_id}{IMAGE_EXTENSION}")
    if os.path.exists(img_path):
        img_size = get_image_size(img_path)
        # Check if adding this image exceeds the chunk size
        if current_chunk_size + img_size > CHUNK_SIZE_GB:
            # Compress the current chunk
            chunk_name = os.path.join(OUTPUT_DIR, f"train_{current_chunk}.zip")
            print(f"Compressing chunk {current_chunk} with {len(current_images)} images...")
            with ZipFile(chunk_name, 'w') as zipf:
                for image in tqdm(current_images):
                    zipf.write(image, os.path.basename(image))  # Add image to the zip
            # Reset for the next chunk
            current_chunk += 1
            current_chunk_size = 0
            current_images = []
        
        # Add image to the current chunk
        current_images.append(img_path)
        current_chunk_size += img_size
    else:
        print(f"Warning: {img_path} not found.")

# Compress the remaining images in the last chunk
if current_images:
    chunk_name = os.path.join(OUTPUT_DIR, f"train_{current_chunk}.zip")
    print(f"Compressing final chunk {current_chunk} with {len(current_images)} images...")
    with ZipFile(chunk_name, 'w') as zipf:
        for image in current_images:
            zipf.write(image, os.path.basename(image))

print("All chunks created successfully!")


  7%|▋         | 83742/1206098 [00:06<01:33, 11972.18it/s]

Compressing chunk 1 with 83866 images...


100%|██████████| 83866/83866 [06:38<00:00, 210.49it/s]t/s]
 14%|█▍        | 166408/1206098 [06:51<01:26, 11972.25it/s]

Compressing chunk 2 with 82844 images...


100%|██████████| 82844/82844 [07:35<00:00, 181.79it/s]it/s]
 21%|██        | 250786/1206098 [14:45<03:08, 5064.97it/s] 

Compressing chunk 3 with 84223 images...


100%|██████████| 84223/84223 [07:27<00:00, 188.04it/s]t/s]
 28%|██▊       | 335454/1206098 [22:35<03:32, 4094.31it/s] 

Compressing chunk 4 with 84596 images...


100%|██████████| 84596/84596 [07:43<00:00, 182.36it/s]t/s]
 35%|███▍      | 418702/1206098 [30:34<02:17, 5743.70it/s] 

Compressing chunk 5 with 83200 images...


100%|██████████| 83200/83200 [07:36<00:00, 182.42it/s]t/s]
 42%|████▏     | 502449/1206098 [38:19<01:09, 10097.00it/s]

Compressing chunk 6 with 83865 images...


100%|██████████| 83865/83865 [07:19<00:00, 190.88it/s]it/s]
 49%|████▊     | 585931/1206098 [45:50<01:19, 7826.97it/s] 

Compressing chunk 7 with 83922 images...


100%|██████████| 83922/83922 [07:49<00:00, 178.90it/s]t/s]
 56%|█████▌    | 669919/1206098 [53:54<01:17, 6903.53it/s] 

Compressing chunk 8 with 84027 images...


100%|██████████| 84027/84027 [07:42<00:00, 181.61it/s]t/s]
 62%|██████▏   | 753174/1206098 [1:01:50<01:12, 6290.37it/s] 

Compressing chunk 9 with 83117 images...


100%|██████████| 83117/83117 [07:30<00:00, 184.54it/s]7it/s]
 69%|██████▉   | 837104/1206098 [1:09:33<00:50, 7302.62it/s] 

Compressing chunk 10 with 83721 images...


100%|██████████| 83721/83721 [08:01<00:00, 173.72it/s]2it/s]
 76%|███████▋  | 921369/1206098 [1:17:47<00:38, 7318.06it/s] 

Compressing chunk 11 with 84320 images...


100%|██████████| 84320/84320 [07:38<00:00, 183.86it/s]6it/s]
 83%|████████▎ | 1004910/1206098 [1:25:37<00:26, 7673.70it/s]

Compressing chunk 12 with 83473 images...


100%|██████████| 83473/83473 [08:01<00:00, 173.51it/s]70it/s]
 90%|█████████ | 1088022/1206098 [1:33:52<00:20, 5704.58it/s] 

Compressing chunk 13 with 83362 images...


100%|██████████| 83362/83362 [07:52<00:00, 176.34it/s]58it/s]
 97%|█████████▋| 1172470/1206098 [1:42:03<00:06, 5068.76it/s]

Compressing chunk 14 with 84577 images...


100%|██████████| 84577/84577 [07:32<00:00, 186.76it/s]76it/s]
100%|██████████| 1206098/1206098 [1:49:41<00:00, 183.25it/s] 


Compressing final chunk 15 with 32985 images...
All chunks created successfully!


In [12]:
#from google.colab import drive
import os
import shutil

# Mount Google Drive
#drive.mount('/content/drive')

# Define paths
DRIVE_FOLDER = PROCESSED_DATA_DIR / 'train'  # Folder where chunks are stored
EXTRACTION_FOLDER = PROCESSED_DATA_DIR / 't'  # Folder to extract images

os.makedirs(EXTRACTION_FOLDER, exist_ok=True)

# List chunks
chunks = [f for f in os.listdir(DRIVE_FOLDER) if f == 'train_15.zip']

# Extract all chunks
for chunk in chunks:
    chunk_path = os.path.join(DRIVE_FOLDER, chunk)
    print(f"Extracting {chunk_path}...")
    shutil.unpack_archive(chunk_path, EXTRACTION_FOLDER)

print("All images extracted successfully!")


Extracting G:\Work\DS\where-am-i\data\processed\train\train_15.zip...
All images extracted successfully!
