In [None]:
! pip install dreamsim

In [None]:
! mkdir models/
! wget -O models/open_clip_vitb32_pretrain.pth.tar https://github.com/ssundaram21/dreamsim/releases/download/v0.1.0/open_clip_vitb32_pretrain.pth.tar

In [None]:
from dreamsim import dreamsim
from dreamsim import PerceptualModel

import os, sys
import torch
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import torch.nn.functional as F
import pandas as pd
import pickle
import zipfile
import math

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = dreamsim(pretrained=True, device=device)

#### Data Extraction

In [None]:
noaa_path = './data/NOAA_FINAL_BUILDING'
gsv_path = './data/GSV_FINAL_BUILDING'

In [None]:
noaa_data_dir = './data/NOAA_FINAL_BUILDING'
gsv_data_dir = './data/GSV_FINAL_BUILDING'

In [None]:
# List all files in the extraction directory
for root, dirs, files in os.walk(noaa_data_dir):
    pass
print(f'Total files: {len(files)}') # 6555

for root, dirs, files in os.walk(gsv_data_dir):
    pass
print(f'Total files: {len(files)}') # 81

In [None]:
valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')

# Function to clean up non-image files
def remove_non_image_files(directory):
    removed_files = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if not file.lower().endswith(valid_extensions):  # Check if the file is not an image
                file_path = os.path.join(root, file)
                os.remove(file_path)  # Delete the file
                removed_files += 1
    print(f'{removed_files} non-image files removed from {directory}')

# Remove non-image files from both directories
remove_non_image_files(noaa_data_dir)
remove_non_image_files(gsv_data_dir)

# Count remaining files
for root, dirs, files in os.walk(noaa_data_dir):
    pass
print(f'Total image files in {noaa_data_dir}: {len(files)}')

for root, dirs, files in os.walk(gsv_data_dir):
    pass
print(f'Total image files in {gsv_data_dir}: {len(files)}')

#### Image Retrieval

In [None]:
# Load models
dreamsim_model = model
dino_model = PerceptualModel(feat_type='cls', model_type='dino_vitb16', stride='16', baseline=True, device="cuda")
open_clip_model = PerceptualModel(feat_type='embedding', model_type='open_clip_vitb32', stride='32', baseline=True, device="cuda")

In [None]:
# Define datasets
noaa_dir = './data/NOAA_FINAL_BUILDING/'
gsv_dir = './data/GSV_FINAL_BUILDING'
datasets = ["noaa_dir", "gsv_dir"]

# Define embedding function
def get_embeddings(model, model_name, data_dir, data_name):
    embeddings_dict = {}  # Dictionary to store {image_name: embedding}

    for image_name in tqdm(os.listdir(data_dir)):
        try:
            img_path = os.path.join(data_dir, image_name)
            img = Image.open(img_path)
            img = preprocess(img).to(device)
            embeddings_dict[image_name] = model.embed(img).detach().cpu()
        except Exception as e:
            print(f"Error processing image {image_name}: {e}")

    os.makedirs(f"./embeddings/{data_name}/", exist_ok=True)

    # Save embeddings dictionary as pickle
    with open(f"./embeddings/{data_name}/{model_name}_embeds.pkl", "wb") as f:
        pickle.dump(embeddings_dict, f)

# Embed images
for dataset in datasets:
    print(f"Processing dataset: {dataset}")
    data_dir = globals().get(dataset)
    data_name = dataset.split("_")[0]

    # Call similarity models
    get_embeddings(dreamsim_model, "dreamsim", data_dir, data_name)
    get_embeddings(dino_model, "dino", data_dir, data_name)
    get_embeddings(open_clip_model, "openclip", data_dir, data_name)

#### Load Embeddings

In [None]:
# NOAA Embeddings
with open("./embeddings/noaa/dreamsim_embeds.pkl", "rb") as f:
    noaa_dreamsim_emb = pickle.load(f)
with open("./embeddings/noaa/dino_embeds.pkl", "rb") as f:
    noaa_dino_emb = pickle.load(f)
with open("./embeddings/noaa/openclip_embeds.pkl", "rb") as f:
    noaa_openclip_emb = pickle.load(f)

# GSV Embeddings
with open("./embeddings/gsv/dreamsim_embeds.pkl", "rb") as f:
    gsv_dreamsim_emb = pickle.load(f)
with open("./embeddings/gsv/dino_embeds.pkl", "rb") as f:
    gsv_dino_emb = pickle.load(f)
with open("./embeddings/gsv/openclip_embeds.pkl", "rb") as f:
    gsv_openclip_emb = pickle.load(f)

In [None]:
# from google.colab import files

# # List of file paths to download
# pickle_files = [
#     "/content/noaa/embeddings/dreamsim_embeds.pkl",
#     "/content/noaa/embeddings/dino_embeds.pkl",
#     "/content/noaa/embeddings/openclip_embeds.pkl",
#     "/content/gsv/embeddings/dreamsim_embeds.pkl",
#     "/content/gsv/embeddings/dino_embeds.pkl",
#     "/content/gsv/embeddings/openclip_embeds.pkl"
# ]

# # Download each file
# for file_path in pickle_files:
#     files.download(file_path)

In [None]:
# from google.colab import files

# # Upload files
# uploaded = files.upload()

# # Save to desired directory
# for name in uploaded.keys():
#     with open(f"/content/embeddings/{name}", "wb") as f:
#         f.write(uploaded[name])

In [None]:
for img, emb in noaa_dreamsim_emb.items():
  print(img, emb.shape)
  break

In [None]:
for img, emb in gsv_dreamsim_emb.items():
  print(img, emb.shape)
  break

In [None]:
for img, emb in gsv_openclip_emb.items():
  print(img, emb.shape)
  break

In [None]:
print(len(os.listdir(noaa_dir)))
print(len(os.listdir(gsv_dir)))

In [None]:
# Define Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Compute differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c  # Distance in kilometers

    return distance

# Parse coordinates from the image filename
def parse_coordinates(image_name):
    lat, lon = map(float, image_name.split("_")[:2])
    return lat, lon

# Find top 5 nearest neighbors within 50m based on similarity and Haversine filter
def find_top_neighbors(noaa_embeddings, gsv_embeddings, metric="cosine"):
    # Save top similar images
    results = {}

    # Iterate through NOAA images
    for noaa_name, noaa_embed in tqdm(noaa_embeddings.items(), desc="Processing NOAA images"):
        # Parse NOAA image coordinates
        noaa_lat, noaa_lon = parse_coordinates(noaa_name)

        # Store distances and similarities
        filtered_candidates = []

        for gsv_name, gsv_embed in gsv_embeddings.items():
            # Parse GSV image coordinates
            gsv_lat, gsv_lon = parse_coordinates(gsv_name)

            # Compute Haversine distance
            distance = haversine(noaa_lat, noaa_lon, gsv_lat, gsv_lon)

            if distance <= 0.05:  # Only consider images within 50m
                # Compute similarity (cosine similarity)
                if metric == "cosine":
                    similarity = F.cosine_similarity(noaa_embed, gsv_embed, dim=1).item()
                else:
                    raise ValueError(f"Unsupported metric: {metric}")

                filtered_candidates.append((gsv_name, similarity))

        # Sort candidates by similarity and take top 5
        top_neighbors = sorted(filtered_candidates, key=lambda x: x[1], reverse=True)[:5]

        # Save results with key as (noaa_name, num_filtered)
        results[(noaa_name, len(filtered_candidates))] = top_neighbors

    return results

In [None]:
# Find neighbors for DreamSim embeddings
dreamsim_results = find_top_neighbors(noaa_dreamsim_emb, gsv_dreamsim_emb, metric="cosine")

In [None]:
# Find neighbors for Dino embeddings
dino_results = find_top_neighbors(noaa_dino_emb, gsv_dino_emb, metric="cosine")

In [None]:
# Find neighbors for OpenClip embeddings
openclip_results = find_top_neighbors(noaa_openclip_emb, gsv_openclip_emb, metric="cosine")

In [None]:
# Save results
os.makedirs("./results/", exist_ok=True)

with open("./results/dreamsim_results.pkl", "wb") as f:
    pickle.dump(dreamsim_results, f)
with open("./results/dino_results.pkl", "wb") as f:
    pickle.dump(dino_results, f)
with open("./results/openclip_results.pkl", "wb") as f:
    pickle.dump(openclip_results, f)

print("Image retrieval with Haversine filtering completed!")

In [None]:
# pickle_files_results = [
#     "/content/dreamsim_results.pkl",
#     "/content/dino_results.pkl",
#     "/content/openclip_results.pkl"
# ]

# # Download each file
# for file_path in pickle_files_results:
#     files.download(file_path)

In [None]:
print(len(dreamsim_results))
print(dreamsim_results)

In [None]:
print(len(dino_results))
print(dino_results)

In [None]:
print(len(openclip_results))
print(openclip_results)

In [None]:
import gc
gc.collect()