In [1]:
import numpy as np
import geopy as gp
import pandas as pd
from geopy.geocoders import Nominatim
from pathlib import Path
from tqdm import tqdm
from src.config import PROCESSED_DATA_DIR, RAW_DATA_DIR, INTERIM_DATA_DIR

[32m2024-12-21 09:36:31.820[0m | [1mINFO    [0m | [36msrc.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: G:\Work\DS\where-am-i[0m


In [2]:
annos = pd.read_parquet(RAW_DATA_DIR / 'train.parquet').reset_index()

In [None]:
annos

In [None]:
# initialize Nominatim API 
geolocator = Nominatim(user_agent="GetLoc")
geolocator.reverse((19.134120, -155.505545)).raw['address']

## Evaluation Metric - Geodesic Distance
 Specifically, for each query image, we compute the Geodesic
 distance between GPS coordinates predicted by our model and the respective ground truth. We
 calculate how many of them (in %) fall within the distance thresholds (1km, 25km, 200km, 750km,
 and 2500km) and report average performance of model over three runs.

## Criterion - Cosine Similarity

#### Precision levels
|precision  | (maximum X axis error, in km)    |
|--|--| 
1  | ± 2500
2  | ± 630
3   |± 78
4   |± 20
5   |± 2.4
6   |± 0.61
7   |± 0.076
8   |± 0.019
9   |± 0.0024
10  |± 0.00060
11  |± 0.000074

In [8]:
import pygeohash as pgh

# Example coordinates
latitude = 19.134120
longitude = -155.505545

# Encode coordinates into geohash
geohash = pgh.encode(latitude, longitude, precision=2)  # Precision determines geohash length
print(f"Geohash: {geohash}")

# Decode geohash back to coordinates
decoded_coords = pgh.decode(geohash)
print(f"Decoded Coordinates: {decoded_coords}")

from geopy import distance
from pygeohash.distances import geohash_approximate_distance

og = (latitude, longitude)
decoded = decoded_coords

print(distance.distance(og, decoded).km)
#print(geohash_approximate_distance('8e3wd2hw8', '8e3wh209f'))


Geohash: 8e
Decoded Coordinates: (20.0, -152.0)
380.110257016121


In [23]:
import pygeohash as pgh


def get_hashes(r: pd.Series):
    hash = pgh.encode(r['latitude'], r['longitude'], precision = 9)
    r['coarse'] = hash[:4]
    r['medium'] = hash[:6]
    r['fine'] = hash
    return r

hashed_annos = annos.apply(get_hashes, axis=1)

In [8]:
#hashed_annos.to_parquet(INTERIM_DATA_DIR / 'hashed_annos.parquet', index = False)
hashed_annos = pd.read_parquet(INTERIM_DATA_DIR / 'hashed_annos.parquet')

In [None]:
num_coarse = hashed_annos.loc[:, ['coarse']]['coarse'].unique()
num_medium = hashed_annos.loc[:, ['medium']]['medium'].unique()
num_fine = hashed_annos.loc[:, ['fine']]['fine'].unique()

In [61]:
l = [319277780027588, 472478470643498]
hashed_annos.set_index(keys='id').loc[l, [ 'coarse', 'medium', 'fine']].values

array([['8e3w', '8e3w7q', '8e3w7qcfmc'],
       ['8e3w', '8e3w5w', '8e3w5w9yqb']], dtype=object)

In [22]:
num_imgs = 0
for root, dirs, files in os.walk(INTERIM_DATA_DIR / 'train'):
    for file in files:
        num_imgs+=1

num_imgs

1206098

In [None]:
#df_pre_train = pd.read_csv(INTERIM_DATA_DIR / 'train/pre_train.csv').drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [17]:
import shutil
import os
import zipfile

df_pre_train = pd.read_csv(INTERIM_DATA_DIR / 'train/pre_train.csv')
ids = set(df_pre_train.loc[:, 'id'].values.tolist())
dic_ids = []
for root, dirs, files in os.walk(INTERIM_DATA_DIR / 'train'):
    for file in tqdm(files):
        id = int(file.split('.jpg')[0])
        if id not in ids:
            dic_ids.append(id)

100%|█████████▉| 1206098/1206100 [00:00<00:00, 1718026.12it/s]


ValueError: invalid literal for int() with base 10: 'pre_train.csv'

In [9]:
import shutil
import os
import zipfile

df_test = pd.read_csv(INTERIM_DATA_DIR / 'test.csv')
ids = set(df_test.loc[:, 'id'].values.tolist())
dic_ids = []
for root, dirs, files in os.walk(INTERIM_DATA_DIR / 'val'):
    for file in tqdm(files):
        id = int(file.split('.jpg')[0])
        if id in ids:
            dic_ids.append(id)
        else:
            os.remove(os.path.join(root, file))


df_val = df_test.set_index(keys='id').loc[dic_ids,].reset_index()
df_val.to_csv(INTERIM_DATA_DIR / 'val.csv')

100%|██████████| 6862/6862 [00:00<?, ?it/s]


In [None]:
for root, dirs, files in os.walk(INTERIM_DATA_DIR / 'val/04'):
    for file in tqdm(files):
        id = int(file.split('.jpg')[0])
        if id in ids:
            dic_ids.append(id)

In [19]:
df_pre_train = pd.concat([df_pre_train, hashed_annos.set_index(keys='id').loc[dic_ids, [ 'coarse', 'medium', 'fine']].reset_index()])
df_pre_train.to_csv(INTERIM_DATA_DIR / 'train/pre_train.csv', index=False)

In [20]:
df_train = df_pre_train.drop(columns=[col for col in df_pre_train.columns.to_list() if col not in ('id', 'coarse', 'medium', 'fine')])
df_train['coarse_i'], class_mapping = pd.factorize(df_train['coarse'])
df_train['medium_i'], class_mapping = pd.factorize(df_train['medium'])
df_train['fine_i'], class_mapping = pd.factorize(df_train['fine'])

In [21]:
df_train.drop(columns=['coarse', 'medium', 'fine']).to_csv(INTERIM_DATA_DIR / 'train/train.csv', index = False)
#df_train = pd.read_csv(INTERIM_DATA_DIR / 'train/train.csv')

In [18]:
df_pre_train = pd.read_csv(INTERIM_DATA_DIR / 'pre_train.csv')
len(df_pre_train['coarse'].str[:3].unique())

491

In [5]:
#Not using since pretrained ViT has inbuilt logic
class ViTPreProcessor(nn.Module):
    def __init__(self):
        super(ViTPreProcessor, self).__init__()

        self.unfold = nn.Unfold(kernel_size = (KERNEL_SIZE, KERNEL_SIZE), stride = KERNEL_SIZE) 
        self.patch_embed = nn.Linear(CHANNELS * KERNEL_SIZE ** 2, EMBED_DIM) 
        self.cls_token = nn.Parameter(torch.zeros(1, 1, EMBED_DIM)) # 1, 1, EMBED_DIM
        self.postional_embeds = nn.Parameter(torch.zeros(1, NUM_PATCHES + 1, EMBED_DIM))

    def forward(self, x):
        batch_size = x.size(0) #B

        patches = self.unfold(x) # B, C * KERNEL_SIZE^2, NUM_PATCHES per img 
        patches = patches.transpose(1, 2)
        patch_embeddings = self.patch_embed(patches) # B, NUM_PATCHES per img, EMBED_DIM
        cls_token = self.cls_token.expand(batch_size, -1, -1) # B, 1, EMBED_DIM
        cls_patches = torch.cat((cls_token, patch_embeddings), dim=1) # B, NUM_PATCHES per img + 1, EMBED_DIM
        
        return cls_patches + self.postional_embeds # B, NUM_PATCHES per img + 1, EMBED_DIM
