In [2]:
import inspect
from plonk import PlonkPipeline

# 1. Load (if not already loaded)
if 'pipeline' not in locals():
    pipeline = PlonkPipeline("nicolas-dufour/PLONK_OSV_5M")

# 2. Reveal the EXACT signature of the call function
print("--- FUNCTION SIGNATURE ---")
print(inspect.signature(pipeline.__call__))

# 3. Reveal the documentation (if the author wrote it)
print("\n--- DOCUMENTATION ---")
print(pipeline.__call__.__doc__)

# 4. Check the 'sampler' object 
# It might control the number of steps
if hasattr(pipeline, "sampler"):
    print("\n--- SAMPLER ATTRIBUTES ---")
    print(dir(pipeline.sampler))

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


--- FUNCTION SIGNATURE ---
(images, batch_size=None, x_N=None, num_steps=None, scheduler=None, cfg=0, generator=None)

--- DOCUMENTATION ---
Sample from the model given conditioning.

        Args:
            images: Conditioning input (image or list of images)
            batch_size: Number of samples to generate (inferred from cond if not provided)
            x_N: Initial noise tensor (generated if not provided)
            num_steps: Number of sampling steps (uses default if not provided)
            sampler: Custom sampler function (uses default if not provided)
            scheduler: Custom scheduler function (uses default if not provided)
            cfg: Classifier-free guidance scale (default 15)
            generator: Random number generator

        Returns:
            Sampled GPS coordinates after postprocessing
        

--- SAMPLER ATTRIBUTES ---
['__annotations__', '__builtins__', '__call__', '__class__', '__closure__', '__code__', '__defaults__', '__delattr__', '__dic

In [4]:
import os
import requests
from tqdm import tqdm

# Dossier o√π on va stocker le mod√®le
local_dir = "./my_local_clip"
os.makedirs(local_dir, exist_ok=True)

# Les fichiers n√©cessaires pour CLIP avec leurs tailles approximatives (pour validation)
files_to_download = {
    "config.json": 4_800,
    "vocab.json": 1_042_000,
    "merges.txt": 456_000,
    "special_tokens_map.json": 400,
    "tokenizer_config.json": 700,
    "preprocessor_config.json": 350,
    "model.safetensors": 1_711_000_000  # ~1.7 GB
}

base_url = "https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/"

print(f"üìÅ T√©l√©chargement dans le dossier : {local_dir}\n")

for filename, expected_size in files_to_download.items():
    url = base_url + filename
    dest_path = os.path.join(local_dir, filename)
    
    # V√©rifier si le fichier existe et a une taille raisonnable
    if os.path.exists(dest_path):
        actual_size = os.path.getsize(dest_path)
        # Tol√©rance de 10% sur la taille
        if actual_size > expected_size * 0.9:
            print(f"‚úÖ {filename} existe d√©j√† ({actual_size / (1024*1024):.1f} MB)")
            continue
        else:
            print(f"‚ö†Ô∏è  {filename} existe mais semble incomplet, re-t√©l√©chargement...")
    
    print(f"‚¨áÔ∏è  T√©l√©chargement de {filename}...")
    
    try:
        # T√©l√©chargement avec support de reprise
        headers = {}
        mode = 'wb'
        initial_pos = 0
        
        if os.path.exists(dest_path):
            initial_pos = os.path.getsize(dest_path)
            headers['Range'] = f'bytes={initial_pos}-'
            mode = 'ab'
            print(f"   üì• Reprise depuis {initial_pos / (1024*1024):.1f} MB")
        
        response = requests.get(url, stream=True, headers=headers, timeout=30)
        
        if response.status_code in [200, 206]:  # 200=nouveau, 206=reprise
            total_size = int(response.headers.get('content-length', 0)) + initial_pos
            
            # Utiliser tqdm pour une meilleure barre de progression
            with open(dest_path, mode) as file:
                with tqdm(
                    total=total_size,
                    initial=initial_pos,
                    unit='B',
                    unit_scale=True,
                    unit_divisor=1024,
                    desc=f"   {filename}"
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1 MB chunks
                        if chunk:
                            file.write(chunk)
                            pbar.update(len(chunk))
            
            print(f"‚úÖ {filename} termin√© ! ({os.path.getsize(dest_path) / (1024*1024):.1f} MB)\n")
        else:
            print(f"‚ùå Erreur HTTP {response.status_code} pour {filename}\n")
            
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Erreur r√©seau pour {filename}: {e}\n")
        print(f"   üí° Relancez le script pour reprendre le t√©l√©chargement.\n")
    except KeyboardInterrupt:
        print(f"\n‚è∏Ô∏è  T√©l√©chargement interrompu. Relancez le script pour continuer.\n")
        break

print("\nüéâ T√âL√âCHARGEMENT TERMIN√â !")
print(f"üìÇ Mod√®le disponible dans : {os.path.abspath(local_dir)}")

# V√©rification finale
print("\nüìä V√©rification des fichiers :")
for filename in files_to_download.keys():
    path = os.path.join(local_dir, filename)
    if os.path.exists(path):
        size = os.path.getsize(path) / (1024*1024)
        print(f"   ‚úì {filename}: {size:.1f} MB")
    else:
        print(f"   ‚úó {filename}: MANQUANT")

üìÅ T√©l√©chargement dans le dossier : ./my_local_clip

‚úÖ config.json existe d√©j√† (0.0 MB)
‚úÖ vocab.json existe d√©j√† (0.9 MB)
‚úÖ merges.txt existe d√©j√† (0.5 MB)
‚úÖ special_tokens_map.json existe d√©j√† (0.0 MB)
‚úÖ tokenizer_config.json existe d√©j√† (0.0 MB)
‚úÖ preprocessor_config.json existe d√©j√† (0.0 MB)
‚ö†Ô∏è  model.safetensors existe mais semble incomplet, re-t√©l√©chargement...
‚¨áÔ∏è  T√©l√©chargement de model.safetensors...
   üì• Reprise depuis 368.0 MB


   model.safetensors: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.59G/1.59G [01:20<00:00, 16.4MB/s]

‚úÖ model.safetensors termin√© ! (1631.3 MB)


üéâ T√âL√âCHARGEMENT TERMIN√â !
üìÇ Mod√®le disponible dans : c:\Users\Imed\Desktop\VMI\Project\Dev\plonk\my_local_clip

üìä V√©rification des fichiers :
   ‚úì config.json: 0.0 MB
   ‚úì vocab.json: 0.9 MB
   ‚úì merges.txt: 0.5 MB
   ‚úì special_tokens_map.json: 0.0 MB
   ‚úì tokenizer_config.json: 0.0 MB
   ‚úì preprocessor_config.json: 0.0 MB
   ‚úì model.safetensors: 1631.3 MB





In [1]:
from plonk import PlonkPipeline
import torch

# Load pipeline
pipeline = PlonkPipeline("nicolas-dufour/PLONK_OSV_5M")

# Check available methods
print("Available methods:")
print([m for m in dir(pipeline) if not m.startswith('_')])

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Available methods:
['compute_likelihood', 'compute_likelihood_grid', 'compute_localizability', 'cond_preprocessing', 'device', 'input_dim', 'manifold', 'model', 'model_path', 'network', 'postprocessing', 'preconditioning', 'sampler', 'scheduler', 'to']


In [2]:
from PIL import Image
import numpy as np

# Load a test image (use any image you have)
# For now, let's just test with a dummy image
test_image = Image.new('RGB', (224, 224), color='blue')

# Test 1: Compute likelihood at a specific point (Paris)
paris_coords = np.array([[48.8566, 2.3522]])  # [lat, lon]

try:
    likelihood = pipeline.compute_likelihood(test_image, paris_coords)
    print(f"‚úÖ compute_likelihood works! Result: {likelihood}")
except Exception as e:
    print(f"‚ùå Error: {e}")

# Test 2: Compute likelihood grid
try:
    grid = pipeline.compute_likelihood_grid(test_image)
    print(f"‚úÖ compute_likelihood_grid works! Grid shape: {grid.shape}")
except Exception as e:
    print(f"‚ùå Error: {e}")

# Test 3: Compute localizability
try:
    localizability = pipeline.compute_localizability(test_image)
    print(f"‚úÖ compute_localizability works! Score: {localizability}")
except Exception as e:
    print(f"‚ùå Error: {e}")

Likelihood NFE: 1022
‚úÖ compute_likelihood works! Result: tensor([0.6887], device='cuda:0')
Computing likelihood over a 19x37 grid (703 points)...


Computing Likelihood Grid: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:59<00:00, 59.42s/it]

Likelihood NFE: 1334
‚ùå Error: 'tuple' object has no attribute 'shape'



  with torch.cuda.amp.autocast(dtype=dtype):


Likelihood NFE: 938
‚úÖ compute_localizability works! Score: 0.6688985824584961


In [3]:
import torch
import numpy as np
import pandas as pd
from plonk import PlonkPipeline
from PIL import Image
from tqdm import tqdm
import os

# Load PLONK pipeline
pipeline = PlonkPipeline("nicolas-dufour/PLONK_OSV_5M")
pipeline.network.to("cuda")

print("‚úÖ Pipeline loaded")

‚úÖ Pipeline loaded


In [7]:
from huggingface_hub import HfFileSystem

# Browse the dataset structure
fs = HfFileSystem()

print("OSV-5M dataset structure:")
files = fs.ls("datasets/osv5m/osv5m", detail=False)
for f in files[:20]:  # First 20 items
    print(f)

OSV-5M dataset structure:
datasets/osv5m/osv5m/images
datasets/osv5m/osv5m/.gitattributes
datasets/osv5m/osv5m/README.md
datasets/osv5m/osv5m/osv5m.py
datasets/osv5m/osv5m/test.csv
datasets/osv5m/osv5m/train.csv


In [5]:
from huggingface_hub import hf_hub_download

# Download the test CSV
csv_path = hf_hub_download(
    repo_id="osv5m/osv5m",
    repo_type="dataset",
    filename="test.csv"
)

print(f"‚úÖ Downloaded test.csv to: {csv_path}")

# Load and inspect it
import pandas as pd
df = pd.read_csv(csv_path)

print(f"\nüìä Test set size: {len(df)} images")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

‚úÖ Downloaded test.csv to: C:\Users\Imed\.cache\huggingface\hub\datasets--osv5m--osv5m\snapshots\cff33609b56b54d8743b7ee7a416eb8433e9a681\test.csv

üìä Test set size: 210122 images

Columns: ['id', 'latitude', 'longitude', 'thumb_original_url', 'country', 'sequence', 'captured_at', 'lon_bin', 'lat_bin', 'cell', 'land_cover', 'road_index', 'drive_side', 'climate', 'soil', 'dist_sea', 'region', 'sub-region', 'city', 'unique_city', 'unique_sub-region', 'unique_region', 'unique_country', 'quadtree_10_1000', 'creator_username', 'creator_id']

First few rows:
                 id   latitude  longitude  \
0   547473234108938 -16.336027  45.628280   
1   826109781317024  50.855687  56.147997   
2  1006398440000844  37.956651  14.954485   
3  2943891539215481  12.373333  -8.909906   
4   122945119799579   7.510295  99.061884   

                                  thumb_original_url country  \
0  https://scontent-cdg4-3.xx.fbcdn.net/m1/v/t6/A...      MG   
1  https://scontent-cdg4-1.xx.fbcdn.net

## Randooooooom sampling

In [6]:
import numpy as np
# Sample 5000 images randomly
np.random.seed(42)
df_sample = df.sample(n=50000, random_state=42).reset_index(drop=True)

print(f"‚úÖ Sampled {len(df_sample)} images")
print(f"\nCountry distribution (top 10):")
print(df_sample['country'].value_counts().head(10))

print(f"\nRegion distribution (top 10):")
print(df_sample['region'].value_counts().head(10))

# Check for missing labels
print(f"\nMissing labels:")
print(f"  Country: {df_sample['country'].isna().sum()}")
print(f"  Region: {df_sample['region'].isna().sum()}")
print(f"  City: {df_sample['city'].isna().sum()}")

‚úÖ Sampled 50000 images

Country distribution (top 10):
country
US    5596
RU    3498
AU    2572
BR    2417
CA    2274
IN    1560
MX    1138
CN    1087
AR    1032
KZ     922
Name: count, dtype: int64

Region distribution (top 10):
region
New South Wales      632
Queensland           604
Ontario              551
Texas                524
Western Australia    514
British Columbia     473
Montana              370
South Australia      348
Minas Gerais         332
Bahia                326
Name: count, dtype: int64

Missing labels:
  Country: 0
  Region: 434
  City: 3


# Download dataset

In [1]:
import requests
from tqdm import tqdm
import os

# Get the download URL for just the first zip
from huggingface_hub import hf_hub_url

url = hf_hub_url(
    repo_id="osv5m/osv5m",
    filename="images/test/00.zip",
    repo_type="dataset"
)

print(f"Downloading 00.zip (2.10 GB)...")
print(f"URL: {url}\n")

os.makedirs("datasets/osv5m/images/test", exist_ok=True)

# Download with visible progress bar
response = requests.get(url, stream=True, timeout=30)
total_size = int(response.headers.get('content-length', 0))

output_path = "datasets/osv5m/00.zip"

with open(output_path, 'wb') as file:
    with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
            pbar.update(len(chunk))

print(f"\n‚úÖ Downloaded to {output_path}")
print(f"Size: {os.path.getsize(output_path) / (1024**3):.2f} GB")

# Extract it
import zipfile
print("\nExtracting...")
with zipfile.ZipFile(output_path, 'r') as zip_ref:
    members = zip_ref.namelist()
    for member in tqdm(members, desc="Extracting"):
        zip_ref.extract(member, "datasets/osv5m/images/test")

print("‚úÖ Done! You have ~42,000 images now")

  from .autonotebook import tqdm as notebook_tqdm


Downloading 00.zip (2.10 GB)...
URL: https://huggingface.co/datasets/osv5m/osv5m/resolve/main/images/test/00.zip



Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2.25G/2.25G [03:45<00:00, 9.99MB/s]



‚úÖ Downloaded to datasets/osv5m/00.zip
Size: 2.10 GB

Extracting...


Extracting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50001/50001 [00:50<00:00, 997.41it/s] 

‚úÖ Done! You have ~42,000 images now





In [7]:
import pandas as pd
import os

# Paths
image_dir = r"C:\Users\Imed\Desktop\VMI\Project\Dev\plonk\datasets\osv5m\images\test\00"
csv_path = r"C:\Users\Imed\.cache\huggingface\hub\datasets--osv5m--osv5m\snapshots\cff33609b56b54d8743b7ee7a416eb8433e9a681\test.csv"

# Load CSV
df_full = pd.read_csv(csv_path)

# Get image IDs from files
image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
downloaded_ids = set([f.replace('.jpg', '') for f in image_files])

print(f"üì∏ Downloaded images: {len(downloaded_ids):,}")
print(f"Example downloaded ID: {list(downloaded_ids)[0]}")
print(f"Type: {type(list(downloaded_ids)[0])}\n")

print(f"Example CSV ID: {df_full['id'].iloc[0]}")
print(f"Type: {type(df_full['id'].iloc[0])}\n")

# Convert both to strings for matching
df_full['id_str'] = df_full['id'].astype(str)
df_matched = df_full[df_full['id_str'].isin(downloaded_ids)].copy()

print(f"‚úÖ Matched: {len(df_matched):,} images")
print(f"\nCountry distribution (top 10):")
print(df_matched['country'].value_counts().head(10))

# Sample 50000
df_sample = df_matched.sample(n=min(50000, len(df_matched)), random_state=42).reset_index(drop=True)
print(f"\n‚úÖ Final dataset: {len(df_sample):,} images")
print(f"Countries: {df_sample['country'].nunique()}")

üì∏ Downloaded images: 50,000
Example downloaded ID: 167304391973915
Type: <class 'str'>

Example CSV ID: 547473234108938
Type: <class 'numpy.int64'>

‚úÖ Matched: 50,000 images

Country distribution (top 10):
country
US    5689
RU    3622
AU    2630
BR    2427
CA    2303
IN    1478
MX    1147
AR    1026
CN    1022
KZ     923
Name: count, dtype: int64

‚úÖ Final dataset: 50,000 images
Countries: 217


In [8]:
# 1. Check the matched dataset
print("Do we have df_sample saved?")
print(f"df_sample exists: {'df_sample' in locals()}")

if 'df_sample' in locals():
    print(f"\n‚úÖ We have {len(df_sample)} images with labels")
    print(f"Columns: {df_sample.columns.tolist()[:10]}")
    print(f"\nFirst row:")
    print(df_sample.iloc[0][['id', 'country', 'region', 'latitude', 'longitude']])
else:
    print("‚ùå Need to rerun matching")

# 2. Check what model we loaded
print("\n" + "="*60)
print("PLONK Model Check:")
print("="*60)

print(f"\nModel path: {pipeline.model_path if hasattr(pipeline, 'model_path') else 'Unknown'}")

# Check the image encoder
if hasattr(pipeline, 'cond_preprocessing'):
    print("‚úÖ Has cond_preprocessing (image encoder)")
    
# What's the actual backbone?
print("\nPipeline structure:")
for attr in ['network', 'preconditioning', 'image_encoder', 'backbone']:
    if hasattr(pipeline, attr):
        obj = getattr(pipeline, attr)
        print(f"  ‚úÖ {attr}: {type(obj)}")

Do we have df_sample saved?
df_sample exists: True

‚úÖ We have 50000 images with labels
Columns: ['id', 'latitude', 'longitude', 'thumb_original_url', 'country', 'sequence', 'captured_at', 'lon_bin', 'lat_bin', 'cell']

First row:
id           301474474887470
country                   RU
region                 Sakha
latitude           62.628466
longitude         135.889643
Name: 0, dtype: object

PLONK Model Check:

Model path: nicolas-dufour/PLONK_OSV_5M
‚úÖ Has cond_preprocessing (image encoder)

Pipeline structure:
  ‚úÖ network: <class 'plonk.models.pretrained_models.Plonk'>
  ‚úÖ preconditioning: <class 'plonk.models.preconditioning.DDPMPrecond'>


## Just some code to see the feature extractor inside the model

In [27]:
# Get the feature extractor
feature_extractor = pipeline.cond_preprocessing

print(f"Feature Extractor: {type(feature_extractor)}")
print(f"Type: {feature_extractor.__class__.__name__}\n")

# Check its attributes
print("Feature extractor attributes:")
for attr in dir(feature_extractor):
    if not attr.startswith('_'):
        print(f"  - {attr}")

# Try to get the actual model
if hasattr(feature_extractor, 'model'):
    print(f"\n‚úÖ Has model: {type(feature_extractor.model)}")
if hasattr(feature_extractor, 'backbone'):
    print(f"‚úÖ Has backbone: {type(feature_extractor.backbone)}")
if hasattr(feature_extractor, 'encoder'):
    print(f"‚úÖ Has encoder: {type(feature_extractor.encoder)}")

# Test extraction
print("\n" + "="*60)
print("Testing feature extraction:")
print("="*60)
# The feature extractor wants: {"img": [list of images]}

test_img = Image.open(os.path.join(image_dir, f"{df_sample.iloc[0]['id']}.jpg"))

with torch.no_grad():
    # Correct format: dict with 'img' key
    result = feature_extractor({"img": [test_img]})

print(f"Output type: {type(result)}")
if isinstance(result, dict):
    print(f"Keys: {result.keys()}")
    for key, value in result.items():
        if torch.is_tensor(value):
            print(f"  '{key}': shape {value.shape}, dtype {value.dtype}")
        else:
            print(f"  '{key}': {type(value)}")

Feature Extractor: <class 'plonk.pipe.StreetClipFeatureExtractor'>
Type: StreetClipFeatureExtractor

Feature extractor attributes:
  - device
  - emb_model
  - processor

Testing feature extraction:
Output type: <class 'dict'>
Keys: dict_keys(['img', 'emb'])
  'img': <class 'list'>
  'emb': shape torch.Size([1, 1024]), dtype torch.float32


# Extracting embeddings

In [28]:
import torch
from PIL import Image
from tqdm import tqdm
import numpy as np

print("Extracting StreetCLIP features from 50,000 images...")
print("Feature dimension: 1024")

features_list = []
countries_list = []
regions_list = []
indices_list = []

batch_size = 64  # Larger batch for faster extraction

for i in tqdm(range(0, len(df_sample), batch_size), desc="Extracting"):
    batch_df = df_sample.iloc[i:i+batch_size]
    
    # Load images for this batch
    images = []
    valid_rows = []
    
    for idx, row in batch_df.iterrows():
        img_path = os.path.join(image_dir, f"{row['id']}.jpg")
        try:
            img = Image.open(img_path).convert('RGB')
            images.append(img)
            valid_rows.append(row)
        except:
            continue
    
    if len(images) == 0:
        continue
    
    # Extract features
    with torch.no_grad():
        batch_dict = feature_extractor({"img": images})
        features = batch_dict['emb'].cpu().numpy()
    
    # Store features and labels
    for feat_idx, row in enumerate(valid_rows):
        features_list.append(features[feat_idx])
        countries_list.append(row['country'])
        regions_list.append(row['region'] if pd.notna(row['region']) else 'UNKNOWN')
        indices_list.append(row.name)

# Convert to numpy
X = np.array(features_list)
y_country = np.array(countries_list)
y_region = np.array(regions_list)

print(f"\n‚úÖ Extraction complete!")
print(f"Features: {X.shape}")
print(f"Countries: {len(np.unique(y_country))} unique")
print(f"Regions: {len(np.unique(y_region))} unique")

# Save for later use
np.save('streetclip_features.npy', X)
np.save('country_labels.npy', y_country)
np.save('region_labels.npy', y_region)
print(f"\nüíæ Saved features to disk")

Extracting StreetCLIP features from 5,000 images...
Feature dimension: 1024
Estimated time: ~10-15 minutes



Extracting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 782/782 [1:21:10<00:00,  6.23s/it]



‚úÖ Extraction complete!
Features: (50000, 1024)
Countries: 217 unique
Regions: 2050 unique

üíæ Saved features to disk


In [10]:
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print("="*60)
print("RELOADING FEATURES & RETRAINING LINEAR PROBE")
print("="*60)

# 1. Load saved features
print("\n1. Loading features from disk...")
X = np.load('streetclip_features.npy')
y_country = np.load('country_labels.npy')

print(f"‚úÖ Features loaded: {X.shape}")
print(f"‚úÖ Labels loaded: {y_country.shape}")
print(f"   Unique countries: {len(np.unique(y_country))}")

# 2. Filter countries with >=2 samples
print("\n2. Filtering countries...")
unique, counts = np.unique(y_country, return_counts=True)
valid_countries = unique[counts >= 2]

print(f"   Total countries: {len(unique)}")
print(f"   Countries with ‚â•2 samples: {len(valid_countries)}")

valid_mask = np.isin(y_country, valid_countries)
X_filtered = X[valid_mask]
y_filtered = y_country[valid_mask]

print(f"   Samples kept: {len(X_filtered)}/{len(X)} ({100*len(X_filtered)/len(X):.1f}%)")

# 3. Split train/test
print("\n3. Splitting train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)

print(f"   Train: {len(X_train)} samples")
print(f"   Test: {len(X_test)} samples")

# 4. Train classifier
print("\n4. Training logistic regression...")
country_clf = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, verbose=1)
country_clf.fit(X_train, y_train)

# 5. Evaluate
print("\n5. Evaluating...")
y_pred = country_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

# Top-5 accuracy
probs = country_clf.predict_proba(X_test)
top5_preds = np.argsort(probs, axis=1)[:, -5:]
top5_acc = np.mean([y_test[i] in country_clf.classes_[top5_preds[i]] 
                     for i in range(len(y_test))])

print(f"\n{'='*60}")
print("RESULTS")
print(f"{'='*60}")
print(f"Top-1 Accuracy: {acc*100:.2f}%")
print(f"Top-5 Accuracy: {top5_acc*100:.2f}%")
print(f"Random Baseline: {100.0/len(valid_countries):.2f}%")
print(f"Improvement: {acc/(1.0/len(valid_countries)):.1f}x")

# 6. Save everything
print(f"\n{'='*60}")
print("SAVING TO DISK")
print(f"{'='*60}")

# Save the trained model
with open('country_classifier.pkl', 'wb') as f:
    pickle.dump(country_clf, f)
print("‚úÖ Saved: country_classifier.pkl")

# Save train/test splits
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)
np.save('y_pred.npy', y_pred)
print("‚úÖ Saved: train/test splits & predictions")

# Save results summary
results = {
    'accuracy': acc,
    'top5_accuracy': top5_acc,
    'n_countries': len(valid_countries),
    'n_train': len(X_train),
    'n_test': len(X_test),
}

np.save('results.npy', results)
print("‚úÖ Saved: results.npy")

print(f"\n{'='*60}")
print("‚úÖ ALL DONE! Everything saved to disk.")
print(f"{'='*60}")

RELOADING FEATURES & RETRAINING LINEAR PROBE

1. Loading features from disk...
‚úÖ Features loaded: (50000, 1024)
‚úÖ Labels loaded: (50000,)
   Unique countries: 217

2. Filtering countries...
   Total countries: 217
   Countries with ‚â•2 samples: 210
   Samples kept: 49993/50000 (100.0%)

3. Splitting train/test...
   Train: 39994 samples
   Test: 9999 samples

4. Training logistic regression...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.



5. Evaluating...

RESULTS
Top-1 Accuracy: 84.97%
Top-5 Accuracy: 97.26%
Random Baseline: 0.48%
Improvement: 178.4x

SAVING TO DISK
‚úÖ Saved: country_classifier.pkl
‚úÖ Saved: train/test splits & predictions
‚úÖ Saved: results.npy

‚úÖ ALL DONE! Everything saved to disk.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

print("="*60)
print("LINEAR PROBING EXPERIMENT")
print("="*60)

# Filter out countries with < 2 samples (needed for stratified split)
unique, counts = np.unique(y_country, return_counts=True)
valid_countries = unique[counts >= 2]

print(f"\nFiltering countries:")
print(f"  Total countries: {len(unique)}")
print(f"  Countries with ‚â•2 samples: {len(valid_countries)}")

# Keep only samples from valid countries
valid_mask = np.isin(y_country, valid_countries)
X_filtered = X[valid_mask]
y_filtered = y_country[valid_mask]

print(f"  Samples kept: {len(X_filtered)}/{len(X)} ({100*len(X_filtered)/len(X):.1f}%)")

# Split train/test (80/20) with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)

print(f"\nDataset split:")
print(f"  Train: {len(X_train)} images")
print(f"  Test:  {len(X_test)} images")

# ============================================================
# COUNTRY CLASSIFICATION
# ============================================================
print("\n" + "="*60)
print("COUNTRY CLASSIFICATION")
print("="*60)

print(f"Training classifier for {len(valid_countries)} countries...")
country_clf = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, verbose=1)
country_clf.fit(X_train, y_train)

# Predict
y_pred = country_clf.predict(X_test)
country_acc = accuracy_score(y_test, y_pred)

# Top-5 accuracy
country_probs = country_clf.predict_proba(X_test)
top5_preds = np.argsort(country_probs, axis=1)[:, -5:]
top5_acc = np.mean([y_test[i] in country_clf.classes_[top5_preds[i]] 
                     for i in range(len(y_test))])

# Random baseline
random_baseline = 1.0 / len(valid_countries)

print(f"\n{'='*60}")
print("RESULTS")
print(f"{'='*60}")
print(f"\nCountry Classification ({len(valid_countries)} classes):")
print(f"  ‚úÖ Top-1 Accuracy: {country_acc*100:.2f}%")
print(f"  ‚úÖ Top-5 Accuracy: {top5_acc*100:.2f}%")
print(f"  üìä Random Baseline: {random_baseline*100:.2f}%")
print(f"  üöÄ Improvement: {country_acc/random_baseline:.1f}x better than random")

# Most confused countries
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred, labels=country_clf.classes_)
print(f"\nPer-country accuracy (top 10 by sample count):")
for country in valid_countries[:10]:
    if country in y_test:
        country_mask = y_test == country
        if country_mask.sum() > 0:
            country_acc_individual = (y_pred[country_mask] == country).mean()
            print(f"  {country}: {country_acc_individual*100:.1f}% ({country_mask.sum()} test samples)")

# Save results
results = {
    'country_accuracy': country_acc,
    'top5_accuracy': top5_acc,
    'n_countries': len(valid_countries),
    'n_train': len(X_train),
    'n_test': len(X_test),
    'random_baseline': random_baseline
}

np.save('probe_results.npy', results) # type: ignore
print(f"\nüíæ Results saved to probe_results.npy")

# Testing on random images from the net

In [41]:
from PIL import Image
import numpy as np

def test_local_image(image_path, country_name="Unknown"):
    """Test PLONK on a local image"""
    
    print(f"\n{'='*60}")
    print(f"Testing image: {image_path}")
    print(f"Expected country: {country_name}")
    print(f"{'='*60}")
    
    # Load image
    try:
        img = Image.open(image_path).convert('RGB')
        print(f"‚úÖ Image loaded: {img.size}")
    except Exception as e:
        print(f"‚ùå Failed to load image: {e}")
        return None
    
    # Extract features
    with torch.no_grad():
        batch_dict = feature_extractor({"img": [img]})
        features = batch_dict['emb'].cpu().numpy()
    
    # Predict
    prediction = country_clf.predict(features)[0]
    probabilities = country_clf.predict_proba(features)[0]
    
    # Top 5
    top5_indices = np.argsort(probabilities)[-5:][::-1]
    top5_countries = country_clf.classes_[top5_indices]
    top5_probs = probabilities[top5_indices]
    
    print(f"\nüéØ Top-1 Prediction: {prediction}")
    if prediction == country_name:
        print(f"‚úÖ CORRECT!")
    else:
        print(f"‚ùå Wrong (expected {country_name})")
    
    print(f"\nTop 5 predictions:")
    for i, (country, prob) in enumerate(zip(top5_countries, top5_probs)):
        marker = "‚úÖ" if country == country_name else "  "
        print(f"{marker} {i+1}. {country}: {prob*100:.2f}%")
    
    # Display
    img.show()
    return img

# Test your local image
img = test_local_image("C:\\Users\\Imed\\Desktop\\VMI\\Project\\Dev\\plonk\\datasets\\other\\20170411_manchester-salford-quays-bridge_16-9.avif", country_name="AU")


Testing image: C:\Users\Imed\Desktop\VMI\Project\Dev\plonk\datasets\other\20170411_manchester-salford-quays-bridge_16-9.avif
Expected country: AU
‚úÖ Image loaded: (1920, 1080)

üéØ Top-1 Prediction: CN
‚ùå Wrong (expected AU)

Top 5 predictions:
   1. CN: 94.61%
   2. RU: 1.83%
   3. EG: 1.29%
   4. GB: 1.28%
   5. NG: 0.41%


# City level probing

In [None]:
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print("="*60)
print("TRAINING CITY-LEVEL CLASSIFIER")
print("="*60)

# 1. Load features (already extracted!)
X = np.load('streetclip_features.npy')
y_country = np.load('country_labels.npy')

print(f"\n‚úÖ Loaded features: {X.shape}")

# 2. Get city labels from df_sample
print("\nüìç Extracting city labels...")

# Create city labels (need unique identifier: city + country)
city_labels = []
for idx in range(len(df_sample)):
    city = df_sample.iloc[idx]['city']
    country = df_sample.iloc[idx]['country']
    
    # Handle missing cities
    if pd.isna(city) or city == '':
        city_label = f"UNKNOWN_{country}"
    else:
        # Make unique: "Paris_FR", "Paris_US" are different
        city_label = f"{city}_{country}"
    
    city_labels.append(city_label)

y_city = np.array(city_labels)

print(f"Total cities: {len(np.unique(y_city))}")

# 3. Filter cities with >= 2 samples (needed for stratified split)
unique_cities, counts = np.unique(y_city, return_counts=True)
valid_cities = unique_cities[counts >= 2]

print(f"Cities with ‚â•2 samples: {len(valid_cities)}")

valid_mask = np.isin(y_city, valid_cities)
X_filtered = X[valid_mask]
y_filtered = y_city[valid_mask]

print(f"Samples kept: {len(X_filtered)}/{len(X)} ({100*len(X_filtered)/len(X):.1f}%)")

# 4. Split train/test
print("\nüîÄ Splitting train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)

print(f"Train: {len(X_train)} samples")
print(f"Test: {len(X_test)} samples")

# 5. Train classifier (this should be fast - features already extracted!)
print(f"\nüöÄ Training logistic regression for {len(valid_cities)} cities...")

city_clf = LogisticRegression(
    solver='saga',
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    verbose=1,
    C=1.0,
    multi_class='multinomial'
)

city_clf.fit(X_train, y_train)

# 6. Evaluate
print("\nüìä Evaluating...")
y_pred = city_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

# Top-5 accuracy
probs = city_clf.predict_proba(X_test)
top5_preds = np.argsort(probs, axis=1)[:, -5:]
top5_acc = np.mean([y_test[i] in city_clf.classes_[top5_preds[i]] 
                     for i in range(len(y_test))])

print(f"\n{'='*60}")
print("CITY CLASSIFICATION RESULTS")
print(f"{'='*60}")
print(f"Top-1 Accuracy: {acc*100:.2f}%")
print(f"Top-5 Accuracy: {top5_acc*100:.2f}%")
print(f"Random Baseline: {100.0/len(valid_cities):.3f}%")
print(f"Improvement: {acc/(1.0/len(valid_cities)):.0f}x")

# 7. Save everything
print(f"\nüíæ Saving...")
with open('city_classifier.pkl', 'wb') as f:
    pickle.dump(city_clf, f)

np.save('y_city_test.npy', y_test)
np.save('y_city_pred.npy', y_pred)

print("‚úÖ Saved: city_classifier.pkl")
print("‚úÖ Saved: city test labels & predictions")

print(f"\n{'='*60}")
print("‚úÖ CITY CLASSIFIER TRAINED!")
print(f"{'='*60}")

# Region linear probing 

In [None]:
# ============================================================
# REGION LINEAR PROBE
# ============================================================
print("="*60)
print("REGION CLASSIFICATION")
print("="*60)

# 1. Load features and region labels
X = np.load('streetclip_features.npy')
y_region = np.load('region_labels.npy')

print(f"‚úÖ Features loaded: {X.shape}")
print(f"‚úÖ Region labels loaded: {y_region.shape}")

# 2. Filter regions with ‚â•2 samples
unique_regions, counts = np.unique(y_region, return_counts=True)
valid_regions = unique_regions[counts >= 2]
valid_mask = np.isin(y_region, valid_regions)

print(f"   Total regions: {len(unique_regions)}")
print(f"   Regions with ‚â•2 samples: {len(valid_regions)}")
print(f"   Samples kept: {valid_mask.sum()}/{len(y_region)} ({valid_mask.sum()/len(y_region)*100:.1f}%)")

# 3. Filter features and labels
X_filtered = X[valid_mask]
y_filtered = y_region[valid_mask]

# 4. Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_filtered
)

print(f"   Train: {len(X_train)} samples")
print(f"   Test: {len(X_test)} samples")

# 5. Train classifier
from sklearn.linear_model import LogisticRegression

print("4. Training logistic regression...")
clf_region = LogisticRegression(
    max_iter=1000, 
    n_jobs=-1,  # Use all cores
    random_state=42,
    verbose=1   # Show progress
)

clf_region.fit(X_train, y_train)

# 6. Evaluate
print("5. Evaluating...")

# Top-1 accuracy
y_pred = clf_region.predict(X_test)
top1_acc = np.mean(y_pred == y_test)

# Top-5 accuracy
y_proba = clf_region.predict_proba(X_test)
top5_pred = np.argsort(y_proba, axis=1)[:, -5:]
top5_acc = np.mean([y_test[i] in top5_pred[i] for i in range(len(y_test))])

print("="*60)
print("REGION RESULTS")
print("="*60)
print(f"Top-1 Accuracy: {top1_acc*100:.2f}%")
print(f"Top-5 Accuracy: {top5_acc*100:.2f}%")
print(f"Classes: {len(valid_regions)}")

# 7. Save
import pickle
with open('region_classifier.pkl', 'wb') as f:
    pickle.dump(clf_region, f)
    
np.save('X_train_region.npy', X_train)
np.save('X_test_region.npy', X_test)
np.save('y_train_region.npy', y_train)
np.save('y_test_region.npy', y_test)
np.save('y_pred_region.npy', y_pred)

print("‚úÖ Saved: region_classifier.pkl")
print("="*60)

# Evaluation

# our test set on plonk 

In [None]:
# ============================================================
# PLONK FULL MODEL COMPARISON (100 images)
# ============================================================
from plonk import PlonkPipeline
import torch
from PIL import Image
from tqdm import tqdm
import numpy as np

print("="*60)
print("TESTING PLONK FULL MODEL ON 100 IMAGES")
print("="*60)

# Load PLONK - CORRECT WAY
pipeline = PlonkPipeline("nicolas-dufour/PLONK_OSV_5M")

# Prepare centroids for reverse geocoding
country_centroids = df_sample.groupby('country')[['latitude', 'longitude']].mean()
region_centroids = df_sample.groupby('region')[['latitude', 'longitude']].mean()
city_centroids = df_sample.groupby('city')[['latitude', 'longitude']].mean()

def gps_to_label(lat, lon, centroids):
    """Find nearest centroid"""
    min_dist = float('inf')
    best_label = None
    for label in centroids.index:
        c_lat = centroids.loc[label, 'latitude']
        c_lon = centroids.loc[label, 'longitude']
        dist = ((lat - c_lat)**2 + (lon - c_lon)**2)**0.5
        if dist < min_dist:
            min_dist = dist
            best_label = label
    return best_label

# Get test indices (first 10000 sampled)
test_sample = test_idx[:10000]

plonk_country_correct = 0
plonk_region_correct = 0
plonk_city_correct = 0

image_dir = r"C:\Users\Imed\Desktop\VMI\Project\Dev\plonk\datasets\osv5m\images\test\00"

for i in tqdm(range(10000), desc="PLONK inference"):
    orig_idx = test_sample[i]
    
    # Get image
    img_id = df_sample.iloc[orig_idx]['id']
    img_path = os.path.join(image_dir, f"{img_id}.jpg")
    
    try:
        img = Image.open(img_path).convert('RGB')
        
        # PLONK prediction - returns GPS coordinates
        gps_coords = pipeline([img], batch_size=1)  # List of images
        pred_lat, pred_lon = float(gps_coords[0][0]), float(gps_coords[0][1])
        
        # True labels
        true_country = df_sample.iloc[orig_idx]['country']
        true_region = df_sample.iloc[orig_idx]['region']
        true_city = df_sample.iloc[orig_idx]['city']
        
        # Reverse geocode
        pred_country = gps_to_label(pred_lat, pred_lon, country_centroids)
        pred_region = gps_to_label(pred_lat, pred_lon, region_centroids)
        pred_city = gps_to_label(pred_lat, pred_lon, city_centroids)
        
        # Count correct
        if pred_country == true_country:
            plonk_country_correct += 1
        if pred_region == true_region:
            plonk_region_correct += 1
        if pred_city == true_city:
            plonk_city_correct += 1
            
    except Exception as e:
        print(f"Error on {img_id}: {e}")
        continue

print(f"\n{'='*60}")
print("PLONK RESULTS (10000 images)")
print(f"{'='*60}")
print(f"Country: {plonk_country_correct}%")
print(f"Region:  {plonk_region_correct}%")
print(f"City:    {plonk_city_correct}%")
print(f"{'='*60}")