## IMC24 Starter for Image Matching Challenge 2024 Hexathlon.

IMC24 Starter came to existance thanks to [imc-understanding-the-baseline][1]. The metric score for mean average accuracy came from [IMC2024-3D-metric-evaluation-example][2].<br>
To further modify the code for submission and scoring, the utility script can be accessed [here][3].

[1]: https://www.kaggle.com/code/asarvazyan/imc-understanding-the-baseline
[2]: https://www.kaggle.com/code/fabiobellavia/imc2024-3d-metric-evaluation-example
[3]: https://www.kaggle.com/code/nartaa/imc24

# SETUP

In [1]:
from imc24 import *

# SIMILLIAR PAIRS

In [2]:
def get_pairs(images_list,device=DEVICE):
    if EXHAUSTIVE:
        return list(combinations(range(len(images_list)), 2)) 
    
    processor = AutoImageProcessor.from_pretrained('/kaggle/input/dinov2/pytorch/base/1/')
    model = AutoModel.from_pretrained('/kaggle/input/dinov2/pytorch/base/1/').eval().to(DEVICE)
    embeddings = []
    
    for img_path in images_list:
        image = K.io.load_image(img_path, K.io.ImageLoadType.RGB32, device=DEVICE)[None, ...]
        with torch.inference_mode():
            inputs = processor(images=image, return_tensors="pt", do_rescale=False ,do_resize=True, 
                               do_center_crop=True, size=224).to(DEVICE)
            outputs = model(**inputs)
            embedding = F.normalize(outputs.last_hidden_state.max(dim=1)[0])
        embeddings.append(embedding)
        
    embeddings = torch.cat(embeddings, dim=0)
    distances = torch.cdist(embeddings,embeddings).cpu()
    distances_ = (distances <= DISTANCES_THRESHOLD).numpy()
    np.fill_diagonal(distances_,False)
    z = distances_.sum(axis=1)
    idxs0 = np.where(z == 0)[0]
    for idx0 in idxs0:
        t = np.argsort(distances[idx0])[1:MIN_PAIRS]
        distances_[idx0,t] = True
        
    s = np.where(distances >= TOLERANCE)
    distances_[s] = False
    
    idxs = []
    for i in range(len(images_list)):
        for j in range(len(images_list)):
            if distances_[i][j]:
                idxs += [(i,j)] if i<j else [(j,i)]
    
    idxs = list(set(idxs))
    return idxs

# KEYPOINTS EXTRACTOR AND MATCHER

In [3]:
def keypoints_matches(images_list,pairs):
    extractor = ALIKED(max_num_keypoints=MAX_NUM_KEYPOINTS,detection_threshold=DETECTION_THRESHOLD,resize=RESIZE_TO).eval().to(DEVICE)
    matcher = KF.LightGlueMatcher("aliked", {'width_confidence':-1, 'depth_confidence':-1, 'mp':True if 'cuda' in str(DEVICE) else False}).eval().to(DEVICE)
    rotation = create_model("swsl_resnext50_32x4d").eval().to(DEVICE)
    
    with h5py.File("keypoints.h5", mode="w") as f_kp, h5py.File("descriptors.h5", mode="w") as f_desc:  
        for image_path in images_list:
            with torch.inference_mode():
                image = load_image(image_path).to(DEVICE)
                feats = extractor.extract(image)
                f_kp[image_path.name] = feats["keypoints"].squeeze().cpu().numpy()
                f_desc[image_path.name] = feats["descriptors"].squeeze().detach().cpu().numpy()
                
    with h5py.File("keypoints.h5", mode="r") as f_kp, h5py.File("descriptors.h5", mode="r") as f_desc, \
         h5py.File("matches.h5", mode="w") as f_matches:  
        for pair in pairs:
            key1, key2 = images_list[pair[0]].name, images_list[pair[1]].name
            kp1 = torch.from_numpy(f_kp[key1][...]).to(DEVICE)
            kp2 = torch.from_numpy(f_kp[key2][...]).to(DEVICE)
            desc1 = torch.from_numpy(f_desc[key1][...]).to(DEVICE)
            desc2 = torch.from_numpy(f_desc[key2][...]).to(DEVICE)
            with torch.inference_mode():
                _, idxs = matcher(desc1, desc2, KF.laf_from_center_scale_ori(kp1[None]), KF.laf_from_center_scale_ori(kp2[None]))
            if len(idxs): group = f_matches.require_group(key1)
            if len(idxs) >= MIN_MATCHES: group.create_dataset(key2, data=idxs.detach().cpu().numpy())

# RANSAC AND SPARSE RECONSTRUCTION

In [4]:
def ransac_and_sparse_reconstruction(images_path):
    now = datetime.datetime.now()
    time_str = now.strftime("%Y-%m-%d_%H-%M-%S")
    db_name = f'colmap_{time_str}.db'
    db = COLMAPDatabase.connect(db_name)
    db.create_tables()
    fname_to_id = add_keypoints(db, '/kaggle/working/', images_path, '', 'simple-pinhole', False)
    add_matches(db, '/kaggle/working/',fname_to_id)
    db.commit()
    
    pycolmap.match_exhaustive(db_name, sift_options={'num_threads':1})
    maps = pycolmap.incremental_mapping(
        database_path=db_name, 
        image_path=images_path,
        output_path='/kaggle/working/', 
        options=pycolmap.IncrementalPipelineOptions({'min_model_size':MIN_MODEL_SIZE, 'max_num_models':MAX_NUM_MODELS, 'num_threads':1})
    )
    return maps

# HYPERPARAMETER TUNING

In [5]:
# SIMILLIAR PAIRS
EXHAUSTIVE = True # True
MIN_PAIRS = 15 # 50, 20, 15, 40, 100
DISTANCES_THRESHOLD = 0.2 # 0.3, 0.6, 0.2, 0.1
TOLERANCE = 500 # 500, 1000

# KEYPOINTS EXTRACTOR AND MATCHER
MAX_NUM_KEYPOINTS = 4096 # 4096
RESIZE_TO = 1280 # 1024
DETECTION_THRESHOLD = 0.005 # 0.005, 0.01
MIN_MATCHES = 100 # 100, 15

# RANSAC AND SPARSE RECONSTRUCTION
MIN_MODEL_SIZE = 5 # 5, 3
MAX_NUM_MODELS = 3 # 3, 2

# CROSS VALIDATION
N_SAMPLES = 50 # 50

SUBMISSION = True

## CROSS VALIDATION

In [6]:
def image_path(row):
    row['image_path'] = 'train/' + row['dataset'] + '/images/' + row['image_name']
    return row

train_df = pd.read_csv(f'{IMC_PATH}/train/train_labels.csv')
train_df = train_df.apply(image_path,axis=1).drop_duplicates(subset=['image_path'])
G = train_df.groupby(['dataset','scene'])['image_path']
image_paths = []

for g in G:
    n = N_SAMPLES
    n = n if n < len(g[1]) else len(g[1])
    g = g[0],g[1].sample(n,random_state=42).reset_index(drop=True)
    for image_path in g[1]:
        image_paths.append(image_path)

gt_df = train_df[train_df.image_path.isin(image_paths)].reset_index(drop=True)
pred_df = gt_df[['image_path','dataset','scene','rotation_matrix','translation_vector']]
pred_df.to_csv('pred_df.csv',index=False)
run('pred_df.csv', get_pairs, keypoints_matches, ransac_and_sparse_reconstruction, submit=False)
pred_df = pd.read_csv('submission.csv')
mAA = round(score(gt_df, pred_df),4)
print('*** Total mean Average Accuracy ***')
print(f"mAA: {mAA}")


*** lizard ***


  M[:ndims, :ndims] *= math.sqrt(np.sum(v1) / np.sum(v0))
  M[:ndims, :ndims] *= math.sqrt(np.sum(v1) / np.sum(v0))



mAA: 0.6206

*** dioscuri ***

mAA: 0.1489

*** pond ***

mAA: 0.3369

*** transp_obj_glass_cylinder ***

mAA: 0.0202

*** multi-temporal-temple-baalshamin ***

mAA: 0.2376

*** church ***

mAA: 0.2234

*** transp_obj_glass_cup ***

mAA: 0.0152
*** Total mean Average Accuracy ***
mAA: 0.229


In [9]:
# # if not SUBMISSION:
#     def image_path(row):
#         row['image_path'] = 'train/' + row['dataset'] + '/images/' + row['image_name']
#         return row

#     train_df = pd.read_csv(f'{IMC_PATH}/train/train_labels.csv')
#     train_df = train_df.apply(image_path,axis=1).drop_duplicates(subset=['image_path'])
#     G = train_df.groupby(['dataset','scene'])['image_path']
#     image_paths = []
    
#     for g in G:
#         n = N_SAMPLES
#         n = n if n < len(g[1]) else len(g[1])
#         g = g[0],g[1].sample(n,random_state=42).reset_index(drop=True)
#         for image_path in g[1]:
#             image_paths.append(image_path)
        
#     gt_df = train_df[train_df.image_path.isin(image_paths)].reset_index(drop=True)
#     pred_df = gt_df[['image_path','dataset','scene','rotation_matrix','translation_vector']]
#     pred_df.to_csv('pred_df.csv',index=False)
#     run('pred_df.csv', get_pairs, keypoints_matches, ransac_and_sparse_reconstruction, submit=False)
#     pred_df = pd.read_csv('submission.csv')
#     mAA = round(score(gt_df, pred_df),4)
#     print('*** Total mean Average Accuracy ***')
#     print(f"mAA: {mAA}")

IndentationError: unexpected indent (2633846123.py, line 2)

# SUBMISSION

In [7]:
# if SUBMISSION:
#     data_path = IMC_PATH + "/sample_submission.csv"
#     run(data_path, get_pairs, keypoints_matches, ransac_and_sparse_reconstruction)