# We perform a simple analysis of the submissions, metadata provided, and generate a filtered list of valid submissions

## Read metadata and filter content

Load the metadata file and set the submission id as index.

In [1]:
import pandas as pd
from icdar24_maptext_analysis.paths import RELPATH_FILE_SUBMISSIONS_META
submissions_meta = pd.read_csv(RELPATH_FILE_SUBMISSIONS_META).set_index('ID')
submissions_meta

Unnamed: 0_level_0,Unnamed: 0,User,Date,Challenge,Task,Valid,Competition,Val. Set,Title,Filesize,Public,Eval.time,Results size
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
113398,93096,user_36250,2024-03-23 15:31:02,28,1,1,1,0,ds-lp,10318653,1,00:00:36,642.0
113413,93111,user_52424,2024-03-24 22:55:02,28,1,1,1,0,Test,278970683,1,00:00:44,321.0
113460,93158,user_36250,2024-03-26 06:24:55,28,4,1,1,0,DS-LP,14289896,1,00:26:18,467.0
113461,93159,user_36250,2024-03-26 06:26:21,28,3,1,1,0,DS-LP,14289896,1,00:25:14,459.0
113462,93160,user_36250,2024-03-26 06:27:32,28,2,1,1,0,DS-LP,14289896,1,00:03:56,792.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
114581,94279,user_51343,2024-05-06 04:57:11,28,2,1,1,0,MapText Detection-Linking Strong Pipeline,165918784,1,00:22:04,792.0
114582,94280,user_51343,2024-05-06 05:00:51,28,3,1,1,0,MapText Detection and Recognition Strong Pipeline,154636953,1,00:22:05,910.0
114583,94281,user_51343,2024-05-06 05:16:27,28,4,1,1,0,MapText Detection-Recognition-Linking Strong P...,165918784,1,00:22:04,939.0
114584,94282,user_51343,2024-05-06 05:16:27,28,4,1,1,0,MapText Detection-Recognition-Linking Strong P...,165918784,1,00:16:11,939.0


Based on a manual check, it looks like the following submissions can be deleted from our analysis as duplicates:

- Task 1: 114256, 114281, 114292
- Task 2: 114294, 114580
- Task 3: 114289, 114291, 114322
- Task 4: 114295, 114583, 114584

We believe all of them have Rumsey entries, but not all of them may have IGN entries.

In [2]:
duplicate_submission_ids_manual = [
    114256, 114281, 114292,  # task 1
    114294, 114580,  # task 2
    114289, 114291, 114322,  # task 3
    114295, 114583, 114584]  # task 4

Collect the set of submission ids for which evaluation is available.

In [3]:
from icdar24_maptext_analysis.loaders import VALID_SUBSETS, VALID_TASKS, list_results
available_results = []
for task_id in VALID_TASKS:
    for subset in VALID_SUBSETS:
        available_results.extend(list_results(task_id, subset))
available_results = set(int(x) for x in available_results)
len(available_results)

35

In [4]:
valid_submission_ids = available_results - set(duplicate_submission_ids_manual)
print(valid_submission_ids)
len(valid_submission_ids)


{114579, 114197, 114581, 114582, 114585, 113460, 113461, 113462, 113464, 113478, 113479, 113480, 113481, 114534, 114279, 114536, 114283, 114288, 114549, 114293, 114551, 114298, 114302, 114303}


24

In [5]:
submissions_meta_ignored = submissions_meta.loc[~submissions_meta.index.isin(valid_submission_ids)]
submissions_meta_valid = submissions_meta.loc[submissions_meta.index.isin(valid_submission_ids)]
print(f"Total submissions: {len(submissions_meta)}, valid: {len(valid_submission_ids)}, discarded: {len(submissions_meta_ignored)}")

Total submissions: 63, valid: 24, discarded: 39


### Valid submissions to be included in the report

In [6]:
submissions_meta_valid

Unnamed: 0_level_0,Unnamed: 0,User,Date,Challenge,Task,Valid,Competition,Val. Set,Title,Filesize,Public,Eval.time,Results size
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
113460,93158,user_36250,2024-03-26 06:24:55,28,4,1,1,0,DS-LP,14289896,1,00:26:18,467.0
113461,93159,user_36250,2024-03-26 06:26:21,28,3,1,1,0,DS-LP,14289896,1,00:25:14,459.0
113462,93160,user_36250,2024-03-26 06:27:32,28,2,1,1,0,DS-LP,14289896,1,00:03:56,792.0
113464,93162,user_36250,2024-03-26 06:33:14,28,1,1,1,0,DS-LP,14289896,1,00:05:55,792.0
113478,93176,user_9,2024-03-26 16:11:30,28,1,1,1,0,Baseline TESTR Checkpoint,34544956,1,00:22:08,792.0
113479,93177,user_9,2024-03-26 16:18:47,28,3,1,1,0,Baseline TESTR Checkpoint,34544956,1,00:22:09,917.0
113480,93178,user_9,2024-03-26 16:21:14,28,2,1,1,0,Baseline TESTR Checkpoint,34544956,1,00:29:47,796.0
113481,93179,user_9,2024-03-26 16:31:50,28,4,1,1,0,Baseline TESTR Checkpoint,34544956,1,00:22:48,944.0
114197,93895,user_51436,2024-04-27 16:10:36,28,1,1,1,0,ensem,96110879,1,00:22:08,791.0
114279,93977,user_43613,2024-04-29 16:14:16,28,1,1,1,0,MapTest,24947424,1,00:17:15,790.0


### Filtered submissions which do not have an evaluation

In [7]:
submissions_meta_ignored

Unnamed: 0_level_0,Unnamed: 0,User,Date,Challenge,Task,Valid,Competition,Val. Set,Title,Filesize,Public,Eval.time,Results size
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
113398,93096,user_36250,2024-03-23 15:31:02,28,1,1,1,0,ds-lp,10318653,1,00:00:36,642.0
113413,93111,user_52424,2024-03-24 22:55:02,28,1,1,1,0,Test,278970683,1,00:00:44,321.0
113463,93161,user_36250,2024-03-26 06:28:50,28,1,1,1,0,DS-LP,14289896,1,00:01:09,792.0
113751,93449,user_51723,2024-04-12 13:57:38,28,1,1,1,0,Test title,13838542,1,00:00:37,642.0
113761,93459,user_51436,2024-04-13 07:26:49,28,1,1,1,0,DiT,82748940,1,00:01:27,778.0
113777,93475,user_51436,2024-04-14 20:18:40,28,1,1,1,0,MViT,74655170,1,00:01:22,791.0
113786,93484,user_50898,2024-04-15 10:04:39,28,3,1,1,0,submit_test_v1,2,1,00:00:21,364.0
113826,93524,user_32125,2024-04-17 10:24:37,28,1,1,1,0,MapTest,1415605,1,00:00:16,395.0
113853,93551,user_50898,2024-04-18 10:37:15,28,3,1,1,0,submit_test_v1,207656779,1,00:01:26,454.0
113934,93632,user_43613,2024-04-20 06:33:23,28,1,1,1,0,MapTest,21974477,1,00:01:02,792.0


In [8]:
# count the number of submissions per user for each task
submissions_meta_valid.groupby(["User", "Task"]).size().unstack().fillna(0).astype(int)

Task,1,2,3,4
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
user_36250,1,1,1,1
user_43613,1,1,1,1
user_50898,1,0,1,0
user_51343,1,1,1,1
user_51436,3,0,0,0
user_53137,1,0,1,0
user_53700,1,0,0,0
user_9,1,1,1,1


In [9]:
submissions_meta_valid[submissions_meta_valid["User"] == "user_51436"]

Unnamed: 0_level_0,Unnamed: 0,User,Date,Challenge,Task,Valid,Competition,Val. Set,Title,Filesize,Public,Eval.time,Results size
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
114197,93895,user_51436,2024-04-27 16:10:36,28,1,1,1,0,ensem,96110879,1,00:22:08,791.0
114298,93996,user_51436,2024-04-29 17:25:26,28,1,1,1,0,dino_mvit,118762838,1,-01:36:52,791.0
114302,94000,user_51436,2024-04-29 17:44:41,28,1,1,1,0,dino_map,138257938,1,00:07:06,790.0


These multiple submissions from the same user indicate in their titles different approaches, so we keep them all.

## Create a clean list of valid submissions per task per subset

We found several duplicate and test submissions, and a participant apparently submitted the same file for two methods for task 1, IGN subset.

We produce here clean list we can refer to in later stages of the analysis.

In [10]:
from icdar24_maptext_analysis.loaders import list_submissions, VALID_TASKS, VALID_SUBSETS
from typing import List, Dict, Union
import pandas as pd

In [11]:
valid_submissions: list[dict[str,Union[int,str]]] = []  # each item has the form {"submission_id": int, "task": int, "subset": str}
dropped_count = 0
for task_id in VALID_TASKS:
    for subset in VALID_SUBSETS:
        for submission_id in list_submissions(task_id, subset):
            submission_id = int(submission_id)
            if submission_id not in valid_submission_ids or (task_id == 1 and subset == "ign" and submission_id in ("114298", )):  # extra filtering for an unexpected duplicate
                # print(f"Skipping submission {submission_id}")
                dropped_count += 1
                continue
            record = {"task": task_id, "subset": subset, "submission_id": submission_id}
            valid_submissions.append(record)
print(f"Dropped {dropped_count} submissions")
print(f"Kept {len(valid_submissions)} submissions")
valid_submissions = pd.DataFrame(valid_submissions)
valid_submissions.sample(5, random_state=0)

Dropped 14 submissions
Kept 44 submissions


Unnamed: 0,task,subset,submission_id
30,3,rumsey,114536
37,4,rumsey,113481
27,3,rumsey,113479
4,1,rumsey,114288
10,1,ign,113464


In [12]:
valid_submissions.to_csv("valid_submissions.csv", index=False)

In [13]:
# test re-loading
valid_submissions = pd.read_csv("valid_submissions.csv")
valid_submissions.sample(5, random_state=0)

Unnamed: 0,task,subset,submission_id
30,3,rumsey,114536
37,4,rumsey,113481
27,3,rumsey,113479
4,1,rumsey,114288
10,1,ign,113464


OK. Ready to continue.