In [209]:
import os
import sys
import json
import numpy as np
import pandas as pd

sys.path.append("../helpers/")
from datasets import *
from filtering import *

data_path = "data"
datasets_path = "datasets"

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Datasets for table detection
In this notebook, NLL, NLL-revised, NLL-filtered, and NLL-tag are constructed.

In [56]:
NLL = pd.read_parquet(os.path.join(data_path, "NLL_lite.parquet"))
NLL_tag = pd.read_parquet(os.path.join(data_path, "NLL_tag_lite.parquet"))

In [80]:
with open("NLL_metadata_images.json", "r") as f:
    NLL_metadata_images = json.load(f)

## Pre-processing

In [58]:
NLL['tag'] = 'table'
NLL = NLL.drop(NLL[NLL.index.isin(NLL_tag[NLL_tag['tag'] == 'not a table'].index)].index)

print(NLL.shape)
NLL.head()

(58007, 17)


Unnamed: 0,tb_coords,pid,iiifURL,iiifFragmentURL,year,month,day,journal,edition,page,meta_issue_id,x,y,width,height,area,tag
luxzeit1858-1858-03-10-a-i0018,"{'height': [812], 'width': [1375], 'x': [153],...",luxzeit1858-1858-03-10-a-p0001,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1858,3,10,luxzeit1858,a,1,luxzeit1858-1858-03-10-a,153,1320,1375,812,1116500,table
avenirgdl-1871-07-25-a-i0033,"{'height': [546], 'width': [610], 'x': [1418],...",avenirgdl-1871-07-25-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1871,7,25,avenirgdl,a,4,avenirgdl-1871-07-25-a,1418,6225,610,546,333060,table
lunion-1867-01-10-a-i0017,"{'height': [224], 'width': [431], 'x': [2220],...",lunion-1867-01-10-a-p0001,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1867,1,10,lunion,a,1,lunion-1867-01-10-a,2220,1021,431,224,96544,table
luxwort-1867-06-07-a-i0004,"{'height': [196], 'width': [3334], 'x': [50], ...",luxwort-1867-06-07-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1867,6,7,luxwort,a,4,luxwort-1867-06-07-a,50,4922,3334,196,653464,table
lunion-1867-08-07-a-i0023,"{'height': [236], 'width': [428], 'x': [2179],...",lunion-1867-08-07-a-p0001,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1867,8,7,lunion,a,1,lunion-1867-08-07-a,2179,1014,428,236,101008,table


Since NLL-revised results from a manual annotation done on VGG Image Annotator (VIA), the *starting* point is a .json file and not a DataFrame.
We upscale the manual annotations to match the original size of the images, as annotations were made on downscaled images.

In [60]:
with (open("via_annotations_NLL_revised.json", "r")) as f:
    via_annotations_NLL_revised = json.load(f)
    
with open(os.path.join(data_path, "NLL_metadata_images.json"), "r") as f:
    NLL_metadata_images = json.load(f)
    
for k, v in via_annotations_NLL_revised.items():
    v['filename'] = v['filename'][9:]
    for region in v['regions']:
            scale = NLL_metadata_images[k]['height']/NLL_metadata_images[k]['resized_height']
            c = region['shape_attributes']
            if c['name'] == 'rect':
                c['x'] = int(scale*c['x'])
                c['y'] = int(scale*c['y'])
                c['width'] = int(scale*c['width'])
                c['height'] = int(scale*c['height'])
            elif c['name'] == 'polygon':
                c['all_points_x'] = [int(scale*x) for x in c['all_points_x']]
                c['all_points_y'] = [int(scale*x) for x in c['all_points_x']]

In [61]:
NLL_filtered = filter_problematic_clusters(NLL)

print(NLL_filtered.shape)
NLL_filtered.head()

(28094, 17)


Unnamed: 0,tb_coords,pid,iiifURL,iiifFragmentURL,year,month,day,journal,edition,page,meta_issue_id,x,y,width,height,area,tag
luxzeit1858-1858-03-10-a-i0018,"{'height': [812], 'width': [1375], 'x': [153],...",luxzeit1858-1858-03-10-a-p0001,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1858,3,10,luxzeit1858,a,1,luxzeit1858-1858-03-10-a,153,1320,1375,812,1116500,table
lunion-1867-07-11-a-i0025,"{'height': [1670], 'width': [779], 'x': [226],...",lunion-1867-07-11-a-p0002,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1867,7,11,lunion,a,2,lunion-1867-07-11-a,226,1718,779,1670,1300930,table
luxland-1986-05-09-a-i0096,"{'height': [51, 29, 703], 'width': [1653, 256,...",luxland-1986-05-09-a-p0009,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1986,5,9,luxland,a,9,luxland-1986-05-09-a,720,4251,1963,764,1499732,table
luxwort-1868-05-29-a-i0005,"{'height': [1071], 'width': [1634], 'x': [1766...",luxwort-1868-05-29-a-p0003,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1868,5,29,luxwort,a,3,luxwort-1868-05-29-a,1766,3376,1634,1071,1750014,table
luxzeit1858-1858-08-21-a-i0023,"{'height': [1289], 'width': [716], 'x': [1289]...",luxzeit1858-1858-08-21-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1858,8,21,luxzeit1858,a,4,luxzeit1858-1858-08-21-a,1289,818,716,1289,922924,table


In [62]:
NLL_tag = NLL_tag[NLL_tag['tag'] != 'not a table']
NLL_tag.loc[NLL_tag['tag'].isin(['currency rates', 'stock']), 'tag'] = 'exchange'
NLL_tag.loc[NLL_tag['tag'] == 'sports results', 'tag'] = 'sport results'
NLL_tag.loc[NLL_tag['tag'] == 'table', 'tag'] = 'miscellaneous'

print(NLL_tag.shape)
NLL_tag.head()

(3783, 19)


Unnamed: 0,tag,date_tag,auto_tag,tb_coords,pid,iiifURL,iiifFragmentURL,year,month,day,journal,edition,page,meta_issue_id,x,y,width,height,area
luxwort-1891-06-22-a-i0059,transport schedule,2021-11-04T18:49:59.000Z,False,"{'height': [70, 227, 987], 'width': [284, 543,...",luxwort-1891-06-22-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1891,6,22,luxwort,a,4,luxwort-1891-06-22-a,3109,4926,544,1298,706112
luxwort-1882-07-14-a-i0026,transport schedule,2021-11-04T18:06:27.000Z,False,"{'height': [1185], 'width': [1111], 'x': [20],...",luxwort-1882-07-14-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1882,7,14,luxwort,a,4,luxwort-1882-07-14-a,20,3132,1111,1185,1316535
luxwort-1929-05-29-a-i0038,exchange,2021-10-12T18:11:31.000Z,False,"{'height': [225], 'width': [1002], 'x': [1543]...",luxwort-1929-05-29-a-p0007,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1929,5,29,luxwort,a,7,luxwort-1929-05-29-a,1543,3532,1002,225,225450
luxwort-1941-05-29-a-i0063,exchange,2021-10-13T16:29:30.000Z,False,"{'height': [587], 'width': [837], 'x': [2759],...",luxwort-1941-05-29-a-p0004,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1941,5,29,luxwort,a,4,luxwort-1941-05-29-a,2759,2937,837,587,491319
luxwort-1895-06-12-a-i0005,exchange,2021-10-12T18:35:50.000Z,False,"{'height': [2988], 'width': [894], 'x': [3135]...",luxwort-1895-06-12-a-p0003,https://iiif.eluxemburgensia.lu/image/iiif/2/a...,[https://iiif.eluxemburgensia.lu/image/iiif/2/...,1895,6,12,luxwort,a,3,luxwort-1895-06-12-a,3135,2815,894,2988,2671272


## NLL: all tables

In [63]:
print_tag_distributions(NLL)

table  58007 items (100.0%) on 28496 pages on 22734 issues


In [None]:
NLL_train, NLL_val, NLL_test = stratified_split_by_page_id(NLL, [0.7, 0.2, 0.1])

In [243]:
print_splits(NLL_train, NLL_val, NLL_test)

Dataset: 58007 items on 28496 pages.
-- Train set: 40435 items on 19947 pages.
-- Validation set: 11706 items on 5700 pages.
-- Test set: 5866 items on 2849 pages.


### Export
#### VIA format

In [233]:
via_annotations_NLL_train = convert_df_to_via(NLL_train)
via_annotations_NLL_val = convert_df_to_via(NLL_val)
via_annotations_NLL_test = convert_df_to_via(NLL_test)

In [237]:
filename = "via_annotations_NLL"

with (open(os.path.join(datasets_path, filename + "_train.json"), "w")) as f:
    json.dump(via_annotations_NLL_train, f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "w")) as f:
    json.dump(via_annotations_NLL_val, f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "w")) as f:
    json.dump(via_annotations_NLL_test, f)

In [278]:
filename = "via_annotations_NLL"

with (open(os.path.join(datasets_path, filename + "_train.json"), "r")) as f:
    via_annotations_NLL_train = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "r")) as f:
    via_annotations_NLL_val = json.load(f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "r")) as f:
    via_annotations_NLL_test = json.load(f)

#### COCO format

In [280]:
coco_annotations_NLL_train = convert_df_to_coco(NLL_train, NLL_metadata_images)
coco_annotations_NLL_val = convert_df_to_coco(NLL_val, NLL_metadata_images)
coco_annotations_NLL_test = convert_df_to_coco(NLL_test, NLL_metadata_images)

In [281]:
filename = "coco_annotations_NLL"

with (open(os.path.join(datasets_path, filename + "_train.json"), "w")) as f:
    json.dump(coco_annotations_NLL_train, f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "w")) as f:
    json.dump(coco_annotations_NLL_val, f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "w")) as f:
    json.dump(coco_annotations_NLL_test, f)

In [85]:
filename = "coco_annotations_NLL"

with (open(os.path.join(datasets_path, filename + "_train.json"), "r")) as f:
    coco_annotations_NLL_train = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "r")) as f:
    coco_annotations_NLL_val = json.load(f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "r")) as f:
    coco_annotations_NLL_test = json.load(f)

## NLL-revised: revised ground truth

In [86]:
all_keys = set(via_annotations_NLL_revised.keys())
num_keys = len(all_keys)
train_keys = random.sample(list(all_keys), int(num_keys*0.6))
all_keys = all_keys.difference(train_keys)
val_keys = random.sample(list(all_keys), int(num_keys*0.2))
all_keys = all_keys.difference(val_keys)
test_keys = all_keys.copy()

all_keys = set(via_annotations_NLL_revised.keys())

In [222]:
def get_num_tables(keys):
    return sum([len(via_annotations_NLL_revised[k]['regions']) for k in keys])

print(f"Dataset: {get_num_tables(all_keys)} items on {len(all_keys)} pages.")
print(f"-- Train set: {get_num_tables(train_keys)} items on {len(train_keys)} pages.")
print(f"-- Validation set: {get_num_tables(val_keys)} items on {len(val_keys)} pages.")
print(f"-- Test set: {get_num_tables(test_keys)} items on {len(test_keys)} pages.")

Dataset: 5445 items on 1500 pages.
-- Train set: 3412 items on 900 pages.
-- Validation set: 985 items on 300 pages.
-- Test set: 1048 items on 300 pages.


### Export
#### VIA format

In [88]:
via_annotations_NLL_revised_train = {k: v for k, v in via_annotations_NLL_revised.items() if k in train_keys}
via_annotations_NLL_revised_val = {k: v for k, v in via_annotations_NLL_revised.items() if k in val_keys}
via_annotations_NLL_revised_test = {k: v for k, v in via_annotations_NLL_revised.items() if k in test_keys}

In [148]:
filename = "via_annotations_NLL_revised"

with (open(os.path.join(datasets_path, filename + ".json"), "w")) as f:
    json.dump(via_annotations_NLL_revised, f)

with (open(os.path.join(datasets_path, filename + "_train.json"), "w")) as f:
    json.dump(via_annotations_NLL_revised_train, f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "w")) as f:
    json.dump(via_annotations_NLL_revised_val, f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "w")) as f:
    json.dump(via_annotations_NLL_revised_test, f)

In [275]:
filename = "via_annotations_NLL_revised"

with (open(os.path.join(datasets_path, filename + ".json"), "r")) as f:
    via_annotations_NLL_revised = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_train.json"), "r")) as f:
    via_annotations_NLL_revised_train = json.load(f)

with (open(os.path.join(datasets_path, filename + "_val.json"), "r")) as f:
    via_annotations_NLL_revised_val = json.load(f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "r")) as f:
    via_annotations_NLL_revised_test = json.load(f)

#### COCO format

In [276]:
coco_annotations_NLL_revised = convert_via_to_coco(via_annotations_NLL_revised, NLL_metadata_images)
coco_annotations_NLL_revised_train = convert_via_to_coco(via_annotations_NLL_revised_train, NLL_metadata_images)
coco_annotations_NLL_revised_val = convert_via_to_coco(via_annotations_NLL_revised_val, NLL_metadata_images)
coco_annotations_NLL_revised_test = convert_via_to_coco(via_annotations_NLL_revised_test, NLL_metadata_images)

In [277]:
filename = "coco_annotations_NLL_revised"

with (open(os.path.join(datasets_path, filename + ".json"), "w")) as f:
    json.dump(coco_annotations_NLL_revised, f)
    
with (open(os.path.join(datasets_path, filename + "_train.json"), "w")) as f:
    json.dump(coco_annotations_NLL_revised_train, f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "w")) as f:
    json.dump(coco_annotations_NLL_revised_val, f)
    
with (open(os.path.join(datasets_path, filename + "_test.json"), "w")) as f:
    json.dump(coco_annotations_NLL_revised_test, f)

In [224]:
filename = "coco_annotations_NLL_revised"

with (open(os.path.join(datasets_path, filename + ".json"), "r")) as f:
    coco_annotations_NLL_revised = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_train.json"), "r")) as f:
    coco_annotations_NLL_revised_train = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "r")) as f:
    coco_annotations_NLL_revised_val = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_test.json"), "r")) as f:
    coco_annotations_NLL_revised_test = json.load(f)

## NLL-filtered: inconsistencies filtered out

In [96]:
print_tag_distributions(NLL_filtered)

table  28094 items (100.0%) on 16987 pages on 15968 issues


In [97]:
total_size = len(NLL_filtered.pid.unique())
NLL_filtered_test = NLL_filtered[NLL_filtered['pid'].isin(set(via_annotations_NLL_revised.keys()))]
test_size = len(NLL_filtered_test.pid.unique())
NLL_filtered = NLL_filtered[~NLL_filtered['pid'].isin(set(via_annotations_NLL_revised.keys()))]
test_split = int(0.1*total_size - test_size)/len(NLL_filtered.pid.unique())
train_split = int(0.7*total_size)/len(NLL_filtered.pid.unique())
val_split = int(0.2*total_size)/len(NLL_filtered.pid.unique())

In [None]:
NLL_filtered_train, NLL_filtered_val, NLL_filtered_test_2 = stratified_split_by_page_id(
    NLL_filtered, 
    [train_split, val_split, test_split],
    print_splits=False)
NLL_filtered_test = pd.concat([NLL_filtered_test, NLL_filtered_test_2])

In [210]:
print_splits(NLL_filtered_train, NLL_filtered_val, NLL_filtered_test)

Dataset: 26628 items on 16087 pages.
-- Train set: 19641 items on 11890 pages.
-- Validation set: 5640 items on 3399 pages.
-- Test set: 1347 items on 798 pages.


### Export
#### VIA format

In [211]:
via_annotations_NLL_filtered_train = convert_df_to_via(NLL_filtered_train)
via_annotations_NLL_filtered_val = convert_df_to_via(NLL_filtered_val)
via_annotations_NLL_filtered_test = convert_df_to_via(NLL_filtered_test)

In [213]:
filename = "via_annotations_NLL_filtered"

with (open(os.path.join(datasets_path, filename + "_train.json"), "w")) as f:
    json.dump(via_annotations_NLL_filtered_train, f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "w")) as f:
    json.dump(via_annotations_NLL_filtered_val, f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "w")) as f:
    json.dump(via_annotations_NLL_filtered_test, f)

In [271]:
filename = "via_annotations_NLL_filtered"

with (open(os.path.join(datasets_path, filename + "_train.json"), "r")) as f:
    via_annotations_NLL_filtered_train = json.load(f)

with (open(os.path.join(datasets_path, filename + "_val.json"), "r")) as f:
    via_annotations_NLL_filtered_val = json.load(f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "r")) as f:
    via_annotations_NLL_filtered_test = json.load(f)

#### COCO format

In [273]:
coco_annotations_NLL_filtered_train = convert_df_to_coco(NLL_filtered_train, NLL_metadata_images)
coco_annotations_NLL_filtered_val = convert_df_to_coco(NLL_filtered_val, NLL_metadata_images)
coco_annotations_NLL_filtered_test = convert_df_to_coco(NLL_filtered_test, NLL_metadata_images)

In [274]:
filename = "coco_annotations_NLL_filtered"

with (open(os.path.join(datasets_path, filename + "_train.json"), "w")) as f:
    json.dump(coco_annotations_NLL_filtered_train, f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "w")) as f:
    json.dump(coco_annotations_NLL_filtered_val, f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "w")) as f:
    json.dump(coco_annotations_NLL_filtered_test, f)

In [218]:
filename = "coco_annotations_NLL_filtered"

with (open(os.path.join(datasets_path, filename + "_train.json"), "r")) as f:
    coco_annotations_NLL_filtered_train = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "r")) as f:
    coco_annotations_NLL_filtered_val = json.load(f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "r")) as f:
    coco_annotations_NLL_filtered_test = json.load(f)

## NLL-tag: manually classified tables

In [108]:
print_tag_distributions(NLL_tag)

cinema                  4 items (0.1%) on    2 pages on    2 issues
election               49 items (1.3%) on   11 pages on   11 issues
exchange              444 items (11.7%) on  312 pages on  312 issues
food prices           177 items (4.7%) on  143 pages on  143 issues
lotto                   8 items (0.2%) on    8 pages on    8 issues
miscellaneous         365 items (9.6%) on  217 pages on  208 issues
radio                   3 items (0.1%) on    3 pages on    3 issues
sport results         403 items (10.7%) on   41 pages on   41 issues
transport schedule   2301 items (60.8%) on  408 pages on  407 issues
weather                29 items (0.8%) on   28 pages on   28 issues


In [109]:
NLL_tag.loc[NLL_tag['tag'].isin({'cinema', 'election', 'lotto', 'radio'}), 'tag'] = 'miscellaneous'
print_tag_distributions(NLL_tag)

exchange              444 items (11.7%) on  312 pages on  312 issues
food prices           177 items (4.7%) on  143 pages on  143 issues
miscellaneous         429 items (11.3%) on  237 pages on  228 issues
sport results         403 items (10.7%) on   41 pages on   41 issues
transport schedule   2301 items (60.8%) on  408 pages on  407 issues
weather                29 items (0.8%) on   28 pages on   28 issues


In [None]:
NLL_tag_train, NLL_tag_val, NLL_tag_test = stratified_split_by_page_id(NLL_tag, [0.6, 0.2, 0.2])

In [219]:
print_splits(NLL_tag_train, NLL_tag_val, NLL_tag_test)

Dataset: 3783 items on 984 pages.
-- Train set: 2354 items on 605 pages.
-- Validation set: 549 items on 173 pages.
-- Test set: 880 items on 206 pages.


In [196]:
print("__________________Train set__________________")
print_tag_distributions(NLL_tag_train)
print()
print("_______________Validation set________________")
print_tag_distributions(NLL_tag_val)
print()
print("__________________Test set___________________")
print_tag_distributions(NLL_tag_test)

__________________Train set__________________
exchange              265 items (11.3%) on  188 pages on  188 issues
food prices           108 items (4.6%) on   86 pages on   86 issues
miscellaneous         276 items (11.7%) on  145 pages on  142 issues
sport results         238 items (10.1%) on   25 pages on   25 issues
transport schedule   1452 items (61.7%) on  245 pages on  245 issues
weather                15 items (0.6%) on   14 pages on   14 issues

_______________Validation set________________
exchange               79 items (14.4%) on   55 pages on   55 issues
food prices            30 items (5.5%) on   28 pages on   28 issues
miscellaneous          82 items (14.9%) on   48 pages on   48 issues
sport results          37 items (6.7%) on    7 pages on    7 issues
transport schedule    318 items (57.9%) on   68 pages on   68 issues
weather                 3 items (0.5%) on    3 pages on    3 issues

__________________Test set___________________
exchange              100 items (11.4

### Export
#### VIA format

In [197]:
via_annotations_NLL_tag_train = convert_df_to_via(NLL_tag_train)
via_annotations_NLL_tag_val = convert_df_to_via(NLL_tag_val)
via_annotations_NLL_tag_test = convert_df_to_via(NLL_tag_test)

In [199]:
filename = "via_annotations_NLL_tag"

with (open(os.path.join(datasets_path, filename + "_train.json"), "w")) as f:
    json.dump(via_annotations_NLL_tag_train, f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "w")) as f:
    json.dump(via_annotations_NLL_tag_val, f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "w")) as f:
    json.dump(via_annotations_NLL_tag_test, f)

In [266]:
filename = "via_annotations_NLL_tag"

with (open(os.path.join(datasets_path, filename + "_train.json"), "r")) as f:
    via_annotations_NLL_tag_train = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "r")) as f:
    via_annotations_NLL_tag_val = json.load(f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "r")) as f:
    via_annotations_NLL_tag_test = json.load(f)

#### COCO format

In [269]:
coco_annotations_NLL_tag_train = convert_df_to_coco(NLL_tag_train, NLL_metadata_images)
coco_annotations_NLL_tag_val = convert_df_to_coco(NLL_tag_val, NLL_metadata_images)
coco_annotations_NLL_tag_test = convert_df_to_coco(NLL_tag_test, NLL_metadata_images)

In [270]:
filename = "coco_annotations_NLL_tag"

with (open(os.path.join(datasets_path, filename + "_train.json"), "w")) as f:
    json.dump(coco_annotations_NLL_tag_train, f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "w")) as f:
    json.dump(coco_annotations_NLL_tag_val, f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "w")) as f:
    json.dump(coco_annotations_NLL_tag_test, f)

In [204]:
filename = "coco_annotations_NLL_tag"

with (open(os.path.join(datasets_path, filename + "_train.json"), "r")) as f:
    coco_annotations_NLL_tag_train = json.load(f)
    
with (open(os.path.join(datasets_path, filename + "_val.json"), "r")) as f:
    coco_annotations_NLL_tag_val = json.load(f)

with (open(os.path.join(datasets_path, filename + "_test.json"), "r")) as f:
    coco_annotations_NLL_tag_test = json.load(f)