In [8]:
import os
import json
import geojson
import pandas as pd
from tqdm import tqdm
from osgeo import gdal
from sklearn.model_selection import train_test_split

# custom functions
import sys
sys.path.append('../')
from utils.functions import grab_certain_file
from utils.create_jsons import geojson_to_json_pix_coords
# TODO delete functions.py in data_preperation folder. Use main utils instead.

'''
Read NSO tiles and annotations geojsons, convert lat/lon of tile to pixel coordinates and save pixel coordinates into
.json file If more than one .json file can be saved as one via_regions.json.
Source:  https://github.com/rl02898/detectron2-spacenet. JDP Edits:some and saving json with origins of tile in jsons to allow stiching back of the tiles. Add empty json for tiles without nnotation to allow their reading in D2.
'''

In [9]:
### ARGS function ###

nso_path = "../NSO"
train_path = os.path.join(nso_path, "train")
test_path = os.path.join(nso_path, "test")
val_path = os.path.join(nso_path, "val")
geojson_path = os.path.join(nso_path, "geojsons")
small_tiles_path = os.path.join(nso_path, "NSO_small_tiles")

In [3]:
# Divide dataset and set a random seed for reproducibility of the splits for next script

RANDOM_SEED = 560

# Create JSONs for Detectron2 NO test set
#nso_images = grab_certain_file(".tif", small_tiles_path)
#train, val = train_test_split(nso_images, test_size=0.2, random_state=RANDOM_SEED)

# Create JSONs for Detectron2 WITH test set
nso_images = grab_certain_file(".tif", small_tiles_path)
train, test = train_test_split(nso_images, test_size=0.20, random_state=RANDOM_SEED)
train, val = train_test_split(train, test_size=0.25, random_state=RANDOM_SEED)

In [5]:
geojson_to_json_pix_coords(train, small_tiles_path, geojson_path, train_path)
geojson_to_json_pix_coords(test, small_tiles_path, geojson_path, test_path)
geojson_to_json_pix_coords(val, small_tiles_path, geojson_path, val_path)

Creating JSONs for Detectron2 on ../NSO/train: 100%|██████████| 78324/78324 [53:46<00:00, 24.27it/s]  
Creating JSONs for Detectron2 on ../NSO/test: 100%|██████████| 26108/26108 [16:50<00:00, 25.84it/s]
Creating JSONs for Detectron2 on ../NSO/val: 100%|██████████| 26108/26108 [15:47<00:00, 27.56it/s]


In [8]:
# Create single via_region_data training dataset => can be skipped if only one .json file.=>But then change file name
for d in ["train", "test", "val"]:
    jsons = [os.path.join(nso_path, d, "nso_with_empty_annotations.json")]
    result = {}
    for file in jsons:
        with open(file, "r") as f:
            loaded = json.load(f)
            
        #https://realpython.com/iterate-through-dictionary-python/
        for key, value in loaded.items():
            result[key] = value
    via_region_p = os.path.join(nso_path, d, "via_region_data_with_empty_annotations.json")
    with open(via_region_p, "w") as file:
        json.dump(result, file)
        
    print(f"Done creating JSONs {d}")
    
    

Done creating JSONs train
Done creating JSONs test
Done creating JSONs val


In [10]:
import pandas as pd
import json

# Check is the regions are well writen

train = "../NSO/train/via_region_data_with_no_annotations.json"
val = "../NSO/val/via_region_data_with_no_annotations.json"
test = "../NSO/test/via_region_data_with_no_annotations.json"
pths = [train, val, test]

dfs = []
for path in pths:
    df = pd.read_json(path, orient='index')
    dfs.append(df)

train_df = dfs[0]
val_df = dfs[1]
test_df = dfs[2]

In [22]:
# Count amount of images with and without regions
#split data into 2 dataframes: with and without annotations

rows_empty_annot = []
rows_annot = []
i=0
e=0

# Initialize empty lists to store rows for images with and without annotations
for index, row in train_df.iterrows():
    if not row['regions']:
        # Add row to new DataFrame
        rows_empty_annot.append(row)
        i+=1
    else:
        rows_annot.append(row)
        e+=1
        #print (row['regions'])     
print("no regions: {}, with regions: {}".format(i,e))

# Create new DataFrames from the collected rows
df_empty_annot = pd.DataFrame(rows_empty_annot, columns=train_df.columns)
df_annot = pd.DataFrame(rows_annot, columns=train_df.columns)
df_empty_annot.head(5)

no regions: 78028, with regions: 296


Unnamed: 0,file_ref,size,filename,base64_img_data,file_attributes,regions,origin_x,origin_y
66_20220903_111726_SV1-04_SV_RD_8bit_RGB_50cm_HoekVanHolland_18000_23000.png,,3107342,66_20220903_111726_SV1-04_SV_RD_8bit_RGB_50cm_...,,{},{},73790.0,442478.0
154_20220906_104537_SV1-03_SV_RD_8bit_RGB_50cm_Hardenberg_12000_26000.png,,3007320,154_20220906_104537_SV1-03_SV_RD_8bit_RGB_50cm...,,{},{},242684.0,518504.0
59_20221102_105437_SV2-01_SV_RD_8bit_RGB_50cm_Echteld_13000_15000.png,,3494018,59_20221102_105437_SV2-01_SV_RD_8bit_RGB_50cm_...,,{},{},160856.0,445873.0
79_20220922_105204_SV1-04_SV_RD_8bit_RGB_50cm_Coevorden_16000_7000.png,,3132536,79_20220922_105204_SV1-04_SV_RD_8bit_RGB_50cm_...,,{},{},246618.0,528318.0
52_20221004_110216_SV1-03_SV_RD_8bit_RGB_50cm_WijkBijDuurstede_13000_24000.png,,3199257,52_20221004_110216_SV1-03_SV_RD_8bit_RGB_50cm_...,,{},{},154632.0,444063.5


In [24]:
# Remove if run form beginning
train_path = f"../NSO/train"

# train_test split on train_df
from sklearn.model_selection import train_test_split
import os

test_sizes = [0.25, 0.5, 0.75]

# Create an empty dictionary to store the DataFrames
train_dic = {}

for perc in test_sizes:
    # Generate the name for the training DataFrame based on the perc amount
    train_name = f"train_{int((1 - perc) * 100)}"
    test_name = f"test_{int((perc) * 100)}"
    # Split the DataFrame into training and test sets
    train_set, test_set = train_test_split(df_empty_annot, test_size=perc, random_state=42)
    train_dic[train_name] = pd.concat([df_annot, train_set])
    #name tiles in index columns
    data = train_dic[train_name].to_json(orient='index')
    with open(os.path.join(train_path, f"via_region_data_{train_name}_empty_annotations.json"), "w") as outfile:
        #.write to avoid {}
        outfile.write(data)
print (f"train_25: {len(train_dic['train_25'])}, train_50 {len(train_dic['train_50'])}, train_75: {len(train_dic['train_75'])}")

train_25: 19803, train_50 39310, train_75: 58817
