In [None]:
import os

import pandas as pd
import numpy as np

from shapely import wkt

### Parameters

In [None]:
percent_in_train = 0.8

percent_in_valid = 0.1
percent_in_test = 0.1

df_path = '../../datasets/space_net_7/SN7_buildings_train_csvs/csvs/sn7_train_ground_truth_pix.csv'

output_path = '../../building_detection_dataset/dataset_1'

### verifications

In [None]:
os.makedirs(output_path, exist_ok=True)

In [None]:
assert (percent_in_train + percent_in_valid + percent_in_test) == 1 

### Read df

In [None]:
df = pd.read_csv(df_path)
df.head()

In [None]:
df['AOI_name'] = df['filename'].apply(lambda x: x.split('_mosaic_')[1])
df.head()

In [None]:
df['area'] = df['geometry'].apply(lambda x: wkt.loads(x).area)

In [None]:
df

In [None]:
df = df[df['area'] > 50]

In [None]:
df

### Split dataset

In [None]:
train = []

valid = []
test = []
for _, group in df.groupby('AOI_name'):
    coin_toss = np.random.uniform()

    if coin_toss <= percent_in_train:
        train.append(group)
    elif coin_toss <= (percent_in_train + percent_in_valid):
        valid.append(group)
    else:
        test.append(group)

In [None]:
len(train), len(valid), len(test)

In [None]:
train_df = pd.concat(train)
valid_df = pd.concat(valid)
test_df = pd.concat(test)

### Save datasets

In [None]:
df.to_csv(os.path.join(output_path, 'all_df.csv'))

train_df.to_csv(os.path.join(output_path, 'train.csv'))
valid_df.to_csv(os.path.join(output_path, 'valid.csv'))
test_df.to_csv(os.path.join(output_path, 'test.csv'))

### Train

In [None]:
print('Total number of \033[1mAreas of interest\033[0m in the dataset \033[1m{}\033[0m'.format(len(train_df.drop_duplicates(['AOI_name']))))
print('Total number of \033[1mImages\033[0m in the dataset \033[1m{}\033[0m'.format(len(train_df.drop_duplicates(['filename']))))

print('Total number of \033[1munique houses\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(train_df.drop_duplicates(['AOI_name', 'id']))))
print('Total number of \033[1mpolygons\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(train_df)))

### Valid

In [None]:
print('Total number of \033[1mAreas of interest\033[0m in the dataset \033[1m{}\033[0m'.format(len(valid_df.drop_duplicates(['AOI_name']))))
print('Total number of \033[1mImages\033[0m in the dataset \033[1m{}\033[0m'.format(len(valid_df.drop_duplicates(['filename']))))

print('Total number of \033[1munique houses\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(valid_df.drop_duplicates(['AOI_name', 'id']))))
print('Total number of \033[1mpolygons\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(valid_df)))

### Test

In [None]:
print('Total number of \033[1mAreas of interest\033[0m in the dataset \033[1m{}\033[0m'.format(len(test_df.drop_duplicates(['AOI_name']))))
print('Total number of \033[1mImages\033[0m in the dataset \033[1m{}\033[0m'.format(len(test_df.drop_duplicates(['filename']))))

print('Total number of \033[1munique houses\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(test_df.drop_duplicates(['AOI_name', 'id']))))
print('Total number of \033[1mpolygons\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(test_df)))