In [None]:
import os

import pandas as pd
import numpy as np

from shapely import wkt

from sklearn.model_selection import train_test_split

### Parameters

In [None]:
percent_in_train = 0.8

percent_in_valid = 0.1
percent_in_test = 0.1

df_path = '../../datasets/space_net_7/SN7_buildings_train_csvs/csvs/sn7_train_ground_truth_pix.csv'

output_path = '../../building_detection_dataset/dataset_2'

### verifications

In [None]:
os.makedirs(output_path, exist_ok=True)

In [None]:
assert (percent_in_train + percent_in_valid + percent_in_test) == 1 

### Read df

In [None]:
df = pd.read_csv(df_path)
df.head()

In [None]:
df['AOI_name'] = df['filename'].apply(lambda x: x.split('_mosaic_')[1])
df.head()

In [None]:
len(df.drop_duplicates(['filename']))

In [None]:
df['area'] = df['geometry'].apply(lambda x: wkt.loads(x).area)

In [None]:
df

In [None]:
df = df[df['area'] > 50]

In [None]:
df

In [None]:
195274/6664652

### Split dataset

In [None]:
train_valid = []
test = []
for _, group in df.groupby('AOI_name'):
    coin_toss = np.random.uniform()

    if coin_toss <= (percent_in_train + percent_in_valid):
        train_valid.append(group)
    else:
        test.append(group)

In [None]:
len(train_valid), len(test)

In [None]:
train_valid_df = pd.concat(train_valid)

In [None]:
files_stats_df = train_valid_df.drop_duplicates('filename')[['filename', 'AOI_name']].copy()

In [None]:
files_stats_df['area_median'] = files_stats_df['filename'].apply(lambda x: np.percentile(train_valid_df[train_valid_df['filename'] == x]['area'], 0.5))
files_stats_df['area_min'] = files_stats_df['filename'].apply(lambda x: np.min(train_valid_df[train_valid_df['filename'] == x]['area']))
files_stats_df['area_max'] = files_stats_df['filename'].apply(lambda x: np.max(train_valid_df[train_valid_df['filename'] == x]['area']))
files_stats_df['area_25th_percentile'] = files_stats_df['filename'].apply(lambda x: np.percentile(train_valid_df[train_valid_df['filename'] == x]['area'], 0.25))
files_stats_df['area_75th_percentile'] = files_stats_df['filename'].apply(lambda x: np.percentile(train_valid_df[train_valid_df['filename'] == x]['area'], 0.75))

files_stats_df['no_buildings'] = files_stats_df['filename'].apply(lambda x: len(train_valid_df[train_valid_df['filename'] == x]['area']))

In [None]:
files_stats_df

In [None]:
train_files_stats_df, valid_files_stats_df, = train_test_split(files_stats_df, test_size=0.1)

In [None]:
train_files_stats_df['no_buildings'].describe()

In [None]:
valid_files_stats_df['no_buildings'].describe()

### Further split

In [None]:
train = []
valid = []
for name, group in train_valid_df.groupby('filename'):
    if name in list(train_files_stats_df['filename']):
        train.append(group)
    else:
        valid.append(group)

In [None]:
train_df = pd.concat(train)
valid_df = pd.concat(valid)
test_df = pd.concat(test)

### Save datasets

In [None]:
# df.to_csv(os.path.join(output_path, 'all_df.csv'))

# train_df.to_csv(os.path.join(output_path, 'train.csv'))
# valid_df.to_csv(os.path.join(output_path, 'valid.csv'))
# test_df.to_csv(os.path.join(output_path, 'test.csv'))

### Load

In [None]:
# all_df = pd.read_csv(os.path.join(output_path, 'all_df.csv'))

# train_df = pd.read_csv(os.path.join(output_path, 'train.csv'))
# valid_df = pd.read_csv(os.path.join(output_path, 'valid.csv'))
# test_df = pd.read_csv(os.path.join(output_path, 'test.csv'))

### All

In [None]:
print('Total number of \033[1mAreas of interest\033[0m in the dataset \033[1m{}\033[0m'.format(len(all_df.drop_duplicates(['AOI_name']))))
print('Total number of \033[1mImages\033[0m in the dataset \033[1m{}\033[0m'.format(len(all_df.drop_duplicates(['filename']))))

print('Total number of \033[1munique houses\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(all_df.drop_duplicates(['AOI_name', 'id']))))
print('Total number of \033[1mpolygons\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(all_df)))

### Train

In [None]:
print('Total number of \033[1mAreas of interest\033[0m in the dataset \033[1m{}\033[0m'.format(len(train_df.drop_duplicates(['AOI_name']))))
print('Total number of \033[1mImages\033[0m in the dataset \033[1m{}\033[0m'.format(len(train_df.drop_duplicates(['filename']))))

print('Total number of \033[1munique houses\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(train_df.drop_duplicates(['AOI_name', 'id']))))
print('Total number of \033[1mpolygons\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(train_df)))

### Valid

In [None]:
print('Total number of \033[1mAreas of interest\033[0m in the dataset \033[1m{}\033[0m'.format(len(valid_df.drop_duplicates(['AOI_name']))))
print('Total number of \033[1mImages\033[0m in the dataset \033[1m{}\033[0m'.format(len(valid_df.drop_duplicates(['filename']))))

print('Total number of \033[1munique houses\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(valid_df.drop_duplicates(['AOI_name', 'id']))))
print('Total number of \033[1mpolygons\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(valid_df)))

### Test

In [None]:
print('Total number of \033[1mAreas of interest\033[0m in the dataset \033[1m{}\033[0m'.format(len(test_df.drop_duplicates(['AOI_name']))))
print('Total number of \033[1mImages\033[0m in the dataset \033[1m{}\033[0m'.format(len(test_df.drop_duplicates(['filename']))))

print('Total number of \033[1munique houses\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(test_df.drop_duplicates(['AOI_name', 'id']))))
print('Total number of \033[1mpolygons\033[0m in the dataset \033[1m{:,.0f}\033[0m'.format(len(test_df)))