# Data Exploration

This file contains an exploration of the geojson dataset that accompanies xView.  For more information see http://xviewdataset.org/.

In [1]:
# import statements
import geopandas as gpd

In [3]:
fname = "../data/xView_train.geojson"

df = gpd.read_file(fname)

In [4]:
type(df)

geopandas.geodataframe.GeoDataFrame

In [5]:
df.head

<bound method NDFrame.head of             bounds_imcoords        edited_by            cat_id  type_id  \
0       2712,1145,2746,1177  wwoscarbecerril  1040010028371A00       73   
1       2720,2233,2760,2288  wwoscarbecerril  1040010028371A00       73   
2       2687,1338,2740,1399  wwoscarbecerril  1040010028371A00       73   
3       2691,1201,2730,1268  wwoscarbecerril  1040010028371A00       73   
4         2671,838,2714,869  wwoscarbecerril  1040010028371A00       73   
5       2705,1181,2738,1206  wwoscarbecerril  1040010028371A00       73   
6       2709,1014,2747,1072  wwoscarbecerril  1040010028371A00       73   
7         2671,808,2733,866  wwoscarbecerril  1040010028371A00       73   
8         2700,353,2731,384  wwoscarbecerril  1040010028371A00       73   
9       2704,2123,2749,2151  wwoscarbecerril  1040010028371A00       73   
10        2683,256,2711,298  wwoscarbecerril  1040010028371A00       73   
11      2700,1841,2729,1885  wwoscarbecerril  1040010028371A00       7

In [6]:
list(df)

['bounds_imcoords',
 'edited_by',
 'cat_id',
 'type_id',
 'ingest_time',
 'index_right',
 'image_id',
 'point_geom',
 'feature_id',
 'grid_file',
 'geometry']

Next, we want to identify all instances of "small cars", which is what this project will be looking at.


In [13]:
df_cars = df.loc[df['type_id'] == 18]

In [20]:
# explore dataset to make sure it worked
df_cars.head
len(df_cars)
# there are 211664 instances of cars in this dataset

211664

In [18]:
# extract just the names of images that have cars in them
images = df_cars['image_id'].unique().tolist()

# JSON Conversion
With help from https://stackoverflow.com/questions/27189892/how-to-filter-json-array-in-python

In [65]:
import json

In [48]:
input_json = "../data/xView_train.geojson"

In [50]:
with open(input_json) as f:
    data = json.load(f)

In [51]:
# Understand structure of data - what's in 
data.keys()

dict_keys(['crs', 'type', 'features', 'name'])

In [54]:
data['name']

'U-LIMDIS_xView_Final'

In [55]:
# copy initial dictionary to preserve categories, structure
output_dict = data

In [56]:
data.keys()

dict_keys(['crs', 'type', 'features', 'name'])

In [57]:
# Transform json input to python objects
#input_dict = json.loads(input_json)

# Filter python objects with list comprehensions
output_dict['features'] = [x for x in data['features'] if x['properties']['type_id'] == 18]

In [59]:
# confirm that this has the same number of entries as we expect (211664)
len(output_dict['features'])

211664

In [60]:
# Transform python object back into json
output_json = json.dumps(output_dict)

In [61]:
# write output_json to file for future use
with open('output_json_test.geojson', 'w') as outfile:  
    json.dump(output_dict, outfile)

# JSON Confirmation

In [63]:
output_dict['features'][0]

{'geometry': {'coordinates': [[[-90.531649339747, 14.561428418217771],
    [-90.531649339747, 14.561448895202519],
    [-90.53160098188819, 14.561448895202519],
    [-90.53160098188819, 14.561428418217771],
    [-90.531649339747, 14.561428418217771]]],
  'type': 'Polygon'},
 'properties': {'bounds_imcoords': '2726,2512,2740,2518',
  'cat_id': '1040010028371A00',
  'edited_by': 'wwkarlanicole',
  'feature_id': 75689,
  'grid_file': 'Grid2.shp',
  'image_id': '2355.tif',
  'index_right': 2356,
  'ingest_time': '2017/07/06 15:40:23.038+00',
  'point_geom': '0101000020E610000050DC892506A256C07C8F3AE3741F2D40',
  'type_id': 18},
 'type': 'Feature'}

In [66]:
np.zeros((len(output_dict['features']),4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [72]:
# try to read it back in
fname = '../data/output_json_test.geojson'
with open(fname) as f:
    data = json.load(f)

coords = np.zeros((len(data['features']),4))

In [69]:
coords

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [70]:
chips = np.zeros((len(data['features'])),dtype="object")

# Testing Standalone Script


In [1]:
import json_annotation_utilities as extract

In [75]:
extract.parsetype("../data/xView_train.geojson","../data/justcars_test1.geojson")

In [2]:
extract.parsetype("../data/xView_train.geojson","../data/helipads.geojson",84)

# Prototyping Downsampling BBoxes

In [3]:
import json
import math

In [4]:
input_json="../data/xView_train.geojson"
output_file='output_json_test.geojson'
scale_factor=2

In [5]:
with open(input_json) as f:
    data = json.load(f)

In [8]:
data['features']

[{u'geometry': {u'coordinates': [[[-90.53169885094464, 14.56603647302396],
     [-90.53169885094464, 14.56614473506768],
     [-90.53158140073565, 14.56614473506768],
     [-90.53158140073565, 14.56603647302396],
     [-90.53169885094464, 14.56603647302396]]],
   u'type': u'Polygon'},
  u'properties': {u'bounds_imcoords': u'2712,1145,2746,1177',
   u'cat_id': u'1040010028371A00',
   u'edited_by': u'wwoscarbecerril',
   u'feature_id': 374410,
   u'grid_file': u'Grid2.shp',
   u'image_id': u'2355.tif',
   u'index_right': 2356,
   u'ingest_time': u'2017/07/24 12:49:09.118+00',
   u'point_geom': u'0101000020E6100000616E4E6406A256C03BE6ADA0D6212D40',
   u'type_id': 73},
  u'type': u'Feature'},
 {u'geometry': {u'coordinates': [[[-90.53167232380382, 14.562217332510999],
     [-90.53167232380382, 14.562407959236182],
     [-90.53153294103244, 14.562407959236182],
     [-90.53153294103244, 14.562217332510999],
     [-90.53167232380382, 14.562217332510999]]],
   u'type': u'Polygon'},
  u'propert

In [13]:
output_dict = dict(data)

In [15]:
def clean_types(input_dict):
    """
    Sets all type_ids to 1, for simplicity.  Do not use with multiple type IDs!

    Args:
        input_dict: dictionary of geojson content in xView schema

    Output:
        returns dictionary
    """

    for item in input_dict['features']:
        item['properties']['type_id'] = 1

    return input_dict

In [17]:
clean_types(output_dict)

{u'crs': {u'properties': {u'name': u'urn:ogc:def:crs:OGC:1.3:CRS84'},
  u'type': u'name'},
 u'features': [{u'geometry': {u'coordinates': [[[-90.53169885094464,
       14.56603647302396],
      [-90.53169885094464, 14.56614473506768],
      [-90.53158140073565, 14.56614473506768],
      [-90.53158140073565, 14.56603647302396],
      [-90.53169885094464, 14.56603647302396]]],
    u'type': u'Polygon'},
   u'properties': {u'bounds_imcoords': u'2712,1145,2746,1177',
    u'cat_id': u'1040010028371A00',
    u'edited_by': u'wwoscarbecerril',
    u'feature_id': 374410,
    u'grid_file': u'Grid2.shp',
    u'image_id': u'2355.tif',
    u'index_right': 2356,
    u'ingest_time': u'2017/07/24 12:49:09.118+00',
    u'point_geom': u'0101000020E6100000616E4E6406A256C03BE6ADA0D6212D40',
    u'type_id': 1},
   u'type': u'Feature'},
  {u'geometry': {u'coordinates': [[[-90.53167232380382, 14.562217332510999],
      [-90.53167232380382, 14.562407959236182],
      [-90.53153294103244, 14.562407959236182],
  

In [42]:
# note: these strings are formatted 'xmin,ymin,xmax,ymax'
coord_string = output_dict['features'][0]['properties']['bounds_imcoords']

In [43]:
# get cords as numbers
coords = [int(x) for x in coord_string.split(',')]

In [44]:
# get floor of xmin, ymin
math.floor(coords[0]/scale_factor)

339

In [45]:
# get ceiling of xmax, ymax
math.ceil(coords[3]/scale_factor)

148

In [33]:
out_coord = str(math.floor(coords[0]/scale_factor)) + "," + \
            str(math.floor(coords[1]/scale_factor)) + "," + \
            str(math.ceil(coords[2]/scale_factor)) + "," + \
            str(math.ceil(coords[3]/scale_factor))

In [34]:
out_coord

'1356,572,1373,589'

In [35]:
# error handling for if the box collapses?

In [57]:
for i in range(15):#range(len(output_dict['features'])):
    # get coordinate string
    coords = [int(x) for x in output_dict['features'][i]['properties']['bounds_imcoords'].split(',')]

    out_coords = str(math.floor(coords[0]/scale_factor)) + "," + \
            str(math.floor(coords[1]/scale_factor)) + "," + \
            str(math.ceil(coords[2]/scale_factor)) + "," + \
            str(math.ceil(coords[3]/scale_factor))

    output_dict['features'][i]['properties']['bounds_imcoords'] = out_coords
    
    print("\n original: " + data['features'][i]['properties']['bounds_imcoords'])
    print("downsized: " + output_dict['features'][i]['properties']['bounds_imcoords'])


 original: 1356,572,1373,589
downsized: 1356,572,1373,589

 original: 1360,1116,1380,1144
downsized: 1360,1116,1380,1144

 original: 1343,669,1370,700
downsized: 1343,669,1370,700

 original: 1345,600,1365,634
downsized: 1345,600,1365,634

 original: 1335,419,1357,435
downsized: 1335,419,1357,435

 original: 1352,590,1369,603
downsized: 1352,590,1369,603

 original: 1354,507,1374,536
downsized: 1354,507,1374,536

 original: 1335,404,1367,433
downsized: 1335,404,1367,433

 original: 1350,176,1366,192
downsized: 1350,176,1366,192

 original: 1352,1061,1375,1076
downsized: 1352,1061,1375,1076

 original: 1341,128,1356,149
downsized: 1341,128,1356,149

 original: 1350,920,1365,943
downsized: 1350,920,1365,943

 original: 1351,1003,1373,1020
downsized: 1351,1003,1373,1020

 original: 1339,726,1363,765
downsized: 1339,726,1363,765

 original: 1347,533,1367,564
downsized: 1347,533,1367,564


# Full Code Test

In [1]:
import json_annotation_utilities as jutil

In [2]:
# run downsample
jutil.downsample_bbox('../data/helipads.geojson','../data/helipads_half_t1.geojson',scale_factor=2)


100%|██████████| 125/125 [00:00<00:00, 75307.10it/s]


## Standalone Testing - Compare Dictionaries


In [1]:
import json

In [17]:
previous = "../data/helipads_half.geojson"
current = "../data/test_util_heli_half.geojson"
default = "../data/xView_train.geojson"

In [None]:
with open(previous) as f:
    previous_dict = json.load(f)
    
with open(current) as f:
    current_dict = json.load(f)
    
with open(default) as f:
    default_dict = json.load(f)

In [14]:
cmp(previous_dict, current_dict)

-1

In [15]:
cmp(default_dict, current_dict)

-1

In [16]:
cjc = '../data/test_util_justcars.geojson'
pjc = '../data/justcars.geojson'

## Class Mappings