In [1]:
import numpy as np
import os
from osgeo import ogr

In [2]:
def create_proj_file(shapefile, spatialRef):
    """Create a projection file corresponding to the given shapefile."""
    path = os.path.splitext(shapefile)[0] + ".prj"
    with open(path, 'w') as proj_file:
        proj_file.write(spatialRef.ExportToWkt())
    return path


def create_ds_from_source(path, source_layer, source_layer_def, spatialRef):
    """Copy the source vector schema."""
    if os.path.exists(path):
        driver.DeleteDataSource(path)

    ds = driver.CreateDataSource(path)
    layer = ds.CreateLayer(source_layer.GetName(),  
                           geom_type=source_layer.GetGeomType())

    # Add input Layer Fields to the output Layer
    for i in range(source_layer_def.GetFieldCount()):
        field_def = source_layer_def.GetFieldDefn(i)
        layer.CreateField(field_def)
    # Get the Layer's Feature Definition
    layer_def = layer.GetLayerDefn()
    
    create_proj_file(path, spatialRef)
    
    # Return the Dataset, the Layer and the Layer Definition
    return ds, layer, layer_def


def split_source(train_path, test_path, source_layer, source_layer_def, spatialRef):
    """
    Given a shapefile path, split the poligons in two different shapefiles: train and test.
    
    Currently: just send half and half.
    
    """
    train_ds, train_layer, train_layer_def = create_ds_from_source(
        train_path, source_layer, source_layer_def, spatialRef
    )
    test_ds, test_layer, test_layer_def = create_ds_from_source(
        test_path, source_layer, source_layer_def, spatialRef
    )
    
    vectors = [
        (train_layer, train_layer_def),
        (test_layer, test_layer_def)
    ]
    current = 0

    # Add features to the ouput Layer
    for feat in source_layer:
        layer, layer_def = vectors[current]
        # Create output Feature
        new_feature = ogr.Feature(layer_def)

        # Add field values from input Layer
        for i in range(layer_def.GetFieldCount()):
            fieldDefn = layer_def.GetFieldDefn(i)
            new_feature.SetField(
                layer_def.GetFieldDefn(i).GetNameRef(), 
                feat.GetField(i)
            )

        geom = feat.GetGeometryRef()
        new_feature.SetGeometry(geom.Clone())
        # Add new feature to output Layer
        layer.CreateFeature(new_feature)

        current = (current+1)%2

    # Close DataSources

    train_ds.Destroy()
    test_ds.Destroy()

In [3]:
def _split_source(train_path, test_path, source_layer, source_layer_def, spatialRef, train_percentage, test_percentage):
    """
    Given a shapefile path, split the poligons in two different shapefiles: train and test.
    
    Currently: just send half and half.
    
    """
    train_ds, train_layer, train_layer_def = create_ds_from_source(
        train_path, source_layer, source_layer_def, spatialRef
    )
    test_ds, test_layer, test_layer_def = create_ds_from_source(
        test_path, source_layer, source_layer_def, spatialRef
    )

    # Eacch one has an id
    feature_count = source_layer.GetFeatureCount()
    to_shuffle = np.arange(feature_count)
    np.random.shuffle(to_shuffle)
    
    # Get percentage for test and training
    for_test = int((feature_count * test_percentage) / 100)
    # To keep the same amount.
    for_training = feature_count - for_test
    
    test_ids = to_shuffle[:for_test]
    training_ids = to_shuffle[for_test:]
    
    # Just a test
    assert len(set(test_ids) & set(training_ids)) == 0
    
    # Add features to the ouput Layer
    
    for test_id in test_ids:
        new_feature = ogr.Feature(test_layer_def)
        feat = source_layer.GetFeature(test_id)
        # Add field values from input Layer
        for i in range(test_layer_def.GetFieldCount()):
            fieldDefn = test_layer_def.GetFieldDefn(i)
            new_feature.SetField(
                test_layer_def.GetFieldDefn(i).GetNameRef(), 
                feat.GetField(i)
            )
        
        geom = feat.GetGeometryRef()
        new_feature.SetGeometry(geom.Clone())
        # Add new feature to output Layer
        test_layer.CreateFeature(new_feature)
    
    
    for train_id in training_ids:
        new_feature = ogr.Feature(train_layer_def)
        feat = source_layer.GetFeature(train_id)
        # Add field values from input Layer
        for i in range(train_layer_def.GetFieldCount()):
            fieldDefn = train_layer_def.GetFieldDefn(i)
            new_feature.SetField(
                train_layer_def.GetFieldDefn(i).GetNameRef(), 
                feat.GetField(i)
            )
        
        geom = feat.GetGeometryRef()
        new_feature.SetGeometry(geom.Clone())
        # Add new feature to output Layer
        train_layer.CreateFeature(new_feature)

    train_ds.Destroy()
    test_ds.Destroy()
    return training_ids, test_ids

In [4]:
# Choose the files to split
vector_data_path = "real_data/split/"
files = [f for f in os.listdir(vector_data_path) 
         if f.endswith(".shp") and not f.startswith('ROI')]
shapefiles = [os.path.join(vector_data_path, f) for f in files if f.endswith('.shp')]

In [5]:
# Move new files into each folder
def move_splited_files():
    dates = ['150201', '150217', '150321']
    splited_files_test = 'real_data/split/test/'
    splited_files_train = 'real_data/split/train/'
    files_test_dest = 'real_data/%s/test'
    files_train_dest = 'real_data/%s/train'
    for date in dates:
        # Move test files
        print("moving files from %s to %s" % (splited_files_test, files_test_dest))
        for f in os.listdir(splited_files_test):
            fname = f.split(".")[0]
            if fname.endswith(date):
                os.rename(os.path.join(splited_files_test, f), os.path.join(files_test_dest % date, f))
        # Move train files
        print("moving files from %s to %s" % (splited_files_train, files_train_dest))
        for f in os.listdir(splited_files_train):
            fname = f.split(".")[0]
            if fname.endswith(date):
                os.rename(os.path.join(splited_files_train, f), os.path.join(files_train_dest % date, f))


In [7]:
driver = ogr.GetDriverByName('ESRI Shapefile')

train_percentage = 75
test_percentage = 25


for source_vector_path in shapefiles:
    source_vector_basename = os.path.basename(source_vector_path).split('.')[0]
    #data_date = source_vector_basename.split("_")[-1]
    
    source_ds = ogr.Open(source_vector_path)
    source_layer = source_ds.GetLayer()
    source_layer_def = source_layer.GetLayerDefn()

    spatialRef = source_layer.GetSpatialRef()
    spatialRef.MorphToESRI()

    train_data_dir = "real_data/split/train/"
    train_vector_basename = "%s" % source_vector_basename
    train_vector_path = os.path.join(train_data_dir, train_vector_basename + ".shp")

    test_data_dir = "real_data/split/test/"
    test_vector_basename = "%s" % source_vector_basename
    test_vector_path = os.path.join(test_data_dir, test_vector_basename + ".shp")
    
    train_ids, test_ids = _split_source(
        train_vector_path, test_vector_path, source_layer, source_layer_def, spatialRef,
        train_percentage, test_percentage
    )
    print("Created from %s\n\t%s\n\t%s" % (source_vector_path, train_vector_path, test_vector_path))
    
    source_ds.Destroy()
    print(len(train_ids))
    print(len(test_ids))
move_splited_files()

Created from real_data/split/RASTROJO_ROI_229_82_150201.shp
	real_data/split/train/RASTROJO_ROI_229_82_150201.shp
	real_data/split/test/RASTROJO_ROI_229_82_150201.shp
14
4
Created from real_data/split/ALFA_ROI_229_82_150321.shp
	real_data/split/train/ALFA_ROI_229_82_150321.shp
	real_data/split/test/ALFA_ROI_229_82_150321.shp
18
5
Created from real_data/split/MANI_ROI_229_82_150217.shp
	real_data/split/train/MANI_ROI_229_82_150217.shp
	real_data/split/test/MANI_ROI_229_82_150217.shp
17
5
Created from real_data/split/SJ_ROI_229_82_150217.shp
	real_data/split/train/SJ_ROI_229_82_150217.shp
	real_data/split/test/SJ_ROI_229_82_150217.shp
79
26
Created from real_data/split/SORGO_ROI_229_82_150321.shp
	real_data/split/train/SORGO_ROI_229_82_150321.shp
	real_data/split/test/SORGO_ROI_229_82_150321.shp
2
0
Created from real_data/split/RASTROJO_ROI_229_82_150321.shp
	real_data/split/train/RASTROJO_ROI_229_82_150321.shp
	real_data/split/test/RASTROJO_ROI_229_82_150321.shp
21
6
Created from real_d