<a href="https://colab.research.google.com/github/laruvinga/machinelearning/blob/master/Group_Slums_Data_into_Train_and_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import shutil
import os
import random
from google.colab import drive
from pathlib import Path
from queue import Queue
from threading import Thread
from time import time

In [2]:
# Mount google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Define dir paths
dataset_dir = "/content/drive/My Drive/SatelliteML/Test_101"
tmp_dir = "/tmp/"
images_extract_dir = tmp_dir + "/32grid"
shapes_extract_dir = tmp_dir + "/boundaries"

In [0]:
# Download and unpack from google drive
shutil.unpack_archive(dataset_dir + "/32grid.tar.bz2", tmp_dir)
shutil.unpack_archive(dataset_dir + "/boundaries.tar.bz2", tmp_dir)

In [5]:
# Count number of files extracted
images_extract_count = len(os.listdir(images_extract_dir))
shapes_extract_count = len(os.listdir(shapes_extract_dir))

print(images_extract_count, " image files extracted.")
print(shapes_extract_count, " shape files extracted.")

99750  image files extracted.
11  shape files extracted.


### GROUP DATA IN CORRECT DIRECTORIES FROM IMAGES EXTRACT

In [6]:
# Install geojson
!pip install geojson
# Install affine
!pip install Affine



In [0]:
# Import libs
from osgeo import gdal
from osgeo import osr
from osgeo import ogr
from shapely.geometry import shape
from shapely.geometry import Polygon
import geojson
from pathlib import Path
import shutil
import os
from affine import Affine

In [0]:
slums_shp = shapes_extract_dir + "/slums.shp"

ngozi_dir = tmp_dir + "/byo_ngozi";
sorted_valid_dir = ngozi_dir + "/valid"
sorted_train_dir = ngozi_dir + "/train"
train_dir = tmp_dir + "/train_tmp"

train_slums_dir = train_dir + "/slum"
train_noslums_dir = train_dir + "/noslum"

sorted_train_slums_dir = sorted_train_dir + "/slum"
sorted_train_noslums_dir = sorted_train_dir + "/noslum"
sorted_valid_slums_dir = sorted_valid_dir + "/slum"
sorted_valid_noslums_dir = sorted_valid_dir + "/noslum"

In [0]:
# Create train slums dir

shutil.rmtree(train_slums_dir)
os.makedirs(train_slums_dir)

shutil.rmtree(sorted_train_slums_dir)
os.makedirs(sorted_train_slums_dir)

shutil.rmtree(sorted_valid_slums_dir)
os.makedirs(sorted_valid_slums_dir)

# Create train no slums dir

shutil.rmtree(train_noslums_dir)
os.makedirs(train_noslums_dir)

shutil.rmtree(sorted_train_noslums_dir)
os.makedirs(sorted_train_noslums_dir)

shutil.rmtree(sorted_valid_noslums_dir)
os.makedirs(sorted_valid_noslums_dir)

In [10]:
# Get slums extents shapefile and test features
driver = ogr.GetDriverByName('ESRI Shapefile')
dataSource = driver.Open(slums_shp, 0) # 0 means read-only. 1 means writeable.
layer = dataSource.GetLayer()
featureCount = layer.GetFeatureCount()
print (featureCount)

93


In [0]:
# Calculate corner points of raster image using affine
def GetCornerPoints(filename):
  src = gdal.Open(filename)
  
  # get row and column size from raster
  ncol = src.RasterXSize
  nrow = src.RasterYSize
  
  # get transformation parameters
  gt  = src.GetGeoTransform()
  
  # calculate affine transform
  transform = Affine.from_gdal(*gt)
  c0x, c0y = transform.c, transform.f  # upper left
  c1x, c1y = transform * (0, nrow)     # lower left
  c2x, c2y = transform * (ncol, nrow)  # lower right
  c3x, c3y = transform * (ncol, 0)     # upper right
  xs = (c0x, c1x, c2x, c3x)
  ys = (c0y, c1y, c2y, c3y)
  
  # Get bounds of image
  x1, y1, x3, y3 = min(xs), min(ys), max(xs), max(ys)
  x2, y2, x4, y4 = x3, y1, x1, y3
  return [(x1, y1), (x2, y2), (x3, y3), (x4, y4)]

In [0]:
# Get polygons from shapefile
def PointsIntersectPolygons(polygons, points):
  # Change points to polygon
  pointsPolygon = Polygon(points)
  
  for polygon in polygons:
    if polygon.intersects(pointsPolygon):
      return True
  return False

In [0]:
# Get all polygons defining slum boundaries
def GetSlumPolygons():
  # Open shapefile using OGR
  driver = ogr.GetDriverByName('ESRI Shapefile')
  dataSource = driver.Open(slums_shp, 0) # 0 means read-only. 1 means writeable.
  layer = dataSource.GetLayer()
  featureCount = layer.GetFeatureCount()
  polygons = []
  
  # Get all polygons and put in array
  for i in range(0, featureCount):
    feature = layer.GetFeature(i)
    json = geojson.loads(feature.ExportToJson())
    geom = shape(json["geometry"])
    polygons.append(geom)
  # Return polygons found
  return polygons

In [0]:
# Get all slum polygons
polygons = GetSlumPolygons()

In [15]:
# Get all files in folder and label imagery based on boundaries
pathlist = Path(images_extract_dir).glob('**/*.tif')
slum = 0
noslum = 0

for path in pathlist:
  # Check if test points intersect
  image_path = str(path)
  points = GetCornerPoints(image_path)
  intersect = PointsIntersectPolygons(polygons, points)
  if intersect:
    # Move file to slums unbalanced set and count
    slum += 1
    shutil.move(image_path, train_slums_dir + "/" + os.path.basename(image_path))
  else:
    # Move file to no slums unbalanced set and count
    noslum += 1
    shutil.move(image_path, train_noslums_dir + "/" + os.path.basename(image_path))

print ("Images labelled as slum: ", slum)
print ("Images labelled as other: ", noslum)

Images labelled as slum:  39
Images labelled as other:  99711


In [16]:
# Pick random image
print(random.choice(os.listdir(train_noslums_dir)))

32grid.84836.tif


In [0]:
def sort_images(src_slums_dir, src_noslums_dir, dest_slums_dir, dest_noslums_dir, count):
  
  for i in range(0, count):
    # Get image names
    slum_image = random.choice(os.listdir(src_slums_dir))
    noslum_image = random.choice(os.listdir(src_noslums_dir))

    # Generate source paths
    src_slum_image_path = src_slums_dir + "/" + slum_image
    src_noslum_image_path = src_noslums_dir + "/" + noslum_image
    dest_slum_image_path = dest_slums_dir + "/" + slum_image
    dest_noslum_image_path = dest_noslums_dir + "/" + noslum_image

    # Move image files
    shutil.move(src_slum_image_path, dest_slum_image_path)
    shutil.move(src_noslum_image_path, dest_noslum_image_path)

In [0]:
# Move images to dataset dir
count_valid = 10
count_train = len(os.listdir(train_slums_dir)) - count_valid

# Validation dataset
sort_images(train_slums_dir, train_noslums_dir, sorted_valid_slums_dir, sorted_valid_noslums_dir, count_valid)

# Train dataset
sort_images(train_slums_dir, train_noslums_dir, sorted_train_slums_dir, sorted_train_noslums_dir, count_train)

In [19]:
print(len(os.listdir(sorted_valid_slums_dir)), " slum validation images")
print(len(os.listdir(sorted_valid_noslums_dir)), " no-slum validation images")
print(len(os.listdir(sorted_train_slums_dir)), " slum train images")
print(len(os.listdir(sorted_train_noslums_dir)), " no-slum train images")

10  slum validation images
10  no-slum validation images
29  slum train images
29  no-slum train images


### CONVERT GEO-TIF IMAGES TO GEO-JPEG

In [0]:
def convertFile(tiffFileName, jpegFilename):
  # Options
  options_list = [
      '-of JPEG'
  ] 
  options_string = " ".join(options_list)
  # Translate geo file
  gdal.Translate(jpegFilename, tiffFileName, options=options_string)
  #Remove converted tiff file
  os.remove(tiffFileName)

In [0]:
def runConvertFile(tiff_image_path):
  # Get base file name
  base_filename = os.path.basename(tiff_image_path)
  # Get jpeg image path as string
  jpg_image_path = os.path.splitext(tiff_image_path)[0] + ".jpg"
  # Convert file
  convertFile(tiff_image_path, jpg_image_path)

In [22]:
# All paths to convert
data_dirs = [
    sorted_valid_slums_dir,
    sorted_valid_noslums_dir,
    sorted_train_slums_dir,
    sorted_train_noslums_dir
]


# Get all files in all data directories
for data_dir in data_dirs:

  # Get all tiff files to convert to jpg
  # pathlist = Path(data_dir) #.glob('**/*.tif')
  # Insert paths into queue
  for path in os.listdir(data_dir):
    if path.endswith('.tif'):
      runConvertFile(data_dir + "/" + path)

print("GeoTIFF to GeoJPEG conversion complete.")

GeoTIFF to GeoJPEG conversion complete.


In [26]:
# Archive images dataset
shutil.make_archive(ngozi_dir, "bztar", tmp_dir)

'/tmp/byo_ngozi.tar.bz2'

In [27]:
shutil.move(ngozi_dir + ".tar.bz2", dataset_dir + "/byo_ngozi.tar.bz2")

'/content/drive/My Drive/SatelliteML/Test_101/byo_ngozi.tar.bz2'