# Data Access & Exploration

## 0. Imports

In [2]:
import os.path
from os.path import isfile, join
import os
from os import listdir
from osgeo import gdal
import errno
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio.plot import show
from rasterio.windows import from_bounds #Window
from rasterio.enums import Resampling
from sys import platform

In [3]:
# for coloured print statements
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [4]:
if platform == "linux" or platform == "linux2":
    path_satellite = "/home/jovyan/work/satellite_data/"
    path_labels = os.getcwd() + "/1_labeled_data/tif/"
    cutouts_path = os.getcwd() + "/2_cutouts/"
elif platform == "darwin":
    path_data = "/Users/leori/Desktop/BA/1_Data/"
    path_satellite = path_data + "2_satellite/"
    path_labels = path_data + "1_labeled_data/tif/"
    cutouts_path = None
elif platform == "win32":
    print("Something went wrong")

# 1. Data Inspection

### 1.1 Functions

In [5]:
def inspect_tif_resolution(filepath):
    img = rasterio.open(filepath)
    xRes, yRes = img.res
    print("Resolution: {} x {}".format(xRes, yRes))   

In [74]:
def bounds(filepath):
    src_img = rasterio.open(filepath)
    print(src_img.bounds)
    left, bottom, right, top = src_img.bounds[0],src_img.bounds[1], src_img.bounds[2], src_img.bounds[3]


In [6]:
def inspect_tif(dataset):
    print("Metadata:", dataset.GetMetadata())
    print("Projection: ", dataset.GetProjection())
    print("Size is {} x {} x {}".format(dataset.RasterXSize,
                                        dataset.RasterYSize,
                                        dataset.RasterCount))
    geotransform = dataset.GetGeoTransform()
    if geotransform:
        print("Origin = ({}, {})".format(geotransform[0], geotransform[3]))
        print("Pixel Size = ({}, {})".format(geotransform[1], geotransform[5]))

    # Raster Band
    band = dataset.GetRasterBand(1)
    min = band.GetMinimum()
    max = band.GetMaximum()
    if not min or not max:
        (min,max) = band.ComputeRasterMinMax(True)
    print("Min={:.3f}, Max={:.3f}".format(int(min),int(max)))
        
    size_x_px = dataset.RasterXSize
    size_y_px = dataset.RasterYSize
    pixel_size = (dataset.GetGeoTransform()[1], dataset.GetGeoTransform()[5])
    size_x = size_x_px * pixel_size[0]
    size_y = size_y_px * pixel_size[1]
    print("# Pixels in Image = ({}, {}) with a Raster Count of {}".format(size_x_px, size_y_px, dataset.RasterCount))
    print("Size of one Pixel in cm = {}".format(tuple([100000*x for x in pixel_size])))
    print("Size Covered by Image / Image Size = ({}m, {}m)".format(round(1000*size_x, 2), abs(round(1000*size_y, 2))))

In [7]:
def isfile(filepath: str = None):
    # if filepath does not exists, an error is thrown
    if not os.path.isfile(filepath): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filepath) #print("File does not exist.")

In [63]:
def preview_tif(tif_filepath: str = None):
    isfile(tif_filepath)
    filesize_in_mb = os.path.getsize(tif_filepath) / 1000000
    if filesize_in_mb > 500: 
        print("File is too large: {}MB".format(int(filesize_in_mb)))
        return
    img = rasterio.open(tif_filepath)
    if img: show(img)
    else: print("img unable to be loaded at this moment")

### 1.2 Data Inspection

ToDos Data Preparation
- Data Cleaning (visualize data first): rm redundant information & set no data values to something - some metadata?
- Custom Transformers
- Feature Scaling


⇒ Pipeline (cf. first example in `4_pipeline.ipynb`)

In [76]:
filepath_label = path_labels + "raster_exported.tif"
filepath_satellite = path_satellite + "2629BD_2018.tif"

In [77]:
print("Satellite")
inspect_tif(gdal.Open(filepath_satellite))
bounds(filepath_satellite)

print("\nLabel")
inspect_tif(gdal.Open(filepath_label))
bounds(filepath_label)

Satellite
Size is 108224 x 107798 x 3
Origin = (29.746877767434313, -26.247113163017453)
Pixel Size = (2.3728143101288307e-06, -2.3728143101288515e-06)
Min=0.000, Max=255.000
# Pixels in Image = (108224, 107798) with a Raster Count of 3
Size of one Pixel in cm = (0.23728143101288307, -0.23728143101288515)
Size Covered by Image / Image Size = (256.8m, 255.78m)
BoundingBox(left=29.746877767434313, bottom=-26.502897800020722, right=30.003673223333696, top=-26.247113163017453)

Label
Size is 106283 x 106048 x 1
Origin = (29.748224553, -26.248683629)
Pixel Size = (2.3728094050789067e-06, -2.3728068516143877e-06)
Min=0.000, Max=0.000
# Pixels in Image = (106283, 106048) with a Raster Count of 1
Size of one Pixel in cm = (0.23728094050789067, -0.23728068516143877)
Size Covered by Image / Image Size = (252.19m, 251.63m)
BoundingBox(left=29.748224553, bottom=-26.50031505, right=30.000413855, top=-26.248683629)


In [78]:
preview_tif(filepath_label)
preview_tif(filepath_satellite)

File is too large: 45086MB
File is too large: 2062MB


In [79]:
cutout_window = (29.748021463931796, -26.24839922837354, 29.74923634485858, -26.249614109300325)

In [81]:
src_label = rasterio.open(filepath_label)
print("shape:", src_label.read().shape)
# src_label.colorinterp[3] # rgba

shape: (1, 106048, 106283)


In [47]:
src_label_sm = rasterio.open(filepath_label_sm)
print("shape:", src_label_sm.read().shape)
# src_label_sm.colorinterp[3] # rgba

shape: (4, 1884, 3050)


In [48]:
# channels are unequal
comparison = src_label_sm.read(2) == src_label_sm.read(3)
equal_arrays = comparison.all()
print(equal_arrays)

False


In [56]:
src_label_sm.read(2)[1884-1][3050-1] # blue pixel value in bottom left corner

255

In [12]:
from PIL import Image
import numpy as np

img = Image.open(filepath_label_sm)

# convert to numpy array
img = np.array(img)

# find number of channels
if img.ndim == 2:
    channels = 1
    print("image has 1 channel")
else:
    channels = img.shape[-1]
    print("image has", channels, "channels")

image has 4 channels


### 1.3 Data Visualization

In [13]:
inspect_tif_resolution(filepath_label)

Resolution: 8.335734183909613e-06 x 8.335734183909613e-06


In [None]:
inspect_tif(gdal.Open(filepath_label))

Size is 30254 x 30188 x 4
Origin = (29.748224553, -26.248679767728067)
Pixel Size = (8.335734183909613e-06, -8.335734183909613e-06)
Min=0.000, Max=255.000
# Pixels in Image = (30254, 30188) with a Raster Count of 4
Size of one Pixel in cm = (0.8335734183909613, -0.8335734183909613)
Size Covered by Image / Image Size = (252.19m, 251.64m)
