# Data Preparation

## Imports

### Identify System

In [66]:
import platform
my_os = platform.system()
if my_os == "Darwin":
    working_on_server = False
elif my_os == "Linux":
    working_on_server = True
else: raise Exception("You are neither on your Mac or on the Linux server. Something went wrong.")
print("working_on_server: ", working_on_server)

working_on_server:  True


In [37]:
import os.path
if working_on_server:
    path_labels = os.getcwd() + "/1_labeled_data/"
    path_rdg = "/home/jovyan/work/satellite_data/"
else:
    path_labels = "/Users/leori/Desktop/BA/1_Data/1_labeled_data/"
    path_rdg = "/Users/leori/Downloads/RDG_storage/"
label_tif = path_labels + "tif/144.tif"
print(os.path.isfile(path_rdg + "2629BD_2018.tif"), os.path.isfile(label_tif))
# TODO: locally only jp2.tif files, whereas on the server only tif file?!

True True


Great Packages:
- Pandas: provide data structures and data analysis tools
- Numpy: a fundamental package for scientific computing with Python
- SciPy:(pronounced “Sigh Pie”) is a Python-based ecosystem of open-source software for mathematics, science, and engineering
- RTree: a ctypes Python wrapper of libspatialindex that provides a number of advanced spatial indexing features
- GDAL: translator library for raster and vector geospatial data formats
- Fiona: Fiona reads and writes spatial data files
- Shapely: Geometric objects, predicates, and operations
- GeoPandas: extends the datatypes used by pandas to allow spatial operations on geometric types.
- PySAL: a library of spatial analysis functions written in Python intended to support the development of high-level applications.
- Matplotlib: Python 2D plotting library
- Missingno: Missing data visualization module for Python

### Import Image

#### Import Image with PIL

In [74]:
# Import was prevented because Image size (329910267 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.
# Increase max_img_pixels: https://stackoverflow.com/questions/51152059/pillow-in-python-wont-let-me-open-image-exceeds-limit
import PIL
from PIL import Image
PIL.Image.MAX_IMAGE_PIXELS = 933120000
label = Image.open(label_tif)
# label.show() # takes 52 seconds

# or:
# import matplotlib.pyplot as plt
# label = plt.imread(label_tif)

UnidentifiedImageError: cannot identify image file '/home/jovyan/work/saved_data/Semantic_Segmentation_of_Tree_Stock/1_labeled_data/tif/144.tif'

In [39]:
from os import listdir
from os.path import isfile, join
def print_files(path = os.getcwd()):
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    print(path, onlyfiles)

In [40]:
print_files(path_labels)

/home/jovyan/work/saved_data/Semantic_Segmentation_of_Tree_Stock/1_labeled_data/ ['144.tif']


#### Import Image with GDAL

In [130]:
from osgeo import gdal
import numpy as np
dataset = gdal.Open(path_rdg + "2629BD_2018.tif") 
# r"/home/jovyan/work/saved_data/Semantic_Segmentation_of_Tree_Stock/1_labeled_data/tif/144.tif" (label file) results in an error -> probably wrong format?

### 0.1 Data Inspection

In [131]:
# General Dataset Information
print("Metadata:", dataset.GetMetadata())
print("Type Image: ", type(dataset))
print("Projection: ", dataset.GetProjection())
print("Size is {} x {} x {}".format(dataset.RasterXSize,
                                    dataset.RasterYSize,
                                    dataset.RasterCount))
print("Driver: {}/{}".format(dataset.GetDriver().ShortName,
                            dataset.GetDriver().LongName))
geotransform = dataset.GetGeoTransform()
if geotransform:
    print("Origin = ({}, {})".format(geotransform[0], geotransform[3]))
    print("Pixel Size = ({}, {})".format(geotransform[1], geotransform[5]))

Metadata: {'ALL_COMMENTS': 'Created by OpenJPEG version 2.4.0', 'AREA_OR_POINT': 'Area', 'COLORSPACE': 'RGB', 'COMPRESSION_RATE_TARGET': '1', 'TIFFTAG_MAXSAMPLEVALUE': '0', 'TIFFTAG_MINSAMPLEVALUE': '0', 'TIFFTAG_RESOLUTIONUNIT': '1 (unitless)', 'TIFFTAG_XRESOLUTION': '0', 'TIFFTAG_YRESOLUTION': '0'}
Type Image:  <class 'osgeo.gdal.Dataset'>
Projection:  GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
Size is 108224 x 107798 x 3
Driver: GTiff/GeoTIFF
Origin = (29.746877767434313, -26.247113163017453)
Pixel Size = (2.3728143101288307e-06, -2.3728143101288515e-06)


In [132]:
# Raster Band
band = dataset.GetRasterBand(1)
print("Band Type={}".format(gdal.GetDataTypeName(band.DataType)))

min = band.GetMinimum()
max = band.GetMaximum()
if not min or not max:
    (min,max) = band.ComputeRasterMinMax(True)
print("Min={:.3f}, Max={:.3f}".format(min,max))

if band.GetOverviewCount() > 0:
    print("Band has {} overviews".format(band.GetOverviewCount()))

if band.GetRasterColorTable():
    print("Band has a color table with {} entries".format(band.GetRasterColorTable().GetCount()))

Band Type=Byte
Min=0.000, Max=255.000
Band has 8 overviews


In [138]:
# Raster Data
if 0:
    scanline = band.ReadRaster(xoff=0, yoff=0,
                            xsize=band.XSize, ysize=1,
                            buf_xsize=band.XSize, buf_ysize=1,
                            buf_type=gdal.GDT_Float32)
# scanline: type string, contains xsize*4 bytes of raw binary floating point data
# Can be converted to Python values using the struct module from the standard library:
#import struct
#tuple_of_floats = struct.unpack('f' * b2.XSize, scanline)

In [122]:
# optional conversion to np array
#channel = np.array(dataset.GetRasterBand(1).ReadAsArray())
#print(channel.shape)

(107798, 108224)


In [123]:
# Read the raster band as separate variable
band = dataset.GetRasterBand(1)

# Check type of the variable 'band'
type(band)

# Data type of the values
gdal.GetDataTypeName(band.DataType)

'Byte'

In [104]:
label_tif

'/home/jovyan/work/saved_data/Semantic_Segmentation_of_Tree_Stock/1_labeled_data/tif/144.tif'