In [1]:
import rasterio
import geopandas
import pandas as pd
import os
import numpy as np

In [2]:
dataset_dir = '/media/pasok/2.0 TB Hard Disk/giorgos/dataset/Data'

In [3]:
def extract_parameters(points_df, raster_files, tile_dir, add_geometry = False):
  '''
  Returns the gven points daraframe with the raster values of every point in every raster file specified
  Inputs: points_df (geopandas object): a dataframe thath contains all the points
          raster_files (list): a list with the names of the raster files to draw values from
          tile_dir (string): the path that contains the raster files
          add_geometry (bool): whether to keep the coordinates information and the label value
  outputs: dataset (pandas dataframe): a dataframe with all the points and their corresponding raster values
           parameter names(list): the names of the dataframe columns
  '''
  dataset = []
  parameter_names = []
  bands = []

  #open raster files to 3D numpy array
  for file in raster_files:
    raster = rasterio.open(os.path.join(tile_dir, file))
    name = file.split('.')[0]
    parameter_names.append(name) #name of the dataframe column
    band = raster.read(1)
    bands.append(band)
    raster.close()
  data = np.stack(bands)

  #add coordinates and label to the column names
  if add_geometry:
    parameter_names.append('x')
    parameter_names.append('y')
    parameter_names.append('Category')

  #for every point in the dataframe
  for index in points_df.index:
    parameters = []
    #get coordinates and label
    x = points_df['x'][index]
    y = points_df['y'][index]
    category = points_df['Code'][index]
    py, px = raster.index(x, y)#get pixel coordinates from ground coordinates
  
    values = data[:, py, px].flatten().tolist() #get raster values
    parameters = parameters + values

    if add_geometry:
      parameters.append(x)
      parameters.append(y)
      parameters.append(category)

    dataset.append(parameters)
  dataset = pd.DataFrame(dataset)
  
  return dataset, parameter_names

def extract_training_data(points_df, raster_files, tile_dir):
  '''
  Segments the process to six parts due to memory issues
  '''
  first = raster_files[0:4]
  second = raster_files[4:8]
  third = raster_files[8:12]
  fourth = raster_files[12:16]
  fifth = raster_files[16:20]
  sixth = raster_files[20:]

  print('Part one...')
  part1, parameters1 = extract_parameters(points_df, first, tile_dir)
  print('Part two...')
  part2, parameters2 = extract_parameters(points_df, second, tile_dir)
  print('Part three...')
  part3, parameters3 = extract_parameters(points_df, third, tile_dir)
  print('Part four...')
  part4, parameters4 = extract_parameters(points_df, fourth, tile_dir)
  print('Part five...')
  part5, parameters5 = extract_parameters(points_df, fifth, tile_dir)
  print('Part six...')
  part6, parameters6 = extract_parameters(points_df, sixth, tile_dir, add_geometry = True)

  parameter_names = parameters1 + parameters2 + parameters3 + parameters4 + parameters5 + parameters6
  dataset = pd.concat([part1, part2, part3, part4, part5, part6], axis=1)
  dataset.columns = parameter_names   
  return dataset

In [5]:
dataset = pd.DataFrame()
for tile in os.listdir(dataset_dir):
    print('Extracting points for tile:', tile)
    tile_dir = os.path.join(dataset_dir, tile)
    raster_files = [raster for raster in os.listdir(tile_dir) if raster.endswith('.tif')]
    vector_file = [vector for vector in os.listdir(tile_dir) if (vector.startswith('springs') and vector.endswith('.shp'))][0]
    
    #open shapefile that contains the points
    hydroPoints = geopandas.read_file(os.path.join(tile_dir, vector_file))
    hydroPoints = hydroPoints.explode(index_parts = False) #in case of multipoint geometry
    #add x and y values
    hydroPoints['x']  = hydroPoints.geometry.x 
    hydroPoints['y']  = hydroPoints.geometry.y
    #extract information and save to csv
    train_data = extract_training_data(hydroPoints, raster_files, tile_dir)
    train_data.to_csv(os.path.join(tile_dir, ('dataset.' + tile.split('.')[1] + '.csv') ))
    if tile == 'DEM.5': #put aside for evaluation
        continue
    dataset = pd.concat([dataset, train_data], ignore_index=True)
    print()


Extracting points for tile: DEM.6
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...

Extracting points for tile: DEM.8
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...

Extracting points for tile: DEM.10
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...

Extracting points for tile: DEM.2
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...

Extracting points for tile: DEM.7
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...

Extracting points for tile: DEM.5
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...
Extracting points for tile: DEM.0
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...

Extracting points for tile: DEM.4
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...

Extracting points for tile: DEM.11
Part one...
Part two...
Part three...
Part four...
Part five...
Part six...



In [6]:
dataset.to_csv(os.path.join(dataset_dir, 'dataset_minus5.csv'))

In [7]:
dataset.tail()

Unnamed: 0,profile_curvature,flow_accumulation,sediment_transport_index,lineaments_distance,ruggedness_index,tan_curvature,slope_length,lineaments_distance_linear,stream_distance_linear,geology,...,DEM,vathos,mrvbf,slope,lineaments_density,wetness_index,plan_curvature,x,y,Category
3044,0.003353,1.0,0.105147,-999.0,0.054266,0.004635,0.0,7259.65332,87.475708,-999.0,...,276.711792,-999.0,4.697973,0.923638,-999.0,3.450434,0.146132,374559.9436,4280217.0,1
3045,-0.00167,22.0,0.358166,-999.0,0.054083,0.000193,42.769547,8880.72168,121.622368,-999.0,...,172.839798,-999.0,2.530862,1.802482,-999.0,6.549775,0.006138,385126.9465,4275652.0,1
3046,0.000344,4.0,0.323551,-999.0,0.089438,0.002189,6.828427,7625.323242,198.040405,-999.0,...,273.198456,-999.0,1.542629,3.084491,-999.0,4.397958,0.044535,379823.9466,4279232.0,1
3047,-0.001221,3.0,0.085934,-999.0,0.038395,-0.002612,2.828427,6871.333008,88.090858,-999.0,...,280.184357,-999.0,2.533747,1.533769,-999.0,5.042605,-0.134856,374405.9441,4280751.0,1
3048,-0.00579,3.0,0.425802,-999.0,0.112759,-0.004084,5.656854,6245.998535,304.53244,-999.0,...,308.94339,-999.0,1.524496,3.803226,-999.0,3.809519,-0.061571,374766.945,4281264.0,1
