# This notebook extracts data from an orthomosaic using a shape file in order to create a training and validation set for analysis.

In [2]:
import numpy as np
import geopandas as gpd
import pandas as pd
from shapely.geometry import mapping
from rasterio.mask import mask
import rasterio

In [3]:
#Change the rasterfile to the correct path
shpfile = './Masking_Data.shp'
rasterfile = './ortho.tif'

In [8]:
#Polygon File
columns = ['Class','Blue','Green','Red','RedEdge','NIR','Thermal']    
csv = pd.DataFrame(columns = columns)
    
df = gpd.read_file(shpfile)   
classes = pd.unique(df['id'])

for i in classes:
    
    y = df.loc[df['id'] == i]
     
    geoms = y.geometry.values
    geometry = geoms[0]
    geoms = [mapping(geoms[0])]


    with rasterio.open(rasterfile) as src:
        out_image, out_transform = mask(src,geoms,crop =True)
    
        no_data = 0
        data = out_image[0]
        row, col= np.where(data != no_data)
        bands = out_image.shape[0]


        Y = np.zeros((row.size,bands))
        for ba in range(bands):
            Y[:, ba] = np.extract(data != no_data, out_image[ba])        
        
        joined = np.concatenate(((i*np.ones((Y.shape[0],1))).astype(int),Y),axis =1)
        temp = pd.DataFrame(joined,columns= columns)
        csv = csv.append(temp,ignore_index = True)

#Rescale to be between 0 and 1 
        
for x in csv:        
    if x != 'Class':
        maximum = np.max(csv[x])
        csv[x] =  csv[x]/maximum

In [9]:
train = pd.DataFrame(columns = columns)
val = pd.DataFrame(columns = columns)

for i in pd.unique(csv['Class']):
    
    c = csv.loc[csv['Class'] == i]   
    train = train.append(c.sample(n=100),ignore_index = True)
    val = val.append(c.sample(n=100),ignore_index = True)
    

In [10]:
train.to_csv('./train.csv',index = False, sep=',', encoding='utf-8')
val.to_csv('./val.csv',index = False, sep=',', encoding='utf-8')

In [11]:
test = pd.read_csv('train.csv')