<a href="https://colab.research.google.com/github/larissavaladao/py6s_harmonize_sample/blob/main/curuai_sample_py6s.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and import packages

In [1]:
#import packages used
import ee
import pandas as pd
import geemap
import geopandas as gpd
import matplotlib.pyplot as plt
import json
import math
import geemap
import os

In [2]:
#authenticate and initialize google earth engine (also necessary for geemap)
ee.Authenticate()
ee.Initialize(project = 'ee-curuai')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Curuai dataset

In [83]:
#import the entire dataset and include ID column
dataset = pd.read_excel('/content/drive/MyDrive/CURUAI_PROCESS/Dataset_CFP.xlsx', sheet_name='data',na_values='NaN').loc[:383]
# dataset = pd.read_excel('Dataset_CFP.xlsx', sheet_name='data',na_values='NaN').loc[:383]
dataset['ID'] = range(len(dataset))

In [84]:
#copy the dataset and select only the variables of interest
dataset_att = dataset[['ID','DATE', 'DEPTH CLASS', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'SAMPLE SITE', 'WATER PERIOD', 'MISSION',
       'TOTAL DEPTH', 'SAMPLING DEPTH', 'TURBIDITY', 'CHLOROPHYLL',
       'SPM', 'TOC', 'POC', 'DOC', 'SILICA', 'P TOTAL',
       'P ORGANIC', 'N TOTAL','N TOTAL DISSOLVED',
       'CHLOROPHYLL A', 'CHLOROPHYLL B']].copy()
dataset_att.columns

Index(['ID', 'DATE', 'DEPTH CLASS', 'LATITUDE', 'LONGITUDE', 'LOCATION',
       'SAMPLE SITE', 'WATER PERIOD', 'MISSION', 'TOTAL DEPTH',
       'SAMPLING DEPTH', 'TURBIDITY', 'CHLOROPHYLL', 'SPM', 'TOC', 'POC',
       'DOC', 'SILICA', 'P TOTAL', 'P ORGANIC', 'N TOTAL', 'N TOTAL DISSOLVED',
       'CHLOROPHYLL A', 'CHLOROPHYLL B'],
      dtype='object')

In [85]:
dataset_att.columns = ['ID','DATE', 'DEPTH_CLASS', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'SAMPLE_SITE', 'WATER_PERIOD', 'MISSION',
       'TOTAL_DEPTH', 'SAMPLING_DEPTH', 'TURBIDITY', 'CHLOROPHYLL', 'SPM',
       'TOC', 'POC', 'DOC', 'SILICA', 'P_TOTAL', 'P_ORGANIC', 'N_TOTAL',
       'N_TOTAL_DISSOLVED', 'CHLOROPHYLL_A', 'CHLOROPHYLL_B']
dataset_att.columns

Index(['ID', 'DATE', 'DEPTH_CLASS', 'LATITUDE', 'LONGITUDE', 'LOCATION',
       'SAMPLE_SITE', 'WATER_PERIOD', 'MISSION', 'TOTAL_DEPTH',
       'SAMPLING_DEPTH', 'TURBIDITY', 'CHLOROPHYLL', 'SPM', 'TOC', 'POC',
       'DOC', 'SILICA', 'P_TOTAL', 'P_ORGANIC', 'N_TOTAL', 'N_TOTAL_DISSOLVED',
       'CHLOROPHYLL_A', 'CHLOROPHYLL_B'],
      dtype='object')

In [86]:
dataset_att.groupby(['MISSION'])['MISSION'].count()

Unnamed: 0_level_0,MISSION
MISSION,Unnamed: 1_level_1
I,71
II,73
III,74
IV,36
IX,24
V,26
VI,25
VII,28
VIII,27


In [87]:
#transform dataframe in a geodataframe (geometry column with point location)
gdf = gpd.GeoDataFrame(
    dataset_att, geometry=gpd.points_from_xy(dataset_att.LONGITUDE, dataset_att.LATITUDE),
    crs="EPSG:4326"
)
gdf.head()

Unnamed: 0,ID,DATE,DEPTH_CLASS,LATITUDE,LONGITUDE,LOCATION,SAMPLE_SITE,WATER_PERIOD,MISSION,TOTAL_DEPTH,...,POC,DOC,SILICA,P_TOTAL,P_ORGANIC,N_TOTAL,N_TOTAL_DISSOLVED,CHLOROPHYLL_A,CHLOROPHYLL_B,geometry
0,0,2013-03-08 00:00:00,1,-2.25127,-55.14622,LG,1.0,R,I,3.8,...,3.546,4.727,2.33,0.132571,"< 0,01",0.3106,0.2564,7.8996,"< 0,01",POINT (-55.14622 -2.25127)
1,1,2013-03-08 00:00:00,2,-2.25127,-55.14622,LG,1.0,R,I,3.8,...,,,,,,,,,,POINT (-55.14622 -2.25127)
2,2,2013-03-09 00:00:00,1,-2.28422,-55.22023,LG,2.0,R,I,2.3,...,"< 0,001",3.575,1.99,0.022095,0.012914,0.2254,0.1269,1.96906,"< 0,01",POINT (-55.22023 -2.28422)
3,3,2013-03-09 00:00:00,1,-2.19696,-55.29953,LG,3.0,R,I,4.63,...,3.771,2.854,2.43,0.066088,0.025859,0.3364,0.279,2.14404,1.94714,POINT (-55.29953 -2.19696)
4,4,2013-03-09 00:00:00,1,-2.221738,-55.270194,LG,4.0,R,I,5.44,...,6.223,1.017,2.45,0.113632,0.063775,0.376,0.04269,10.23478,8.20502,POINT (-55.27019 -2.22174)


In [88]:
#transform date and time columns in string - necessary to convert to JSON
gdf['DATE'] = gdf['DATE'].astype("str")

In [None]:
# gdf.to_file("/content/drive/MyDrive/CURUAI_PROCESS/points_curuai.shp")

In [89]:
##Convert geodataframe to json - necessary to be read in GEE
dataset_json = gdf.to_json()

In [90]:
#load and select the features of the json data
data_points = json.loads(dataset_json)
data_points = data_points['features']
# data_points

In [91]:
##transform json in in gee object = feature collection
roi_points = ee.FeatureCollection(data_points)
print(roi_points.size().getInfo())

384


In [92]:
#function to insert a property with date of the point - Gee date format
def insert_date (feat):
    return feat.set('system:time_start',ee.Date.parse('YYYY-MM-dd HH:mm:ss',feat.get('DATE')))

In [93]:
#apply date function - new system:time_start property
roi_points = roi_points.map(insert_date)
print(roi_points.first().getInfo())

{'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [-55.14622, -2.25127]}, 'id': '0', 'properties': {'CHLOROPHYLL': 0.8799999999999999, 'CHLOROPHYLL_A': 7.8995999999999995, 'CHLOROPHYLL_B': '< 0,01', 'DATE': '2013-03-08 00:00:00', 'DEPTH_CLASS': 1, 'DOC': 4.727, 'ID': 0, 'LATITUDE': -2.25127, 'LOCATION': 'LG', 'LONGITUDE': -55.14622, 'MISSION': 'I', 'N_TOTAL': 0.3106, 'N_TOTAL_DISSOLVED': 0.2564, 'POC': 3.5459999999999994, 'P_ORGANIC': '< 0,01', 'P_TOTAL': 0.132571, 'SAMPLE_SITE': 1, 'SAMPLING_DEPTH': 0.2, 'SILICA': 2.33, 'SPM': 44.800000000000004, 'TOC': 8.273, 'TOTAL_DEPTH': 3.8, 'TURBIDITY': 50.625, 'WATER_PERIOD': 'R', 'system:time_start': {'type': 'Date', 'value': 1362700800000}}}


In [94]:
#create a polygon around the floodpalin area - roi
roi_poly = roi_points.geometry().buffer(150).bounds()

# Import GEE images

In [95]:
advance = 16

In [96]:
#define initial and final date to filter the image collection based on the
#field points date
initial_date = ee.Date(roi_points.sort('system:time_start').first().get('system:time_start')).advance(-advance, 'day')
end_date = ee.Date(roi_points.sort('system:time_start',False).first().get('system:time_start')).advance(-advance, 'day')

print('Data inicial de coleta: ',initial_date.format().getInfo())
print('Data final de coleta: ',end_date.format().getInfo())

Data inicial de coleta:  2013-02-20T00:00:00
Data final de coleta:  2017-09-03T00:00:00


## Import image collections

###Landsat 7 - PY6S

In [97]:
#Landsat 7 collection 2 tier 1 TOA
#filter collection by region and date (2 month prior toi the field dates and one month past) based on field points
#mask clouds
landsat7 = ee.ImageCollection("projects/ee-curuai/assets/Py6S/LD7/ld7_py6s")\
            .filterDate(initial_date,end_date)\
            .select([ 'B1', 'B2', 'B3', 'B4', 'B5', 'B7'])
print(ee.Date(landsat7.first().get('system:time_start')).format().getInfo())
print(ee.Date(landsat7.sort('system:time_start',False).first().get('system:time_start')).format().getInfo())

2013-05-12T13:44:05
2017-08-27T13:50:49


In [98]:
print('collection size',landsat7.size().getInfo())
print('projection',landsat7.first().select('B4').projection().getInfo())
print('spatial resolution',landsat7.first().select('B4').projection().nominalScale().getInfo())
print('bands',landsat7.first().bandNames().getInfo())


collection size 197
projection {'type': 'Projection', 'crs': 'EPSG:32721', 'transform': [30, 0, 610350, 0, -30, 9780030]}
spatial resolution 30
bands ['B1', 'B2', 'B3', 'B4', 'B5', 'B7']


###Landsat 8 PY6S

In [99]:
#Landsat 8 collection 2 tier 1 TOA
#filter collection by region and date (2 month prior toi the field dates and one month past) based on field points
#mask clouds
landsat8 = (ee.ImageCollection("projects/ee-curuai/assets/Py6S/LD8/ld8_py6s")
            .filterDate(initial_date,end_date)
            .select(['B2', 'B3', 'B4', 'B5', 'B6', 'B7']))
print(ee.Date(landsat8.first().get('system:time_start')).format().getInfo())
print(ee.Date(landsat8.sort('system:time_start',False).first().get('system:time_start')).format().getInfo())

2013-05-20T13:50:14
2017-08-26T13:54:29


In [100]:
print('collection size',landsat8.size().getInfo())
print('projection',landsat8.first().select('B4').projection().getInfo())
print('spatial resolution',landsat8.first().select('B4').projection().nominalScale().getInfo())
print('bands',landsat8.first().bandNames().getInfo())

collection size 203
projection {'type': 'Projection', 'crs': 'EPSG:32721', 'transform': [30, 0, 610350, 0, -30, 9780030]}
spatial resolution 30
bands ['B2', 'B3', 'B4', 'B5', 'B6', 'B7']


###Sentinel 2 PY6S

In [101]:
#Sentinel 2 level 1C harmonized
sentinel2 = ee.ImageCollection("projects/ee-curuai/assets/Py6S/S2/S2_py6s")\
            .map(lambda img: img.set({
                'system:time_start':ee.Date(img.get('AC_date')),
                'CLOUD_COVER':img.get('CLOUDY_PIXEL_PERCENTAGE')}))\
            .filterDate(initial_date,end_date)\
            .select(['B2', 'B3', 'B4', 'B8', 'B11', 'B12'])
print(ee.Date(sentinel2.first().get('system:time_start')).format().getInfo())
print(ee.Date(sentinel2.sort('system:time_start',False).first().get('system:time_start')).format().getInfo())

2015-08-19T00:00:00
2017-08-23T00:00:00


In [102]:
print('collection size',sentinel2.size().getInfo())
print('projection',sentinel2.first().select('B4').projection().getInfo())
print('spatial resolution',sentinel2.first().select('B4').projection().nominalScale().getInfo())
print('bands',sentinel2.first().bandNames().getInfo())

collection size 144
projection {'type': 'Projection', 'crs': 'EPSG:32721', 'transform': [30, 0, 610350, 0, -30, 9780030]}
spatial resolution 30
bands ['B2', 'B3', 'B4', 'B8', 'B11', 'B12']


### Visualize

In [103]:
Map = geemap.Map(basemap='HYBRID')
Map.centerObject(roi_points,10)

Map.addLayer(landsat7.first(), {'bands':['B3','B2','B1'], min:0.02,max:0.05}, str(landsat7.first().get('LANDSAT_PRODUCT_ID').getInfo()))
Map.addLayer(landsat8.first(), {'bands':['B4','B3','B2'], min:0.02,max:0.05}, str(landsat8.first().get('LANDSAT_PRODUCT_ID').getInfo()))
Map.addLayer(sentinel2.first(), {'bands':['B4','B3','B2'], min:0.02,max:0.1}, str(sentinel2.first().get('PRODUCT_ID').getInfo()))

Map.addLayer(roi_points, {'color':'darkred'}, 'Data Points');
# Map.addLayer(roi_poly, {'color':'darkred'}, 'Data polygon');
Map

Map(center=[-2.1837863104242503, -55.48976569658883], controls=(WidgetControl(options=['position', 'transparen…

## Padronize band names

In [104]:
name_bands = ['blue','green','red','nir','swir1','swir2']

###Landsat 7

In [105]:
#renomear bandas

ld7 = landsat7.map(lambda img: img.rename(name_bands))
ld7.first()

### Landsat 8

In [106]:
ld8 = landsat8.map(lambda img: img.rename(name_bands))
ld8.first()

### Sentinel 2

In [107]:
s2 = sentinel2.map(lambda img: img.rename(name_bands))
s2.first()

# Sample data points pixel values

filter images that fall within a 16-day window period from each field point date

In [108]:
def imgs_points(collection):
    def wrap(feat):
        date_point = ee.Date(feat.get('system:time_start'))
        data1 = date_point.advance(-16,'day')\
        .format('yyyy-MM-dd')

        data2 = date_point.advance(16,'day')\
        .format('yyyy-MM-dd')

        filtro = collection.filterDate(data1,data2)\
        .filterBounds(feat.geometry())\
        .map(lambda img: img.set({
            'dif_date_point':ee.Date(img.get('system:time_start')).difference(date_point, 'day')})\
            .copyProperties(feat,['ID']))


        return ee.ImageCollection(filtro.limit(10))

    return wrap

obtain statistics for the same location of the field point - with a 3 pixel window - and filter out if more than 4 pixels are masked   

In [109]:
prj = ld8.first().projection()
prj

In [110]:
def sample_point(img):

  feat = roi_points.filter(ee.Filter.eq('ID',img.get('ID'))).first()
  geom = feat.geometry().buffer(45).bounds()

  mean = img.reduceRegion(geometry=geom,
                          scale=prj.nominalScale(),
                          crs=prj.crs(),
                          reducer=ee.Reducer.mean())
  median = img.reduceRegion(geometry=geom,
                            scale=prj.nominalScale(),
                            crs=prj.crs(),
                            reducer=ee.Reducer.median())
  minMax = img.reduceRegion(geometry=geom,
                            scale=prj.nominalScale(),
                            crs=prj.crs(),
                            reducer=ee.Reducer.minMax())
  count = img.reduceRegion(geometry=geom,
                            scale=prj.nominalScale(),
                            crs=prj.crs(),
                            reducer=ee.Reducer.count())
  return feat.set({
      "system_index": img.get('system:index'),
      'CLOUD_COVER':img.get('CLOUD_COVER'),
      'img_date':ee.Date(img.get('system:time_start')).format(),
      'dif_date_point': img.get('dif_date_point'),

      'blue_mean':mean.get('blue'),
      'green_mean':mean.get('green'),
      'red_mean':mean.get('red'),
      'nir_mean':mean.get('nir'),
      'swir1_mean':mean.get('swir1'),
      'swir2_mean':mean.get('swir2'),

      'blue_median':median.get('blue'),
      'green_meadin':median.get('green'),
      'red_median':median.get('red'),
      'nir_median':median.get('nir'),
      'swir1_median':median.get('swir1'),
      'swir2_median':median.get('swir2'),

      'blue_min':minMax.get('blue_min'),
      'green_min':minMax.get('green_min'),
      'red_min':minMax.get('red_min'),
      'nir_min':minMax.get('nir_min'),
      'swir1_min':minMax.get('swir1_min'),
      'swir2_min':minMax.get('swir2_min'),

      'blue_max':minMax.get('blue_max'),
      'green_max':minMax.get('green_max'),
      'red_max':minMax.get('red_max'),
      'nir_max':minMax.get('nir_max'),
      'swir1_max':minMax.get('swir1_max'),
      'swir2_max':minMax.get('swir2_max'),

      "count_pixel":count.get('red')
  })

filter_count = ee.Filter.gt('count_pixel',3)




### Landsat 7

In [111]:
img_pointsLD7 = ee.ImageCollection(roi_points.map(imgs_points(ld7)).flatten().toList(2000))

In [112]:
img_pointsLD7.size()


In [113]:
img_pointsLD7.aggregate_count_distinct('ID')

In [114]:
img_pointsLD7.limit(5)

In [115]:
reduced_LD7 = ee.FeatureCollection(img_pointsLD7.map(sample_point)).filter(filter_count)

In [116]:
reduced_LD7.limit(2)

In [117]:
reduced_LD7.size()

In [118]:
reduced_LD7.aggregate_count_distinct('ID')

## Landsat 8

In [119]:
img_pointsLD8 = ee.ImageCollection(roi_points.map(imgs_points(ld8)).flatten().toList(2000))

In [120]:
img_pointsLD8.size()


In [121]:
img_pointsLD8.aggregate_count_distinct('ID')

In [122]:
img_pointsLD8.limit(5)

In [123]:
reduced_LD8 = ee.FeatureCollection(img_pointsLD8.map(sample_point)).filter(filter_count)

In [124]:
reduced_LD8.size()


In [125]:
reduced_LD8.aggregate_count_distinct('ID')

In [126]:
reduced_LD8.limit(2)

##Sentinel 2

In [127]:
img_pointsS2 = ee.ImageCollection(roi_points.map(imgs_points(s2)).flatten().toList(2000))

In [128]:
img_pointsS2.size()

In [129]:
img_pointsS2.aggregate_count_distinct('ID')

In [130]:
reduced_S2 = ee.FeatureCollection(img_pointsS2.map(sample_point)).filter(filter_count)

In [131]:
reduced_S2.size()


In [132]:
reduced_S2.aggregate_count_distinct('ID')

In [133]:
reduced_S2.limit(2)

# Export data as CSV file

In [137]:
#Exportar tabelas
#LD7
geemap.ee_to_csv(reduced_LD7, '/content/drive/MyDrive/CURUAI_PROCESS/py6s_LD7_data.csv')

In [138]:
geemap.ee_to_csv(reduced_LD8, '/content/drive/MyDrive/CURUAI_PROCESS/py6s_LD8_data.csv')

In [139]:
geemap.ee_to_csv(reduced_S2, '/content/drive/MyDrive/CURUAI_PROCESS/py6s_S2_data.csv')