## Setup and Import Packages

In [7]:
#!pip install geopandas

In [8]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import geopandas
import h5py
from shapely.geometry import Point

# New module named tensorflow will be used.
# Tensorflow is a module for machine learning, neural network, ...
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# for model reporting
from sklearn.metrics import classification_report

In [9]:
#from google.colab import drive
#drive.mount('/content/drive/')

## Thin ice and open water height/elev correction

In [8]:
### calculate z-score
from scipy.stats import zscore

def set_elev_correction(window_data_slice, input_data):
    
    df_label = window_data_slice.label.max()
    print(df_label)
    
    thick_ice_slice = window_data_slice[window_data_slice['label']==0]
    thin_ice_slice = window_data_slice[window_data_slice['label']==1]
    open_water_slice = window_data_slice[window_data_slice['label']==2]
    
    thick_ice_slice['z_score'] = zscore(thick_ice_slice['h_cor_mean'])
    thin_ice_slice['z_score'] = zscore(thin_ice_slice['h_cor_mean'])
    open_water_slice['z_score'] = zscore(open_water_slice['h_cor_mean'])    
    window_data_slice = thick_ice_slice.append(thin_ice_slice.append(open_water_slice)).sort_values(by=['x_atc'])   
    #print(window_data_slice)
    
    for df in range(len(window_data_slice)):
        idx = window_data_slice.index[df]
        input_data.loc[idx, 'z_score'] = window_data_slice.z_score.iloc[df]
    return

def get_next_window_idx2(index, radius, input_data):
    #print("getting next window index ")
    base_x = input_data.x_atc.iloc[index] 
    start_window_dist_x = base_x - radius
    end_window_dist_x = base_x + radius
    print(base_x, start_window_dist_x, end_window_dist_x)
    window_data_slice = input_data.loc[(input_data['x_atc'] > start_window_dist_x) & (input_data['x_atc'] < end_window_dist_x), ['x_atc', 'h_cor_mean', 'label']]
    #print(window_data_slice)
    
    set_elev_correction(window_data_slice, input_data)
    
    index_next = window_data_slice.index[-1]+1
    return index_next

def find_elev_correction(radius, input_data):
    index = 0
    count = 0
    print("data length ", len(input_data))
    while index < len(input_data):
        count += 1
        index = get_next_window_idx2(index, radius, input_data)
        print(index, count)
        #debug mode
        #break
    return


In [10]:
###### 10m 5km ########
import glob, os
import time

dir_path = "ATL03_labeled_data/ow_labeled_by_is2trackline_10m_corrected/labeled_track/"
dest_path = "ATL03_labeled_data/ow_labeled_10m_z_score_corrected/"
filelist = sorted(glob.glob(dir_path + "ATL03_" + "*_labeled_10m.csv"))
missing_filelist = []
missing_data = 0
radius = 5000

for file in filelist:
    print(file)
    start = time.time()
    labeled_data = pd.read_csv(file, index_col=0)   
    #labeled_data = labeled_data[labeled_data['sea_surf_min_elev'].notna()]
    labeled_data = labeled_data[labeled_data.h_cor_mean <= 10]
    labeled_data = labeled_data[labeled_data['label'].notna()]
    labeled_data = labeled_data.reset_index(drop=True)
    
    print("find_elev_correction commented out for notebook storage issue")
    break
    find_elev_correction(radius, labeled_data)
    
    nan_count = labeled_data['z_score'].isna().sum()   
    if(nan_count > 0):
        #print("###############################################################################################",file)
        missing_filelist.append(file)
        missing_data += nan_count
    
    #ignore nan z_score data
    labeled_data = labeled_data[labeled_data['z_score'].notna()]
    
    #labeled_data.to_csv(file.replace(dir_path, dest_path).replace(".csv", "_z_score.csv"))
    del labeled_data
    #break
    
    end = time.time() - start
    print("Time per file is ",end)
    print("------------ File done ------------")

print("------------ Finish ------------")    
print("filelists ",len(filelist))
print("Missing filelists ",len(missing_filelist))
print("Missing data ",missing_data)
   
#labeled_data

ATL03_labeled_data/ow_labeled_by_is2trackline_10m_corrected/labeled_track/ATL03_20191103184432_05780510_gt1r_labeled_10m.csv
find_elev_correction commented out for notebook storage issue
------------ Finish ------------
filelists  15
Missing filelists  0
Missing data  0


In [8]:
# missing_filelist

In [9]:
# labeled_data = pd.read_csv('ATL03_labeled_data/shifted_labeled_10m_z_score_corrected/ATL03_20191103184432_05780510_gt1r_labeled_10m_z_score.csv', index_col=0)
# nan_count = labeled_data['z_score'].isna().sum()   
# print(nan_count)

## Relative Sea Surface height using sliding window

In [3]:
import sys

def set_sea_surface_ow(window_data_slice):
    
    #df[df['Capacity'].ne(0)].min()
    #label max cz 2 = water, 1 = thin ice, 0 = thick ice
    df_label = window_data_slice.label.max()
    print(df_label)
    
    #everythin thick ice
    if df_label <= 1:
        min_elev = np.nan
        avg_elev = np.nan
        min_dist_low10 = pd.DataFrame()
        #count_nan += 1
    else:
        min_elev = window_data_slice.h_cor_mean[window_data_slice['label'].eq(df_label)].min()
        print("Min elev = ", min_elev)
        avg_elev = window_data_slice.h_cor_mean[window_data_slice['label'].eq(df_label)].mean()
        print("Avg elev = ", avg_elev)
    
        #min dist sea surface 10 lowest elev data sorted by x
        #handle label as well....
        min_dist_low10 = window_data_slice[window_data_slice['label'].eq(df_label)].sort_values(by='h_cor_mean', ignore_index=False).iloc[:10].sort_values(by='x_atc', ignore_index=False)
        #print(min_dist_low10)
    
    for df in range(len(window_data_slice)):
        idx = window_data_slice.index[df]
        data_copy.loc[idx, 'rel_sea_surf_min_elev'] = min_elev
        data_copy.loc[idx, 'rel_sea_surf_avg_elev'] = avg_elev
        
        #min dist sea surface
        x = window_data_slice.x_atc.iloc[df]
        min_dist_elev = np.nan
        
        min_dist = sys.float_info.max
        min_dist_idx = -1
        ## make it efficient how the minimum is speard out----- binary search
        if not min_dist_low10.empty:

            for df2, row in min_dist_low10.iterrows():
                #print('index: ', i, 'col g:', row['g'])
                #print(df2)
                x2 = row['x_atc']#min_dist_low10.x[df2]
                diff = abs(x - x2)
                if(diff<min_dist):
                    min_dist = diff
                    min_dist_idx = df2
            #print("min dist idx", min_dist_idx) 
            #print("min dist diff", min_dist)
        
            min_dist_elev = min_dist_low10.h_cor_mean[min_dist_idx] #numpy array
            #print("Min Dist elev = ", min_dist_elev, " Min elev = ", min_elev)
            
        if(min_dist_elev<min_elev):
            print("Min Dist elev = ", min_dist_elev, " Min elev = ", min_elev)
        data_copy.loc[idx, 'rel_sea_surf_min_dist'] = min_dist_elev
        
    return

def get_next_window_idx_ow(index, radius):
    #print("getting next window index ")
    base_x = data_copy.x_atc.iloc[index] 
    start_window_dist_x = base_x - radius
    end_window_dist_x = base_x + radius
    print(base_x, start_window_dist_x, end_window_dist_x)
    window_data_slice = data_copy.loc[(data_copy['x_atc'] > start_window_dist_x) & (data_copy['x_atc'] < end_window_dist_x), ['x_atc', 'h_cor_mean', 'label']]
    print(window_data_slice)
    
    set_sea_surface_ow(window_data_slice)
    
    index_next = window_data_slice.index[-1]+1
    return index_next

def find_sea_surface_ow(radius):
    index = 0
    count = 0
    print("data length ", len(data_copy))
    while index < len(data_copy):
        count += 1
        index = get_next_window_idx_ow(index,radius)
        print(index, count)
        #debug mode
        #break
    return



In [12]:
#labeled_data = pd.read_csv("ATL03_labeled_data/labeled_by_is2trackline/ATL03_20191123180255_08830510_T04CEV_gt2r_labeled.csv", index_col=0)
#data_copy = labeled_data.sort_values(by=['x_atc'])
#data_copy = data_copy.loc[:, ~data_copy.columns.str.contains('^Unnamed')]
##data['FID'] = range(1, len(data) + 1)
#data_copy = data_copy[data_copy.h_cor_mean <= 15]
#data_copy = data_copy[data_copy['label'].notna()]
#data_copy = data_copy.reset_index(drop=True)
#
##radius 5 km
#radius = 5000.0
#
#data_copy['sea_surf_min_elev'] = np.nan
#data_copy['sea_surf_avg_elev'] = np.nan
#data_copy['sea_surf_min_dist'] = np.nan
#print(radius)
#find_sea_surface(radius)
#
#nan_count = data_copy['sea_surf_min_elev'].isna().sum()
#print("min elev nan ",nan_count)
#nan_count = data_copy['sea_surf_avg_elev'].isna().sum()
#print("avg elev nan ",nan_count)
#nan_count = data_copy['sea_surf_min_dist'].isna().sum()
#print("min dist elev nan ",nan_count)
#
#print(data_copy['sea_surf_min_elev'].isnull().values.any())
#print(data_copy['sea_surf_avg_elev'].isnull().values.any())
#print(data_copy['sea_surf_min_dist'].isnull().values.any())
#
#### freeboard calculation
#data_copy['freeboard_min_elev'] = data_copy['h_cor_mean'] - data_copy['sea_surf_min_elev']
#data_copy['freeboard_avg_elev'] = data_copy['h_cor_mean'] - data_copy['sea_surf_avg_elev']
#data_copy['freeboard_min_dist_elev'] = data_copy['h_cor_mean'] - data_copy['sea_surf_min_dist']
#
#data_copy.to_csv("ATL03_labeled_data/labeled_by_is2trackline/ATL03_20191123180255_08830510_T04CEV_gt2r_labeled.csv".replace(".csv", "_5km.csv"))

## All file freeboard calc and to csv

In [4]:
import glob, os
import time

src_dir_path = "ATL03_labeled_data/cor_labeled_by_is2trackline_10m/done/"
dest_dir_path = "ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height/"
filelist = sorted(glob.glob(src_dir_path + "ATL03_" + "*_labeled_10m_done.csv"))

begin = time.time()

for file in filelist:
    print(file)
    start = time.time()
    labeled_data = pd.read_csv(file, index_col=0)
    data_copy = labeled_data.sort_values(by=['x_atc'])
    data_copy = data_copy.loc[:, ~data_copy.columns.str.contains('^Unnamed')]
    #data['FID'] = range(1, len(data) + 1)
    data_copy = data_copy[data_copy.h_cor_mean <= 10]
    data_copy = data_copy[data_copy['label'].notna()]
    data_copy = data_copy.reset_index(drop=True)

    #radius 5 km
    radius = 5000.0

    data_copy['rel_sea_surf_min_elev'] = np.nan
    data_copy['rel_sea_surf_avg_elev'] = np.nan
    data_copy['rel_sea_surf_min_dist'] = np.nan
    print(radius)
    
    #break
    print("find_sea_surface commented out for notebook storage issue")
    find_sea_surface_ow(radius)
    

    nan_count = data_copy['rel_sea_surf_min_elev'].isna().sum()
    print("min elev nan ",nan_count)
    nan_count = data_copy['rel_sea_surf_avg_elev'].isna().sum()
    print("avg elev nan ",nan_count)
    nan_count = data_copy['rel_sea_surf_min_dist'].isna().sum()
    print("min dist elev nan ",nan_count)

    print(data_copy['rel_sea_surf_min_elev'].isnull().values.any())
    print(data_copy['rel_sea_surf_avg_elev'].isnull().values.any())
    print(data_copy['rel_sea_surf_min_dist'].isnull().values.any())

    ### freeboard calculation
    data_copy['rel_height_min_elev'] = data_copy['h_cor_mean'] - data_copy['rel_sea_surf_min_elev']
    data_copy['rel_height_avg_elev'] = data_copy['h_cor_mean'] - data_copy['rel_sea_surf_avg_elev']
    data_copy['rel_height_min_dist_elev'] = data_copy['h_cor_mean'] - data_copy['rel_sea_surf_min_dist']
    
    data_copy.to_csv(file.replace(src_dir_path, dest_dir_path).replace(".csv", "_rel_height.csv"))
    del labeled_data, data_copy
    #break
    end = time.time() - start
    print("Time per file is ",end)
    print("------------ File done ------------")

end = time.time() - begin
print("Total time needed ",end)
print("------------ Finish ------------")    


ATL03_labeled_data/cor_labeled_by_is2trackline_10m/done/ATL03_20191104195311_05940510_T02CNA_gt1r_labeled_10m_done.csv
5000.0
find_sea_surface commented out for notebook storage issue
data length  11439
28489900 28484900.0 28494900.0
         x_atc  h_cor_mean  label
0     28489900    0.381907      0
1     28489902    0.392659      0
2     28489904    0.351589      0
3     28489906    0.355799      0
4     28489908    0.325980      0
...        ...         ...    ...
2494  28494890    0.373782      0
2495  28494892    0.376476      0
2496  28494894    0.337074      0
2497  28494896    0.344850      0
2498  28494898    0.407554      0

[2499 rows x 3 columns]
0
2499 1
28494900 28489900.0 28499900.0
         x_atc  h_cor_mean  label
1     28489902    0.392659      0
2     28489904    0.351589      0
3     28489906    0.355799      0
4     28489908    0.325980      0
5     28489910    0.361949      0
...        ...         ...    ...
4878  28499890    0.275554      0
4879  28499892    0.3

# interpolate relative sea surface

In [5]:
import glob, os
import time

src_dir_path = "ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height/"
dest_dir_path = "ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height_interpolated/"
filelist = sorted(glob.glob(src_dir_path + "ATL03_" + "*_rel_height.csv"))

begin = time.time()

for file in filelist:
    print(file)
    start = time.time()
    labeled_data = pd.read_csv(file, index_col=0)
    data_copy = labeled_data.sort_values(by=['x_atc'])
    data_copy = data_copy.loc[:, ~data_copy.columns.str.contains('^Unnamed')]
    #data['FID'] = range(1, len(data) + 1)
    data_copy = data_copy[data_copy.h_cor_mean <= 10]
    data_copy = data_copy[data_copy['label'].notna()]
    data_copy = data_copy.reset_index(drop=True)


    nan_count = data_copy['rel_sea_surf_min_elev'].isna().sum()
    print("min elev nan ",nan_count)
    nan_count = data_copy['rel_sea_surf_avg_elev'].isna().sum()
    print("avg elev nan ",nan_count)
    nan_count = data_copy['rel_sea_surf_min_dist'].isna().sum()
    print("min dist elev nan ",nan_count)

    print(data_copy['rel_sea_surf_min_elev'].isnull().values.any())
    print(data_copy['rel_sea_surf_avg_elev'].isnull().values.any())
    print(data_copy['rel_sea_surf_min_dist'].isnull().values.any())
    
    #break
    #interpolation
    data_copy['rel_sea_surf_min_elev'] = data_copy['rel_sea_surf_min_elev'].interpolate(method='linear', limit_direction='both', axis=0)
    data_copy['rel_sea_surf_avg_elev'] = data_copy['rel_sea_surf_avg_elev'].interpolate(method='linear', limit_direction='both', axis=0)
    data_copy['rel_sea_surf_min_dist'] = data_copy['rel_sea_surf_min_dist'].interpolate(method='linear', limit_direction='both', axis=0)


    ### rel_height calculation
    data_copy['rel_height_min_elev'] = data_copy['h_cor_mean'] - data_copy['rel_sea_surf_min_elev']
    data_copy['rel_height_avg_elev'] = data_copy['h_cor_mean'] - data_copy['rel_sea_surf_avg_elev']
    data_copy['rel_height_min_dist_elev'] = data_copy['h_cor_mean'] - data_copy['rel_sea_surf_min_dist']
    
    data_copy.to_csv(file.replace(src_dir_path, dest_dir_path).replace(".csv", "_interpolated.csv"))
    del labeled_data, data_copy
    #break
    end = time.time() - start
    print("Time per file is ",end)
    print("------------ File done ------------")

end = time.time() - begin
print("Total time needed ",end)
print("------------ Finish ------------")    




ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height/ATL03_20191104195311_05940510_T02CNA_gt1r_labeled_10m_done_rel_height.csv
min elev nan  1
avg elev nan  1
min dist elev nan  1
True
True
True
Time per file is  0.4714388847351074
------------ File done ------------
ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height/ATL03_20191104195311_05940510_T02CNA_gt2r_labeled_10m_done_rel_height.csv
min elev nan  698
avg elev nan  698
min dist elev nan  698
True
True
True
Time per file is  0.03331804275512695
------------ File done ------------
ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height/ATL03_20191104195311_05940510_T02CNB_gt1r_labeled_10m_done_rel_height.csv
min elev nan  23261
avg elev nan  23261
min dist elev nan  23261
True
True
True
Time per file is  1.874849557876587
------------ File done ------------
ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height/ATL03_20191104195311_05940510_T02CNB_gt2r_labeled_10m_done_rel_height

## calculate h_diff = h_cor_mean - h_cor_med

In [6]:
import glob, os
import time

src_dir_path = "ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height_interpolated/"
dest_dir_path = "ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height_interpolated_h_diff/"
filelist = sorted(glob.glob(src_dir_path + "ATL03_" + "*_interpolated.csv"))

begin = time.time()

for file in filelist:
    print(file)
    start = time.time()
    labeled_data = pd.read_csv(file, index_col=0)
    data_copy = labeled_data.sort_values(by=['x_atc'])
    data_copy = data_copy.loc[:, ~data_copy.columns.str.contains('^Unnamed')]
    #data['FID'] = range(1, len(data) + 1)
    data_copy = data_copy[data_copy.h_cor_mean <= 10]
    data_copy = data_copy[data_copy['label'].notna()]
    data_copy = data_copy.reset_index(drop=True)

    ### h_diff calculation
    data_copy['h_diff'] = data_copy['h_cor_mean'] - data_copy['h_cor_med']
    
    data_copy.to_csv(file.replace(src_dir_path, dest_dir_path).replace(".csv", "_h_diff.csv"))
    del labeled_data, data_copy
    #break
    end = time.time() - start
    print("Time per file is ",end)
    print("------------ File done ------------")

end = time.time() - begin
print("Total time needed ",end)
print("------------ Finish ------------")    




ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height_interpolated/ATL03_20191104195311_05940510_T02CNA_gt1r_labeled_10m_done_rel_height_interpolated.csv
Time per file is  0.7246870994567871
------------ File done ------------
ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height_interpolated/ATL03_20191104195311_05940510_T02CNA_gt2r_labeled_10m_done_rel_height_interpolated.csv
Time per file is  0.047849178314208984
------------ File done ------------
ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height_interpolated/ATL03_20191104195311_05940510_T02CNB_gt1r_labeled_10m_done_rel_height_interpolated.csv
Time per file is  2.2047715187072754
------------ File done ------------
ATL03_labeled_data/cor_labeled_by_is2trackline_10m/relative_height_interpolated/ATL03_20191104195311_05940510_T02CNB_gt2r_labeled_10m_done_rel_height_interpolated.csv
Time per file is  1.9402530193328857
------------ File done ------------
ATL03_labeled_data/cor_labeled_by_is2t