In [1]:
import pandas as pd
import math
import numpy as np
from timeit import default_timer as timer
import tensorflow as tf

print("tensorflow version:",tf.__version__)

tensorflow version: 2.3.0


In [2]:
# Set seed
seed=1920
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
# Import station data from csv
stations = [70351, 70217, 65103, 66194, 68192, 75041, 66037, 63291,
 73138, 51049, 62100, 58198, 67113, 61078, 61363, 69148,
 61287, 51161, 74148, 58208, 66161, 62101, 47048, 65068,
 69139, 59007, 58214, 60141, 68262, 66137, 58012, 75019,
 56238, 63292, 49000, 67105, 63303, 58077, 68257, 66212,
 55202, 68242, 74258, 65111, 58212, 70330, 48245, 54038,
 72160, 72162, 72161, 50017, 60139, 61375, 68072, 68239,
 61425, 46012, 64017, 69128, 68228, 67108, 69137, 52088,
 61392, 67119, 55325, 61055, 50137, 69138, 61366, 65070,
 61260, 69147, 68241]

# distance, dem, ndvi
fold_max_min_old = [[12.255430642780357, 0.08258964826174785, 1480.9, 0.0, 8.142000000000001, 0.0],
                [10.925501092856122, 0.07230995782047492, 1481.1000000000001, 0.0, 8.142000000000001, 0.0],
                [12.496770047496275, 0.07230995782047492, 1481.1000000000001, 0.0, 8.142000000000001, 0.0],
                [12.496770047496275, 0.07230995782047492, 1128.7, 0.0, 8.094999999999999, 0.0],
                [12.496770047496275, 0.07230995782047492, 1481.1000000000001, 0.0, 7.982, 0.0]]

fold_max_min = [15.708357648080208, 0, 1489.6967605590821, 0, 11, 0]

fold_station_len = 15

In [4]:
# Read stations
base_str = "../BOM/spatial_int/merged_station.csv"
base_df = pd.read_csv(base_str)
# Remove rows with NaN data
#base_df.dropna(inplace=True)
#base_df.reset_index(drop=True, inplace=True)
base_df.head()

Unnamed: 0,70351Lon,70351Lat,70351DEM,70351ndvi,70351temp,70351dew,70351RH,70351Nwind,70351Ewind,70351MinTemp,...,68241Lon,68241Lat,68241DEM,68241ndvi,68241temp,68241dew,68241RH,68241Nwind,68241Ewind,68241MinTemp
0,149.2004,-35.3088,577.1,3514.0,15.8,15.0,95.0,8.375461,-4.267511,15.5,...,150.79,-34.5638,8.0,4649.0,21.5,20.7,95.0,0.0,0.0,21.5
1,149.2004,-35.3088,577.1,3514.0,15.8,15.0,95.0,8.140639,-4.7,15.5,...,150.79,-34.5638,8.0,4649.0,21.5,20.7,95.0,0.0,0.0,21.5
2,149.2004,-35.3088,577.1,3514.0,15.7,14.9,95.0,7.700029,-5.391619,15.5,...,150.79,-34.5638,8.0,4649.0,21.6,20.8,95.0,0.0,0.0,21.5
3,149.2004,-35.3088,577.1,3514.0,15.6,14.8,95.0,5.906309,-4.782835,15.5,...,150.79,-34.5638,8.0,4649.0,21.6,20.8,95.0,0.0,0.0,21.5
4,149.2004,-35.3088,577.1,3514.0,15.5,14.5,94.0,6.148529,-4.467168,15.5,...,150.79,-34.5638,8.0,4649.0,21.6,20.8,95.0,0.0,0.0,21.5


In [5]:
# Dataframe to Array
base_ar = base_df.to_numpy()

In [6]:
# Generate Training data
def generate_train_more(base_ar, fold_index, id_index):
    # Check fold_index
    lower_bound = fold_index * fold_station_len
    upper_bound = (fold_index+1) * fold_station_len
    # id_index is within the bound of fold testing weather stations
    if (id_index >= lower_bound) and (id_index < upper_bound):
        return np.NaN
    station_train_list = []
    for i in range(len(stations)):
        if (i >= lower_bound) and (i < upper_bound):
            continue
        if i==id_index:
            continue
        relatives_a =  base_ar[:, id_index*10:id_index*10+4]
        relatives_b = base_ar[:, i*10:i*10+4]
        station_train=np.concatenate((relatives_a, relatives_b, base_ar[:, id_index*10+4:id_index*10+9],  base_ar[:, i*10+9:i*10+10]), axis=1)
        station_train_list.append(station_train)
    # merge all stations together row by row
    result = np.concatenate(station_train_list, axis=0)
    return result

# Inference to generate test dataset for correlation
def infer_test_generate(train_ar,fold_index,id_index):
    train_df = pd.DataFrame(train_ar, columns = ['Lon','Lat','DEM','ndvi','bLon','bLat','bDEM','bndvi','temp','dew','RH','Nwind','Ewind','MinTemp'])
    
    train_df['ndvi'] = train_df['ndvi']/1000.0
    train_df['bndvi'] = train_df['bndvi']/1000.0
    
    # Remove rows with NaN data
    train_df.dropna(inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    
    # Randomly allocate data to training (80%) and testing (20%) sets
    train_dataset = train_df.sample(frac=0.8,random_state=seed)
    
    # From clean_df remove the data that is chosen as training dataset to from the testing dataset
    test_dataset = train_df.drop(train_dataset.index)
    test_dataset.reset_index(drop=True, inplace=True)
    
    # Load model
    model_path="../Models/Spatial/ex2/" + str(fold_index)+'/' + str(stations[id_index]) + '_ann.h5'
    current_model = tf.keras.models.load_model(model_path)
    test_arr = test_dataset.to_numpy()
    # Predict
    predict_results = current_model.predict(test_arr[:, 0:13], batch_size=1024)
    # Calculate Normalized Distances and error
    norm_dist_df = pd.DataFrame()
    # Geo
    geo_lon_diff_arr = test_arr[:,0:1] - test_arr[:,4:5]
    geo_lat_diff_arr = test_arr[:,1:2] - test_arr[:,5:6]
    geo_distance_arr = ((geo_lon_diff_arr * geo_lon_diff_arr) + (geo_lat_diff_arr * geo_lat_diff_arr)) ** (0.5)
    #norm_dist_geo = (geo_distance_arr - fold_max_min[fold_index][1])/(fold_max_min[fold_index][0]-fold_max_min[fold_index][1])
    norm_dist_geo = (geo_distance_arr - fold_max_min[1])/(fold_max_min[0]-fold_max_min[1])
    # DEM
    #norm_dist_dem = (np.abs(test_arr[:,2:3] - test_arr[:,6:7]) - fold_max_min[fold_index][3])/(fold_max_min[fold_index][2]-fold_max_min[fold_index][3])
    norm_dist_dem = (np.abs(test_arr[:,2:3] - test_arr[:,6:7]) - fold_max_min[3])/(fold_max_min[2]-fold_max_min[3])
    # NDVI
    a_ndvi_arr = test_arr[:,3:4].copy()
    b_ndvi_arr = test_arr[:,7:8].copy()
    a_ndvi_arr[a_ndvi_arr==-3] = np.NaN
    b_ndvi_arr[b_ndvi_arr==-3] = np.NaN
    
    #norm_dist_ndvi = (np.abs(a_ndvi_arr - b_ndvi_arr) - fold_max_min[fold_index][5])/(fold_max_min[fold_index][4]-fold_max_min[fold_index][5])
    norm_dist_ndvi = (np.abs(a_ndvi_arr - b_ndvi_arr) - fold_max_min[5])/(fold_max_min[4]-fold_max_min[5])
    
    # Error
    norm_dist_error = np.abs(predict_results-test_arr[:,13:14])
    
    # Store Normalized Distances and Error
    corr_test_data = np.concatenate((norm_dist_geo,norm_dist_dem,norm_dist_ndvi,norm_dist_error),axis = 1)
    return corr_test_data

In [7]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

fold_pearson = []
for fold_index in range(5):
    current_test_list = []
    # Fold index bounds
    lower_bound = fold_index * fold_station_len
    upper_bound = (fold_index+1) * fold_station_len
    for i in range(len(stations)):
        print(bcolors.OKGREEN + "Start Process " + "fold " + str(fold_index) + " Station " + str(i) + bcolors.ENDC)
        # Check fold index
        if (i >= lower_bound) and (i < upper_bound):
            continue
        train_ar = generate_train_more(base_ar,fold_index,i)
        test_ar = infer_test_generate(train_ar,fold_index,i)
        current_test_list.append(test_ar)
    # norm_geo, norm_dem, norm_ndvi, error
    fold_test_dataset = np.concatenate(current_test_list, axis=0)
    
    # Perform pearson correlation
    geo_corr = np.corrcoef(fold_test_dataset[:,0], fold_test_dataset[:,3])
    dem_corr = np.corrcoef(fold_test_dataset[:,1], fold_test_dataset[:,3])
    ndvi_temp_arr = fold_test_dataset[:,2:4]
    ndvi_temp_arr = ndvi_temp_arr[~np.isnan(ndvi_temp_arr).any(axis=1)]
    ndvi_corr = np.corrcoef(ndvi_temp_arr[:,0], ndvi_temp_arr[:,1])
    current_corr_list = [geo_corr[0][1],dem_corr[0][1],ndvi_corr[0][1]]
    fold_pearson.append(current_corr_list)
    print(bcolors.FAIL + "Finished " + "fold " + str(fold_index) + bcolors.ENDC)

[92mStart Process fold 0 Station 0[0m
[92mStart Process fold 0 Station 1[0m
[92mStart Process fold 0 Station 2[0m
[92mStart Process fold 0 Station 3[0m
[92mStart Process fold 0 Station 4[0m
[92mStart Process fold 0 Station 5[0m
[92mStart Process fold 0 Station 6[0m
[92mStart Process fold 0 Station 7[0m
[92mStart Process fold 0 Station 8[0m
[92mStart Process fold 0 Station 9[0m
[92mStart Process fold 0 Station 10[0m
[92mStart Process fold 0 Station 11[0m
[92mStart Process fold 0 Station 12[0m
[92mStart Process fold 0 Station 13[0m
[92mStart Process fold 0 Station 14[0m
[92mStart Process fold 0 Station 15[0m
[92mStart Process fold 0 Station 16[0m
[92mStart Process fold 0 Station 17[0m
[92mStart Process fold 0 Station 18[0m
[92mStart Process fold 0 Station 19[0m
[92mStart Process fold 0 Station 20[0m
[92mStart Process fold 0 Station 21[0m
[92mStart Process fold 0 Station 22[0m
[92mStart Process fold 0 Station 23[0m
[92mStart Process fold 0 

[92mStart Process fold 2 Station 50[0m
[92mStart Process fold 2 Station 51[0m
[92mStart Process fold 2 Station 52[0m
[92mStart Process fold 2 Station 53[0m
[92mStart Process fold 2 Station 54[0m
[92mStart Process fold 2 Station 55[0m
[92mStart Process fold 2 Station 56[0m
[92mStart Process fold 2 Station 57[0m
[92mStart Process fold 2 Station 58[0m
[92mStart Process fold 2 Station 59[0m
[92mStart Process fold 2 Station 60[0m
[92mStart Process fold 2 Station 61[0m
[92mStart Process fold 2 Station 62[0m
[92mStart Process fold 2 Station 63[0m
[92mStart Process fold 2 Station 64[0m
[92mStart Process fold 2 Station 65[0m
[92mStart Process fold 2 Station 66[0m
[92mStart Process fold 2 Station 67[0m
[92mStart Process fold 2 Station 68[0m
[92mStart Process fold 2 Station 69[0m
[92mStart Process fold 2 Station 70[0m
[92mStart Process fold 2 Station 71[0m
[92mStart Process fold 2 Station 72[0m
[92mStart Process fold 2 Station 73[0m
[92mStart Proce

In [8]:
fold_pearson

[[0.1629152369408916, 0.013211451777692786, 0.02895956579799682],
 [0.17684879332291165, 0.020482856069401752, 0.023767898945956042],
 [0.16117969829801523, 0.022204014820531602, 0.01771150510073388],
 [0.18040094526216574, 0.011374845692810416, 0.026877141145743525],
 [0.16013585322014295, 0.011004783014932655, 0.0259859970407614]]

In [None]:
#Old
[[0.16291523694081764, 0.013211451777692177, 0.028959565798005314],
 [0.17684879332280815, 0.020482856069392284, 0.02376789894594458],
 [0.16117969829774614, 0.022204014820559396, 0.01771150510072165],
 [0.18040094526208264, 0.01137484569280127, 0.026877141145749076],
 [0.16013585321999554, 0.011004783014941564, 0.025985997040750525]]

In [9]:
#New
[[0.1629152369408916, 0.013211451777692786, 0.02895956579799682],
 [0.17684879332291165, 0.020482856069401752, 0.023767898945956042],
 [0.16117969829801523, 0.022204014820531602, 0.01771150510073388],
 [0.18040094526216574, 0.011374845692810416, 0.026877141145743525],
 [0.16013585322014295, 0.011004783014932655, 0.0259859970407614]]

[15.708357648080208, 0, 1489.6967605590821, 0, 11, 0]