In [1]:
import pandas as pd
import math
import numpy as np
from timeit import default_timer as timer
import tensorflow as tf

print("tensorflow version:",tf.__version__)

tensorflow version: 2.3.0


In [2]:
# Set seed
seed=1920
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
# Import station data from csv
stations = [70351, 70217, 65103, 66194, 68192, 75041, 66037, 63291,
 73138, 51049, 62100, 58198, 67113, 61078, 61363, 69148,
 61287, 51161, 74148, 58208, 66161, 62101, 47048, 65068,
 69139, 59007, 58214, 60141, 68262, 66137, 58012, 75019,
 56238, 63292, 49000, 67105, 63303, 58077, 68257, 66212,
 55202, 68242, 74258, 65111, 58212, 70330, 48245, 54038,
 72160, 72162, 72161, 50017, 60139, 61375, 68072, 68239,
 61425, 46012, 64017, 69128, 68228, 67108, 69137, 52088,
 61392, 67119, 55325, 61055, 50137, 69138, 61366, 65070,
 61260, 69147, 68241]

In [4]:
# Read stations
base_str = "../BOM/spatial_int/merged_station.csv"
base_df = pd.read_csv(base_str)
# Remove rows with NaN data
#base_df.dropna(inplace=True)
#base_df.reset_index(drop=True, inplace=True)
base_df.head()

Unnamed: 0,70351Lon,70351Lat,70351DEM,70351ndvi,70351temp,70351dew,70351RH,70351Nwind,70351Ewind,70351MinTemp,...,68241Lon,68241Lat,68241DEM,68241ndvi,68241temp,68241dew,68241RH,68241Nwind,68241Ewind,68241MinTemp
0,149.2004,-35.3088,577.1,3514.0,15.8,15.0,95.0,8.375461,-4.267511,15.5,...,150.79,-34.5638,8.0,4649.0,21.5,20.7,95.0,0.0,0.0,21.5
1,149.2004,-35.3088,577.1,3514.0,15.8,15.0,95.0,8.140639,-4.7,15.5,...,150.79,-34.5638,8.0,4649.0,21.5,20.7,95.0,0.0,0.0,21.5
2,149.2004,-35.3088,577.1,3514.0,15.7,14.9,95.0,7.700029,-5.391619,15.5,...,150.79,-34.5638,8.0,4649.0,21.6,20.8,95.0,0.0,0.0,21.5
3,149.2004,-35.3088,577.1,3514.0,15.6,14.8,95.0,5.906309,-4.782835,15.5,...,150.79,-34.5638,8.0,4649.0,21.6,20.8,95.0,0.0,0.0,21.5
4,149.2004,-35.3088,577.1,3514.0,15.5,14.5,94.0,6.148529,-4.467168,15.5,...,150.79,-34.5638,8.0,4649.0,21.6,20.8,95.0,0.0,0.0,21.5


In [5]:
# Dataframe to Array
base_ar = base_df.to_numpy()

In [10]:
# Generate Training data
def generate_train(base_ar, fold_index, id_index):
    # Check fold_index
    lower_bound = fold_index * 15
    upper_bound = (fold_index+1) * 15
    # id_index is within the bound of fold testing weather stations
    if (id_index >= lower_bound) and (id_index < upper_bound):
        return np.NaN
    station_train_list = []
    for i in range(len(stations)):
        if (i >= lower_bound) and (i < upper_bound):
            continue
        if i==id_index:
            continue
        relatives =  base_ar[:, id_index*10:id_index*10+4] - base_ar[:, i*10:i*10+4]
        station_train=np.concatenate((relatives, base_ar[:, id_index*10+4:id_index*10+9],  base_ar[:, i*10+9:i*10+10]), axis=1)
        station_train_list.append(station_train)
    # merge all stations together row by row
    result = np.concatenate(station_train_list, axis=0)
    return result

# Generate Training data
def generate_train_more(base_ar, fold_index, id_index):
    # Check fold_index
    lower_bound = fold_index * 15
    upper_bound = (fold_index+1) * 15
    # id_index is within the bound of fold testing weather stations
    if (id_index >= lower_bound) and (id_index < upper_bound):
        return np.NaN
    station_train_list = []
    for i in range(len(stations)):
        if (i >= lower_bound) and (i < upper_bound):
            continue
        if i==id_index:
            continue
        relatives_a =  base_ar[:, id_index*10:id_index*10+4]
        relatives_b = base_ar[:, i*10:i*10+4]
        station_train=np.concatenate((relatives_a, relatives_b, base_ar[:, id_index*10+4:id_index*10+9],  base_ar[:, i*10+9:i*10+10]), axis=1)
        station_train_list.append(station_train)
    # merge all stations together row by row
    result = np.concatenate(station_train_list, axis=0)
    return result

# Set callback to record training time
class TimingCallback(tf.keras.callbacks.Callback):
    def __init__(self, logs={}):
        self.logs=[]
    def on_epoch_begin(self, epoch, logs={}):
        self.starttime = timer()
    def on_epoch_end(self, epoch, logs={}):
        self.logs.append(timer()-self.starttime)

# Training
def train_model(train_ar,fold_index,id_index):
    #train_df = pd.DataFrame(train_ar, columns = ['Lon','Lat','DEM','ndvi','temp','dew','RH','Nwind','Ewind','MinTemp'])
    train_df = pd.DataFrame(train_ar, columns = ['Lon','Lat','DEM','ndvi','bLon','bLat','bDEM','bndvi','temp','dew','RH','Nwind','Ewind','MinTemp'])
    
    train_df['ndvi'] = train_df['ndvi']/1000.0
    train_df['bndvi'] = train_df['bndvi']/1000.0
    
    # Remove rows with NaN data
    train_df.dropna(inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    
    # Randomly allocate data to training (80%) and testing (20%) sets
    train_dataset = train_df.sample(frac=0.8,random_state=seed)
    
    # From clean_df remove the data that is chosen as training dataset to from the testing dataset
    test_dataset = train_df.drop(train_dataset.index)
    
    # Extract the MinTemp Series as labels (Expected Outputs)
    train_labels = train_dataset.pop('MinTemp')
    test_labels = test_dataset.pop('MinTemp')
    
    # Define a Neural Network
    def build_model():
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(10, activation='relu', input_shape=[len(train_dataset.keys())]),
            tf.keras.layers.Dense(14, activation='relu'),
            tf.keras.layers.Dense(9, activation='relu'),
            tf.keras.layers.Dense(8, activation='relu'),
            tf.keras.layers.Dense(1, activation='linear')
        ])
    
        model.compile(optimizer='adam', loss='mse', metrics=['mse'])
        return model
    
    model = build_model()
    
    timetaken = TimingCallback()
    
    # Training for 50 epoches to fit model parameters 80% training and 20% for validation
    history = model.fit(train_dataset.to_numpy(), train_labels.to_numpy(), batch_size=1024, epochs=50, validation_split = 0.2, callbacks = [timetaken])
    
    # Save history
    hist_pd = pd.DataFrame(history.history)
    hist_pd['epoch'] = np.add(history.epoch, 1).tolist()
    
    # Save training time
    train_time = pd.DataFrame(timetaken.logs)
    train_time.columns = ['training seconds']
    
    # Testing
    lose, mse = model.evaluate(test_dataset.to_numpy(), test_labels.to_numpy())
    
    # Save Reports
    report_path = "../Reports/Spatial/ex2/"
    
    # History
    hist_pd.to_csv(report_path+str(fold_index)+'/' + str(stations[id_index]) + '_ann_history.csv', index = False)
    
    # Training Time
    train_time.to_csv(report_path +str(fold_index)+'/'+ str(stations[id_index]) + '_ann_train.csv', index = False)

    # Testing loss
    f= open(report_path +str(fold_index)+'/'+ str(stations[id_index]) + '_ann_loss.txt',"w+")
    f.write(str(lose))
    f.close()

    # Save Model
    model_path = "../Models/Spatial/ex2/"
    model.save(model_path +str(fold_index)+'/'+ str(stations[id_index]) + '_ann.h5') 
    

In [11]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# Training 60 models for each of the 5 folds
for fold_index in range(5):
    for id_index in range(75):
        train_ar = generate_train_more(base_ar,fold_index,id_index)
        if(np.isnan(train_ar).all() == True):
            continue
        print(bcolors.OKGREEN + "Start Training " + "fold " + str(fold_index) + " model " + str(id_index) + bcolors.ENDC)
        train_model(train_ar,fold_index,id_index)
    print(bcolors.FAIL + "Finished " + "fold " + str(fold_index) + bcolors.ENDC)

[92mStart Training fold 0 model 15[0m
Epoch 1/100
Epoch 2/100

KeyboardInterrupt: 