In [14]:
import pandas as pd
import utm
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import h2o
from h2o.automl import H2OAutoML
from utils.from_latlon import from_latlon
from utils.preprocessing import preprocess
from utils.postprocessing import postprocess_data, location_averaging, calculate_error

seed = 38 # Seed for train/val split

In [15]:
# Specify data paths
train_data = 'Example_data\Output\Train_test_data\Training_Test_Combined_for_final_run.xlsx'
test_data = 'Example_data\Output\Train_test_data\Testing_Tag_GPS_locations.xlsx'
radio_tower_xy_path = 'Example_data\Input\Radio_tower_locations\RTEastNorth_2groups.xlsx'
model_save_path = 'Example_data\Output\Trained_models\MovEcolPaper'

# Variable parameters
freq = '3min' # Frequency of data

# Fixed parameters
routine = 'training'
data_type = ['Simulated BTFS', 'BTFS'] # Simulation or Live BTF, or could do both 'Simulated BTFS', 'BTFS'
dimensions = ['xOffset', 'yOffset']
# predictors = ['ant1_mean', 'ant2_mean', 'ant3_mean', 'ant4_mean', 'ant1_count', 'ant2_count', 'ant3_count', 'ant4_count', 'ant1_std', 'ant2_std', 'ant3_std', 'ant4_std', 'mean_std', 'total_count']

In [16]:
def preprocess_sim_data(sim_data, data_type, freq, tower_locs, routine):
    # Get data
    sim_dat_filt = sim_data[sim_data['Data_type'].isin(data_type)]
    
    sim_dat_filt, predictors = preprocess(sim_dat_filt, freq, routine)
     
    # Calculate easting and northing from lat long
    sim_dat_filt['easting'], sim_dat_filt['northing'], sim_dat_filt['zone_num'], sim_dat_filt['zone_letter'] = from_latlon(sim_dat_filt['POINT_Y'].values, sim_dat_filt['POINT_X'].values)

    # Create a dictionary of the coordinates of the towers
    offset_dict = tower_locs.set_index('TowerID').to_dict()
    point_x = offset_dict['POINT_X']
    point_y = offset_dict['POINT_Y']
    tower_g = offset_dict['tower_group']

    # Standardise the coordinates so that the tower location == 0 on both the x and y axes.
    sim_dat_filt['xOffset'] = sim_dat_filt['easting'] - sim_dat_filt['TowerID'].map(point_x).fillna(0)
    sim_dat_filt['yOffset'] = sim_dat_filt['northing'] - sim_dat_filt['TowerID'].map(point_y).fillna(0)
    
    # Add the model group
    sim_dat_filt['tower_group'] = sim_dat_filt['TowerID'].map(tower_g).fillna(0)

    return sim_dat_filt, predictors


In [17]:
# Get training data
train_data = pd.read_excel(train_data)
train_data['DateAndTime'] = pd.to_datetime(train_data['DateAndTime'])

# Get testing data
test_data = pd.read_excel(test_data)
test_data['DateAndTime'] = pd.to_datetime(test_data['DateAndTime'])

# Get tower locations
tower_locs = pd.read_excel(radio_tower_xy_path)

In [18]:
# filtered_df_test = train_data_preproc[train_data_preproc['Data_type'] == 'Simulated BTFS']
# filtered_df_test = train_data_preproc[(train_data_preproc['Data_type'] == 'Simulated BTFS') & (train_data_preproc['Tag_type'] == 'Nanotag')]
# filtered_df_test

In [19]:
# Preprocess the training and testing data
train_data_preproc, predictors_train = preprocess_sim_data(train_data, data_type, freq, tower_locs, routine)
test_data_preproc, predictors_test = preprocess_sim_data(test_data, data_type, freq, tower_locs, routine)

train_data_preproc = train_data_preproc[train_data_preproc['Tag_type'] == 'Nanotag']

tower_groups = tower_locs['tower_group'].unique()

# Initialise h2o
h2o.init(nthreads = 2)

# Train, save and test the models for each dimension for each model grouping
for tower_group in tower_groups:
    for dimension in dimensions:
        print(f"Training model for {dimension} in tower group {tower_group}")
        # Train the model
        variables = predictors_train + [dimension]
        training_input = train_data_preproc[train_data_preproc['tower_group'] == tower_group]
        training_input = training_input[variables]
        train = h2o.H2OFrame(training_input)
        aml = H2OAutoML(max_models=20, seed=seed, stopping_metric='MAE', sort_metric='MAE')
        aml.train(x=predictors_train, y=dimension, training_frame=train)
        # print(aml.leaderboard)

        # Save the leader model
        h2o.save_model(aml.leader, path = model_save_path, force=True, filename=f'{dimension}_group_{tower_group}_model')

        # Make predictions on the test data
        test_input = test_data_preproc[test_data_preproc['tower_group'] == tower_group]
        test = h2o.H2OFrame(test_input)
        preds = aml.leader.predict(test)

        # Save predictions to a new column in the test dataframe
        pred_column_name = f"{dimension}_pred"
        test_data_preproc.loc[test_data_preproc['tower_group'] == tower_group, pred_column_name] = preds.as_data_frame().values

# Stop h2o
h2o.cluster().shutdown()


Checking whether there is an H2O instance running at http://localhost:54321.

 connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,39 mins 10 secs
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,7 months and 6 days
H2O_cluster_name:,H2O_from_python_John_osanh2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.353 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,2


Training model for xOffset in tower group 1
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█
17:24:03.101: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Training model for yOffset in tower group 1
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█
17:37:58.292: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Training model for xOffset

In [20]:
# Post process the test predictions to calculate location from the radio tower locations
test_predictions_tower = postprocess_data(test_data_preproc, tower_locs)

# Location averaging functions
test_location_estimates = location_averaging(test_predictions_tower)
test_location_estimates = calculate_error(test_location_estimates)

# Calculate the mean absolute error of UTM_predictions['distance'] and the standard error
mean_error = np.mean(test_location_estimates['error_m'])
std_error = stats.sem(test_location_estimates['error_m'])

print(f'Mean error (+/-SE) = {mean_error} (+/- {std_error})')

Mean error (+/-SE) = 244.36771649297617 (+/- 17.213496561634344)


In [21]:
test_predictions_tower.to_excel("test_tower_predictions.xlsx", index=False)
test_location_estimates.to_excel("UTM_predictions.xlsx", index=False)

In [22]:
# sorted_df = test_location_estimates.sort_values(by='error_m', ascending=False)
# sorted_df

In [23]:
# sorted_df.to_excel("sorted_df.xlsx", index=False)