In [38]:
import pandas as pd
import utm
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import h2o
from h2o.automl import H2OAutoML

seed = 38 # Seed for train/val split

In [39]:
# Specify data paths
train_data = 'Example_data\Output\Train_test_data\Training_Tag_GPS_locations.xlsx'
test_data = 'Example_data\Output\Train_test_data\Testing_Tag_GPS_locations.xlsx'
radio_tower_xy_path = 'H:\My Drive\Colab Notebooks\RadioTelemetry\Tower_data\RTEastNorth.xlsx'

# Variable parameters
freq = '5min' # Frequency of data

# Fixed parameters
data_type = ['Simulated BTFS', 'BTFS'] # Simulation or Live BTF, or could do both
dimensions = ['xOffset', 'yOffset']
predictors = ['ant1_mean', 'ant2_mean', 'ant3_mean', 'ant4_mean', 'ant1_count', 'ant2_count', 'ant3_count', 'ant4_count', 'ant1_std', 'ant2_std', 'ant3_std', 'ant4_std', 'mean_std', 'total_count']

In [40]:
#Function to convert geographic to projected coordinates
def from_latlon(lat, lon):
    easting, northing, zone_num, zone_letter = utm.from_latlon(lat, lon)
    return easting, northing, zone_num, zone_letter

In [41]:
def preprocess_sim_data(sim_data, data_type, freq, tower_locs):
    # Get data
    sim_dat_filt = sim_data[sim_data['Data_type'].isin(data_type)]
    
    # make column with the datetime to nearest 'freq' value (e.g. 5min)
    sim_dat_filt = sim_dat_filt.assign(DateTime = sim_dat_filt['DateAndTime'].dt.floor(freq=freq))
  
    # group by datetime, tag, tower and antenna, compute mean power and std power, pivot to antennas
    sim_dat_filt = (
            sim_dat_filt.groupby(['DateTime', 'TowerID', 'TagID', 'Antenna', 'Data_type', 'POINT_X', 'POINT_Y'])['Power']
            .agg(['mean', 'count', np.std])
            .reset_index()
            .pivot_table(index=['DateTime', 'TowerID', 'TagID',  'Data_type', 'POINT_X', 'POINT_Y'], columns='Antenna', values=['mean', 'count', 'std'])
            .reset_index()
        )

    # Rename columns
    sim_dat_filt.columns = [f"{col[0]}{col[1]}" if col[1] != "" else col[0] for col in sim_dat_filt.columns.values]
    sim_dat_filt = sim_dat_filt.rename(columns={ 'mean1': 'ant1_mean', 'mean2': 'ant2_mean', 'mean3': 'ant3_mean', 'mean4': 'ant4_mean',
                                                  'count1': 'ant1_count', 'count2': 'ant2_count', 'count3': 'ant3_count', 'count4': 'ant4_count',
                                                  'std1': 'ant1_std', 'std2': 'ant2_std', 'std3': 'ant3_std', 'std4': 'ant4_std'})
    
    # Calculate the mean std and total count across the antennas
    sim_dat_filt['mean_std'] = sim_dat_filt[['ant1_std', 'ant2_std', 'ant3_std', 'ant4_std']].mean(axis=1)
    sim_dat_filt['total_count'] = sim_dat_filt[['ant1_count', 'ant2_count', 'ant3_count', 'ant4_count']].sum(axis=1)

    # Fill missing values with 0
    sim_dat_filt = sim_dat_filt.fillna(value=0)
     
    # Calculate easting and northing from lat long
    sim_dat_filt['easting'], sim_dat_filt['northing'], sim_dat_filt['zone_num'], sim_dat_filt['zone_letter'] = from_latlon(sim_dat_filt['POINT_Y'].values, sim_dat_filt['POINT_X'].values)

    # Create a dictionary of the coordinates of the towers
    offset_dict = tower_locs.set_index('TowerID').to_dict()
    point_x = offset_dict['POINT_X']
    point_y = offset_dict['POINT_Y']

    # Standardise the coordinates so that the tower location == 0 on both the x and y axes.
    sim_dat_filt['xOffset'] = sim_dat_filt['easting'] - sim_dat_filt['TowerID'].map(point_x).fillna(0)
    sim_dat_filt['yOffset'] = sim_dat_filt['northing'] - sim_dat_filt['TowerID'].map(point_y).fillna(0)
    
    return sim_dat_filt

In [42]:
#Convert locations predictions back to easting northings

def postprocess_data(prediction_data, tower_locs):
    # Create a dictionary of the coordinates of the towers
    offset_dict = tower_locs.set_index('TowerID').to_dict()
    point_x = offset_dict['POINT_X']
    point_y = offset_dict['POINT_Y']

    # Change predicted x/y offset values to their respective easting/northing considering the location of the tower
    prediction_data['easting_pred'] = prediction_data['xOffset_pred'] + prediction_data['TowerID'].map(point_x).fillna(0)
    prediction_data['northing_pred'] = prediction_data['yOffset_pred'] + prediction_data['TowerID'].map(point_y).fillna(0)
    
    return prediction_data

In [43]:
# Get training data
train_data = pd.read_excel(train_data)
train_data['DateAndTime'] = pd.to_datetime(train_data['DateAndTime'])

# Get testing data
test_data = pd.read_excel(test_data)
test_data['DateAndTime'] = pd.to_datetime(test_data['DateAndTime'])

# Get tower locations
tower_locs = pd.read_excel(radio_tower_xy_path)

In [44]:
# Preprocess the training and testing data
train_data_preproc = preprocess_sim_data(train_data, data_type, freq, tower_locs)
test_data_preproc = preprocess_sim_data(test_data, data_type, freq, tower_locs)

# Initialise h2o
h2o.init()

# Train, save and test the models for each dimension
for dimension in dimensions:
    print(f"Training model for {dimension}")
    # Train the model
    variables = predictors + [dimension]
    training_input = train_data_preproc[variables]
    train = h2o.H2OFrame(training_input)
    aml = H2OAutoML(max_models=20, seed=seed, stopping_metric='MAE', sort_metric='MAE')
    aml.train(x=predictors, y=dimension, training_frame=train)
    print(aml.leaderboard)

    # Save the leader model
    h2o.save_model(aml.leader, path = f"Example_data\Output\Trained_models\{dimension}", force=True)

    # Make predictions on the test data
    test = h2o.H2OFrame(test_data_preproc)
    preds = aml.leader.predict(test)

    # Save predictions to a new column in the test dataframe
    pred_column_name = f"{dimension}_pred"
    test_data_preproc[pred_column_name] = preds.as_data_frame()

# Stop h2o
h2o.cluster().shutdown()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM Zulu17.36+17-CA (build 17.0.4.1+1-LTS, mixed mode, sharing)
  Starting server from C:\Users\s5236256\Documents\GitHub\ml4rt\.venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\s5236256\AppData\Local\Temp\tmppciuhgu0
  JVM stdout: C:\Users\s5236256\AppData\Local\Temp\tmppciuhgu0\h2o_s5236256_started_from_python.out
  JVM stderr: C:\Users\s5236256\AppData\Local\Temp\tmppciuhgu0\h2o_s5236256_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Australia/Brisbane
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 5 days
H2O_cluster_name:,H2O_from_python_s5236256_m0wmqo
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.922 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Training model for xOffset
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
16:30:31.765: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                     mae     rmse     mse    rmsle    mean_residual_deviance
DeepLearning_grid_3_AutoML_1_20230602_163031_model_1     226.713  359.566  129288      nan                    129288
GBM_3_AutoML_1_20230602_163031                           228.134  356.116  126818      nan                    126818
GBM_grid_1_AutoML_1_20230602_163031_model_4              228.28   355.354  126276      nan                    126276
GBM_2_AutoML_1_20230602_163031                           228.387  355.532  126403      nan                    126403
GBM_4_AutoML_1_20230602_163031                           228.787  357.684  127938      nan                    127938
DeepLearning_grid_

In [60]:
# Calculate the accuracy of the predictions

test_predictions = postprocess_data(test_data_preproc, tower_locs)

utm_predictions = (test_predictions.groupby(['DateTime', 'TagID'], as_index=False)
    .agg({'easting':'first',
          'northing':'first',
          'easting_pred':'mean',
          'northing_pred':'mean',
          'TowerID': lambda x: x.nunique(),  # Number of unique TowerID
          'Data_type': 'first',  # First value of Data_type
          'ant1_count':'sum',  # Total count of ant1_count
          'ant2_count':'sum',  # Total count of ant2_count
          'ant3_count':'sum',  # Total count of ant3_count
          'ant4_count':'sum',  # Total count of ant4_count
          'xOffset':'mean',  # Average xOffset
          'yOffset':'mean'  # Average yOffset
         })
)

# Sum the values of the four antennas into a single column 'pulse_count'
utm_predictions['pulse_count'] = utm_predictions['ant1_count'] + utm_predictions['ant2_count'] + utm_predictions['ant3_count'] + utm_predictions['ant4_count']

# Drop the individual antenna count columns
utm_predictions.drop(['ant1_count', 'ant2_count', 'ant3_count', 'ant4_count'], axis=1, inplace=True)

utm_predictions.rename(columns={'TowerID': 'Tower_count'}, inplace=True) # Rename TowerID column to a more appropriate name

# Calculate the error of location predictions

utm_predictions['easting_error'] = utm_predictions['easting_pred'] - utm_predictions['easting']
utm_predictions['northing_error'] = utm_predictions['northing_pred'] - utm_predictions['northing']

# Calculate the Eucledian distance between the predicted and actual locations
utm_predictions['error_m'] = np.sqrt((utm_predictions['easting_error']) ** 2
                    + (utm_predictions['northing_error']) ** 2)

# Calculate the average Eucledian distance from the towers
utm_predictions['mean_distance_from_tower'] = np.sqrt((utm_predictions['xOffset']) ** 2
                    + (utm_predictions['yOffset']) ** 2)

# Calculate the mean absolute error of UTM_predictions['distance'] and the standard error
mean_error = np.mean(utm_predictions['error_m'])
std_error = stats.sem(utm_predictions['error_m'])

print(f'Mean error (+/-SE) = {mean_error} (+/- {std_error})')

Mean error (+/-SE) = 224.17221415428668 (+/- 16.480864468271726)


In [59]:
test_predictions.to_excel("test_predictions.xlsx", index=False)
utm_predictions.to_excel("UTM_predictions.xlsx", index=False)