In [11]:
import pandas as pd
import utm
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import h2o
from h2o.automl import H2OAutoML
from utils.from_latlon import from_latlon
from utils.preprocessing import preprocess
from utils.postprocessing import postprocess_data, location_averaging, calculate_error

seed = 38 # Seed for train/val split

In [12]:
# Specify data paths
train_data = 'Example_data\Output\Train_test_data\Training_Tag_GPS_locations.xlsx'
test_data = 'Example_data\Output\Train_test_data\Testing_Tag_GPS_locations.xlsx'
radio_tower_xy_path = 'Example_data\Input\Radio_tower_locations\RTEastNorth.xlsx'

# Variable parameters
freq = '3min' # Frequency of data

# Fixed parameters
routine = 'training'
data_type = ['Simulated BTFS', 'BTFS'] # Simulation or Live BTF, or could do both
dimensions = ['xOffset', 'yOffset']
predictors = ['ant1_mean', 'ant2_mean', 'ant3_mean', 'ant4_mean', 'ant1_count', 'ant2_count', 'ant3_count', 'ant4_count', 'ant1_std', 'ant2_std', 'ant3_std', 'ant4_std', 'mean_std', 'total_count']

In [13]:
def preprocess_sim_data(sim_data, data_type, freq, tower_locs, routine):
    # Get data
    sim_dat_filt = sim_data[sim_data['Data_type'].isin(data_type)]
    
    sim_dat_filt = preprocess(sim_dat_filt, freq, routine)
     
    # Calculate easting and northing from lat long
    sim_dat_filt['easting'], sim_dat_filt['northing'], sim_dat_filt['zone_num'], sim_dat_filt['zone_letter'] = from_latlon(sim_dat_filt['POINT_Y'].values, sim_dat_filt['POINT_X'].values)

    # Create a dictionary of the coordinates of the towers
    offset_dict = tower_locs.set_index('TowerID').to_dict()
    point_x = offset_dict['POINT_X']
    point_y = offset_dict['POINT_Y']

    # Standardise the coordinates so that the tower location == 0 on both the x and y axes.
    sim_dat_filt['xOffset'] = sim_dat_filt['easting'] - sim_dat_filt['TowerID'].map(point_x).fillna(0)
    sim_dat_filt['yOffset'] = sim_dat_filt['northing'] - sim_dat_filt['TowerID'].map(point_y).fillna(0)
    
    return sim_dat_filt


In [14]:
# #Convert locations predictions back to easting northings

# def postprocess_data(prediction_data, tower_locs):
#     # Create a dictionary of the coordinates of the towers
#     offset_dict = tower_locs.set_index('TowerID').to_dict()
#     point_x = offset_dict['POINT_X']
#     point_y = offset_dict['POINT_Y']

#     # Change predicted x/y offset values to their respective easting/northing considering the location of the tower
#     prediction_data['easting_pred'] = prediction_data['xOffset_pred'] + prediction_data['TowerID'].map(point_x).fillna(0)
#     prediction_data['northing_pred'] = prediction_data['yOffset_pred'] + prediction_data['TowerID'].map(point_y).fillna(0)
    
#     return prediction_data

In [15]:
# Get training data
train_data = pd.read_excel(train_data)
train_data['DateAndTime'] = pd.to_datetime(train_data['DateAndTime'])

# Get testing data
test_data = pd.read_excel(test_data)
test_data['DateAndTime'] = pd.to_datetime(test_data['DateAndTime'])

# Get tower locations
tower_locs = pd.read_excel(radio_tower_xy_path)

In [16]:
# Preprocess the training and testing data
train_data_preproc = preprocess_sim_data(train_data, data_type, freq, tower_locs, routine)
test_data_preproc = preprocess_sim_data(test_data, data_type, freq, tower_locs, routine)

# Initialise h2o
h2o.init()

# Train, save and test the models for each dimension
for dimension in dimensions:
    print(f"Training model for {dimension}")
    # Train the model
    variables = predictors + [dimension]
    training_input = train_data_preproc[variables]
    train = h2o.H2OFrame(training_input)
    aml = H2OAutoML(max_models=20, seed=seed, stopping_metric='MAE', sort_metric='MAE')
    aml.train(x=predictors, y=dimension, training_frame=train)
    print(aml.leaderboard)

    # Save the leader model
    h2o.save_model(aml.leader, path = f"Example_data\Output\Trained_models\{dimension}", force=True)

    # Make predictions on the test data
    test = h2o.H2OFrame(test_data_preproc)
    preds = aml.leader.predict(test)

    # Save predictions to a new column in the test dataframe
    pred_column_name = f"{dimension}_pred"
    test_data_preproc[pred_column_name] = preds.as_data_frame()

# Stop h2o
h2o.cluster().shutdown()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.321-b07, mixed mode)
  Starting server from C:\Users\John\Documents\GitHub\ml4rt\.venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\John\AppData\Local\Temp\tmpc2h5xupa
  JVM stdout: C:\Users\John\AppData\Local\Temp\tmpc2h5xupa\h2o_John_started_from_python.out
  JVM stderr: C:\Users\John\AppData\Local\Temp\tmpc2h5xupa\h2o_John_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Australia/Brisbane
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 28 days
H2O_cluster_name:,H2O_from_python_John_51ul98
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.467 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Training model for xOffset
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
21:11:55.562: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                     mae     rmse     mse    rmsle    mean_residual_deviance
GBM_2_AutoML_1_20230625_211155                           263.044  411.992  169737      nan                    169737
GBM_3_AutoML_1_20230625_211155                           263.345  413.486  170971      nan                    170971
StackedEnsemble_BestOfFamily_1_AutoML_1_20230625_211155  263.867  411.639  169447      nan                    169447
StackedEnsemble_AllModels_1_AutoML_1_20230625_211155     264.086  409.853  167980      nan                    167980
GBM_grid_1_AutoML_1_20230625_211155_model_13             264.56   414.451  171769      nan                    171769
GBM_4_AutoML_1_202

In [17]:
test_predictions = postprocess_data(test_data_preproc, tower_locs)
test_predictions

Unnamed: 0,DateTime,TowerID,TagID,Data_type,POINT_X,POINT_Y,ant1_count,ant2_count,ant3_count,ant4_count,...,northing,zone_num,zone_letter,xOffset,yOffset,xOffset_pred,yOffset_pred,easting_pred,northing_pred,zone_number
0,2021-02-02 07:53:00,RT01,60,BTFS,146.256427,-21.919968,0.0,2.0,0.0,1.0,...,7.575845e+06,55,k,-135.520989,-73.105345,-143.803172,-11.289872,423194.928012,7.575907e+06,55
1,2021-02-02 07:53:00,RT04,60,BTFS,146.256427,-21.919968,1.0,0.0,0.0,0.0,...,7.575845e+06,55,k,257.092334,171.458690,37.687866,317.659869,422983.805727,7.575992e+06,55
2,2021-02-02 07:54:00,RT01,60,BTFS,146.256427,-21.919968,0.0,1.0,0.0,1.0,...,7.575845e+06,55,k,-135.520989,-73.105345,-123.018943,-43.874106,423215.712241,7.575875e+06,55
3,2021-02-02 07:54:00,RT04,60,BTFS,146.256427,-21.919968,1.0,0.0,2.0,0.0,...,7.575845e+06,55,k,257.092334,171.458690,13.854895,-63.330024,422959.972756,7.575611e+06,55
4,2021-02-02 07:55:00,RT01,60,BTFS,146.256427,-21.919968,0.0,1.0,0.0,1.0,...,7.575845e+06,55,k,-135.520989,-73.105345,-148.689038,-43.506954,423190.042146,7.575875e+06,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170,2023-05-23 10:51:00,RT18,192,BTFS,146.237257,-21.938164,4.0,5.0,5.0,4.0,...,7.573822e+06,55,k,103.510404,-165.520759,32.910588,-77.470642,421162.648405,7.573910e+06,55
1171,2023-05-23 10:52:00,RT18,192,BTFS,146.237257,-21.938164,5.0,5.0,4.0,5.0,...,7.573822e+06,55,k,103.510404,-165.520759,37.565748,-33.611385,421167.303565,7.573953e+06,55
1172,2023-05-23 10:53:00,RT18,192,BTFS,146.237257,-21.938164,2.0,2.0,2.0,1.0,...,7.573822e+06,55,k,103.510404,-165.520759,42.984970,35.044649,421172.722787,7.574022e+06,55
1173,2023-05-23 10:54:00,RT18,192,BTFS,146.237257,-21.938164,4.0,5.0,4.0,0.0,...,7.573822e+06,55,k,103.510404,-165.520759,57.948284,-15.857102,421187.686101,7.573971e+06,55


In [18]:
# Post process the test predictions to calculate location from the radio tower locations
test_predictions_tower = postprocess_data(test_data_preproc, tower_locs)

# Location averaging functions
test_location_estimates = location_averaging(test_predictions_tower)
test_location_estimates = calculate_error(test_location_estimates)

# Calculate the mean absolute error of UTM_predictions['distance'] and the standard error
mean_error = np.mean(test_location_estimates['error_m'])
std_error = stats.sem(test_location_estimates['error_m'])

print(f'Mean error (+/-SE) = {mean_error} (+/- {std_error})')

Mean error (+/-SE) = 294.7075618167527 (+/- 11.858097892054985)


In [21]:
test_predictions_tower.to_excel("test_predictions.xlsx", index=False)
test_location_estimates.to_excel("UTM_predictions.xlsx", index=False)

In [23]:
sorted_df = test_location_estimates.sort_values(by='error_m', ascending=False)
sorted_df

Unnamed: 0,DateTime,TagID,easting,northing,easting_pred,northing_pred,Tower_count,Data_type,Signal_count,xOffset,yOffset,easting_error,northing_error,error_m,mean_distance_from_tower
252,2022-10-09 14:12:00,157,421946.516339,7.576117e+06,424356.642064,7.576144e+06,1,BTFS,1.0,-1392.214845,198.378076,2410.125725,27.042635,2410.277435,1406.277368
254,2022-10-09 14:14:00,157,421946.516339,7.576117e+06,424022.720149,7.576095e+06,1,BTFS,1.0,-1392.214845,198.378076,2076.203811,-21.416910,2076.314270,1406.277368
220,2021-04-22 14:56:00,8,430892.884204,7.572816e+06,432327.343273,7.573131e+06,1,Simulated BTFS,1.0,-648.257158,-197.396337,1434.459069,315.546664,1468.755431,677.644934
410,2023-05-11 11:05:00,204,425494.477409,7.576692e+06,424210.826038,7.576486e+06,3,BTFS,14.0,1510.889473,144.463018,-1283.651371,-205.723084,1300.031857,1517.780144
232,2021-04-23 15:11:00,8,422663.351956,7.575777e+06,423843.412139,7.575947e+06,2,Simulated BTFS,2.0,-479.072567,-19.532713,1180.060183,170.805579,1192.357573,479.470594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293,2022-10-16 11:17:00,176,427338.872211,7.565474e+06,427359.043227,7.565479e+06,4,BTFS,44.0,-128.672836,45.513985,20.171017,4.675025,20.705694,136.485243
499,2023-05-18 14:31:00,208,421127.471479,7.573681e+06,421130.245404,7.573701e+06,1,BTFS,10.0,-2.266338,-305.965764,2.773924,20.044234,20.235266,305.974158
64,2021-04-16 10:28:00,70,431482.935289,7.573009e+06,431498.698552,7.573004e+06,1,Simulated BTFS,5.0,-58.206073,-4.439718,15.763263,-4.719370,16.454571,58.375149
169,2021-04-18 11:49:00,8,422948.423915,7.575798e+06,422932.600908,7.575794e+06,3,Simulated BTFS,8.0,-128.251147,-123.541672,-15.823008,-3.534011,16.212859,178.075550


In [24]:
sorted_df.to_excel("sorted_df.xlsx", index=False)