In [9]:
import pandas as pd
import utm
import os
import numpy as np
from scipy import stats
import h2o
from h2o.automl import H2OAutoML
import tkinter as tk
from tkinter import filedialog
from tkinter.simpledialog import askstring
from utils.from_latlon import from_latlon
from utils.preprocessing import preprocess
from utils.postprocessing import postprocess_data, location_averaging, calculate_error

seed = 38 # randomly generated for ML process

In [10]:
# User input of data paths and temporal resolution

# Initialize Tkinter
root = tk.Tk()
root.withdraw()

# Ask the user to select the train data file
train_data = filedialog.askopenfilename(
    title="Select training data",
    filetypes=[("Excel files", "*.xlsx")]
)

# Ask the user to select the test data file
test_data = filedialog.askopenfilename(
    title="Select testing data",
    filetypes=[("Excel files", "*.xlsx")]
)

# Ask the user to select the radio tower XY data file
radio_tower_xy_path = filedialog.askopenfilename(
    title="Select radio tower location data",
    filetypes=[("Excel files", "*.xlsx")]
)

# Ask the user to select the model save path
model_save_path = filedialog.askdirectory(
    title="Select model save path"
)

# Function to get minutes from user
def get_minutes():
    while True:
        minutes = askstring("Time (in minutes) to compile location data (t)", "Enter time period (t) in minutes (must be an integer):")
        if minutes and minutes.isdigit():
            return minutes
        messagebox.showerror("Error", "Invalid input. Please enter a number.")

# Prompt the user and get the validated input
minutes = get_minutes()

# Append the input number to 'min'
freq = minutes + 'min'

# Print freq to verify (optional)
print("Frequency:", freq)

Frequency: 3min


In [11]:
# Specify data paths
train_data = 'SBTF_data/input_data/train_test_data/train_data.xlsx'
test_data = 'SBTF_data/input_data/train_test_data/test_data.xlsx'
radio_tower_xy_path = 'SBTF_data/input_data/radio_tower_locations/RTEastNorth_1group.xlsx'
model_save_path = 'SBTF_data/ml4rt_output/trained_models'

# Variable parameters
freq = '3min' # Frequency of data

# Fixed parameters
routine = 'training'
dimensions = ['xOffset', 'yOffset']

In [12]:
def preprocess_sim_data(sim_data, freq, tower_locs, routine):
   
    sim_dat_filt, predictors = preprocess(sim_data, freq, routine)
     
    # Calculate easting and northing from lat long
    sim_dat_filt['easting'], sim_dat_filt['northing'], sim_dat_filt['zone_num'], sim_dat_filt['zone_letter'] = from_latlon(sim_dat_filt['POINT_Y'].values, sim_dat_filt['POINT_X'].values)

    # Create a dictionary of the coordinates of the towers
    offset_dict = tower_locs.set_index('TowerID').to_dict()
    point_x = offset_dict['POINT_X']
    point_y = offset_dict['POINT_Y']
    tower_g = offset_dict['tower_group']

    # Standardise the coordinates so that the tower location == 0 on both the x and y axes.
    sim_dat_filt['xOffset'] = sim_dat_filt['easting'] - sim_dat_filt['TowerID'].map(point_x).fillna(0)
    sim_dat_filt['yOffset'] = sim_dat_filt['northing'] - sim_dat_filt['TowerID'].map(point_y).fillna(0)
    
    # Add the model group
    sim_dat_filt['tower_group'] = sim_dat_filt['TowerID'].map(tower_g).fillna(0)

    return sim_dat_filt, predictors


In [13]:
# Get training data
train_data = pd.read_excel(train_data)
train_data['DateAndTime'] = pd.to_datetime(train_data['DateAndTime'])

# Get testing data
test_data = pd.read_excel(test_data)
test_data['DateAndTime'] = pd.to_datetime(test_data['DateAndTime'])

# Get tower locations
tower_locs = pd.read_excel(radio_tower_xy_path)

In [14]:
# Preprocess the training and testing data
train_data_preproc, predictors_train = preprocess_sim_data(train_data, freq, tower_locs, routine)
test_data_preproc, predictors_test = preprocess_sim_data(test_data, freq, tower_locs, routine)

tower_groups = tower_locs['tower_group'].unique()

# Initialise h2o
h2o.init(nthreads = 2)

# Train, save and test the models for each dimension for each model grouping
for tower_group in tower_groups:
    for dimension in dimensions:
        print(f"Training model for {dimension} in tower group {tower_group}")
        # Train the model
        variables = predictors_train + [dimension]
        training_input = train_data_preproc[train_data_preproc['tower_group'] == tower_group]
        training_input = training_input[variables]
        train = h2o.H2OFrame(training_input)
        aml = H2OAutoML(max_models=20, seed=seed, stopping_metric='MAE', sort_metric='MAE')
        aml.train(x=predictors_train, y=dimension, training_frame=train)

        # Save the leader model
        h2o.save_model(aml.leader, path = model_save_path, force=True, filename=f'{dimension}_group_{tower_group}_model')

        # Make predictions on the test data
        test_input = test_data_preproc[test_data_preproc['tower_group'] == tower_group]
        test = h2o.H2OFrame(test_input)
        preds = aml.leader.predict(test)

        # Save predictions to a new column in the test dataframe
        pred_column_name = f"{dimension}_pred"
        test_data_preproc.loc[test_data_preproc['tower_group'] == tower_group, pred_column_name] = preds.as_data_frame().values

# Stop h2o
h2o.cluster().shutdown()


Checking whether there is an H2O instance running at http://localhost:54321.

  .agg(['mean', 'count', np.std])
  .agg(['mean', 'count', np.std])


.... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 17.0.10+11-LTS-240, mixed mode, sharing)
  Starting server from C:\Users\JohnvanOsta\Documents\GitHub\ml4rt\.venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\JOHNVA~1\AppData\Local\Temp\tmpzgezexy2
  JVM stdout: C:\Users\JOHNVA~1\AppData\Local\Temp\tmpzgezexy2\h2o_JohnvanOsta_started_from_python.out
  JVM stderr: C:\Users\JOHNVA~1\AppData\Local\Temp\tmpzgezexy2\h2o_JohnvanOsta_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Australia/Brisbane
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,1 month and 3 days
H2O_cluster_name:,H2O_from_python_JohnvanOsta_wuajw5
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.961 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,2


Training model for xOffset in tower group 1
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
15:10:03.59: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████ (cancelled)


H2OJobCancelled: Job<$03017f00000132d4ffffffff$_81f2edc73535bbacc40906b5bbf34bf5> was cancelled by the user.

In [None]:
# Post process the test predictions to calculate location from the radio tower locations
test_predictions_tower = postprocess_data(test_data_preproc, tower_locs)

# Location averaging functions
test_location_estimates = location_averaging(test_predictions_tower)
test_location_estimates = calculate_error(test_location_estimates)

# Calculate the mean absolute error of UTM_predictions['distance'] and the standard error
mean_error = np.mean(test_location_estimates['error_m'])
std_error = stats.sem(test_location_estimates['error_m'])

print(f'Mean error (+/-SE) = {mean_error} (+/- {std_error})')

Mean error (+/-SE) = 256.1809716326903 (+/- 18.612738531367942)


In [None]:
test_location_estimates.to_excel(os.path.join(model_save_path, "test_data_location_predictions.xlsx"), index=False)