In [4]:
import pandas as pd
import h2o
import os
import utm
import numpy as np
from utils.to_latlon import to_latlon
from utils.preprocessing import preprocess
from utils.postprocessing import postprocess_data

In [5]:
# Filepaths to input data
all_tags_path = r'C:\Users\John\SynologyDrive\JOBS\~2019\QEJ19152\DATA ANALYSIS\MOVEMENT_ECOLOGY\MASTER_DATA\ART_DATA\All_TAGS'
radio_tower_xy_path = 'Example_data\Input\Radio_tower_locations\RTEastNorth_2groups.xlsx'

# Filepaths to trained models
model_save_path = 'Example_data\Output\Trained_models\MovEcolPaper'

# Variable parameters
freq = '3min' # Frequency of data
data_type = None

routine = 'prediction'
dimensions = ['xOffset', 'yOffset']
predictors = ['ant1_mean', 'ant2_mean', 'ant3_mean', 'ant4_mean', 'ant1_count', 'ant2_count', 'ant3_count', 'ant4_count', 'ant1_std', 'ant2_std', 'ant3_std', 'ant4_std', 'mean_std', 'total_count']

In [6]:
# Combine all tag data files into a single dataframe
all_dataframes = []

# Loop through each file in the folder
for filename in os.listdir(all_tags_path):
    if filename.endswith('.xlsx') or filename.endswith('.xls'):
        # Construct full file path
        file_path = os.path.join(all_tags_path, filename)
        # Read the excel file and append to the list
        df = pd.read_excel(file_path)
        all_dataframes.append(df)

# Concatenate all dataframes
pred_data = pd.concat(all_dataframes, ignore_index=True)
pred_data['DateAndTime'] = pd.to_datetime(pred_data['DateAndTime'])

In [7]:
# Get tower locations
tower_locs = pd.read_excel(radio_tower_xy_path)

In [8]:
# Preprocess the unlabelled data
pred_data_preproc, predictors_predict = preprocess(pred_data, freq, routine)

# Create a dictionary of the coordinates of the towers
offset_dict = tower_locs.set_index('TowerID').to_dict()
tower_g = offset_dict['tower_group']

# Add the model group
pred_data_preproc['tower_group'] = pred_data_preproc['TowerID'].map(tower_g).fillna(0)

up to preprocess
created DateTime
down routine prediction route
prediction route subprocess complete


In [32]:
tower_groups = tower_locs['tower_group'].unique()

# Initialise h2o
h2o.init(nthreads = 2)

# Make predictions for each tower group and dimension
for tower_group in tower_groups:
    for dimension in dimensions:
        # Load the trained model
        try:
            if dimension == 'xOffset':
                model = h2o.load_model(f"{model_save_path}\{dimension}_group_{tower_group}_model")
                
            elif dimension == 'yOffset':
                model = h2o.load_model(f"{model_save_path}\{dimension}_group_{tower_group}_model")
            else:
                raise ValueError(f"Error loading the model: {dimension}")
        except Exception as e:
            print(f"Error loading model for dimension '{dimension}': {str(e)}")
    
        # Make predictions on the test data
        data_input = pred_data_preproc[pred_data_preproc['tower_group'] == tower_group]
        data_input['unique_index'] = data_input.index ## Delete once bug tested
        unlabelled_data = h2o.H2OFrame(data_input, header=1)
        preds = model.predict(unlabelled_data)

        # Save predictions to a new column in the test dataframe
        pred_column_name = f"{dimension}_pred"
        pred_data_preproc.loc[pred_data_preproc['tower_group'] == tower_group, pred_column_name] = preds.as_data_frame().values
        

# Stop h2o
h2o.cluster().shutdown()

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,29 mins 11 secs
H2O_cluster_timezone:,Australia/Sydney
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,7 months and 11 days
H2O_cluster_name:,H2O_from_python_John_ovtgbs
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.054 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_input['unique_index'] = data_input.index ## Delete once bug tested


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
DateTime             TowerID      TagID    ant1_count    ant2_count    ant3_count    ant4_count    ant1_mean    ant2_mean    ant3_mean    ant4_mean    ant1_std    ant2_std    ant3_std    ant4_std    mean_std    total_count    tower_group    xOffset_pred    unique_index
2020-08-29 11:36:00  RT03            10             1             0             0             0          116        0              0            0        0          0            0         0            0                    1              1         59.5707             550
2020-08-29 11:54:00  RT03            10             1             0             0             0          107        0              0            0        0          0            0         0            0                    1              1         29.8262             580
2020-08-29 11:57:00  RT03            10             3             0             0             0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_input['unique_index'] = data_input.index ## Delete once bug tested


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
DateTime             TowerID      TagID    ant1_count    ant2_count    ant3_count    ant4_count    ant1_mean    ant2_mean    ant3_mean    ant4_mean    ant1_std    ant2_std    ant3_std    ant4_std    mean_std    total_count    tower_group    xOffset_pred    unique_index
2020-08-29 11:36:00  RT03            10             1             0             0             0          116        0              0            0        0          0            0         0            0                    1              1         59.5707             550
2020-08-29 11:54:00  RT03            10             1             0             0             0          107        0              0            0        0          0            0         0            0                    1              1         29.8262             580
2020-08-29 11:57:00  RT03            10             3             0             0             0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_input['unique_index'] = data_input.index ## Delete once bug tested


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
DateTime             TowerID      TagID    ant1_count    ant2_count    ant3_count    ant4_count    ant1_mean    ant2_mean    ant3_mean    ant4_mean    ant1_std    ant2_std    ant3_std    ant4_std    mean_std    total_count    tower_group    xOffset_pred    yOffset_pred    unique_index
2020-08-29 06:03:00  RT01            10             0             0             1             0            0            0      155                0           0           0     0                 0     0                    1              2             nan             nan               0
2020-08-29 06:06:00  RT01             7             0             0             1             0            0            0      116                0           0           0     0                 0     0                    1              2             nan             nan               1
2020-08-29 06:06:00  RT01            10        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_input['unique_index'] = data_input.index ## Delete once bug tested


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
DateTime             TowerID      TagID    ant1_count    ant2_count    ant3_count    ant4_count    ant1_mean    ant2_mean    ant3_mean    ant4_mean    ant1_std    ant2_std    ant3_std    ant4_std    mean_std    total_count    tower_group    xOffset_pred    yOffset_pred    unique_index
2020-08-29 06:03:00  RT01            10             0             0             1             0            0            0      155                0           0           0     0                 0     0                    1              2        293.657              nan               0
2020-08-29 06:06:00  RT01             7             0             0             1             0            0            0      116                0           0           0     0                 0     0                    1              2        267.966              nan               1
2020-08-29 06:06:00  RT01            10        

In [33]:
predictions = postprocess_data(pred_data_preproc, tower_locs)

# I added the below line on 06/01/2024, need to confirm that it works ok
predictions = location_averaging(predictions)

In [34]:
predictions[['latitude_pred', 'longitude_pred']] = predictions.apply(lambda row: pd.Series(to_latlon(row['easting_pred'], row['northing_pred'], row['zone_number'], row['zone_letter'])), axis=1)


In [35]:
predictions.to_excel("predictions.xlsx", index=False)