In [118]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [119]:
tower_data_path = 'Example_data\Input\Lotek_combined_csv\Lotek_combined_data.csv'
tag_locations_path = 'Example_data\Input\Training_data\Tag_GPS_locations.xlsx'
output_path = 'Example_data\Output\Train_test_data'

In [120]:
# Import data
dtypes = {'Datetime': 'str'}
parse_dates = ['Datetime']
RTdat = pd.read_csv(tower_data_path, dtype=dtypes, parse_dates=parse_dates, dayfirst=True)
# RTdat = RTdat.loc[RTdat['TagID']==114] # for testing, to be commented out
# RTdat

parse_dates = ['Start_time', 'End_time']
SIMdat = pd.read_excel(tag_locations_path, parse_dates=parse_dates)

In [121]:
#Train test split of simulation data
seed = 38 # Seed for train/test split
test_size = 0.2

SIMdat_train, SIMdat_test = train_test_split(SIMdat, test_size=test_size, random_state=seed)

Add Latitude and longitude to RTdat to identify simulation data

In [122]:
# def append_coordinates(RTdat, SIMdat):
#     progress_interval = 10000
#     try:
#         for i, (index, row) in enumerate(RTdat.iterrows()):
#             sim_query = SIMdat.query("Start_time <= @row.Datetime and End_time >= @row.Datetime and Tag_ID == @row.TagID")
#             if not sim_query.empty:
#                 d = sim_query.Data_type.iloc[0]
#                 x = sim_query.POINT_X.iloc[0]
#                 y = sim_query.POINT_Y.iloc[0]
#                 RTdat.loc[index, 'Data_type'] = d
#                 RTdat.loc[index, 'POINT_X'] = x
#                 RTdat.loc[index, 'POINT_Y'] = y

#             if i % progress_interval == 0 and i > 0:
#                 print("Progress: {} rows processed".format(i))    
#         # RTdat = RTdat.loc[~RTdat['Data_type'].isna()]
#         return RTdat
#     except Exception as e:
#         print("An error occurred:", str(e))
#         # Handle the error or raise it again to propagate it to the caller
#         raise


In [123]:
def append_coordinates(RTdat, SIMdat):
    progress_interval = 10000
    try:
        RTdat_sim = RTdat.copy()  # Create a separate copy to avoid modifying the original DataFrame

        # Extract relevant columns to improve performance
        sim_columns = ['Start_time', 'End_time', 'Tag_ID', 'Data_type', 'POINT_X', 'POINT_Y']
        sim_query = SIMdat[sim_columns]

        for i, (index, row) in enumerate(RTdat_sim.iterrows()):
            query_mask = (sim_query['Start_time'] <= row['Datetime']) & \
                         (sim_query['End_time'] >= row['Datetime']) & \
                         (sim_query['Tag_ID'] == row['TagID'])

            query_result = sim_query[query_mask]
            if not query_result.empty:
                RTdat_sim.at[index, 'Data_type'] = query_result.at[query_result.index[0], 'Data_type']
                RTdat_sim.at[index, 'POINT_X'] = query_result.at[query_result.index[0], 'POINT_X']
                RTdat_sim.at[index, 'POINT_Y'] = query_result.at[query_result.index[0], 'POINT_Y']

            if i % progress_interval == 0 and i > 0:
                print("Progress: {} rows processed".format(i))
        # RTdat_sim = RTdat_sim.loc[~RTdat_sim['Data_type'].isna()]
        return RTdat_sim
    except Exception as e:
        print("An error occurred:", str(e))
        # Handle the error or raise it again to propagate it to the caller
        raise


In [124]:
RTdat_sim_train = append_coordinates(RTdat, SIMdat_train)
RTdat_sim_test = append_coordinates(RTdat, SIMdat_test)

Progress: 10000 rows processed
Progress: 20000 rows processed
Progress: 30000 rows processed
Progress: 40000 rows processed
Progress: 50000 rows processed
Progress: 60000 rows processed
Progress: 70000 rows processed
Progress: 80000 rows processed
Progress: 90000 rows processed
Progress: 100000 rows processed
Progress: 110000 rows processed
Progress: 120000 rows processed
Progress: 130000 rows processed
Progress: 140000 rows processed
Progress: 150000 rows processed
Progress: 160000 rows processed
Progress: 170000 rows processed
Progress: 180000 rows processed
Progress: 190000 rows processed
Progress: 10000 rows processed
Progress: 20000 rows processed
Progress: 30000 rows processed
Progress: 40000 rows processed
Progress: 50000 rows processed
Progress: 60000 rows processed
Progress: 70000 rows processed
Progress: 80000 rows processed
Progress: 90000 rows processed
Progress: 100000 rows processed
Progress: 110000 rows processed
Progress: 120000 rows processed
Progress: 130000 rows proc

In [125]:
# Save the data into training and testing files

# Create the output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)

# Save the data into training and testing files
filename = os.path.basename(tag_locations_path)

train_output_path = os.path.join(output_path, 'Training_' + filename)
test_output_path = os.path.join(output_path, 'Testing_' + filename)

RTdat_sim_train.to_excel(train_output_path, index=False)
RTdat_sim_test.to_excel(test_output_path, index=False)