In [61]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from pathlib import Path

In [62]:
tower_data_path = Path(r"C:\Users\s5236256\Google Drive\E2M\BTFRP\Data\Radio Telemetry\MATLAB\RT_MASTER_DATA\RTdatMaster20230625.csv")
tag_locations_path = 'Example_data\Input\Training_data\Tag_GPS_locations_time_filt.xlsx'
output_path = 'Example_data\Output\Train_test_data'

In [63]:
# Import data
dtypes = {'DateAndTime': 'str'}
parse_dates = ['DateAndTime']
RTdat = pd.read_csv(tower_data_path, dtype=dtypes, parse_dates=parse_dates, dayfirst=True)
# RTdat = RTdat.loc[RTdat['TagID']==70] # for testing, to be commented out

parse_dates = ['Start_time', 'End_time']
SIMdat = pd.read_excel(tag_locations_path, parse_dates=parse_dates)

In [64]:
#Train test split of simulation data
seed = 38 # Seed for train/test split
test_size = 0.2

SIMdat_train, SIMdat_test = train_test_split(SIMdat, test_size=test_size, random_state=seed)

Add Latitude and longitude to RTdat to identify simulation data

In [65]:
def append_coordinates(RTdat, SIMdat):
    progress_interval = 10000
    try:
        RTdat_sim = RTdat.copy()  # Create a separate copy to avoid modifying the original DataFrame

        # Extract relevant columns to improve performance
        sim_columns = ['Start_time', 'End_time', 'Tag_ID', 'Data_type', 'POINT_X', 'POINT_Y', 'Point_ID']
        sim_query = SIMdat[sim_columns]

        for i, (index, row) in enumerate(RTdat_sim.iterrows()):
            query_mask = (sim_query['Start_time'] <= row['DateAndTime']) & \
                         (sim_query['End_time'] >= row['DateAndTime']) & \
                         (sim_query['Tag_ID'] == row['TagID'])
            
            query_result = sim_query[query_mask]
            
            if not query_result.empty:
                RTdat_sim.at[index, 'Data_type'] = query_result.at[query_result.index[0], 'Data_type']
                RTdat_sim.at[index, 'POINT_X'] = query_result.at[query_result.index[0], 'POINT_X']
                RTdat_sim.at[index, 'POINT_Y'] = query_result.at[query_result.index[0], 'POINT_Y']
                RTdat_sim.at[index, 'Point_ID'] = query_result.at[query_result.index[0], 'Point_ID']

            if i % progress_interval == 0 and i > 0:
                print("Progress: {} rows processed".format(i))

        # Remove rows with missing coordinates
        RTdat_sim = RTdat_sim.dropna(subset=['POINT_X', 'POINT_Y'])

        return RTdat_sim
    except Exception as e:
        print("An error occurred:", str(e))
        # Handle the error or raise it again to propagate it to the caller
        raise


In [66]:
RTdat_sim_train = append_coordinates(RTdat, SIMdat_train)
RTdat_sim_test = append_coordinates(RTdat, SIMdat_test)

Progress: 10000 rows processed
Progress: 20000 rows processed
Progress: 30000 rows processed
Progress: 40000 rows processed
Progress: 50000 rows processed
Progress: 60000 rows processed
Progress: 70000 rows processed
Progress: 80000 rows processed
Progress: 90000 rows processed
Progress: 100000 rows processed
Progress: 110000 rows processed
Progress: 120000 rows processed
Progress: 130000 rows processed
Progress: 140000 rows processed
Progress: 150000 rows processed
Progress: 160000 rows processed
Progress: 170000 rows processed
Progress: 180000 rows processed
Progress: 190000 rows processed
Progress: 200000 rows processed
Progress: 210000 rows processed
Progress: 220000 rows processed
Progress: 230000 rows processed
Progress: 240000 rows processed
Progress: 250000 rows processed
Progress: 260000 rows processed
Progress: 270000 rows processed
Progress: 280000 rows processed
Progress: 290000 rows processed
Progress: 300000 rows processed
Progress: 310000 rows processed
Progress: 320000 

In [67]:
# Save the data into training and testing files

# Create the output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)

# Save the data into training and testing files
filename = os.path.basename(tag_locations_path)

train_output_path = os.path.join(output_path, 'Training_' + filename)
test_output_path = os.path.join(output_path, 'Testing_' + filename)

RTdat_sim_train.to_excel(train_output_path, index=False)
RTdat_sim_test.to_excel(test_output_path, index=False)