# functions

In [1]:
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import time
from datetime import datetime, timedelta, timezone
import os
import matplotlib.pyplot as plt
import seaborn as sns

### def append_csv_files()
appends all the data files in the folder

In [2]:
def append_csv_files(folder_path, output_file):
    combined_df = pd.DataFrame()  # Initialize an empty DataFrame

    for filename in os.listdir(folder_path):  # Iterate through each file in the specified folder
        if filename.endswith('.csv'):  # Check if the file has a .csv extension
            file_path = os.path.join(folder_path, filename)  # Construct the full file path
            df = pd.read_csv(file_path)  # Read the CSV file into a DataFrame

            if combined_df.empty:  # If the combined DataFrame is empty, initialize it with the first DataFrame
                combined_df = df
            else:
                if list(combined_df.columns) == list(df.columns):  # Check if the column names are the same
                    combined_df = pd.concat([combined_df, df], ignore_index=True)  # Append the rows

    combined_df.to_csv(output_file, index=False) 

#examples: 
#folder_path = 'C:\\Users\\kmh\\Documents\\DATA\\2024-5\\RFID'  # Replace with your folder path
#output_file = 'RFID_24Dec11_04.csv'
#append_csv_files(folder_path, output_file) 

### def append_force_text_files()

In [3]:
def append_force_text_files(folder_path, output_file):
    # Initialize an empty DataFrame
    combined_df = pd.DataFrame()

    # Open the output file in write mode
    with open(output_file, 'w') as f_out:
        for filename in os.listdir(folder_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(folder_path, filename)
                df = pd.read_csv(file_path, delimiter=',')  # Read the text file into a DataFrame

                if combined_df.empty:
                    combined_df = df
                    combined_df.to_csv(f_out, index=False, sep=',')  # Write the header and first chunk
                else:
                    df.to_csv(f_out, index=False, sep=',', header=False)  # Append without writing the header

### def convert_time() for force, rfid, and IR


In [4]:
# Function to convert seconds since 1904-01-01 to datetime format for force data
def convert_time_force(seconds):
    base_time = datetime(1904, 1, 1)
    return (base_time + timedelta(seconds=seconds)).strftime('%Y-%m-%d %H:%M:%S:%f')


# Function to convert microseconds since 1970-01-01 to datetime format for rfid data
def convert_time_rfid(microseconds):
    if pd.isna(microseconds):
        return np.nan
    base_time = datetime(1970, 1, 1)
    return (base_time + timedelta(microseconds=microseconds)).strftime('%Y-%m-%d %H:%M:%S:%f')


# Function to convert microseconds since 1970-01-01 to datetime format for IR data
def convert_time_IR(milliseconds):
    base_time = datetime(1970, 1, 1)
    return (base_time + timedelta(milliseconds=milliseconds)).strftime('%Y-%m-%d %H:%M:%S:%f')



### def find_and_filter_sequence()

In [5]:
# Define the sequence to filter

# the df should have the following columns and format:

#Sensor               int32
#Time               float64
#datetime    datetime64[ns]

sequence = [1, 2, 3, 4, 5]
# Function to find and filter the sequence in the 'Sensor' column with possible repeated elements
def find_and_filter_sequence(df, seq):
    seq_len = len(seq)
    filtered_indices = []
    takeoff_event = []
    event_number = 1
    i = 0
    
    while i < len(df):
        if df['Sensor'].iloc[i] == seq[0]:
            match = True
            seq_index = 0
            for j in range(i, len(df)):
                if df['Sensor'].iloc[j] == seq[seq_index]:
                    seq_index += 1
                    if seq_index == seq_len:
                        # Include all subsequent repeated elements of the last sequence element
                        while j + 1 < len(df) and df['Sensor'].iloc[j + 1] == seq[-1]:
                            j += 1
                        filtered_indices.extend(range(i, j + 1))
                        takeoff_event.extend([event_number] * (j - i + 1))
                        event_number += 1
                        i = j
                        break
                elif df['Sensor'].iloc[j] != seq[seq_index - 1]:
                    match = False
                    break
        i += 1
    
    filtered_df = df.iloc[filtered_indices].copy()
    filtered_df['takeoff_event'] = takeoff_event
    
    return filtered_df

# use example
#filtered_IR_Sensor = find_and_filter_sequence(IR_unique, sequence)
#filtered_IR_Sensor: full dataset with 'Sensor' column filtered

# Display the filtered DataFrame A
#print(filtered_IR_Sensor.head(10))
#print(filtered_IR_Sensor.shape)

### def isolate_takeoff()

In [6]:
# the input df should have the following format, as the product of def find_and_filter_sequence()

#index                     int64
#Sensor                    int32
#Time                    float64
#datetime         datetime64[ns]
#takeoff_event             int64
#dtype: object

def isolate_takeoff(df):

    df = df.drop_duplicates().sort_values(by = 'datetime').reset_index() #drop duplicates that has same timestamp 

    i = 0
    keep_indices = []

    while i < len(df) - 1:
        first_sensor_reading = df['Sensor'].iloc[i]
        
        # Always keep the first reading of a new sensor value
        if i == 0 or df['Sensor'].iloc[i] != df['Sensor'].iloc[i - 1]:
            keep_indices.append(i)
        
        if df['Sensor'].iloc[i + 1] == first_sensor_reading:
            time_diff = (df['datetime'].iloc[i + 1] - df['datetime'].iloc[i]).total_seconds()
            
            if time_diff < 0.5:
                keep_indices.append(i + 1)
            else:
                # Skip all subsequent rows with the same sensor value
                while i < len(df) - 1 and df['Sensor'].iloc[i + 1] == first_sensor_reading:
                    i += 1
        else:
            keep_indices.append(i)
        
        i += 1

    # Add the last index if it wasn't added
    if i == len(df) - 1 and (df['datetime'].iloc[i] - df['datetime'].iloc[i - 1]).total_seconds() < 0.5:
        keep_indices.append(i)

    # Drop duplicates in keep_indices
    keep_indices = list(dict.fromkeys(keep_indices))

    return df.iloc[keep_indices]


# use example
# Apply the function and show the result
#final_filtered_df = isolate_takeoff(filtered_df)

### def process_row 
- matching ir and rfid one row at a time. 

In [7]:
#RFID_match should have the following format:
#  
#id                              object
#status                          object
#epoch_time_converted    datetime64[ns]
#dtype: object
#<class 'pandas.core.frame.DataFrame'>

#takeoff_match should have the following format:
# 
#takeoff_event	datetime	RFID
#0	1	2024-12-11 08:00:48.289	<NA>
#1	2	2024-12-11 08:02:31.780	<NA>
#2	3	2024-12-11 08:02:36.599	<NA>


def process_row(i, RFID_match, takeoff_match):
    print(f"Processing row: {i}")
    if RFID_match['status'].iloc[i] == "Arrive" and RFID_match['status'].iloc[i + 1] in ["Displace", "Depart"]:
        arrival_time = RFID_match['epoch_time_converted'].iloc[i]
        depart_time = RFID_match['epoch_time_converted'].iloc[i + 1]
        arrival_RFID = RFID_match['id'].iloc[i]
        #print(f"Arrive found at row {i} with RFID {arrival_RFID} from {arrival_time} to {depart_time}")

        # Vectorized operation to assign RFID
        mask = (takeoff_match['datetime'] > arrival_time) & (takeoff_match['datetime'] < depart_time)
        takeoff_match.loc[mask, 'RFID'] = arrival_RFID
        print(f"Assigned RFID {arrival_RFID} to {mask.sum()} rows in takeoff_match")

    elif RFID_match['status'].iloc[i] == "Displace" and RFID_match['status'].iloc[i + 1] == "Depart":
        arrival_time = RFID_match['epoch_time_converted'].iloc[i]
        depart_time = RFID_match['epoch_time_converted'].iloc[i + 1]
        arrival_RFID = RFID_match['id'].iloc[i]
        #print(f"Displace found at row {i} with RFID {arrival_RFID} from {arrival_time} to {depart_time}")

        # Vectorized operation to assign RFID
        mask = (takeoff_match['datetime'] > arrival_time) & (takeoff_match['datetime'] < depart_time)
        takeoff_match.loc[mask, 'RFID'] = arrival_RFID
        print(f"Assigned RFID {arrival_RFID} to {mask.sum()} rows in takeoff_match")

# Use example: Process rows sequentially
#for i in tqdm(range(len(RFID_match) - 1)):
#    process_row(i) #just 6.4 rows

### def firstbroken()

In [8]:
#the input df should be in this format
#index              int64
#Sensor             int32
#Time             float64
#datetime          object
#takeoff_event      int64
#RFID              object
#dtype: object

#returns results_df with firstbroken sensors in the format of: 
#RFID	firstbroken1	firstbroken2	firstbroken3	firstbroken4	firstbroken5
#0	3B0018C6F9	1.733904e+12	NaN	NaN	NaN	NaN
#1	<NA>	NaN	1.733904e+12	NaN	NaN	NaN
#2	<NA>	NaN	NaN	1.733904e+12	NaN	NaN
#3	<NA>	NaN	NaN	NaN	1.733904e+12	NaN
#4	<NA>	NaN	NaN	NaN	NaN	1.733904e+12


def firstbroken(df):
    # Initialize the results list
    results_list = []
    # Initialize start_index
    start_index = 0

    # Iterate over the DataFrame
    for i in range(len(df)):
        first_read = df['Time'].iloc[start_index]
        current_sensor = df['Sensor'].iloc[start_index]

        #rfid assignment
        rfid = df['RFID'].iloc[start_index] if current_sensor == 1 else pd.NA

        # Debugging print statements 
        print(f"Iteration {i}:")
        print(f"  start_index: {start_index}")
        print(f"  first_read: {first_read}")
        print(f"  current_sensor: {current_sensor}")
        print(f"  rfid: {rfid}")


        for j in range(start_index, len(df)):
            if df['Sensor'].iloc[j] != current_sensor:
                start_index = j
                print(f"  Sensor changed at index {j}, new start_index: {start_index}")
                break

        results_list.append({
            f'RFID': rfid,
            f'firstbroken{current_sensor}': first_read,
        })

    # Convert results_list to DataFrame
    results_df = pd.DataFrame(results_list)

    print(results_df)



### def irfid_allign()
- alligns the above results_df without NA 

In [9]:
def irfid_allign(results_df):
    # Initialize an empty DataFrame with specified columns
    irfid_alligned = pd.DataFrame(columns=['RFID', 'firstbroken1', 'firstbroken2', 'firstbroken3', 'firstbroken4', 'firstbroken5'])

    # Iterate through results_df
    for i in range(len(results_df) - 4):
        if pd.notna(results_df['RFID'].iloc[i]):
            rfid = results_df['RFID'].iloc[i]
            firstbroken1 = results_df['firstbroken1'].iloc[i]
            firstbroken2 = results_df['firstbroken2'].iloc[i+1]
            firstbroken3 = results_df['firstbroken3'].iloc[i+2]
            firstbroken4 = results_df['firstbroken4'].iloc[i+3]
            firstbroken5 = results_df['firstbroken5'].iloc[i+4]
            firstbroken_values = [firstbroken1, firstbroken2, firstbroken3, firstbroken4, firstbroken5]

            #print(f"Checking row {i}: RFID={rfid}, firstbroken_values={firstbroken_values}")

            if pd.notna(firstbroken_values).all():
                row = [rfid] + firstbroken_values
                irfid_alligned.loc[len(irfid_alligned)] = row

    print(irfid_alligned)


#output dataset: 
#RFID	firstbroken1	firstbroken2	firstbroken3	firstbroken4	firstbroken5
#0	3B0018C6F9	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12
#1	3B00185E23	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12

### def speed_calculation()

In [10]:
#the input irfid_new looks like this:
#RFID	firstbroken1	firstbroken2	firstbroken3	firstbroken4	firstbroken5	datetime	takeoff_event
#0	3B0018C6F9	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12	2024-12-11 08:00:48.289	1
#1	3B00185E23	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12	2024-12-11 08:02:36.599	3
#2	3B00185CB8	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12	1.733904e+12	2024-12-11 08:05:55.755	6



#the output irfid_calc looks like this: 
#RFID  firstbroken1  firstbroken2  firstbroken3  firstbroken4  \
#0  3B0018C6F9      1.733904      1.733904      1.733904      1.733904   
#1  3B00185E23      1.733904      1.733904      1.733904      1.733904   

#   firstbroken5                 datetime  takeoff_event           t12  \
#0      1.733904  2024-12-11 08:00:48.289              1  2.830001e-10   
#1      1.733904  2024-12-11 08:02:36.599              3  1.340001e-10   

#            t23  ...           t45            v1            v2            v3  \
#0  1.310001e-10  ...  6.699996e-11  4.770317e+08  1.030534e+09  2.327593e+09   
#1  6.899992e-11  ...  7.399992e-11  1.007462e+09  1.956524e+09  2.288130e+09   

#             v4         avg.v          acc1          acc2          acc3  \
#0  2.014927e+09  1.001855e+09  2.673921e+18  1.372550e+19 -5.002674e+18   
#1  1.824326e+09  1.607142e+09  9.350367e+18  5.181344e+18 -6.974491e+18   
  
#        avg.acc  
#0 -1.310527e+18  
#1 -4.342505e+18  


def speed_calculation(irfid_new):
    irfid_calc = pd.DataFrame()
    irfid_calc = irfid_new
    #seconds
    irfid_calc['firstbroken1'] = irfid_calc['firstbroken1']/1000
    irfid_calc['firstbroken2'] = irfid_calc['firstbroken2']/1000
    irfid_calc['firstbroken3'] = irfid_calc['firstbroken3']/1000
    irfid_calc['firstbroken4'] = irfid_calc['firstbroken4']/1000
    irfid_calc['firstbroken5'] = irfid_calc['firstbroken5']/1000

    #time
    irfid_calc['t12'] = irfid_calc['firstbroken2'] - irfid_calc['firstbroken1']
    irfid_calc['t23'] = irfid_calc['firstbroken3'] - irfid_calc['firstbroken2']
    irfid_calc['t34'] = irfid_calc['firstbroken4'] - irfid_calc['firstbroken3']
    irfid_calc['t45'] = irfid_calc['firstbroken5'] - irfid_calc['firstbroken4']

    #speed
    irfid_calc['v1'] = 0.135/irfid_calc['t12']
    irfid_calc['v2'] = 0.135/irfid_calc['t23']
    irfid_calc['v3'] = 0.135/irfid_calc['t34']
    irfid_calc['v4'] = 0.135/irfid_calc['t45']
    irfid_calc['avg.v'] = 0.540/(irfid_calc['firstbroken5'] - irfid_calc['firstbroken1'])

    #acceleration
    irfid_calc['acc1'] = 2*(irfid_calc['v2'] - irfid_calc['v1'])/(irfid_calc['t12'] + irfid_calc['t23'])
    irfid_calc['acc2'] = 2*(irfid_calc['v3'] - irfid_calc['v2'])/(irfid_calc['t23'] + irfid_calc['t34'])
    irfid_calc['acc3'] = 2*(irfid_calc['v4'] - irfid_calc['v3'])/(irfid_calc['t34'] + irfid_calc['t45'])
    irfid_calc['avg.acc'] = irfid_calc['v4'] - irfid_calc['v1']/(irfid_calc['t23']+irfid_calc['t34']+1/2*(irfid_calc['t12']+irfid_calc['t45']))

    #filtering
    irfid_calc[(irfid_calc['t12'] <0.25 )& (irfid_calc['t23'] <0.25)] #flying bird should exit the section less than quarter of a second.
    print(irfid_calc.head(3))
    print(irfid_calc.shape)

### def match_keys_preparation()

In [11]:
def match_keys_preparation(F_df, RFID_match):

    F_df_uniq_ts = pd.DataFrame()
    RFID_match_uniq_ts = pd.DataFrame()

    #F_df preparation
    F_df_uniq_ts['hr'] = F_df['datetime'].dt.hour
    F_df_uniq_ts['min'] = F_df['datetime'].dt.minute
    F_df_uniq_ts['s'] = F_df['datetime'].dt.second

    #RFID_match preparation
    RFID_match_uniq_ts['hr'] = RFID_match['epoch_time_converted'].dt.hour
    RFID_match_uniq_ts['min'] = RFID_match['epoch_time_converted'].dt.minute
    RFID_match_uniq_ts['s'] = RFID_match['epoch_time_converted'].dt.second
    RFID_match_uniq_ts['rfid'] = RFID_match['id']
    RFID_match_uniq_ts['status'] = RFID_match['status']

    #unique combinations of hr,min, and s 
    F_match_keys = F_df_uniq_ts[['hr', 'min','s']].drop_duplicates()
    RFID_match_keys = RFID_match_uniq_ts[['hr', 'min','s','rfid', 'status']].drop_duplicates()
    #print(f'RFID_match_keys: {RFID_match_keys.head(5)}')
    # Create a timestamp column from hr, min, s columns
    # need to go through the pain of making it a datetime format in case 59 + 1 becomes 00 in seconds

    
    F_match_keys['timestamp'] = pd.to_datetime(F_match_keys[['hr', 'min', 's']]
                                            .astype(str).agg(':'.join, axis=1), format='%H:%M:%S')

    RFID_match_keys['timestamp'] = pd.to_datetime(RFID_match_keys[['hr', 'min', 's']]
                                            .astype(str).agg(':'.join, axis=1), format='%H:%M:%S')

    #print(f'RFID_match_keys in the function: {RFID_match_keys.head(5)}')


    # Initialize event counter and event list
    event_counter = 1
    events = [event_counter]

    # Iterate over the rows of the DataFrame to mark the event 
    # event = a set of consecutive seconds
    for i in range(len(F_match_keys) - 1):
        if F_match_keys['timestamp'].iloc[i + 1] == F_match_keys['timestamp'].iloc[i] + timedelta(seconds=1):
            events.append(event_counter)
        else:
            event_counter += 1
            events.append(event_counter)

    # Add the event column to the F_match_keys 
    F_match_keys['event'] = events
    F_match_keys['timestamp'] = F_match_keys['timestamp'].dt.time
    RFID_match_keys['timestamp'] = RFID_match_keys['timestamp'].dt.time

    # Reset index and make it a column named 'index'
    F_match_keys.reset_index(inplace=True)
    F_match_keys.rename(columns={'index': 'index'}, inplace=True)
    #print(F_match_keys)

    #add RFID column as a matchkey
    F_match_keys['RFID'] =pd.NA

    return F_match_keys, RFID_match_keys





### def assign()

In [12]:
def assign(RFID_match_keys, F_match_keys, F_df, start_index=0):  
    current_F_valid = pd.DataFrame()
    F_match_keys_in = pd.DataFrame()

    for i in range(len(RFID_match_keys) - 1):
        if RFID_match_keys.loc[i, 'status'] == "Arrive" and RFID_match_keys.loc[i + 1, 'status'] in ["Displace", "Depart"]:
            arrival_time = RFID_match_keys.loc[i, 'timestamp']  
            depart_time = RFID_match_keys.loc[i + 1, 'timestamp'] 
            arrival_RFID = RFID_match_keys.loc[i, 'rfid']

            # Adjust the mask to start from start_index
            mask = (F_match_keys['timestamp'] >= arrival_time) & (F_match_keys['timestamp'] <= depart_time) & (F_match_keys['index'] >= start_index)
                
            if not F_match_keys.loc[mask].empty:
                matching_event = F_match_keys.loc[mask, 'event'].iloc[0]
                matching_start_index = F_match_keys.loc[mask, 'index'].iloc[0]
                matching_end_index = F_match_keys.loc[mask, 'index'].iloc[-1]

                #directly stores assigns RFID to the raw F_df file in current_F_valid
                current_F_valid = F_df.loc[matching_start_index:(matching_end_index+999), :]
                current_F_valid['RFID'] = arrival_RFID
                current_F_valid['event'] = matching_event

                F_match_keys.loc[mask, 'RFID'] = arrival_RFID
                F_match_keys_in = F_match_keys[F_match_keys['RFID'].notna()]

                return F_match_keys_in, current_F_valid, matching_end_index

    return F_match_keys_in, current_F_valid, None