In [1]:
from geojson import LineString
import pandas as pd
import turfpy.measurement as turf_measurement
from turfpy.measurement import along
from typing import List, Tuple
from sqlalchemy import create_engine
import psycopg2

In [2]:
multiplier = 1

# database account
db_username = "postgres"
db_password = "postgres"
db_host = "localhost"
db_port = "5432"
db_name = "bus_trial"


In [113]:
def unique_route_steps(timestamps) :
    individual_steps = []
    for i in range(len(timestamps) - 1):
        difference = timestamps[i + 1] - timestamps[i]
        multiplied_value = round(difference * multiplier)
        individual_steps.append(multiplied_value)
    return individual_steps

def is_equal(coord1, coord2) :
    return coord1[0] == coord2[0] and coord1[1] == coord2[1]

def getShapeCoords(shape_id) :
    print('shape')
    # Create a database connection string
    connection = psycopg2.connect(user=db_username,
                                    password=db_password,
                                    host=db_host,
                                    port=db_port,
                                    database=db_name)

    # Create a cursor to perform database operations
    cursor = connection.cursor()
    #Get the shape data in the database
    query = "SELECT * FROM shapes WHERE shape_id = %s ORDER BY shape_pt_sequence"
    cursor.execute(query, (shape_id,))
    # Get all the results
    results = cursor.fetchall()
    # Close the connection
    cursor.close()
    connection.close()
    # Create a pandas dataframe
    shape_df = pd.DataFrame(results, columns = ['id', 'shape_id', 'lat', 'lon', 'shape_pt_sequence'])
    # Combine the coordinates together 
    shape_df['coordinates'] = shape_df.apply(lambda row: [row['lon'], row['lat']], axis=1)
    # Get all the coordinates into one list 
    shape_coordinates = shape_df['coordinates'].tolist()
    return(shape_coordinates)
    
def route_to_dict(route):
    return {tuple(coord): idx for idx, coord in enumerate(route)}

def interpolation_fps(subsetDataset) :
    subsetDataset = subsetDataset.reset_index(drop=True)
    unique_coordinates = subsetDataset["coordinates"]
    shape_id = str(subsetDataset["shape_id"].unique()[0])
    unique_timestamps = subsetDataset["timestamps"]
    unique_status = subsetDataset["status"]
    
    # Get the bus route coordinates
    unique_route = getShapeCoords(shape_id)
    route_dict = route_to_dict(unique_route)

    # Get the number of frames we want to allocate between each observatoin 
    unique_steps = unique_route_steps(unique_timestamps)
    interpolated_points = []
    status_conditions = [] 


    print(range(len(unique_timestamps) - 1))
    for i in range(len(unique_timestamps) - 1):
        # Check if the bus status has changed. If it has we need to change the colour of the bus 
        changed_bus = False
        first_status = unique_status[i]
    
        if unique_status[i] != unique_status[i + 1]:
            second_status = unique_status[i + 1]
            changed_bus = True
            status_difference = abs(second_status - first_status)

        # subset coordinates are the coordinates we are interested in 
        subset_coordinates = []
        start_point = unique_coordinates[i]
        end_point = unique_coordinates[i + 1]

        # J is a counter 
        j = 0

        # Want to get all the points in between the current value and the next
        while j < len(unique_route):
            # Get the current value 
            current_coordinates = unique_route[j]

            # If we find the value that we want (which is the current node)
            if is_equal(current_coordinates, start_point):
                # Put it into the coordinate system
                subset_coordinates.append(current_coordinates)
                # Collect all the points in between 
                while not is_equal(current_coordinates, end_point):
                    j += 1
                    current_coordinates = unique_route[j]
                    subset_coordinates.append(current_coordinates)
                break
            j += 1

        # Create a linestring object
        line = LineString(subset_coordinates)

        # Get the distance along the line 
        line_distance = turf_measurement.length(line)

        if not changed_bus:
            z = 0
            while z < line_distance:
                interpolated_point = along(line, z)
                interpolated_points.append(tuple(interpolated_point['geometry']['coordinates']))
                status_conditions.append(first_status)
                z += line_distance / unique_steps[i]
        else:
            z = 0
            while z < line_distance:
                interpolated_point = along(line, z)
                interpolated_points.append(tuple(interpolated_point['geometry']['coordinates']))

                interpolation_index = z / line_distance

                if status_difference == 1:
                    if interpolation_index < 0.5:
                        status = first_status
                    else:
                        status = second_status
                elif status_difference == 2:
                    if interpolation_index < 1/3:
                        status = first_status
                    elif 1/3 <= interpolation_index < 2/3:
                        if first_status < second_status:
                            status = first_status + 1
                        else:
                            status = first_status - 1
                    else:
                        status = second_status

                status_conditions.append(status)
                z += line_distance / unique_steps[i]

    # With all the data that we have, we need to input them all into the sql database
    startPoint = subsetDataset["timestamps"].min().item()
    route_name = subsetDataset['route_short_name'].unique()[0]
    
    # Create data for the DataFrame
    data = []
    for i, (point, status) in enumerate(zip(interpolated_points, status_conditions)):
        row = {
            'longitude': point[0],
            'latitude': point[1],
            'status_condition': status,
            'route_name': route_name,
            'timestamp': startPoint + i
        }
        data.append(row)

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)
    return(df)


In [114]:
trial_data = pd.read_csv("complete_data.csv")
trial_data['coordinates'] = trial_data.apply(lambda row: [row['stop_lon'], row['stop_lat']], axis=1)

In [73]:
trial_data

Unnamed: 0,trip_id,shape_id,timestamps,status,stop_lat,stop_lon,route_id,route_short_name,stop_sequence,coordinates
0,1010-98109-18000-2-fc6666b6,1010-98109-d221c68a,1683262820,1,-36.54528,174.70832,981-221,981,1,"[174.70832, -36.54528]"
1,1010-98109-18000-2-fc6666b6,1010-98109-d221c68a,1683263392,1,-36.57805,174.68896,981-221,981,7,"[174.68896, -36.57805]"
2,1010-98109-18000-2-fc6666b6,1010-98109-d221c68a,1683264039,1,-36.60968,174.68729,981-221,981,20,"[174.68729, -36.60968]"
3,1010-98109-18000-2-fc6666b6,1010-98109-d221c68a,1683264244,1,-36.62406,174.66652,981-221,981,21,"[174.66652, -36.62406]"
4,1010-98109-19800-2-fc6666b6,1010-98109-d221c68a,1683264623,1,-36.54528,174.70832,981-221,981,1,"[174.70832, -36.54528]"
...,...,...,...,...,...,...,...,...,...,...
81865,957-12602-68400-2-ea1af35a,957-12602-45320708,1683315763,1,-36.72212,174.71207,126-206,126,27,"[174.71207, -36.72212]"
81866,957-12603-19200-2-953b0419,957-12603-1bf4bceb,1683264001,1,-36.75751,174.59198,126-206,126,1,"[174.59198, -36.75751]"
81867,957-12603-19200-2-953b0419,957-12603-1bf4bceb,1683264109,1,-36.76618,174.58738,126-206,126,4,"[174.58738, -36.76618]"
81868,957-12603-19200-2-953b0419,957-12603-1bf4bceb,1683264581,1,-36.81375,174.60895,126-206,126,11,"[174.60895, -36.81375]"


In [115]:
from multiprocessing import Pool, cpu_count
# Split groups into a list so they can be processed in parallel
groups = [group for _, group in trial_data.groupby('trip_id')]

# Create a worker function to process each group
def worker(group):
    return interpolation_fps(group)

# Use all available CPU cores
num_cores = cpu_count()

# Create a multiprocessing Pool and map the worker function to the groups
with Pool(num_cores) as pool:
    result_dataframes = pool.map(worker, groups)

# Concatenate all resulting DataFrames
combined_dataframe = pd.concat(result_dataframes, ignore_index=True)


KeyboardInterrupt: 

In [34]:
trial_data.groupby('trip_id').apply(interpolation_fps).reset_index()

KeyError: 1

In [57]:
import pandas as pd

# Sample data
data = {
    'Student': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'Charlie'],
    'Subject': ['Math', 'Math', 'Math', 'Physics', 'Physics', 'Physics'],
    'Score': [85, 89, 92, 70, 75, 78]
}

df = pd.DataFrame(data)

# Function to normalize scores
def normalize_scores(group):
    group['Normalized'] = (group['Score'] - group['Score'].mean()) / group['Score'].std()
    return group

# Group by 'Subject' and apply the normalization
df_normalized = df.groupby('Subject').apply(normalize_scores)

print(df_normalized)

   Student  Subject  Score  Normalized
0    Alice     Math     85   -1.044074
1      Bob     Math     89    0.094916
2  Charlie     Math     92    0.949158
3    Alice  Physics     70   -1.072222
4      Bob  Physics     75    0.164957
5  Charlie  Physics     78    0.907265


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_normalized = df.groupby('Subject').apply(normalize_scores)


In [59]:
print(df)

   Student  Subject  Score
0    Alice     Math     85
1      Bob     Math     89
2  Charlie     Math     92
3    Alice  Physics     70
4      Bob  Physics     75
5  Charlie  Physics     78
