<a href="https://colab.research.google.com/github/harry-erskine/PRBX-Travel-Mode-Identification/blob/main/Source%20Code/Pre-Processing/Pre_Processing_Steps_4_to_9_Organise_Trajectories_into_Groups%2C_Calculate_Motion_Values%2C_and_Export_a_Final_Pandas_DataFrame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Pre-Processing - Steps 4 to 9: Organise Trajectories into Groups, Calculate Motion Values, and Export a Final Pandas DataFrame**

This notebook will contain only the crucial blocks from steps 4 to 9 (so will not include some of the extra code that I have written to test or analyse my solution).

The purpose of this notebook is to organise the trajectories into group sizes of our choice (this can be done by changing the 'groupSize' variable that is visible below). This way we can create dataframes where of varying group sizes all from this notebook.

Also, where necessary, please change the variables for 'yourMountPath' and 'yourDrivePath' as well as uncommenting the installs in the 'Notebook Setup' section.

These steps will take up to 2 hours with a 'groupSize' of 16 and considerably less with higher group sizes.

In [None]:
### VARIABLES ###

groupSize = 8                  # indicates the number of trajectories that are grouped together
newGroupSize = groupSize - 3    # Do not change this variable

### **Notebook Setup**

In [None]:
###
### If you were to run this notebook for yourself,
### after you have copied the whole 'PRBX' root folder correctly,
### the strings 'yourDrivePath' and 'yourMountPath'
### would be the only variables thatyou would need to change
### within this entire notebook for the rest of the code to work.
###

yourMountPath = ''  # add where you are mounting your drive here
yourDrivePath = ''  # add your drive path here

# these are the drive paths which I have been using
mainMountPath = '/content/gdrive'
mainDrivePath = '/content/gdrive/MyDrive/Colab Notebooks/PRBX'

In [None]:
### INSTALLS ###

# !pip install --upgrade gspread
# !pip install pandas
# !pip install geopy

In [None]:
### IMPORTS ###

import os
from google.colab import drive
import pandas as pd
import time
import numpy as np
from geopy.distance import geodesic

In [None]:
### MOUNT GOOGLE DRIVE ###

# This will make yourDrivePath the path we use instead (if that has been filled in)
if yourMountPath != '':
  drive.mount(yourMountPath, force_remount=True)
else:
  drive.mount(mainMountPath, force_remount=True)

if yourDrivePath != '':
  mainDrivePath = yourDrivePath

Mounted at /content/gdrive


In [None]:
### FILE PATHS ###

generatedFilesPath = mainDrivePath + '/Data Files/Generated Files'
cleanedLabelledTrajectoriesPath = generatedFilesPath + '/cleanedLabelledTrajectories.csv'
# groupedTrajectoriesPath = generatedFilesPath + '/groupedTrajectoriesX' + str(groupSize) + '.csv'
# distancesDataPath = generatedFilesPath + '/distancesDataX' + str(groupSize) + '.csv'
# motionDataPath = generatedFilesPath + '/motionDataX' + str(groupSize) + '.csv'
# groupedMotionDataPath = generatedFilesPath + '/groupedMotionDataX' + str(groupSize) + '.csv'
# derivedMotionDataPath = generatedFilesPath + '/derivedMotionDataX' + str(groupSize) + '.csv'
cleanedMotionDataPath = generatedFilesPath + '/cleanedMotionDataX' + str(groupSize) + '.csv'

### **Step 4: Pre-Processing - Group Trajectories into a DataFrame**

This step will take about 1 minute.

In [None]:
cltdf = pd.read_csv(cleanedLabelledTrajectoriesPath)
cltdf

Unnamed: 0,UserId,Old Mode,Mode,Timestamp,Latitude,Longitude,Altitude
0,10,walk,walk,1207116566,39.477125,75.989985,1248.4608
1,10,walk,walk,1207116568,39.477083,75.989990,1247.8512
2,10,walk,walk,1207116569,39.477053,75.989980,1247.8512
3,10,walk,walk,1207116570,39.477033,75.989972,1247.8512
4,10,walk,walk,1207116608,39.476977,75.989965,1249.9848
...,...,...,...,...,...,...,...
5377864,179,subway,train,1227925767,40.029529,116.411977,88.6968
5377865,179,subway,train,1227925769,40.029320,116.411975,88.0872
5377866,179,subway,train,1227925771,40.029111,116.411963,83.8200
5377867,179,subway,train,1227925773,40.028904,116.411962,83.5152


In [None]:
cltdf['Time Difference'] = cltdf['Timestamp'].diff()
cltdf

Unnamed: 0,UserId,Old Mode,Mode,Timestamp,Latitude,Longitude,Altitude,Time Difference
0,10,walk,walk,1207116566,39.477125,75.989985,1248.4608,
1,10,walk,walk,1207116568,39.477083,75.989990,1247.8512,2.0
2,10,walk,walk,1207116569,39.477053,75.989980,1247.8512,1.0
3,10,walk,walk,1207116570,39.477033,75.989972,1247.8512,1.0
4,10,walk,walk,1207116608,39.476977,75.989965,1249.9848,38.0
...,...,...,...,...,...,...,...,...
5377864,179,subway,train,1227925767,40.029529,116.411977,88.6968,2.0
5377865,179,subway,train,1227925769,40.029320,116.411975,88.0872,2.0
5377866,179,subway,train,1227925771,40.029111,116.411963,83.8200,2.0
5377867,179,subway,train,1227925773,40.028904,116.411962,83.5152,2.0


In [None]:
# change the first value of 'Time Difference' from 'NaN' to 0
cltdf.loc[0, 'Time Difference'] = 0

cltdf['Time Difference'] = cltdf['Time Difference'].astype(int)

In [None]:
cltdf['Time > 0s'] = cltdf['Time Difference'] >  0
cltdf['Time <= 6s'] = cltdf['Time Difference'] <= 6
cltdf['Same UserId'] = cltdf['UserId'] == cltdf['UserId'].shift()
cltdf['Same Mode'] = cltdf['Mode'] == cltdf['Mode'].shift()

testingSpecificTrajectories = cltdf.iloc[0:]
testingSpecificTrajectories.head(25)

Unnamed: 0,UserId,Old Mode,Mode,Timestamp,Latitude,Longitude,Altitude,Time Difference,Time > 0s,Time <= 6s,Same UserId,Same Mode
0,10,walk,walk,1207116566,39.477125,75.989985,1248.4608,0,False,True,False,False
1,10,walk,walk,1207116568,39.477083,75.98999,1247.8512,2,True,True,True,True
2,10,walk,walk,1207116569,39.477053,75.98998,1247.8512,1,True,True,True,True
3,10,walk,walk,1207116570,39.477033,75.989972,1247.8512,1,True,True,True,True
4,10,walk,walk,1207116608,39.476977,75.989965,1249.9848,38,True,False,True,True
5,10,walk,walk,1207116610,39.476928,75.989937,1249.9848,2,True,True,True,True
6,10,walk,walk,1207116611,39.476913,75.989937,1249.9848,1,True,True,True,True
7,10,walk,walk,1207116612,39.476905,75.989933,1249.9848,1,True,True,True,True
8,10,walk,walk,1207116613,39.47689,75.989917,1249.9848,1,True,True,True,True
9,10,walk,walk,1207116614,39.476878,75.989903,1249.9848,1,True,True,True,True


In [None]:
def createGroups(groupSize, cltdf):

  groupsData = []
  tempData = []
  groupId = 1
  consecutiveRows = 0

  # Create dictionaries to store data
  TrajectoryData = list(zip(cltdf['UserId'], cltdf['Old Mode'], cltdf['Mode'], cltdf['Timestamp'], cltdf['Latitude'], cltdf['Longitude'],
                  cltdf['Altitude'], cltdf['Time Difference'], cltdf['Time > 0s'], cltdf['Time <= 6s'], cltdf['Same UserId'], cltdf['Same Mode']))

  # Loop through the trajectory data ('v' for 'value' and 'b' for 'boolean')
  for v1, v2, v3, v4, v5, v6, v7, v8, b1, b2, b3, b4 in TrajectoryData:

    # if all of our boolean values are true (0 < time difference <= 6 and both user and mode stay the same)
    if b1 and b2 and b3 and b4:

      tempData.append((v1, v2, v3, v4, v5, v6, v7, v8, groupId))
      consecutiveRows += 1

      # once there are enough trajectories in a group
      if consecutiveRows >= groupSize:

        groupsData.extend(tempData)
        tempData = []
        consecutiveRows = 0
        groupId += 1

        # add this line to track how many trajectories we've gone through
        # print("At group id: ", groupId, ", at user id: ", v1 )

    # when one of the boolean values is not met
    else:

      tempData = []
      consecutiveRows = 0

  columns = ['UserId', 'Old Mode', 'Mode', 'Timestamp', 'Latitude', 'Longitude', 'Altitude', 'Time Difference', 'GroupId']
  groupsDf = pd.DataFrame(groupsData, columns=columns)
  return groupsDf

In [None]:
# this process will take about 20 seconds
total_start_time = time.time()

groupsDf = createGroups(groupSize, cltdf)

print(f"Total processing time: {time.time() - total_start_time:.2f} seconds")

groupsDf

Total processing time: 22.18 seconds


Unnamed: 0,UserId,Old Mode,Mode,Timestamp,Latitude,Longitude,Altitude,Time Difference,GroupId
0,10,walk,walk,1207116610,39.476928,75.989937,1249.9848,2,1
1,10,walk,walk,1207116611,39.476913,75.989937,1249.9848,1,1
2,10,walk,walk,1207116612,39.476905,75.989933,1249.9848,1,1
3,10,walk,walk,1207116613,39.476890,75.989917,1249.9848,1,1
4,10,walk,walk,1207116614,39.476878,75.989903,1249.9848,1,1
...,...,...,...,...,...,...,...,...,...
4340907,179,subway,train,1227925759,40.030395,116.411975,92.0496,2,542614
4340908,179,subway,train,1227925761,40.030170,116.411990,91.1352,2,542614
4340909,179,subway,train,1227925763,40.029950,116.411985,90.2208,2,542614
4340910,179,subway,train,1227925765,40.029739,116.411980,89.6112,2,542614


In [None]:
# the columns could also do with a bit of reordering (plus we don't need timestamp anymore)
groupsDf = groupsDf[['GroupId', 'Mode', 'Time Difference', 'Latitude', 'Longitude', 'Altitude', 'UserId', 'Old Mode']]
groupsDf = groupsDf.rename(columns={'Time Difference': 'Delta Time'})

In [None]:
# save groupsDf to the grouped trajectories file path (as a .csv)
# groupsDf.to_csv(groupedTrajectoriesPath, index=False)

### **Step 5: Pre-Processing - Calculate Distance Values for each Group**

This step will take about 15 minutes.

This step involves calculating distance and bearing values between each trajectory.

To find the distance we are using a mathematical function from our geodesic import which will use our latitude and longitude values (and assume altitude is zero) which will return the real world distance between the 2 points with near perfect accuracy.

To find the bearing we will be using another mathematical formulae.

In [None]:
# groupsDf = pd.read_csv(groupedTrajectoriesPath)
groupsDf

Unnamed: 0,GroupId,Mode,Delta Time,Latitude,Longitude,Altitude,UserId,Old Mode
0,1,walk,2,39.476928,75.989937,1249.9848,10,walk
1,1,walk,1,39.476913,75.989937,1249.9848,10,walk
2,1,walk,1,39.476905,75.989933,1249.9848,10,walk
3,1,walk,1,39.476890,75.989917,1249.9848,10,walk
4,1,walk,1,39.476878,75.989903,1249.9848,10,walk
...,...,...,...,...,...,...,...,...
4340907,542614,train,2,40.030395,116.411975,92.0496,179,subway
4340908,542614,train,2,40.030170,116.411990,91.1352,179,subway
4340909,542614,train,2,40.029950,116.411985,90.2208,179,subway
4340910,542614,train,2,40.029739,116.411980,89.6112,179,subway


In [None]:
# this process will take about 15 minutes
total_start_time = time.time()

# Convert 'Latitude' and 'Longitude' columns to numeric
groupsDf['Latitude'] = pd.to_numeric(groupsDf['Latitude'], errors='coerce')
groupsDf['Longitude'] = pd.to_numeric(groupsDf['Longitude'], errors='coerce')

# Calculate distance using Vincenty's formula
# Shift the dataframe by one row to align current and next points properly
groupsDf['Previous Latitude'] = groupsDf['Latitude'].shift(1)
groupsDf['Previous Longitude'] = groupsDf['Longitude'].shift(1)

#
# groupsDf['Previous Latitude'].fillna(0, inplace=True)
# groupsDf['Previous Longitude'].fillna(0, inplace=True)

# Set the next latitude and next longitude of their current latitudes and longitudes (instead of NaN)
groupsDf.loc[groupsDf.index[0], 'Previous Latitude'] = groupsDf.loc[groupsDf.index[0], 'Latitude']
groupsDf.loc[groupsDf.index[0], 'Previous Longitude'] = groupsDf.loc[groupsDf.index[0], 'Longitude']

# Calculate distance between current and next points
groupsDf['Distance'] = groupsDf.apply(lambda row: geodesic(
    (row['Latitude'], row['Longitude']),
    (row['Previous Latitude'], row['Previous Longitude'])).meters, axis=1)

# Calculate distance between current and previous points
# groupsDf['Distance'] = geodesic(
#     zip(groupsDf['Latitude'], groupsDf['Longitude']),
#     zip(groupsDf['Previous Latitude'], groupsDf['Previous Longitude'])
# ).meters

# Calculate the bearing
groupsDf['Bearing'] = np.arctan2(
    # y component of the bearing calculation
    np.sin(np.radians(groupsDf['Longitude'] - groupsDf['Previous Longitude'])) *
    np.cos(np.radians(groupsDf['Latitude'])),

    # x component of the bearing calculation
    np.cos(np.radians(groupsDf['Previous Latitude'])) *
    np.sin(np.radians(groupsDf['Latitude'])) -
    np.sin(np.radians(groupsDf['Previous Latitude'])) *
    np.cos(np.radians(groupsDf['Latitude'])) *
    np.cos(np.radians(groupsDf['Longitude'] - groupsDf['Previous Longitude']))
)

time_in_mins = (time.time() - total_start_time) / 60
print(f"Total processing time: {time_in_mins:.2f} minutes")

# Display the updated dataframe
groupsDf


Total processing time: 15.12 minutes


Unnamed: 0,GroupId,Mode,Delta Time,Latitude,Longitude,Altitude,UserId,Old Mode,Previous Latitude,Previous Longitude,Distance,Bearing
0,1,walk,2,39.476928,75.989937,1249.9848,10,walk,39.476928,75.989937,0.000000,0.000000
1,1,walk,1,39.476913,75.989937,1249.9848,10,walk,39.476928,75.989937,1.665369,3.141593
2,1,walk,1,39.476905,75.989933,1249.9848,10,walk,39.476913,75.989937,0.952546,-2.773265
3,1,walk,1,39.476890,75.989917,1249.9848,10,walk,39.476905,75.989933,2.160712,-2.452781
4,1,walk,1,39.476878,75.989903,1249.9848,10,walk,39.476890,75.989917,1.796118,-2.408486
...,...,...,...,...,...,...,...,...,...,...,...,...
4340907,542614,train,2,40.030395,116.411975,92.0496,179,subway,40.030624,116.411946,25.547267,3.044928
4340908,542614,train,2,40.030170,116.411990,91.1352,179,subway,40.030395,116.411975,25.015710,3.090590
4340909,542614,train,2,40.029950,116.411985,90.2208,179,subway,40.030170,116.411990,24.431474,-3.124192
4340910,542614,train,2,40.029739,116.411980,89.6112,179,subway,40.029950,116.411985,23.432315,-3.123450


In [None]:
# Drop the temporary columns used for calculation
groupsDf.drop(columns=['Previous Latitude', 'Previous Longitude'], inplace=True)

In [None]:
distancesDf = groupsDf

In [None]:
# save distancesDf to the distances data file path (as a .csv)
# distancesDf.to_csv(distancesDataPath, index=False)

### **Step 6: Pre-Processing - Calculate Motion Values for each Group**

This step will take just a minute.

This step involves calculating motion values for each group.

To do this we will add values for speed, acceleration, jerk, altitudal velcity, and bearing rate at each of our trajectories.

We should still bear in mind that the first 3 points of each group will later need to be made redundant (as the jerk value is not calculatable without 3 prior points).

In [None]:
numberOfGroups = distancesDf.iloc[-1]['GroupId']

# This will remove the (often) inaccurate distance value on the first trajectory of each group
for i in range(numberOfGroups):
  n = i * groupSize
  distancesDf.at[n, 'Distance'] = np.nan

In [None]:
# Add speed, acceleration, and jerk values
distancesDf['Speed'] = distancesDf['Distance'] / distancesDf['Delta Time']
distancesDf['Acceleration'] = (distancesDf['Speed'] - distancesDf['Speed'].shift(1)) / distancesDf['Delta Time']
distancesDf['Jerk'] = (distancesDf['Acceleration'] - distancesDf['Acceleration'].shift(1)) / distancesDf['Delta Time']
distancesDf.head(25)

# notice that the first 3 trajectories of each group will later need to be made redundant

Unnamed: 0,GroupId,Mode,Delta Time,UserId,Old Mode,Speed,Acceleration,Jerk,Alti Velocity,Alti Accel,Bearing Rate,Distance
0,1,walk,2,10,walk,,,,,,,
1,1,walk,1,10,walk,,,,0.0,,3.141593,
2,1,walk,1,10,walk,,,,0.0,0.0,0.368328,
3,1,walk,1,10,walk,,,,0.0,0.0,0.320484,
4,1,walk,1,10,walk,,,,0.0,0.0,0.044295,
5,1,walk,1,10,walk,,,,0.0,0.0,-0.733107,
6,1,walk,2,10,walk,,,,0.0,0.0,-3.027816,
7,1,walk,2,10,walk,,,,0.0,0.0,0.039154,
8,2,walk,1,10,walk,,,,0.0,0.0,-0.728935,
9,2,walk,1,10,walk,,,,0.0,0.0,-0.026416,


In [None]:
# add altitudal velocity ('Alti Velocity') and altidual acceleration ('Alti Accel')
distancesDf['Alti Velocity'] = (distancesDf['Altitude'] - distancesDf['Altitude'].shift(1)) / distancesDf['Delta Time']
distancesDf['Alti Accel'] = (distancesDf['Alti Velocity'] - distancesDf['Alti Velocity'].shift(1)) / distancesDf['Delta Time']

KeyError: 'Altitude'

In [None]:
# add bearing rate (absolute value of the change in bearing over time)
distancesDf['Bearing Rate'] = (distancesDf['Bearing'] - distancesDf['Bearing'].shift(1)) / distancesDf['Delta Time']

# We need to normalise the bearing rate so that it is within a range of -pi to pi

# Add 2*pi (360 degrees) when bearing rate is below -pi (-180 degrees)
distancesDf.loc[distancesDf['Bearing Rate'] < -np.pi, 'Bearing Rate'] += 2 * np.pi

# Subtract 2*pi (360 degrees) when bearing rate is above pi (180 degrees)
distancesDf.loc[distancesDf['Bearing Rate'] > np.pi, 'Bearing Rate'] -= 2 * np.pi

In [None]:
motionDf = distancesDf

In [None]:
# we can now get rid of values for latitude, longitude, altitude, distance, and bearing
motionDf.drop(columns=['Latitude', 'Longitude', 'Altitude', 'Distance', 'Bearing'], inplace=True)
motionDf

In [None]:
# save motionDf to the motion data file path (as a .csv)
# motionDf.to_csv(motionDataPath, index=False)

### **Step 7: Pre-Processing - Converting Groups of Trajectories Into Single Records of a DataFrame**

This step will take just a minute.

In [None]:
# motionDf = pd.read_csv(motionDataPath)
motionDf = motionDf[['GroupId', 'UserId', 'Mode', 'Old Mode', 'Delta Time',
                     'Speed', 'Acceleration', 'Jerk', 'Alti Velocity','Alti Accel', 'Bearing Rate']]
motionDf

Unnamed: 0,GroupId,UserId,Mode,Old Mode,Delta Time,Speed,Acceleration,Jerk,Alti Velocity,Alti Accel,Bearing Rate
0,1,10,walk,walk,2,,,,,,
1,1,10,walk,walk,1,1.665369,,,0.0000,,3.141593
2,1,10,walk,walk,1,0.952546,-0.712822,,0.0000,0.0000,0.368328
3,1,10,walk,walk,1,2.160712,1.208165,1.920988,0.0000,0.0000,0.320484
4,1,10,walk,walk,1,1.796118,-0.364594,-1.572759,0.0000,0.0000,0.044295
...,...,...,...,...,...,...,...,...,...,...,...
4340907,542614,179,train,subway,2,12.773633,0.154231,0.031022,-0.3048,0.0762,0.009776
4340908,542614,179,train,subway,2,12.507855,-0.132889,-0.143560,-0.4572,-0.0762,0.022831
4340909,542614,179,train,subway,2,12.215737,-0.146059,-0.006585,-0.4572,0.0000,-3.107391
4340910,542614,179,train,subway,2,11.716158,-0.249790,-0.051865,-0.3048,0.0762,0.000371


In [None]:
# Start timing the entire process
total_start_time = time.time()

# Remove the first 3 indices of every 16 rows
indices_to_remove = np.hstack([np.arange(start, start+3) for start in range(0, len(motionDf), groupSize)])
motionDf_cleaned = motionDf.drop(indices_to_remove).reset_index(drop=True)

# Create a group index that counts each row within its group
motionDf_cleaned['group_index'] = motionDf_cleaned.groupby('GroupId').cumcount()

# Filter to ensure each group has at least 'newGroupSize' entries
filtered_df = motionDf_cleaned.groupby('GroupId').filter(lambda x: len(x) >= newGroupSize)

# Separate numeric data for reshaping
numeric_columns = ['Delta Time', 'Speed', 'Acceleration', 'Jerk', 'Alti Velocity', 'Alti Accel', 'Bearing Rate']
numeric_data = filtered_df.set_index(['GroupId', 'group_index'])[numeric_columns]

# Pivot the numeric data to a wide format
reshaped_data = numeric_data.unstack(fill_value=0)
reshaped_data.columns = [f'{col[0]}{col[1]}' for col in reshaped_data.columns]

# Include non-numeric data by merging it back
group_info = motionDf[['GroupId', 'UserId', 'Mode', 'Old Mode']].drop_duplicates('GroupId').set_index('GroupId')
gmdf = pd.concat([group_info, reshaped_data], axis=1).reset_index()

print(f"Total reshaping time: {time.time() - total_start_time:.2f} seconds")
gmdf.head()

Total reshaping time: 29.70 seconds


Unnamed: 0,GroupId,UserId,Mode,Old Mode,Delta Time0,Delta Time1,Delta Time2,Delta Time3,Delta Time4,Speed0,...,Alti Accel0,Alti Accel1,Alti Accel2,Alti Accel3,Alti Accel4,Bearing Rate0,Bearing Rate1,Bearing Rate2,Bearing Rate3,Bearing Rate4
0,1,10,walk,walk,1,1,1,2,2,2.160712,...,0.0,0.0,0.0,0.0,0.0,0.320484,0.044295,-0.733107,-3.027816,0.039154
1,2,10,walk,walk,1,1,3,2,1,0.999221,...,0.0,0.0,0.0,0.0,-0.9144,0.390527,0.281756,1.829966,-0.154315,-0.470319
2,3,10,walk,walk,1,1,1,1,1,0.924945,...,0.3048,-0.3048,0.0,0.0,0.6096,0.086573,-0.151963,-0.119474,0.362565,0.343796
3,4,10,walk,walk,1,1,1,1,1,0.581184,...,0.0,0.0,0.0,-0.6096,0.3048,0.068861,0.047636,-0.547945,-0.090388,0.055967
4,5,10,walk,walk,1,1,1,1,1,1.023958,...,0.3048,0.9144,-0.3048,-0.3048,1.524,-0.161796,-0.16715,0.39563,0.21432,0.017377


In [None]:
# Define the mapping from full names to abbreviations
abbreviations = {
    'Delta Time':     'DT',
    'Speed':          'S',
    'Acceleration':   'A',
    'Jerk':           'J',
    'Alti Velocity':  'AV',
    'Alti Accel':     'AA',
    'Bearing Rate':   'BR'
}

# Remove unnecessary repeated columns, which might have been created during reshaping
columns_to_remove = [f'{name}{i}' for name in ['Mode', 'UserId', 'Old Mode'] for i in range(newGroupSize)]
gmdf.drop(columns=columns_to_remove, inplace=True, errors='ignore')

# Rename remaining numeric columns to use abbreviations
for full_name, abbr in abbreviations.items():
    gmdf.columns = [col.replace(full_name, abbr) if full_name in col else col for col in gmdf.columns]

# Confirm the changes
gmdf

Unnamed: 0,GroupId,UserId,Mode,Old Mode,DT0,DT1,DT2,DT3,DT4,S0,...,AA0,AA1,AA2,AA3,AA4,BR0,BR1,BR2,BR3,BR4
0,1,10,walk,walk,1,1,1,2,2,2.160712,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.320484,0.044295,-0.733107,-3.027816,0.039154
1,2,10,walk,walk,1,1,3,2,1,0.999221,...,0.0000,0.0000,0.0000,0.0000,-0.9144,0.390527,0.281756,1.829966,-0.154315,-0.470319
2,3,10,walk,walk,1,1,1,1,1,0.924945,...,0.3048,-0.3048,0.0000,0.0000,0.6096,0.086573,-0.151963,-0.119474,0.362565,0.343796
3,4,10,walk,walk,1,1,1,1,1,0.581184,...,0.0000,0.0000,0.0000,-0.6096,0.3048,0.068861,0.047636,-0.547945,-0.090388,0.055967
4,5,10,walk,walk,1,1,1,1,1,1.023958,...,0.3048,0.9144,-0.3048,-0.3048,1.5240,-0.161796,-0.167150,0.395630,0.214320,0.017377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542609,542610,179,train,subway,2,2,2,2,2,16.219073,...,-0.2286,0.0000,-0.1524,0.0000,-0.3048,0.010296,0.013752,0.009288,-0.018712,0.010066
542610,542611,179,train,subway,2,2,2,2,2,16.990752,...,-0.2286,-0.0762,-0.0762,0.1524,-0.1524,-0.002986,-0.027964,-0.040122,-0.029809,-0.028052
542611,542612,179,train,subway,2,2,2,2,2,14.432845,...,0.0762,0.0762,-0.0762,0.0000,-0.0762,-0.000193,0.016108,0.022229,0.018219,0.022398
542612,542613,179,train,subway,2,2,2,2,2,14.441516,...,0.0000,0.0000,0.0000,-0.2286,0.1524,-0.000223,-0.002537,-0.003111,0.022414,0.023369


In [None]:
# Reorder the columns
top_columns = ['GroupId', 'UserId', 'Mode', 'Old Mode']
ordered_columns = top_columns + [col for col in gmdf.columns if col not in top_columns]
gmdf = gmdf[ordered_columns]

columns_gmdf = gmdf.columns.tolist()
print(columns_gmdf)

['GroupId', 'UserId', 'Mode', 'Old Mode', 'DT0', 'DT1', 'DT2', 'DT3', 'DT4', 'S0', 'S1', 'S2', 'S3', 'S4', 'A0', 'A1', 'A2', 'A3', 'A4', 'J0', 'J1', 'J2', 'J3', 'J4', 'AV0', 'AV1', 'AV2', 'AV3', 'AV4', 'AA0', 'AA1', 'AA2', 'AA3', 'AA4', 'BR0', 'BR1', 'BR2', 'BR3', 'BR4']


In [None]:
# save gmdf to the grouped motion file path (as a .csv)
# gmdf.to_csv(groupedMotionDataPath, index=False)

### **Step 8: Pre-Processing - Add Values for Mean, Median, Range and More**

This step will take about 60 minutes with a 'groupSize' of 16 and considerably less with higher group sizes.

For this I will be adding derived values from my data (Delta Time, Speed, Acceleration, Jerk, Altitudal Velocity, and Bearing Rate), those derived values being:


*   Minimum (0% Quartile)
*   25% Quartile
*   Mean
*   Median (50% Quartile)
*   75% Quartile
*   Maximum (100% Quartile)
*   Range
*   Inter-Quartile Range
*   Range Counts (will explain later)

Also at the end of this notebook I show that the data with mode 'airplane' may need to be made redundant and that there are some impossible outliers still within the data.

In [None]:
# gmdf = pd.read_csv(groupedMotionDataPath)
gmdf

Unnamed: 0,GroupId,UserId,Mode,Old Mode,DT0,DT1,DT2,DT3,DT4,S0,...,AA0,AA1,AA2,AA3,AA4,BR0,BR1,BR2,BR3,BR4
0,1,10,walk,walk,1,1,1,2,2,2.160712,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.320484,0.044295,-0.733107,-3.027816,0.039154
1,2,10,walk,walk,1,1,3,2,1,0.999221,...,0.0000,0.0000,0.0000,0.0000,-0.9144,0.390527,0.281756,1.829966,-0.154315,-0.470319
2,3,10,walk,walk,1,1,1,1,1,0.924945,...,0.3048,-0.3048,0.0000,0.0000,0.6096,0.086573,-0.151963,-0.119474,0.362565,0.343796
3,4,10,walk,walk,1,1,1,1,1,0.581184,...,0.0000,0.0000,0.0000,-0.6096,0.3048,0.068861,0.047636,-0.547945,-0.090388,0.055967
4,5,10,walk,walk,1,1,1,1,1,1.023958,...,0.3048,0.9144,-0.3048,-0.3048,1.5240,-0.161796,-0.167150,0.395630,0.214320,0.017377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542609,542610,179,train,subway,2,2,2,2,2,16.219073,...,-0.2286,0.0000,-0.1524,0.0000,-0.3048,0.010296,0.013752,0.009288,-0.018712,0.010066
542610,542611,179,train,subway,2,2,2,2,2,16.990752,...,-0.2286,-0.0762,-0.0762,0.1524,-0.1524,-0.002986,-0.027964,-0.040122,-0.029809,-0.028052
542611,542612,179,train,subway,2,2,2,2,2,14.432845,...,0.0762,0.0762,-0.0762,0.0000,-0.0762,-0.000193,0.016108,0.022229,0.018219,0.022398
542612,542613,179,train,subway,2,2,2,2,2,14.441516,...,0.0000,0.0000,0.0000,-0.2286,0.1524,-0.000223,-0.002537,-0.003111,0.022414,0.023369


In [None]:
columns_gmdf = gmdf.columns.tolist()
print(columns_gmdf)

['GroupId', 'UserId', 'Mode', 'Old Mode', 'DT0', 'DT1', 'DT2', 'DT3', 'DT4', 'S0', 'S1', 'S2', 'S3', 'S4', 'A0', 'A1', 'A2', 'A3', 'A4', 'J0', 'J1', 'J2', 'J3', 'J4', 'AV0', 'AV1', 'AV2', 'AV3', 'AV4', 'AA0', 'AA1', 'AA2', 'AA3', 'AA4', 'BR0', 'BR1', 'BR2', 'BR3', 'BR4']


In [None]:
# Define the columns for which you want to calculate derived values
columns_to_derive = ['DT', 'S', 'A', 'J', 'AV', 'AA', 'BR']

# Calculate derived values for each column
for column in columns_to_derive:
    col_indices = [f'{column}{i}' for i in range(newGroupSize)]

    # Use .loc to ensure operations are done directly on the DataFrame
    gmdf.loc[:, f'{column} Min'] = gmdf.loc[:, col_indices].min(axis=1)
    gmdf.loc[:, f'{column} 25% Q'] = gmdf.loc[:, col_indices].quantile(0.25, axis=1)
    gmdf.loc[:, f'{column} Mean'] = gmdf.loc[:, col_indices].mean(axis=1)
    gmdf.loc[:, f'{column} Median'] = gmdf.loc[:, col_indices].median(axis=1)
    gmdf.loc[:, f'{column} 75% Q'] = gmdf.loc[:, col_indices].quantile(0.75, axis=1)
    gmdf.loc[:, f'{column} Max'] = gmdf.loc[:, col_indices].max(axis=1)
    gmdf.loc[:, f'{column} Range'] = gmdf.loc[:, f'{column} Max'] - gmdf.loc[:, f'{column} Min']
    gmdf.loc[:, f'{column} IQR'] = gmdf.loc[:, f'{column} 75% Q'] - gmdf.loc[:, f'{column} 25% Q']


# Display the dataframe with derived values
gmdf

Unnamed: 0,GroupId,UserId,Mode,Old Mode,DT0,DT1,DT2,DT3,DT4,S0,...,AA Range,AA IQR,BR Min,BR 25% Q,BR Mean,BR Median,BR 75% Q,BR Max,BR Range,BR IQR
0,1,10,walk,walk,1,1,1,2,2,2.160712,...,0.0000,0.0000,-3.027816,-0.733107,-0.671398,0.039154,0.044295,0.320484,3.348300,0.777402
1,2,10,walk,walk,1,1,3,2,1,0.999221,...,0.9144,0.0000,-0.470319,-0.154315,0.375523,0.281756,0.390527,1.829966,2.300286,0.544842
2,3,10,walk,walk,1,1,1,1,1,0.924945,...,0.9144,0.3048,-0.151963,-0.119474,0.104299,0.086573,0.343796,0.362565,0.514529,0.463269
3,4,10,walk,walk,1,1,1,1,1,0.581184,...,0.9144,0.0000,-0.547945,-0.090388,-0.093174,0.047636,0.055967,0.068861,0.616805,0.146354
4,5,10,walk,walk,1,1,1,1,1,1.023958,...,1.8288,1.2192,-0.167150,-0.161796,0.059676,0.017377,0.214320,0.395630,0.562780,0.376116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542609,542610,179,train,subway,2,2,2,2,2,16.219073,...,0.3048,0.2286,-0.018712,0.009288,0.004938,0.010066,0.010296,0.013752,0.032464,0.001008
542610,542611,179,train,subway,2,2,2,2,2,16.990752,...,0.3810,0.0762,-0.040122,-0.029809,-0.025786,-0.028052,-0.027964,-0.002986,0.037136,0.001845
542611,542612,179,train,subway,2,2,2,2,2,14.432845,...,0.1524,0.1524,-0.000193,0.016108,0.015753,0.018219,0.022229,0.022398,0.022591,0.006121
542612,542613,179,train,subway,2,2,2,2,2,14.441516,...,0.3810,0.0000,-0.003111,-0.002537,0.007982,-0.000223,0.022414,0.023369,0.026480,0.024950


In [None]:
# Define the columns you want to keep for Delta Time
dt_keep_columns = ['DT Min', 'DT Max', 'DT Mean', 'DT Median']

# List all columns for Delta Time that were created (both raw data and derived statistics)
dt_all_columns = [col for col in gmdf.columns if col.startswith('DT') and col not in dt_keep_columns]  # Derived statistics
dt_drop_columns = [col for col in dt_all_columns if col not in dt_keep_columns]

# Drop the columns that are not specified to be kept
gmdf = gmdf.drop(columns=dt_drop_columns)

print("removing columns:", dt_drop_columns)

removing columns: ['DT0', 'DT1', 'DT2', 'DT3', 'DT4', 'DT 25% Q', 'DT 75% Q', 'DT Range', 'DT IQR']


In [None]:
def getRangeCount(row, value_range):
    """Return the count of values within the specified range for a row."""
    low, high = value_range
    return ((row >= low) & (row < high)).sum()

In [None]:
def getBearingRateRangeCount(row, value_range):
    low, high = value_range
    return ((abs(row) >= low) & (abs(row) < high)).sum()

In [None]:
# Define the ranges for each variable as tuples (min, max)

speed_ranges =          [(0.0, 0.25),    (0.25, 1.0),   (1.0, 2.5),     (2.5, 5.0),     (5.0, 10.0), (10.0, 20.0), (20.0, 999.9)]
acceleration_ranges =   [(-999.9, -5.0), (-5.0, -2.5),  (-2.5, -0.5),   (-0.5, 0.5),    (0.5, 2.0),  (2.0, 4.0),   (4.0, 999.9)]
jerk_ranges =           [(-999.9, -5.0), (-5.0, -2.5),  (-2.5, -0.5),   (-0.5, 0.5),    (0.5, 2.0),  (2.0, 4.0),   (4.0, 999.9)]
alti_velocity_ranges =  [(-999.9, -2.0), (-2.0, -1.0),  (-1.0, -0.25),  (-0.25, 0.25),  (0.25, 1.0), (1.0, 3.0),   (3.0, 999.9)]
alti_accel_ranges =     [(-999.9, -2.0), (-2.0, -0.75), (-0.75, -0.25), (-0.25, 0.25),  (0.25, 0.5), (0.5, 1.5),   (1.5, 999.9)]

# bearing rate will be done a little bit differently because BRs are rotational
# therefore a BR of -3.0 is close to 3.0 (note that BRs all fall within -3.142 to 3.142)
# so this range count will be based on the absolute value of the bearing rate (0.0 to 3.142)
bearing_rate_ranges =  [(0.0, 0.125), (0.125, 0.25), (0.25, 0.5), (0.5, 1.0), (1.0, 1.75), (1.75, 2.5), (2.5, 3.142)]

In [None]:
# lets add the range counts (RCs)

# Start timing this process (because its gonna take a while)
total_start_time = time.time()

# Process range counts for Speed (S)
gmdf['S RC0'] = gmdf.iloc[:, 4 + 0 * newGroupSize : 4 + 1 * newGroupSize].apply(getRangeCount, axis=1, args=(speed_ranges[0],))
gmdf['S RC1'] = gmdf.iloc[:, 4 + 0 * newGroupSize : 4 + 1 * newGroupSize].apply(getRangeCount, axis=1, args=(speed_ranges[1],))
gmdf['S RC2'] = gmdf.iloc[:, 4 + 0 * newGroupSize : 4 + 1 * newGroupSize].apply(getRangeCount, axis=1, args=(speed_ranges[2],))
gmdf['S RC3'] = gmdf.iloc[:, 4 + 0 * newGroupSize : 4 + 1 * newGroupSize].apply(getRangeCount, axis=1, args=(speed_ranges[3],))
gmdf['S RC4'] = gmdf.iloc[:, 4 + 0 * newGroupSize : 4 + 1 * newGroupSize].apply(getRangeCount, axis=1, args=(speed_ranges[4],))
gmdf['S RC5'] = gmdf.iloc[:, 4 + 0 * newGroupSize : 4 + 1 * newGroupSize].apply(getRangeCount, axis=1, args=(speed_ranges[5],))
gmdf['S RC6'] = gmdf.iloc[:, 4 + 0 * newGroupSize : 4 + 1 * newGroupSize].apply(getRangeCount, axis=1, args=(speed_ranges[6],))
print(f"Current processing time (finished speed): {time.time() - total_start_time:.2f} seconds")

# Process range counts for Acceleration (A)
gmdf['A RC0'] = gmdf.iloc[:, 4 + 1 * newGroupSize : 4 + 2 * newGroupSize].apply(getRangeCount, axis=1, args=(acceleration_ranges[0],))
gmdf['A RC1'] = gmdf.iloc[:, 4 + 1 * newGroupSize : 4 + 2 * newGroupSize].apply(getRangeCount, axis=1, args=(acceleration_ranges[1],))
gmdf['A RC2'] = gmdf.iloc[:, 4 + 1 * newGroupSize : 4 + 2 * newGroupSize].apply(getRangeCount, axis=1, args=(acceleration_ranges[2],))
gmdf['A RC3'] = gmdf.iloc[:, 4 + 1 * newGroupSize : 4 + 2 * newGroupSize].apply(getRangeCount, axis=1, args=(acceleration_ranges[3],))
gmdf['A RC4'] = gmdf.iloc[:, 4 + 1 * newGroupSize : 4 + 2 * newGroupSize].apply(getRangeCount, axis=1, args=(acceleration_ranges[4],))
gmdf['A RC5'] = gmdf.iloc[:, 4 + 1 * newGroupSize : 4 + 2 * newGroupSize].apply(getRangeCount, axis=1, args=(acceleration_ranges[5],))
gmdf['A RC6'] = gmdf.iloc[:, 4 + 1 * newGroupSize : 4 + 2 * newGroupSize].apply(getRangeCount, axis=1, args=(acceleration_ranges[6],))
print(f"Current processing time (finished acceleration): {time.time() - total_start_time:.2f} seconds")

# Process range counts for Jerk (J)
gmdf['J RC0'] = gmdf.iloc[:, 4 + 2 * newGroupSize : 4 + 3 * newGroupSize].apply(getRangeCount, axis=1, args=(jerk_ranges[0],))
gmdf['J RC1'] = gmdf.iloc[:, 4 + 2 * newGroupSize : 4 + 3 * newGroupSize].apply(getRangeCount, axis=1, args=(jerk_ranges[1],))
gmdf['J RC2'] = gmdf.iloc[:, 4 + 2 * newGroupSize : 4 + 3 * newGroupSize].apply(getRangeCount, axis=1, args=(jerk_ranges[2],))
gmdf['J RC3'] = gmdf.iloc[:, 4 + 2 * newGroupSize : 4 + 3 * newGroupSize].apply(getRangeCount, axis=1, args=(jerk_ranges[3],))
gmdf['J RC4'] = gmdf.iloc[:, 4 + 2 * newGroupSize : 4 + 3 * newGroupSize].apply(getRangeCount, axis=1, args=(jerk_ranges[4],))
gmdf['J RC5'] = gmdf.iloc[:, 4 + 2 * newGroupSize : 4 + 3 * newGroupSize].apply(getRangeCount, axis=1, args=(jerk_ranges[5],))
gmdf['J RC6'] = gmdf.iloc[:, 4 + 2 * newGroupSize : 4 + 3 * newGroupSize].apply(getRangeCount, axis=1, args=(jerk_ranges[6],))
print(f"Current processing time (finished jerk): {time.time() - total_start_time:.2f} seconds")

# Process range counts for Altitudinal Velocity (AV)
gmdf['AV RC0'] = gmdf.iloc[:, 4 + 3 * newGroupSize : 4 + 4 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_velocity_ranges[0],))
gmdf['AV RC1'] = gmdf.iloc[:, 4 + 3 * newGroupSize : 4 + 4 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_velocity_ranges[1],))
gmdf['AV RC2'] = gmdf.iloc[:, 4 + 3 * newGroupSize : 4 + 4 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_velocity_ranges[2],))
gmdf['AV RC3'] = gmdf.iloc[:, 4 + 3 * newGroupSize : 4 + 4 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_velocity_ranges[3],))
gmdf['AV RC4'] = gmdf.iloc[:, 4 + 3 * newGroupSize : 4 + 4 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_velocity_ranges[4],))
gmdf['AV RC5'] = gmdf.iloc[:, 4 + 3 * newGroupSize : 4 + 4 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_velocity_ranges[5],))
gmdf['AV RC6'] = gmdf.iloc[:, 4 + 3 * newGroupSize : 4 + 4 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_velocity_ranges[6],))
print(f"Current processing time (altitudinal velocity): {time.time() - total_start_time:.2f} seconds")

# Process range counts for Altitudinal Velocity (AV)
gmdf['AA RC0'] = gmdf.iloc[:, 4 + 4 * newGroupSize : 4 + 5 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_accel_ranges[0],))
gmdf['AA RC1'] = gmdf.iloc[:, 4 + 4 * newGroupSize : 4 + 5 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_accel_ranges[1],))
gmdf['AA RC2'] = gmdf.iloc[:, 4 + 4 * newGroupSize : 4 + 5 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_accel_ranges[2],))
gmdf['AA RC3'] = gmdf.iloc[:, 4 + 4 * newGroupSize : 4 + 5 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_accel_ranges[3],))
gmdf['AA RC4'] = gmdf.iloc[:, 4 + 4 * newGroupSize : 4 + 5 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_accel_ranges[4],))
gmdf['AA RC5'] = gmdf.iloc[:, 4 + 4 * newGroupSize : 4 + 5 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_accel_ranges[5],))
gmdf['AA RC6'] = gmdf.iloc[:, 4 + 4 * newGroupSize : 4 + 5 * newGroupSize].apply(getRangeCount, axis=1, args=(alti_accel_ranges[6],))
print(f"Current processing time (altitudinal acceleration): {time.time() - total_start_time:.2f} seconds")

# Process range counts for Bearing Rate (BR)
gmdf['BR RC0'] = gmdf.iloc[:, 4 + 5 * newGroupSize : 4 + 6 * newGroupSize].apply(getBearingRateRangeCount, axis=1, args=(bearing_rate_ranges[0],))
gmdf['BR RC1'] = gmdf.iloc[:, 4 + 5 * newGroupSize : 4 + 6 * newGroupSize].apply(getBearingRateRangeCount, axis=1, args=(bearing_rate_ranges[1],))
gmdf['BR RC2'] = gmdf.iloc[:, 4 + 5 * newGroupSize : 4 + 6 * newGroupSize].apply(getBearingRateRangeCount, axis=1, args=(bearing_rate_ranges[2],))
gmdf['BR RC3'] = gmdf.iloc[:, 4 + 5 * newGroupSize : 4 + 6 * newGroupSize].apply(getBearingRateRangeCount, axis=1, args=(bearing_rate_ranges[3],))
gmdf['BR RC4'] = gmdf.iloc[:, 4 + 5 * newGroupSize : 4 + 6 * newGroupSize].apply(getBearingRateRangeCount, axis=1, args=(bearing_rate_ranges[4],))
gmdf['BR RC5'] = gmdf.iloc[:, 4 + 5 * newGroupSize : 4 + 6 * newGroupSize].apply(getBearingRateRangeCount, axis=1, args=(bearing_rate_ranges[5],))
gmdf['BR RC6'] = gmdf.iloc[:, 4 + 5 * newGroupSize : 4 + 6 * newGroupSize].apply(getBearingRateRangeCount, axis=1, args=(bearing_rate_ranges[6],))
print(f"Current processing time (Bearing Rate): {time.time() - total_start_time:.2f} seconds")

time_in_mins = (time.time() - total_start_time) / 60
print(f"Total processing time: {time_in_mins:.2f} minutes")

Current processing time (finished speed): 1585.96 seconds
Current processing time (finished acceleration): 3152.94 seconds
Current processing time (finished jerk): 4741.45 seconds
Current processing time (altitudinal velocity): 6324.71 seconds
Current processing time (altitudinal acceleration): 7902.90 seconds
Current processing time (Bearing Rate): 9839.01 seconds
Total processing time: 163.98 minutes


In [None]:
# dmdf stands for Derived Motion Data Path
dmdf = gmdf

# save dmdf to the derived motion file path (as a .csv)
# dmdf.to_csv(derivedMotionDataPath, index=False)

### **Step 9: Pre-Processing - Remove Groups Containing Impossible Trajectories**

This step will take just a minute.

will classify data that may not be suitable for machine learining, those being:

'Impossible' groups - will be 'True' where groups contain trajectories that are likely to have been miscollected because they're not realistic

'Stationary' groups - will be 'True' where all trajectories in a group have no (or near zero) speed

its also worth noting that we might also want to exclude data for 'airplane' and 'boat' in future as they both have very little data. Additionally, the data for 'airplane' does not appear to show an airplane in flight

In [None]:
# dmdf = pd.read_csv(derivedMotionDataPath)
dmdf['Impossible'] = False
dmdf['Stationary'] = False
dmdf

Unnamed: 0,GroupId,UserId,Mode,Old Mode,S0,S1,S2,S3,S4,A0,...,AA RC6,BR RC0,BR RC1,BR RC2,BR RC3,BR RC4,BR RC5,BR RC6,Impossible,Stationary
0,1,10,walk,walk,2.160712,1.796118,3.330738,1.139857,3.843534,1.208165,...,0,2,0,1,1,0,0,1,False,False
1,2,10,walk,walk,0.999221,0.924945,0.934597,1.468060,0.805586,-0.802794,...,0,0,1,3,0,0,1,0,False,False
2,3,10,walk,walk,0.924945,0.612202,0.653157,0.452357,0.561752,-1.218286,...,0,2,1,2,0,0,0,0,False,False
3,4,10,walk,walk,0.581184,0.344008,0.478947,0.884291,1.159450,0.104910,...,0,4,0,0,1,0,0,0,False,False
4,5,10,walk,walk,1.023958,1.404956,1.236617,1.306314,1.680328,-0.135492,...,1,1,3,1,0,0,0,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542609,542610,179,train,subway,16.219073,17.377757,17.833244,16.757143,15.631111,0.278373,...,0,5,0,0,0,0,0,0,False,False
542610,542611,179,train,subway,16.990752,16.984657,17.896499,17.651621,17.014180,0.259203,...,0,5,0,0,0,0,0,0,False,False
542611,542612,179,train,subway,14.432845,14.213134,13.944071,14.341203,14.242879,-0.246348,...,0,5,0,0,0,0,0,0,False,False
542612,542613,179,train,subway,14.441516,14.116439,13.737667,13.778328,13.450009,-0.311967,...,0,5,0,0,0,0,0,0,False,False


In [None]:
# Update 'Impossible' attribute to True where mode is 'walk' and max speed is greater than 12 m/s
dmdf.loc[(dmdf['Mode'] == 'walk') & (dmdf['S Max'] > 12.0), 'Impossible'] = True

In [None]:
# Update 'Impossible' attribute to True where mode is 'car' and max speed is greater than 90.0 m/s
dmdf.loc[(dmdf['Mode'] == 'car') & (dmdf['S Max'] > 90.0), 'Impossible'] = True

In [None]:
# Update 'Impossible' attribute to True where mode is 'bus' and conditions are met
dmdf.loc[((dmdf['Mode'] == 'bus') & ((dmdf['S Max'] > 90) | ((dmdf['A Max'] > 15) & (dmdf['A Min'] < -15)))), 'Impossible'] = True

In [None]:
# Update 'Impossible' attribute to True where mode is 'train' and conditions are met
dmdf.loc[((dmdf['Mode'] == 'train') & ((dmdf['S Max'] > 150) | ((dmdf['A Max'] > 3) & (dmdf['A Min'] < -3)))), 'Impossible'] = True

In [None]:
# Update 'Impossible' attribute to True where mode is 'bike' and conditions are met
dmdf.loc[((dmdf['Mode'] == 'bike') & ((dmdf['S Max'] > 30) | ((dmdf['A Max'] > 10) & (dmdf['A Min'] < -10)))), 'Impossible'] = True

In [None]:
# Update 'Stationary' attribute to True where the product of mean speed, max delta time, and group size is less than 3
dmdf.loc[(dmdf['S Mean'] * dmdf['DT Max'] * newGroupSize < 3), 'Stationary'] = True

In [None]:
# cmdf stands for Cleaned Motion DataFrame
cmdf = dmdf

In [None]:
# save cmdf to the cleaned motion file path (as a .csv)
cmdf.to_csv(cleanedMotionDataPath)

Here is some information about our final dataframe

In [None]:
# Same function as used in 'Step 3'
def count_unique_modes(dataframe):
    # Find all unique values in the 'Mode' column
    unique_modes = dataframe['Mode'].unique()
    print("Unique Modes:", unique_modes, "\n")

    # Get the count of each mode
    mode_counts = dataframe['Mode'].value_counts()

    # Print unique values along with their counts
    for mode in unique_modes:
        count = mode_counts.get(mode, 0)
        print("Mode:", mode, "        - Count:", count)

    print("\nTotal count:", np.sum(mode_counts))

    return mode_counts

In [None]:
cmdf_total_counts = count_unique_modes(cmdf)

Unique Modes: ['walk' 'car' 'bus' 'train' 'airplane' 'bike' 'boat'] 

Mode: walk         - Count: 147746
Mode: car         - Count: 76960
Mode: bus         - Count: 137768
Mode: train         - Count: 92570
Mode: airplane         - Count: 468
Mode: bike         - Count: 86684
Mode: boat         - Count: 418

Total count: 542614


In [None]:
cmdf_impossible_counts = count_unique_modes(cmdf[cmdf['Impossible'] == True])

Unique Modes: ['bus' 'train' 'walk' 'bike' 'car'] 

Mode: bus         - Count: 71
Mode: train         - Count: 1136
Mode: walk         - Count: 2421
Mode: bike         - Count: 432
Mode: car         - Count: 4

Total count: 4064


In [None]:
cmdf_stationary_counts = count_unique_modes(cmdf[cmdf['Stationary'] == True])

Unique Modes: ['walk' 'bus' 'car' 'train' 'airplane' 'bike' 'boat'] 

Mode: walk         - Count: 13295
Mode: bus         - Count: 9430
Mode: car         - Count: 3037
Mode: train         - Count: 1458
Mode: airplane         - Count: 63
Mode: bike         - Count: 3552
Mode: boat         - Count: 3

Total count: 30838


In [None]:
cmdf_airplane_and_boat_counts = count_unique_modes(cmdf[(cmdf['Mode'] == 'airplane') | (cmdf['Mode'] == 'boat')])

Unique Modes: ['airplane' 'boat'] 

Mode: airplane         - Count: 468
Mode: boat         - Count: 418

Total count: 886


In [None]:
cmdf_clean_groups = cmdf[~((cmdf['Impossible'] == True) | (cmdf['Stationary'] == True) |
                                        (cmdf['Mode'] == 'airplane') | (cmdf['Mode'] == 'boat'))]

cmdf_cleaned_counts = count_unique_modes(cmdf_clean_groups)

Unique Modes: ['walk' 'car' 'bus' 'train' 'bike'] 

Mode: walk         - Count: 132030
Mode: car         - Count: 73919
Mode: bus         - Count: 128267
Mode: train         - Count: 89976
Mode: bike         - Count: 82700

Total count: 506892


In [None]:
unclean_groups_count = len(cmdf) - len(cmdf_clean_groups)
print("Total count of unclean groups:", unclean_groups_count)

percentage_unclean = (1 - unclean_groups_count / len(cmdf)) * 100
print("Percentage of groups that only hold clean trajectories:", percentage_unclean, "%")

Total count of unclean groups: 35722
Percentage of groups that only hold clean trajectories: 93.41668294588787 %


In [None]:
cmdf_dt_max_one_counts = count_unique_modes(cmdf[cmdf['DT Max'] == 1])
percent_dt_max_one = len(cmdf[cmdf['DT Max'] == 1]) / len(cmdf) * 100
print("\nPercentage of groups where all trajectories are 1 second apart:", percent_dt_max_one, "%")

Unique Modes: ['walk' 'car' 'bus' 'train' 'airplane' 'bike'] 

Mode: walk         - Count: 17457
Mode: car         - Count: 9174
Mode: bus         - Count: 13670
Mode: train         - Count: 44574
Mode: airplane         - Count: 73
Mode: bike         - Count: 17318

Total count: 102266

Percentage of groups where all trajectories are 1 second apart: 18.846915118297723 %


In [None]:
squeaky_clean_groups = cmdf[~((cmdf['Impossible'] == True) | (cmdf['Stationary'] == True) |
      (cmdf['Mode'] == 'airplane') | (cmdf['Mode'] == 'boat')) & (cmdf['DT Max'] == 1)]

squeaky_clean_counts = count_unique_modes(squeaky_clean_groups)
percent_squeaky_clean = len(squeaky_clean_groups) / len(cmdf) * 100
print("\nPercentage of groups that only hold clean trajectories that are all 1 second apart:", percent_squeaky_clean, "%")

Unique Modes: ['walk' 'car' 'bus' 'train' 'bike'] 

Mode: walk         - Count: 16220
Mode: car         - Count: 8634
Mode: bus         - Count: 13167
Mode: train         - Count: 42938
Mode: bike         - Count: 16857

Total count: 97816

Percentage of groups that only hold clean trajectories that are all 1 second apart: 18.02681095585444 %


In [None]:
columns_cmdf = cmdf.columns.tolist()

# Print columns in groups of 20 lines
for i in range(0, len(columns_cmdf), 10):
    print(columns_cmdf[i:i+10])

['GroupId', 'UserId', 'Mode', 'Old Mode', 'S0', 'S1', 'S2', 'S3', 'S4', 'A0']
['A1', 'A2', 'A3', 'A4', 'J0', 'J1', 'J2', 'J3', 'J4', 'AV0']
['AV1', 'AV2', 'AV3', 'AV4', 'AA0', 'AA1', 'AA2', 'AA3', 'AA4', 'BR0']
['BR1', 'BR2', 'BR3', 'BR4', 'DT Min', 'DT Mean', 'DT Median', 'DT Max', 'S Min', 'S 25% Q']
['S Mean', 'S Median', 'S 75% Q', 'S Max', 'S Range', 'S IQR', 'A Min', 'A 25% Q', 'A Mean', 'A Median']
['A 75% Q', 'A Max', 'A Range', 'A IQR', 'J Min', 'J 25% Q', 'J Mean', 'J Median', 'J 75% Q', 'J Max']
['J Range', 'J IQR', 'AV Min', 'AV 25% Q', 'AV Mean', 'AV Median', 'AV 75% Q', 'AV Max', 'AV Range', 'AV IQR']
['AA Min', 'AA 25% Q', 'AA Mean', 'AA Median', 'AA 75% Q', 'AA Max', 'AA Range', 'AA IQR', 'BR Min', 'BR 25% Q']
['BR Mean', 'BR Median', 'BR 75% Q', 'BR Max', 'BR Range', 'BR IQR', 'S RC0', 'S RC1', 'S RC2', 'S RC3']
['S RC4', 'S RC5', 'S RC6', 'A RC0', 'A RC1', 'A RC2', 'A RC3', 'A RC4', 'A RC5', 'A RC6']
['J RC0', 'J RC1', 'J RC2', 'J RC3', 'J RC4', 'J RC5', 'J RC6', 'AV 

In [None]:
cmdf

Unnamed: 0,GroupId,UserId,Mode,Old Mode,S0,S1,S2,S3,S4,A0,...,AA RC6,BR RC0,BR RC1,BR RC2,BR RC3,BR RC4,BR RC5,BR RC6,Impossible,Stationary
0,1,10,walk,walk,2.160712,1.796118,3.330738,1.139857,3.843534,1.208165,...,0,2,0,1,1,0,0,1,False,False
1,2,10,walk,walk,0.999221,0.924945,0.934597,1.468060,0.805586,-0.802794,...,0,0,1,3,0,0,1,0,False,False
2,3,10,walk,walk,0.924945,0.612202,0.653157,0.452357,0.561752,-1.218286,...,0,2,1,2,0,0,0,0,False,False
3,4,10,walk,walk,0.581184,0.344008,0.478947,0.884291,1.159450,0.104910,...,0,4,0,0,1,0,0,0,False,False
4,5,10,walk,walk,1.023958,1.404956,1.236617,1.306314,1.680328,-0.135492,...,1,1,3,1,0,0,0,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542609,542610,179,train,subway,16.219073,17.377757,17.833244,16.757143,15.631111,0.278373,...,0,5,0,0,0,0,0,0,False,False
542610,542611,179,train,subway,16.990752,16.984657,17.896499,17.651621,17.014180,0.259203,...,0,5,0,0,0,0,0,0,False,False
542611,542612,179,train,subway,14.432845,14.213134,13.944071,14.341203,14.242879,-0.246348,...,0,5,0,0,0,0,0,0,False,False
542612,542613,179,train,subway,14.441516,14.116439,13.737667,13.778328,13.450009,-0.311967,...,0,5,0,0,0,0,0,0,False,False


### **Next Steps: Machine Learning Models**