In [22]:
import tensorflow as tf
import os
import pandas as pd
import strym
from strym import strymread
import matplotlib.pyplot as plt
from pylab import rcParams
import strym.DBC_Read_Tools as dbc
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

# Load in Data
I load in data from a .csv file with the columns `['Time', 'Bus', 'MessageID', 'Message', 'MessageLength', 'Latitude', 'Longitude']`. The columns for the training data are `['Time', 'Bus', 'MessageID', 'Message', 'MessageLength']` while the columns for the labels are `['Time', 'Latitude', 'Longitude']`.

In [23]:
use_decoded = False
apply_window = True
use_delta_coords = True
filter_ids = True
use_histogram = True

## Load the Dataframe

In [24]:
def load_dataframe():
    '''Loads a dataframe from the designated CSV file.'''
    data_path = ''
    df = None
    if use_decoded:
        data_path = 'outputs/'
        dataframes = [pd.read_csv(data_path + '/' + file) for file in os.listdir(data_path)]
    else:
        data_path = '../../libpanda/encoded_data/'
        dataframes = [strymread(data_path + '/' + file).dataframe for file in os.listdir(data_path)]
    return dataframes

Next, we sort the dataframe into training data and labels and testing data and labels.

In [25]:
def split_dataframe(df, debug_num_samples, num_test_samples):
    '''Splits the dataframe into training and testing data and labels.'''
    datalist = []
    for df in dataframes:
        r = []
        test_df = []
        if not use_kfold:
            r = df[:debug_num_samples]
            test_df = df[-test_num_samples:]

            if test_num_samples < len(df) - debug_num_samples:
                print('training and test data do not intersect :)')
            else:
                print('training and test data intersect :(')
                
        features = []
        if use_decoded:
            features = ['Time', 'Speed', 'LatAcceleration', 'LongAcceleration', 'ZAcceleration', 'YawRate', 'SteerTorque', 'SteerRate',
                        'SteerAngle', 'SteerFraction', 'FLWheelSpeed', 'FRWheelSpeed', 'RRWheelSpeed', 'RLWheelSpeed']
        else:
            features = ['Time', 'MessageID', 'Message']

        labels = ['Time', 'Latitude', 'Longitude']

        can_data_df = r[features][:].reset_index(drop=True)
        labels_df = r[labels][:].reset_index(drop=True)

        test_data_df = test_df[features][:].reset_index(drop=True)
        test_labels_df = test_df[labels][:].reset_index(drop=True)

        datalist.append([can_data_df, labels_df, test_data_df, test_labels_df])
    return datalist

The next cell is used to convert the Messages from strings representing hexadecimal values to integers.

In [26]:
def convert_messages(can_data_df, test_data_df):
    '''Converts the messages from hexadecimal string values to integers.'''
    for i in range(len(can_data_df['Message'])):
        can_data_df['Message'][i] = int(can_data_df['Message'][i], 16)
    for i in range(len(test_data_df['Message'])):
        test_data_df['Message'][i] = int(test_data_df['Message'][i], 16)
    return can_data_df, test_data_df

The cell below changes the values in the 'Latitude' and 'Longitude' columns from absolute latitude and longitude to the change in latitude and longitude, respectively.

In [27]:
def convert_coords(labels_df, test_labels_df):
    '''Converts the GPS coordinates from absolute latitude and longitude to delta latitude and lonitude.'''
    time = labels_df['Time'][:-1]

    lat = [0]
    long = [0]
    test_lat = [0]
    test_long = [0]

    for i in range(len(labels_df['Latitude']) - 1):
        lat.append(labels_df['Latitude'][i+1] - labels_df['Latitude'][i])
        long.append(labels_df['Longitude'][i+1] - labels_df['Longitude'][i])

    for i in range(len(test_labels_df['Latitude']) - 1):
        test_lat.append(test_labels_df['Latitude'][i+1] - test_labels_df['Latitude'][i])
        test_long.append(test_labels_df['Longitude'][i+1] - test_labels_df['Longitude'][i])


    labels_df['Latitude'] = lat
    labels_df['Longitude'] = long
    test_labels_df['Latitude'] = test_lat
    test_labels_df['Longitude'] = test_long
    return labels_df, test_labels_df

Now we are filtering out irrelevant message IDs based on the information given in the DBC file.

In [28]:
def filter_IDs(can_data_df, labels_df, test_data_df, test_labels_df):
    '''Filters out the IDs that are not relevant to the vehicle's location.'''
    irrelevant_filters = [1568, 1570]
    for i, i_d in enumerate(can_data_df['MessageID']):
        if i_d in irrelevant_filters:
            can_data_df = can_data_df.drop(labels=i, axis=0)
            labels_df = labels_df.drop(labels=i, axis=0)
    for i, i_d in enumerate(test_data_df['MessageID']):
        if i_d in irrelevant_filters:
            test_data_df = test_data_df.drop(labels=i, axis=0)
            test_labels_df = test_labels_df.drop(labels=i, axis=0)
    return can_data_df, labels_df, test_data_df, test_labels_df

In [29]:
def to_histogram(interval, can_data_df, test_data_df, message_ids, test_message_ids):
    '''Converts the dataframe to a histogram representation.'''
    for i_d in message_ids:
            can_data_df[i_d] = [0 for i in range (len(can_data_df))]
    for i_d in test_message_ids:
            test_data_df[i_d] = [0 for i in range (len(test_data_df))]
    
    can_data_df = can_data_df.reset_index(drop=True)
    for current_index, row in can_data_df.iterrows():
        
        current_time = row['Time']
        
        first_time_index = can_data_df.index.get_loc(current_time - interval, method='nearest')
        
        # loop through previous message IDs
        for i in range (first_time_index, current_index + 1):
            m = can_data_df['MessageID'][i]
            can_data_df[m][current_index] += 1
            
    test_data_df = test_data_df.reset_index(drop=True)
    for current_index, row in test_data_df.iterrows():
        
        current_time = row['Time']
        first_time_index = test_data_df.index.get_loc(current_time - interval, method='nearest')
        
        # loop through previous message IDs
        for i in range (first_time_index, current_index + 1):
            m = test_data_df['MessageID'][i]
            test_data_df[m][current_index] += 1
            
    return can_data_df, test_data_df

In [30]:
def to_np(can_data_df, labels_df, test_data_df, test_labels_df):
    '''Converts the dataframes to numpy arrays.'''
    data = np.array(can_data_df)
    labels = np.array(labels_df)

    testData = np.array(test_data_df)
    testLabels = np.array(test_labels_df)

    return data, labels, testData, testLabels

This cell creates a window that slides over the data, creating subsamples that provide the model with context of previous changes in latitude and longitude.

In [31]:
def window_data(data, labels, testData, testLabels):
    '''Segments the data into chunks by sliding a window over the array.'''
    sequenceLength = 5

    stackedData = []
    stackedLabels = []

    stackedTestData = []
    stackedTestLabels = []

    # split can_data into subsampled sequences
    for i in range(data.shape[0]-sequenceLength):
        stackedData.append(data[i:i+sequenceLength])
        stackedLabels.append(labels[i:i+sequenceLength])

    for i in range(testData.shape[0]-sequenceLength):
        stackedTestData.append(testData[i:i+sequenceLength])
        stackedTestLabels.append(testLabels[i:i+sequenceLength])

    stackedData = np.array(stackedData)
    stackedLabels = np.array(stackedLabels)

    stackedTestData = np.array(stackedTestData)
    stackedTestLabels = np.array(stackedTestLabels)

    data = stackedData.reshape((stackedData.shape[0], stackedData.shape[1]*stackedData.shape[2]))
    labels = stackedLabels.reshape((stackedLabels.shape[0], stackedLabels.shape[1]*stackedLabels.shape[2]))

    testData = stackedTestData.reshape((stackedTestData.shape[0], stackedTestData.shape[1]*stackedTestData.shape[2]))
    testLabels = stackedTestLabels.reshape((stackedTestLabels.shape[0], stackedTestLabels.shape[1]*stackedTestLabels.shape[2]))
    
    return data, labels, testData, testLabels

In [32]:
debug_num_samples = 50000
test_num_samples = 15000

dataframes = load_dataframe()

datalist = split_dataframe(dataframes, debug_num_samples, test_num_samples)

messageIDs = []
for lst in datalist:
    m = lst[0]['MessageID']
    if len(messageIDs) < len(m):
        messageIDs = m
        
test_messageIDs = []
for lst in datalist:
    m = lst[2]['MessageID']
    if len(test_messageIDs) < len(m):
        test_messageIDs = m

[2021_07_23_16_48_55] (root) INFO: Vehicle model infered is toyota-rav4-2019


KeyboardInterrupt: 

In [None]:
all_data = []
all_labels = []
all_test_data = []
all_test_labels = []

for can_data_df, labels_df, test_data_df, test_labels_df in datalist:
    if not use_decoded:
        can_data_df, test_data_df = convert_messages(can_data_df, test_data_df)
    if use_delta_coords: 
        labels_df, test_labels_df = convert_coords(labels_df, test_labels_df)
    if not use_decoded and filter_ids:
        can_data_df, labels_df, test_data_df, test_labels_df = filter_IDs(can_data_df, labels_df, test_data_df, test_labels_df)
    if use_histogram:
        can_data_df, test_data_df = to_histogram(3, can_data_df, test_data_df, messageIDs, messageIDs)
    data, labels, testData, testLabels = to_np(can_data_df, labels_df, test_data_df, test_labels_df)
    if apply_window:
        data, labels, testData, testLabels = window_data(data, labels, testData, testLabels)
        
    print(data.shape, labels.shape, testData.shape, testLabels.shape)
    all_data.append(data)
    all_labels.append(labels)
    all_test_data.append(testData)
    all_test_labels.append(testLabels)

In [None]:
data = np.concatenate(all_data)
labels = np.concatenate(all_labels)
test_data = np.concatenate(all_test_data)
test_labels = np.concatenate(all_test_labels)
print(data.shape, labels.shape, test_data.shape, test_labels.shape)

# The Model

In [None]:
model = RandomForestRegressor(n_estimators=32, max_depth=50)

In [None]:
model.fit(data, labels)

# Results

In [None]:
model.score(test_data, test_labels)