In [1]:
import numpy as np
import pandas as pd
from ipywidgets import widgets
from sklearn import preprocessing
import sklearn
from datetime import time
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle
from scipy.ndimage.filters import gaussian_filter
import plotly.graph_objects as go

In [2]:
drivePaths = [str(path) for path in Path(".").rglob("outputs/dataByLocation*.csv")][:50]

In [3]:
subsamplingPeriod = 16

drivesWithLocation = []
drivesWithoutLocation = []
for drivePath in drivePaths:
    drive = pd.read_csv(drivePath)
    if len(drive) > 1200:
        drive = drive.iloc[::subsamplingPeriod]
        driveWithoutLocation = drive.drop(columns=["Time", "Longitude", "Latitude"])
#         driveWithoutLocation = driveWithoutLocation.drop(columns=["ZAcceleration", "YawRate", "SteerTorque", "SteerRate", "SteerFraction", "FLWheelSpeed", "FRWheelSpeed", "RRWheelSpeed", "RLWheelSpeed"])
        drivesWithLocation.append(drive)
        drivesWithoutLocation.append(driveWithoutLocation)
print("Found", len(drivesWithoutLocation), "drives that meet specifications")

Found 41 drives that meet specifications


## Normalize Data

In [4]:
normalizedDrives = []
for drive in drivesWithoutLocation:
    drive = drive.values[:]
    standard_scaler = preprocessing.StandardScaler()
    data_normalized = standard_scaler.fit_transform(drive)
    data_normalized = pd.DataFrame(data_normalized)
    normalizedDrives.append(data_normalized)

## Smooth Data

In [5]:
smoothDrives = []
for drive in normalizedDrives:
    smoothDriveSeries = []
    for feature in drive.columns:
        smoothDriveSeries.append(gaussian_filter(drive.iloc[:,feature], sigma=2))
        #plot the original feature and the smoothed feature
#         scatterData = pd.DataFrame({
#         "index":range(drive.shape[0]),
#         "smoothData": gaussian_filter(drive.iloc[:,feature], sigma=2),
#         "originalData": drive.iloc[:,feature],
#         })
#         scatterData.describe()

#         fig1 = go.Figure()
#         fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.smoothData, name="gaussian smoothed data"))
#         fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.originalData, name="original data"))

#         fig1.show()
    smoothDrives.append(pd.DataFrame(smoothDriveSeries))

## Window Data

In [6]:
sequenceLength = 10
features = ['Speed',
            'LatAcceleration',
            'LongAcceleration',
            'SteerTorque',
            'SteerRate',
            'SteerAngle',
            'FLWheelSpeed',
            'FRWheelSpeed',
            'RRWheelSpeed',
            'RLWheelSpeed']

windowedDrives = []
for drive in normalizedDrives:
    data_df = drive
    stackedData = []
    # split can_data into subsampled sequences
    for i in range(len(data_df)-sequenceLength):
        stackedData.append(data_df[i:i+sequenceLength])
    stackedData = np.array(stackedData)
    windowedDrives.append(stackedData)

## Generate Labels

In [7]:
dataset = {"samples":[], "labels":[]}
for k,drive in enumerate(windowedDrives):
    for i,window in enumerate(drive[:-1]):
        last = drivesWithLocation[k].iloc[i]
        lastLong = last.Longitude
        lastLat = last.Latitude
        cur = drivesWithLocation[k].iloc[i+5]
        curLong = cur.Longitude
        curLat = cur.Latitude
        
        dataset["samples"].append(window)
        dataset["labels"].append([curLong - lastLong, curLat - lastLat])
dataset = pd.DataFrame(dataset)

### Normalize labels

In [8]:
originalLabels = dataset["labels"].tolist()
# originalLabels = np.array(originalLabels)

scaler = preprocessing.MinMaxScaler()
labels_normalized = scaler.fit_transform(originalLabels)

(34374, 2)

## Smooth Labels

In [18]:
normalizedLongitudeLabels = gaussian_filter(labels_normalized[:,0], sigma=2)
normalizedLatitudeLabels = gaussian_filter(labels_normalized[:,1], sigma=2)

smoothedLabels = np.vstack((normalizedLongitudeLabels, normalizedLatitudeLabels)).T

In [19]:
samples = np.stack(dataset["samples"])
labels = smoothedLabels
dataset = {"samples": samples, "labels": labels}

### Pickle

In [22]:
# Its important to use binary mode
dbfile = open('otherLargeFiles/CNN-dataset.pkl', 'ab')

# source, destination
pickle.dump(dataset, dbfile)                     
dbfile.close()

In [21]:
dataset["samples"].shape

(34374, 10, 13)