In [72]:
import numpy as np
import pandas as pd
from ipywidgets import widgets
from sklearn import preprocessing
import sklearn
from datetime import time
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle
from scipy.ndimage.filters import gaussian_filter
import plotly.graph_objects as go

In [85]:
use_weighted_average_interpolation = True
nDrives = 50

In [86]:
if use_weighted_average_interpolation:
    drivePaths = [str(path) for path in Path(".").rglob("outputs/weightedInterpolation/dataByLocation*.csv")][:nDrives]
else:
    drivePaths = [str(path) for path in Path(".").rglob("outputs/unweightedInterpolation/dataByLocation*.csv")][:nDrives]

In [74]:
subsamplingPeriod = 1

drivesWithLocation = []
drivesWithoutLocation = []
for drivePath in drivePaths:
    drive = pd.read_csv(drivePath)
    if len(drive) > 1200:
        drive = drive.iloc[::subsamplingPeriod]
        driveWithoutLocation = drive.drop(columns=["Time", "Longitude", "Latitude"])
#         driveWithoutLocation = driveWithoutLocation.drop(columns=["ZAcceleration", "LongAcceleration", "LatAcceleration"])
        drivesWithLocation.append(drive)
        drivesWithoutLocation.append(driveWithoutLocation)
print("Found", len(drivesWithoutLocation), "drives that meet specifications")

Found 41 drives that meet specifications


## Normalize Data

In [75]:
normalizedDrives = []
for drive in drivesWithoutLocation:
    drive = drive.values[:]
    standard_scaler = preprocessing.StandardScaler()
    data_normalized = standard_scaler.fit_transform(drive)
    data_normalized = pd.DataFrame(data_normalized)
    normalizedDrives.append(data_normalized)

## Smooth Data

In [76]:
smoothDrives = []
for drive in normalizedDrives:
    smoothDriveSeries = []
    for feature in drive.columns:
        smoothDriveSeries.append(gaussian_filter(drive.iloc[:,feature], sigma=2))
        #plot the original feature and the smoothed feature
#         scatterData = pd.DataFrame({
#         "index":range(drive.shape[0]),
#         "smoothData": gaussian_filter(drive.iloc[:,feature], sigma=2),
#         "originalData": drive.iloc[:,feature],
#         })
#         scatterData.describe()

#         fig1 = go.Figure()
#         fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.smoothData, name="gaussian smoothed data"))
#         fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.originalData, name="original data"))

#         fig1.show()
    smoothDrives.append(pd.DataFrame(smoothDriveSeries))

## Window Data

In [77]:
sequenceLength = 10
features = ['Speed',
            'LatAcceleration',
            'LongAcceleration',
            'SteerTorque',
            'SteerRate',
            'SteerAngle',
            'FLWheelSpeed',
            'FRWheelSpeed',
            'RRWheelSpeed',
            'RLWheelSpeed']

windowedDrives = []
for drive in normalizedDrives:
    data_df = drive
    stackedData = []
    # split can_data into subsampled sequences
    for i in range(len(data_df)-sequenceLength):
        stackedData.append(data_df[i:i+sequenceLength])
    stackedData = np.array(stackedData)
    windowedDrives.append(stackedData)

## Generate Labels

In [78]:
dataset = {"samples":[], "labels":[]}
for k,drive in enumerate(windowedDrives):
    for i,window in enumerate(drive[:-1]):
        last = drivesWithLocation[k].iloc[i]
        lastLong = last.Longitude
        lastLat = last.Latitude
        cur = drivesWithLocation[k].iloc[i+5]
        curLong = cur.Longitude
        curLat = cur.Latitude
        
        dataset["samples"].append(window)
        dataset["labels"].append([curLong - lastLong, curLat - lastLat])
# dataset = pd.DataFrame(dataset)

### Normalize labels

In [79]:
originalLabels = dataset["labels"]
# originalLabels = np.array(originalLabels)

# scaler = preprocessing.MinMaxScaler()
# labels_normalized = scaler.fit_transform(originalLabels)
# type(labels_normalized)
labels_normalized = np.array(originalLabels) * (10**5)
print(labels_normalized.shape)
print(type(labels_normalized))

(556450, 2)
<class 'numpy.ndarray'>


## Smooth Labels

In [84]:
sigma = 8
smoothedLongitudeLabels = gaussian_filter(labels_normalized[:,0], sigma=sigma)
smoothedLatitudeLabels = gaussian_filter(labels_normalized[:,1], sigma=sigma)

smoothedLabels = np.vstack((smoothedLongitudeLabels, smoothedLatitudeLabels)).T

# scatterData = pd.DataFrame({
# "index":range(len(smoothedLongitudeLabels)),
# "smoothLongLabels": smoothedLongitudeLabels,
# "originalLongLabels": labels_normalized[:,0],
# "smoothLatLabels": smoothedLatitudeLabels,
# "originalLatLabels": labels_normalized[:,1],
# })
# scatterData.describe()

# fig1 = go.Figure()
# fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.smoothLongLabels, name="gaussian smoothed long"))
# fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.originalLongLabels, name="original long"))
# fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.smoothLatLabels, name="gaussian smoothed lat"))
# fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.originalLatLabels, name="original lat"))

# fig1.show()

In [81]:
samples = np.stack(dataset["samples"])
labels = smoothedLabels
dataset = {"samples": samples, "labels": labels}

### Pickle

In [83]:
# Its important to use binary mode
dbfile = open('otherLargeFiles/CNN-dataset.pkl', 'ab')

# source, destination
pickle.dump(dataset, dbfile)                     
dbfile.close()