In [1]:
import numpy as np
import pandas as pd
from ipywidgets import widgets
from sklearn import preprocessing
import sklearn
from datetime import time
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle

In [2]:
drivePaths = [str(path) for path in Path(".").rglob("outputs/dataByLocation*.csv")][:25]

In [3]:
subsamplingPeriod = 1

drivesWithLocation = []
drivesWithoutLocation = []
for drivePath in drivePaths:
    drive = pd.read_csv(drivePath)
    drive = drive.iloc[::subsamplingPeriod]
    driveWithoutLocation = drive.drop(columns=["Time", "Longitude", "Latitude"])
    drivesWithLocation.append(drive)
    drivesWithoutLocation.append(driveWithoutLocation)

## Normalize Data

In [4]:
normalizedDrives = []
for drive in drivesWithoutLocation:
    drive = drive.values[:]
    standard_scaler = preprocessing.StandardScaler()
    data_normalized = standard_scaler.fit_transform(drive)
    data_normalized = pd.DataFrame(data_normalized)
    normalizedDrives.append(data_normalized)

## Window Data

In [5]:
sequenceLength = 32
features = ['Speed',
            'LatAcceleration',
            'LongAcceleration',
            'SteerTorque',
            'SteerRate',
            'SteerAngle',
            'FLWheelSpeed',
            'FRWheelSpeed',
            'RRWheelSpeed',
            'RLWheelSpeed']

windowedDrives = []
for drive in normalizedDrives:
    data_df = drive
    stackedData = []
    # split can_data into subsampled sequences
    for i in range(len(data_df)-sequenceLength):
        stackedData.append(data_df[i:i+sequenceLength])
    stackedData = np.array(stackedData)
    windowedDrives.append(stackedData)

## Generate Labels

In [6]:
dataset = {"samples":[], "labels":[]}
for k,drive in enumerate(windowedDrives):
    for i,window in enumerate(drive[:-1]):
        last = drivesWithLocation[k].iloc[i]
        lastLong = last.Longitude
        lastLat = last.Latitude
        cur = drivesWithLocation[k].iloc[i+5]
        curLong = cur.Longitude
        curLat = cur.Latitude
        
        dataset["samples"].append(window)
        dataset["labels"].append([curLong - lastLong, curLat - lastLat])
dataset = pd.DataFrame(dataset)

### Normalize labels

In [7]:
originalLabels = dataset["labels"].tolist()
# originalLabels = np.array(originalLabels)

scaler = preprocessing.StandardScaler()
labels_normalized = scaler.fit_transform(originalLabels)

dataset["labels"] = labels_normalized
labels_normalized.shape

(338930, 2)

### Formalize dataset

In [8]:
samples = np.stack(dataset["samples"])
labels = labels_normalized
dataset = {"samples": samples, "labels": labels}

### Pickle

In [9]:
# Its important to use binary mode
dbfile = open('otherLargeFiles/CNN-dataset.pkl', 'ab')

# source, destination
pickle.dump(dataset, dbfile)                     
dbfile.close()

In [None]:
x