In [1]:
import numpy as np
import pandas as pd
from ipywidgets import widgets
from sklearn import preprocessing
import sklearn
from datetime import time
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle
from scipy.ndimage.filters import gaussian_filter
import plotly.graph_objects as go
from collections import Counter
import random

In [2]:
nDrives = 150
random.seed(1)

In [3]:
drivePaths = random.sample([str(path) for path in Path(".").rglob("outputs/withNewFeatures/dataByLocation*.csv")], nDrives)
drivesUsed = []

In [4]:
subsamplingPeriod = 1

drivesWithLocation = []
drivesWithoutLocation = []
driveIDs = []
for drivePath in drivePaths:
    drive = pd.read_csv(drivePath)
    if len(drive) > 1200 and "dataByLocation_2021-06-03-19-38-35_2T3Y1RFV8KC014025.csv" not in drivePath:
        driveIDs.append("_".join(drivePath.split("/")[-1].split("_")[1:-1]))
        drivesUsed.append(drivePath)
        drive = drive.iloc[::subsamplingPeriod]
        driveWithoutLocation = drive.drop(columns=["Time","Longitude", "Latitude"])
#         driveWithoutLocation = driveWithoutLocation.drop(columns=["ZAcceleration", "LongAcceleration", "LatAcceleration"])
        drivesWithLocation.append(drive)
        drivesWithoutLocation.append(driveWithoutLocation)
print("Found", len(drivesWithoutLocation), "drives that meet specifications")

Found 128 drives that meet specifications


## Normalize Data

In [5]:
normalizedDrives = []
combinedDrivesForScalerFitting = pd.concat(drivesWithoutLocation).reset_index().drop(columns=["index"])
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(combinedDrivesForScalerFitting)

for drive in drivesWithoutLocation:
    drive = drive.values[:]
    data_normalized = standard_scaler.transform(drive)
    data_normalized = pd.DataFrame(data_normalized)
    data_normalized[0] = drive[:,0]
    normalizedDrives.append(data_normalized)
    
scalerFile = open('otherLargeFiles/scalerUsedForTrainingInputs.pkl', 'ab')
pickle.dump(standard_scaler, scalerFile)                     
scalerFile.close()

## Generate Labels

In [None]:
# Faster
sequenceLength = 10
datasetLength = sum([len(drive) for drive in normalizedDrives]) - ((sequenceLength + 1)*len(normalizedDrives))
dataset = {"samples":np.full((datasetLength, sequenceLength, len(normalizedDrives[0].columns)), -1.),"labels":np.full((datasetLength, 2), -1.)}
datasetIndex = 0

for k,drive in enumerate(normalizedDrives):
    print("on drive",k)
    for i,sample in drive.iterrows():
        if i < len(drive) - sequenceLength - 1:
            thisDriveWithLocation = drivesWithLocation[k]
            thisSampleWithLocation = thisDriveWithLocation.iloc[i]
            thisLong = thisSampleWithLocation.Longitude
            thisLat = thisSampleWithLocation.Latitude
            nextSample = thisDriveWithLocation.iloc[i+sequenceLength]
            nextLong = nextSample.Longitude
            nextLat = nextSample.Latitude
            deltaLong = nextLong - thisLong
            deltaLat = nextLat - thisLat
            deltas = [deltaLong, deltaLat]
            dataset["samples"][datasetIndex] = [drive.iloc[i+n].to_numpy() for n in range(sequenceLength)]
            dataset["labels"][datasetIndex] = deltas
            datasetIndex+=1

### Normalize labels

In [20]:
originalLabels = dataset["labels"]

scaler = preprocessing.StandardScaler()
labels_normalized = scaler.fit_transform(originalLabels)
dataset["labels"] = labels_normalized

scalerFile = open('otherLargeFiles/scalerUsedForTrainingLabels.pkl', 'ab')
pickle.dump(scaler, scalerFile)                     
scalerFile.close()

### Pickle

In [21]:
dbfile = open('otherLargeFiles/CNN-dataset.pkl', 'ab')

pickle.dump(dataset, dbfile)                     
dbfile.close()

In [46]:
dbfile = open('otherLargeFiles/drives-used.pkl', 'ab')

pickle.dump(drivesUsed, dbfile)                     
dbfile.close()