In [1]:
import numpy as np
import pandas as pd
from ipywidgets import widgets
from sklearn import preprocessing
import sklearn
from datetime import time
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle
from scipy.ndimage.filters import gaussian_filter
import plotly.graph_objects as go
from collections import Counter

In [2]:
use_weighted_average_interpolation = True
use_new_features = True
nDrives = 50

In [3]:
if use_weighted_average_interpolation:
    drivePaths = [str(path) for path in Path(".").rglob("outputs/weightedInterpolation/dataByLocation*.csv")][:nDrives]
else:
    drivePaths = [str(path) for path in Path(".").rglob("outputs/unweightedInterpolation/dataByLocation*.csv")][:nDrives]
    
if use_new_features:
    drivePaths = [str(path) for path in Path(".").rglob("outputs/withNewFeatures/dataByLocation*.csv")][:nDrives]

In [4]:
subsamplingPeriod = 1

drivesWithLocation = []
drivesWithoutLocation = []
driveIDs = []
for drivePath in drivePaths:
    drive = pd.read_csv(drivePath)
    if len(drive) > 1200 and "dataByLocation_2021-06-03-19-38-35_2T3Y1RFV8KC014025.csv" not in drivePath:
        driveIDs.append("_".join(drivePath.split("/")[-1].split("_")[1:-1]))
        drive = drive.iloc[::subsamplingPeriod]
        driveWithoutLocation = drive.drop(columns=["Time","Longitude", "Latitude"])
#         driveWithoutLocation = driveWithoutLocation.drop(columns=["ZAcceleration", "LongAcceleration", "LatAcceleration"])
        drivesWithLocation.append(drive)
        drivesWithoutLocation.append(driveWithoutLocation)
print("Found", len(drivesWithoutLocation), "drives that meet specifications")

Found 40 drives that meet specifications


## Normalize Data

In [5]:
normalizedDrives = []
combinedDrivesForScalerFitting = pd.concat(drivesWithoutLocation).reset_index().drop(columns=["index"])
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(combinedDrivesForScalerFitting)

for drive in drivesWithoutLocation:
    drive = drive.values[:]
    data_normalized = standard_scaler.transform(drive)
    data_normalized = pd.DataFrame(data_normalized)
    data_normalized[0] = drive[:,0]
    normalizedDrives.append(data_normalized)
    
scalerFile = open('otherLargeFiles/scalerUsedForTrainingInputs.pkl', 'ab')
pickle.dump(standard_scaler, scalerFile)                     
scalerFile.close()

## Generate Labels

In [6]:
# Faster
sequenceLength = 10
datasetLength = sum([len(drive) for drive in normalizedDrives]) - ((sequenceLength + 1)*len(normalizedDrives))
dataset = {"samples":np.full((datasetLength, sequenceLength, len(normalizedDrives[0].columns)), -1.),"labels":np.full((datasetLength, 2), -1.)}
datasetIndex = 0

for k,drive in enumerate(normalizedDrives):
    print("on drive",k)
    for i,sample in drive.iterrows():
        if i < len(drive) - sequenceLength - 1:
            thisDriveWithLocation = drivesWithLocation[k]
            thisSampleWithLocation = thisDriveWithLocation.iloc[i]
            thisLong = thisSampleWithLocation.Longitude
            thisLat = thisSampleWithLocation.Latitude
            nextSample = thisDriveWithLocation.iloc[i+sequenceLength]
            nextLong = nextSample.Longitude
            nextLat = nextSample.Latitude
            deltaLong = nextLong - thisLong
            deltaLat = nextLat - thisLat
            deltas = [deltaLong, deltaLat]
            dataset["samples"][datasetIndex] = [drive.iloc[i+n].to_numpy() for n in range(sequenceLength)]
            dataset["labels"][datasetIndex] = deltas
            datasetIndex+=1

on drive 0
on drive 1
on drive 2
on drive 3
on drive 4
on drive 5
on drive 6
on drive 7
on drive 8
on drive 9
on drive 10
on drive 11
on drive 12
on drive 13
on drive 14
on drive 15
on drive 16
on drive 17
on drive 18
on drive 19
on drive 20
on drive 21
on drive 22
on drive 23
on drive 24
on drive 25
on drive 26
on drive 27
on drive 28
on drive 29
on drive 30
on drive 31
on drive 32
on drive 33
on drive 34
on drive 35
on drive 36
on drive 37
on drive 38
on drive 39


### Normalize labels

In [7]:
originalLabels = dataset["labels"]
# originalLabels = np.array(originalLabels)

scaler = preprocessing.StandardScaler()
labels_normalized = scaler.fit_transform(originalLabels)
dataset["labels"] = labels_normalized

scalerFile = open('otherLargeFiles/scalerUsedForTrainingLabels.pkl', 'ab')
pickle.dump(scaler, scalerFile)                     
scalerFile.close()

# type(labels_normalized)
# labels_normalized = originalLabels
# print(labels_normalized.shape)
# print(type(labels_normalized))

### Pickle

In [9]:
# Its important to use binary mode
dbfile = open('otherLargeFiles/CNN-dataset.pkl', 'ab')

# source, destination
pickle.dump(dataset, dbfile)                     
dbfile.close()