In [84]:
import numpy as np
import pandas as pd
from ipywidgets import widgets
from sklearn import preprocessing
import sklearn
from datetime import time
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle
from scipy.ndimage.filters import gaussian_filter
import plotly.graph_objects as go
from collections import Counter

In [11]:
use_weighted_average_interpolation = True
nDrives = 5

In [12]:
if use_weighted_average_interpolation:
    drivePaths = [str(path) for path in Path(".").rglob("outputs/weightedInterpolation/dataByLocation*.csv")][:nDrives]
else:
    drivePaths = [str(path) for path in Path(".").rglob("outputs/unweightedInterpolation/dataByLocation*.csv")][:nDrives]

In [31]:
subsamplingPeriod = 1

drivesWithLocation = []
drivesWithoutLocation = []
driveIDs = []
for drivePath in drivePaths:
    drive = pd.read_csv(drivePath)
    if len(drive) > 1200:
        driveIDs.append("_".join(drivePath.split("/")[-1].split("_")[1:-1]))
        drive = drive.iloc[::subsamplingPeriod]
        driveWithoutLocation = drive.drop(columns=["Longitude", "Latitude"])
#         driveWithoutLocation = driveWithoutLocation.drop(columns=["ZAcceleration", "LongAcceleration", "LatAcceleration"])
        drivesWithLocation.append(drive)
        drivesWithoutLocation.append(driveWithoutLocation)
print("Found", len(drivesWithoutLocation), "drives that meet specifications")

Found 3 drives that meet specifications


## Normalize Data

In [73]:
normalizedDrives = []
for drive in drivesWithoutLocation:
    drive = drive.values[:]
    standard_scaler = preprocessing.StandardScaler()
    data_normalized = standard_scaler.fit_transform(drive)
    data_normalized = pd.DataFrame(data_normalized)
    data_normalized[0] = drive[:,0]
    normalizedDrives.append(data_normalized)

## Smooth Data

In [74]:
smoothDrives = []
for drive in normalizedDrives:
    smoothDriveSeries = []
    for feature in drive.columns:
        smoothDriveSeries.append(gaussian_filter(drive.iloc[:,feature], sigma=2))
    smoothDrives.append(pd.DataFrame(smoothDriveSeries))

## Window Data

In [75]:
sequenceLength = 10

windowedDrives = []
for drive in normalizedDrives:
    data_df = drive
    stackedData = []
    # split can_data into subsampled sequences
    for i in range(len(data_df)-sequenceLength):
        stackedData.append(data_df[i:i+sequenceLength])
    stackedData = np.array(stackedData)
    windowedDrives.append(stackedData)

## Add histogram features

In [76]:
CANFiles = [str(PathObj) for PathObj in Path("./raw_data").rglob("*_CAN_Messages.csv")][:]

In [77]:
def getAssociatedCANFile(i):
    driveID = driveIDs[i]
    for path in CANFiles:
        if driveID in path:
            return path

In [89]:
ids = (1541,387,1552,1041,1553,1042,1556,1557,1044,544,1568,1056,1571,36,37,550,1059,552,1063,1570,1572,388,1941,562,1076,565,1077,1592,1082,1595,1084,1085,1086,1594,576,577,578,579,580,581,582,583,584,1088,1089,1948,1949,1104,1114,1952,608,610,1132,1649,1956,643,1161,1162,1163,1164,1165,1166,1167,658,1172,1696,1940,170,180,186,705,1973,1228,1745,1235,1237,728,740,742,743,1775,1264,1779,761,1786,1787,764,765,1788,1279,1789,1808,1809,1816,1817,800,291,296,810,812,814,304,1841,818,1840,1846,824,1848,829,830,835,836,352,865,353,869,870,871,877,1904,881,882,885,1912,889,896,384,898,385,900,386,902,391,392,905,393,394,395,396,397,398,399,401,400,402,403,404,918,405,406,921,407,408,409,410,411,412,413,414,415,416,417,933,934,935,418,419,426,389,390,1960,1964,944,945,436,437,438,951,439,440,441,955,956,442,443,1981,1986,452,1990,1994,1998,464,976,466,467,977,978,979,2004,2000,2008,987,2012,998,999,1000,1001,1002,1014,1017,1020)

In [91]:
debugNum = 10
for i,drive in enumerate(windowedDrives):
    associatedCANPath = getAssociatedCANFile(i)
    associatedCAN = pd.read_csv(associatedCANPath)
    datetimes = pd.to_datetime(associatedCAN.Time * (10**9))
    associatedCAN = associatedCAN.set_index(datetimes)
    for window in drive[:debugNum]:
        windowStartTime = pd.to_datetime(window[0][0] * (10**9))
        windowEndTime = pd.to_datetime(window[-1][0] * (10**9))
        counts = dict(Counter(associatedCAN.loc[windowStartTime:windowEndTime].MessageID))
        countList = []
        for id in ids:
            if id in counts:
                countList.append(counts[id])
            else:
                countList.append(0)
        
        

211
211
211
211
211
211
211
211
211
211
211
211
211
211
211
211
211
211
211
211
213
213
213
213
213
213
213
213
213
213


## Generate Labels

In [8]:
dataset = {"samples":[], "labels":[]}
for k,drive in enumerate(windowedDrives):
    for i,window in enumerate(drive[:-1]):
        last = drivesWithLocation[k].iloc[i]
        lastLong = last.Longitude
        lastLat = last.Latitude
        cur = drivesWithLocation[k].iloc[i+5]
        curLong = cur.Longitude
        curLat = cur.Latitude
        
        dataset["samples"].append(window)
        dataset["labels"].append([curLong - lastLong, curLat - lastLat])
# dataset = pd.DataFrame(dataset)

### Normalize labels

In [9]:
originalLabels = dataset["labels"]
# originalLabels = np.array(originalLabels)

# scaler = preprocessing.MinMaxScaler()
# labels_normalized = scaler.fit_transform(originalLabels)
# type(labels_normalized)
labels_normalized = np.array(originalLabels) * (10**5)
# print(labels_normalized.shape)
# print(type(labels_normalized))

## Smooth Labels

In [10]:
sigma = 8
smoothedLongitudeLabels = gaussian_filter(labels_normalized[:,0], sigma=sigma)
smoothedLatitudeLabels = gaussian_filter(labels_normalized[:,1], sigma=sigma)

smoothedLabels = np.vstack((smoothedLongitudeLabels, smoothedLatitudeLabels)).T

# scatterData = pd.DataFrame({
# "index":range(len(smoothedLongitudeLabels)),
# "smoothLongLabels": smoothedLongitudeLabels,
# "originalLongLabels": labels_normalized[:,0],
# "smoothLatLabels": smoothedLatitudeLabels,
# "originalLatLabels": labels_normalized[:,1],
# })
# scatterData.describe()

# fig1 = go.Figure()
# fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.smoothLongLabels, name="gaussian smoothed long"))
# fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.originalLongLabels, name="original long"))
# fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.smoothLatLabels, name="gaussian smoothed lat"))
# fig1.add_trace(go.Scatter(x=scatterData.index, y=scatterData.originalLatLabels, name="original lat"))

# fig1.show()

In [11]:
samples = np.stack(dataset["samples"])
labels = smoothedLabels
dataset = {"samples": samples, "labels": labels}

### Pickle

In [12]:
# Its important to use binary mode
dbfile = open('otherLargeFiles/CNN-dataset.pkl', 'ab')

# source, destination
pickle.dump(dataset, dbfile)                     
dbfile.close()

## Other