In [None]:
# Experiments with UMAP  21-Oct-2022 J.Beale
# based on https://umap-learn.readthedocs.io/en/latest/basic_usage.html

import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

%matplotlib inline


In [None]:
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [None]:
group=3
bsize = 15000
stdThresh = 8             # discard rows with standard dev. higher than this
maxlines = group*bsize    # use only this many lines of data

#cars  = pd.read_csv("cars-7390.csv", nrows=maxlines)  # Get training data from CSV file
carsR  = pd.read_csv("data/cars-2021.csv", nrows=maxlines)  # Get training data from CSV file
cars=carsR.drop(carsR.index[:((group-1)*bsize)])       # select out this subset
cars=cars[cars["std"] < stdThresh]                     # remove rows with std.dev. too high
cars["direction"] = np.sign(cars["v2"]) # create new column: direction of motion (-1, +1)
# cars.head()

In [None]:
# cars = cars.dropna()      # get rid of any N/A values
# cars.hour.value_counts()  # display how many events at each hour of day

In [None]:
# sns.pairplot(cars, vars=["v2", "std", "minY", "pixels"], hue='v2');  # slow

In [None]:
import umap                 # this cell takes a while
import umap.plot

In [None]:

cars_data = cars[
    [
        #"day",
        #"hour",
        #"minute",
        "frames",
        "v1",
        "v2",
        "v3",
        "std",
        "minY",
        "pixels",
        "direction"
    ]
].values

# fit_transform(X) calculates the best fit, then transforms the data
# fit(X) just calculates the parameters (per-column) returning the scaler object for later use

Fit = StandardScaler().fit(cars_data)  # get parameters needed to standardize this data
scaled_cars_data = Fit.transform(cars_data)


In [None]:
def ishow(dmap, labels, n):
    hover_data = pd.DataFrame({'index': np.arange(n)+2,
                               'label': labels[:n],
                               'x': dmap.embedding_[:n, 0],
                               'y': dmap.embedding_[:n, 1]
                               })
    p = umap.plot.interactive(dmap, labels=labels,
                              hover_data=hover_data, point_size=8, theme = 'fire')
    umap.plot.show(p)
    

In [None]:
mapper = umap.UMAP(n_neighbors=45,
                   min_dist=0.05,
                   init='spectral',
                   random_state=42).fit(scaled_cars_data) # this is a little slow

In [None]:
carA = cars.to_numpy()  # convert pandas dataframe to numpy array
img_count = carA.shape[0]   # how many total cars in data set

#labels = carA[:,5]          # v1 (average velocity)
v2 = carA[:,6]          # v2 (average velocity)
#labels = carA[:,7]          # v3 (average velocity)
#labels = carA[:,8]          # std (standard deviation of velocity)
#labels = carA[:,9]          # maxY (lowest vertical position in frame)
sz = carA[:,10]          # pixels (size of detected motion area)

ishow(mapper, v2, img_count)  # show an interactive plot of the training data
ishow(mapper, sz, img_count)  # show an interactive plot of the training data

In [None]:
orig_embedding = mapper.transform(scaled_cars_data)   # original training data in map
fig, ax = plt.subplots(figsize=(12, 12))
plt.rcParams['axes.facecolor'] = 'black'  # set matplotlib background color
plt.scatter(orig_embedding[:, 0], orig_embedding[:, 1], c=carA[:,6], s=2, cmap='Spectral') # plot training data
plt.xlim([-8, 16])
plt.ylim([-3, 16])


In [None]:
# === Now, let's load new test data, and see how it fits into the map

In [None]:
# %%script false --no-raise-error   # temporarily disable this cell

#cars2 = pd.read_csv("data/mailman.csv")  # Get test data from CSV file
#cars2 = pd.read_csv("data/cars-2310.csv")  # Get test data from CSV file
cars2 = pd.read_csv("data/cars-7390.csv")  # Get test data from CSV file

cars2=cars2[cars2["std"] < stdThresh]                     # remove rows with std.dev. too high
cars2["direction"] = np.sign(cars2["v2"]) # create new column: direction of motion (-1, +1)


In [None]:
# %%script false --no-raise-error   # temporarily disable this cell

cars2_data = cars2[
    [
        #"day",
        #"hour",
        #"minute",
        "frames",
        "v1",
        "v2",
        "v3",
        "std",
        "minY",
        "pixels",
        "direction"
    ]
].values

scaled_cars2_data = Fit.transform(cars2_data)  # transform test data with existing trained Fit parameters
sc2d = scaled_cars2_data  # easier to type

In [None]:
%%script false --no-raise-error   # temporarily disable this cell

# synthesize a random dataset with same per-column statistics
scaled_cars2_data = np.random.normal(0, 1.0, size=(15000, 7)) # 8 columns of normalized random data
sc2d = scaled_cars2_data  # easier to type

# create new column: direction of motion (-1, +1)
carDir = np.sign(sc2d[:,2]).reshape((sc2d.shape[0],1))
sc2d = np.append(sc2d,carDir,1)  # add the new column onto existing array


In [None]:
test_embedding = mapper.transform(scaled_cars2_data)  # test data into trained map (slow)

In [None]:
# Display the new data based on the trained UMAP embedding

# index: 1 2 3 4 5  6  7  8   9    10
# value: D H M S v1 v2 v3 std minY size

#car2A = cars2.to_numpy()  # convert pandas dataframe to numpy array
plt.rcParams['axes.facecolor'] = 'black'  # set matplotlib background color

fig, ax = plt.subplots(figsize=(12, 12))

#plt.scatter(test_embedding[:, 0], test_embedding[:, 1], c=car2A[:,10], cmap='Spectral')
#plt.scatter(test_embedding[:, 0], test_embedding[:, 1], c=car2A[:,9], cmap='Spectral')
#plt.scatter(test_embedding[:, 0], test_embedding[:, 1], c=car2A[:,8], cmap='Spectral')
plt.scatter(test_embedding[:, 0], test_embedding[:, 1], c=sc2d[:,2], s=2, cmap='Spectral')
plt.xlim([-8, 16])
plt.ylim([-3, 16])

In [None]:
# Find the closest example to a specific point in data mapping
from scipy import spatial

def showNearest(A,pt):
    dist,index = spatial.KDTree(A).query(pt)  # get distance and index of nearest point
    print("Map coords: ", A[index],end="")  # find (x,y) of nearest point
    print(" Dist: %5.3f  Index %d" % (dist, index)) # corresponding line # in CSV file

# pt = [4.94, 0.724]  # choose a point in the output space
indexIn = np.arange(6000)
pt = test_embedding[indexIn,:]
dist,indexOut = spatial.KDTree(orig_embedding).query(pt)  # distance & index of nearest point

float_formatter = "{:+0.3f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})

print(" frames,   v1,    v2,    v3,  stdev,  minY,    sz,   dir")

diff = sc2d[indexIn,:] - scaled_cars_data[indexOut]
d2 = diff ** 2
d2s = np.sum(d2,axis=1)

print( (np.average(d2,axis=0)), " mean difference")

#  frames,   v1,    v2,    v3,  stdev,  minY, size,   dir    (mean differences)
# [+0.134 +0.074 +0.048 +0.067 +0.111 +0.212 +0.140 +0.003]  real data pair #1
# [+0.109 +0.065 +0.044 +0.069 +0.082 +0.198 +0.159 +0.002]  real data pair #2
# [+0.153 +0.071 +0.046 +0.066 +0.112 +0.236 +0.144 +0.001]  real data pair #3
# [+0.131 +0.072 +0.044 +0.067 +0.102 +0.227 +0.144 +0.000]  real data pair #4
# [+0.147 +0.071 +0.045 +0.067 +0.095 +0.205 +0.134 +0.001]  real data pair #5
# [+0.650 +1.004 +0.948 +0.963 +0.519 +0.708 +0.489 +1.381]  real paired with random
# [+0.707 +1.040 +0.997 +0.952 +0.546 +0.796 +0.633 +1.387]  random try #2
# [+0.591 +0.988 +0.972 +0.946 +0.530 +0.727 +0.568 +1.399]  random try #3