In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import lstsq

In [2]:
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from scipy.stats import norm
from sklearn.cluster import OPTICS

# Generate Dataset

the data generating process is:

$y=ax$

where **a** stands for pose, **y** for camera coordinates (camera bearings), and **x** for predicted world coordinates

However, we don't know **x**, we only get a dsitribution over x from some upstream system of the pipeline. So **x** is modeled with a GMM:

$p(x)=\sum^{K}_{k=1}\pi_k N(\mu_k, \sigma_k)$

For simplicity we assume that all **x** have K=3. And each datapoint will have $K*\{\pi,\mu,\sigma\}$ parameters, so 9 per datapoint. Therefore the dataset will be:
* **y.shape = (dataset_size,)** - associated camera coordinate (which we know)
* **x.shape = (dataset_size, 9)**

The image which generated the data was ambigious and there are two modes which generated the data (i.e. if our upstream input was an image, then there would be two meanigful poses that we should be able to predict). 

Therefore there will some datappoints with 

In [None]:
np.random.seed(seed=13)  # set seed so the dataset is the same whenever this NB is ran

dataset_size = 3000
num_modes = 3
noise = 0.005

x = np.random.uniform(-1,1, dataset_size)
x_mode = np.random.choice(num_modes, dataset_size)

a = np.random.uniform(-1,1,num_modes)
b = np.random.uniform(-0,0,num_modes)

y = np.empty(x.shape)
# get modes
for i in range(num_modes):
    y[x_mode == i] = x[x_mode == i] * a[i] + b[i] + np.random.normal(scale=noise, size=x[x_mode == i].shape)
    

# add some outlier points
y_tmp = np.random.uniform(y.min(), y.max(), size=int(0.1 * dataset_size))
x_tmp = np.random.uniform(-1,1, int(0.1 * dataset_size))
x_mode_tmp = np.ones(300) * num_modes

x = np.concatenate((x, x_tmp))
y = np.concatenate((y, y_tmp))
x_mode = np.concatenate((x_mode, x_mode_tmp))