In [3]:
import numpy as np
import matplotlib.pyplot as plt


from optigrid import optigrid



# First, generate two separate normal distributions and noise
normal1_mean = [-5, -5, 1]
normal1_cov = [[1, 0, 0], [0, 1, 0], [0, 0, 0.05]]
normal1_samples = 10000
normal1 = np.random.multivariate_normal(mean=normal1_mean, cov=normal1_cov, size=normal1_samples)

normal2_mean = [5, 0, -1]
normal2_cov = [[1, 0, 0], [0, 1, 0], [0, 0, 0.05]]
normal2_samples = 20000
normal2 = np.random.multivariate_normal(mean=normal2_mean, cov=normal2_cov, size=normal2_samples)

noise_low = [-10, -10, -10]
noise_high = [10, 10, 10]
noise_samples = 10000
noise = np.random.uniform(low=noise_low, high=noise_high, size=(noise_samples, 3))

data = np.concatenate((normal1, normal2))#, noise))

# Weight the samples from the first population twice as high
weights = np.array([2] * normal1_samples + [1] * normal2_samples)

# Now we want to standard scale our data. Although it is not necessary, it is recommended for better selection of the parameters and uniform importance of the dimensions.
data_scaled = (data - np.mean(data, axis=0)) / np.std(data, axis=0)

# Next, chose the parameters
d = 3 # Number of dimensions
q = 1 # Number of cutting planes per step
noise_level = 0.1
max_cut_score = 0.3
bandwidth = 0.1

# Fit Optigrid to the data
optigrid_test = optigrid.Optigrid(d=d, q=q, max_cut_score=max_cut_score, noise_level=noise_level, kde_bandwidth=bandwidth, verbose=True)
optigrid_test.fit(data_scaled, weights=weights)
### Output: 
###     In current cluster: 47.08% of datapoints
###     In current cluster: 52.92% of datapoints
###     Optigrid found 2 clusters.

for i, cluster in enumerate(optigrid_test.clusters):
    cluster_data = np.take(data, cluster, axis=0) # Clusters are stored as indices pointing to the original data
    print("Cluster {}: Mean={}, Std={}".format(i, np.mean(cluster_data, axis=0), np.std(cluster_data, axis=0)))
### Output: 
###     Cluster 0: Mean=[-5.03474967 -3.3355985   0.6569438 ], Std=[1.79700025 4.11403245 3.33377444]
###     Cluster 1: Mean=[ 4.92505754  0.05634452 -0.62898176], Std=[1.92237979 3.49116619 3.46671477]

# Draw a 10 values from both normals and score it with optigrid after normalization
sample_size = 10
sample1 = np.random.multivariate_normal(normal1_mean, normal1_cov, sample_size)
sample2 = np.random.multivariate_normal(normal2_mean, normal2_cov, sample_size)
sample = np.concatenate((sample1, sample2))
sample = (sample - np.mean(data)) / np.std(data)

result = optigrid_test.score_samples(sample)
print(result)
### Output: 
###     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
### The first ten values belong to the zeroth cluster and the latter ten to the second cluster as expected

Found following cuts: [(-0.41783537483156374, 0, 2.236518682454137e-05)]
Evaluating subgrid: 50.00% of datapoints
Found cluster 0: 50.00% of datapoints
Evaluating subgrid: 50.00% of datapoints
Found cluster 1: 50.00% of datapoints
Optigrid found 2 clusters.
Cluster 0: Mean=[-5.00335437 -4.98570265  0.9999046 ], Std=[0.99891199 0.99446087 0.22298978]
Cluster 1: Mean=[ 5.00077602e+00  1.15342401e-03 -9.97732119e-01], Std=[1.00359211 0.9982854  0.22250921]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
