To generate the dataset,
we sampled k centers from a 15-dimensional spherical Gaussian
distribution with mean at the origin and variance R ∈
{1, 10, 100}. We then added points from Gaussian distributions
of unit variance around each center. Given the k centers,
this is a mixture of k spherical Gaussians with equal
weights. Note that the Gaussians are separated in terms
of probability mass — even if only marginally for the case
R = 1 — and therefore the value of the optimal k-clustering
can be well approximated using the centers of these Gaussians.
The number of sampled points from this mixture of
Gaussians is n = 10, 000.

In [1]:
%%file simulatedData.py
import numpy as np
def generate_centers(k, var):
    """Generate k centers from 15-dimensional spherical Gaussian distribution with the given variance"""
    centers = np.random.multivariate_normal(np.zeros(15),  np.eye(15)*var, k)
    return centers

def generate_data(k, var):
    """Generate data points around each center such that there are 10,000 data points total including the centers?
    This could also be 10000 data points total plus the centers if this is better? Just chance the -k in 
    the sampData line"""
    # generate centers
    centers = generate_centers(k, var)
    # array to store points #
    points = np.empty([1,15])
    # generate data around each center
    for i in range(k):
        points = np.concatenate((points, np.random.multivariate_normal(centers[i],np.eye(15),10000)), axis=0)
        points = np.delete(points, 0, axis=0)
    # sample points from array and combine these with centers
    sampData = np.concatenate((centers, points[np.random.choice(len(points),10000-k)]), axis = 0)
    return(sampData)

Overwriting simulatedData.py


In [3]:
import simulatedData
from simulatedData import generate_data
dataTest = generate_data(5, 1)
len(dataTest)

10000

In [4]:
dataTest

array([[ 0.094245  ,  1.24473561, -1.47416355, ..., -0.44821048,
         1.07509292,  1.23586155],
       [-0.00389113,  1.93824107,  1.29914844, ..., -0.21635323,
         0.52604057, -0.26825916],
       [ 0.89306542,  0.29696635,  0.58920778, ..., -1.85260025,
        -0.53139692, -0.03838707],
       ..., 
       [ 0.25025571,  0.39828788,  1.19196379, ..., -1.63085116,
        -0.33179029, -0.58100716],
       [-0.77403546,  2.55487751, -2.87273788, ...,  0.31526968,
        -1.15896694,  0.32706329],
       [-1.03579483,  1.41657981, -1.47368661, ..., -1.10863183,
        -0.0482546 , -0.08539769]])