In [205]:
%matplotlib inline
# %matplotlib notebook
%config Completer.use_jedi = False
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


from scipy.special import logsumexp
from scipy.special import softmax
from scipy.special import betaln
from scipy.special import beta

import numpy.linalg as linalg
from sklearn.cluster import KMeans
from sklearn import metrics
import scipy.sparse as sparse
import random

from collections import Counter

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_blobs

In [3]:
random_seed = 123
rng = np.random.default_rng(random_seed)

# Shapes

In [221]:
from skimage.draw import disk, rectangle, polygon

def generate_shape(shape, std=0.1, size=10):
    img = np.zeros((size, size), dtype=np.float32)
    center1 = np.round(size // 2 + np.random.uniform(-size//15,size//15)).astype(np.int16)
    center2 = np.round(size // 2 + np.random.uniform(-size//15,size//15)).astype(np.int16)
    
    if shape == 'circle':
        rr, cc = disk((center1, center1), radius=size//3, shape=img.shape)
        img[rr, cc] = 1
    elif shape == 'square':
        s = size // 3
        rr, cc = rectangle((center1-s, center2-s), extent=(2*s, 2*s), shape=img.shape)
        img[rr, cc] = 1
    elif shape == 'triangle':
        s = size // 3
        points = [(center1, center2 - s), (center1 - s, center2 + s), (center1 + s, center2 + s)]
        rr, cc = polygon([p[0] for p in points], [p[1] for p in points], shape=img.shape)
        img[rr, cc] = 1
    elif shape == 'plus':
        t = 1
        img[center1 - t:center1 + t + 1, :] = 1
        img[:, center2 - t:center2 + t + 1] = 1
    elif shape == "bar" :
        t = 1
        img[:, center1 - t:center2 + t + 1] = 1
    else:
        raise ValueError("Unknown shape")
    
    return img+  np.random.normal(0, std, size=img.shape)

In [224]:
shapes = ['square','triangle','bar','plus']

X = []
y = []

for idx,shape in enumerate(shapes):
    for _ in range(1000):
        X.append(generate_shape(shape))
        y.append(idx)

In [225]:
X = np.array(X).reshape(len(X), -1)
y = np.array(y)

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [227]:
X_train_bin = X_train > 0.5
X_test_bin = X_test > 0.5

In [228]:
pipeline = Pipeline([
    ('scaler', Normalizer())
])

X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)

In [229]:
name="shapes"

np.save(f"./../Datasets/Gaussian/Processed/{name}/X_train_{name}.npy",X_train_prepared)
np.save(f"./../Datasets/Gaussian/Processed/{name}/X_test_{name}.npy",X_test_prepared)
np.save(f"./../Datasets/Gaussian/Processed/{name}/y_train_{name}.npy",y_train)
np.save(f"./../Datasets/Gaussian/Processed/{name}/y_test_{name}.npy",y_test)

np.save(f"./../Datasets/Bernoulli/Processed/{name}/X_train_{name}.npy", X_train_bin)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/X_test_{name}.npy", X_test_bin)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/y_train_{name}.npy", y_train)
np.save(f"./../Datasets/Bernoulli/Processed/{name}/y_test_{name}.npy", y_test)

# Clusters

In [232]:
from scipy.stats import wishart, multivariate_normal

In [242]:
def generate_wishart_covariances(K, D, df=None, rng=None):
    rng = rng or np.random.default_rng()
    df = df or D + 2  # degrees of freedom must be >= D

    covs = []
    for _ in range(K):
        scale_matrix = np.eye(D)
        cov = wishart(df=df, scale=scale_matrix, seed=rng).rvs()
        covs.append(cov)

    return np.array(covs)

def generate_means(covariances):
    K,D,D = covariances.shape
    rng = np.random.default_rng()

    m_0 = np.zeros((D))
    k = 0.01

    means = []

    for cov in covariances:
        mean = rng.multivariate_normal(m_0, cov/k)
        means.append(mean)

    return means



In [243]:
covariances = generate_wishart_covariances(10, 10)
means = generate_means(covariances)

In [256]:
X = []
y = []

rng = np.random.default_rng()

for idx, (cov, mean) in enumerate(zip(covariances, means)):
    X_k = rng.multivariate_normal(mean, cov, size=500)
    y_k = np.full(500, idx)  # more efficient than list comprehension
    X.append(X_k)
    y.append(y_k)

X = np.vstack(X)  # shape: (K * 1000, D)
y = np.concatenate(y)  # shape: (K * 1000,)

In [257]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [258]:
name = "synthetic"

np.save(f"./../Datasets/Gaussian/Processed/{name}/X_train_{name}.npy",X_train)
np.save(f"./../Datasets/Gaussian/Processed/{name}/X_test_{name}.npy",X_test)
np.save(f"./../Datasets/Gaussian/Processed/{name}/y_train_{name}.npy",y_train)
np.save(f"./../Datasets/Gaussian/Processed/{name}/y_test_{name}.npy",y_test)

# Synthetic Bernoulli

In [None]:
def generate_bernoulli_parameters(K, D, rng=None):
    rng = rng or np.random.default_rng()
    return rng.uniform(low=0.1, high=0.9, size=(K, D))
def generate_bernoulli_mixture_data(θ, n_per_cluster=1000, rng=None):
    rng = rng or np.random.default_rng()
    K, D = θ.shape

    X = []
    y = []

    for k in range(K):
        samples = rng.binomial(n=1, p=θ[k], size=(n_per_cluster, D))
        labels = np.full(n_per_cluster, k)

        X.append(samples)
        y.append(labels)

    X = np.vstack(X)  # shape (K * n_per_cluster, D)
    y = np.concatenate(y)  # shape (K * n_per_cluster,)
    return X, y
