In [3]:
import numpy as np
from scipy.spatial.distance import mahalanobis
from scipy.stats import multivariate_normal, invwishart
def random_data_gen(n_samples=1000, n_feats=10, maha=1.0, psi_diag=1.0, psi_offdiag=0., ddof=150, class_ratio=0.5, seed=None):
    if seed:
        np.random.seed(seed)
    ## initialize multivariate normal dist with normally distributed means and covariance
    ## drawn from an inverse wishart distribution (conjugate prior for MVN)
    norm_means_a = np.random.randn(n_feats)
    norm_means_b = np.zeros_like(norm_means_a)
    psi = psi_diag * np.eye(n_feats) + psi_offdiag * ~np.eye(n_feats).astype(bool)
    nu = n_feats + ddof
    wishart_cov = invwishart(nu, psi).rvs()
    ## specify the mahalanobis distance between the two distributions
    dist = mahalanobis(norm_means_a, norm_means_b, wishart_cov)
    norm_means_a = norm_means_a * (maha / dist)
    assert np.isclose(mahalanobis(norm_means_a, norm_means_b, wishart_cov), maha)
    ## multivariate normal distributions with different means and equal variances
    mvn_a = multivariate_normal(mean=norm_means_a, cov=wishart_cov)
    mvn_b = multivariate_normal(mean=norm_means_b, cov=wishart_cov)
    ## not used, but compute correlations
    corr = (D:=np.diag(1/np.sqrt(np.diag(wishart_cov)))) @ wishart_cov @ D
    ## generate data samples from a multivariate normal
    data = np.vstack([mvn_a.rvs(int(n_samples*class_ratio)), mvn_b.rvs(n_samples - int(n_samples*class_ratio))])
    labels = np.arange(len(data))<int(n_samples*class_ratio)
    return data, labels

In [4]:
from numpy import __version__ as v
v
from scipy import __version__ as v
v

'1.7.3'

In [5]:
data, labels = random_data_gen(n_samples=1000, n_feats=10, maha=1., psi_diag=1.)
data, labels

(array([[-9.15910475e+00, -4.89070990e-02,  2.98483777e+00, ...,
          6.33191789e-01,  1.22825385e+00,  7.74653242e-01],
        [-9.28514918e+00, -1.74422581e-01,  3.13601087e+00, ...,
          7.96529923e-01,  1.05613031e+00,  8.37606569e-01],
        [-9.30889080e+00, -1.38971077e-01,  3.21211546e+00, ...,
          6.46224059e-01,  9.60843439e-01,  7.78035999e-01],
        ...,
        [ 3.03177664e-02, -1.39181033e-01,  3.98278450e-02, ...,
          1.25858704e-01, -1.80826564e-01, -3.23349127e-03],
        [ 7.21363601e-02,  1.13799991e-01, -4.87543905e-02, ...,
         -1.19678948e-01,  8.00650982e-02,  3.45938422e-02],
        [-2.26635283e-02,  7.59515415e-02, -1.79267798e-01, ...,
          2.19843202e-02,  1.34797645e-02,  2.35456576e-02]]),
 array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  

In [6]:
np.save("random_data_X.npy", data)
np.save("random_data_y", labels)

In [7]:
def E_invwish(psi, dof):
    n_feats = len(psi)
    return psi / (dof-n_feats-1)

def Var_invwish(psi, dof):
    p = len(psi)
    Var = np.empty((p, p))
    for i in range(p):
        for j in range(p):
            Var[i][j] = (dof-p+1) * psi[i][j]**2 + (dof-p-1) * psi[i][i]*psi[j][j] 
    Var /= (dof-p)*(dof-p-1)**2*(dof-p-3)
    return Var

In [8]:
def covariance_invwishart(diag=1., offdiag=0., ddof=4, p=5):
    psi = diag*np.eye(5) + offdiag * np.ones((p, p))
    expected = E_invwish(psi, p+ddof)
    variance = Var_invwish(psi, p+ddof)
    return expected, variance 

expected, variance = covariance_invwishart(2, 0, ddof=100)
print(f"Expected Cov:\n{expected}")
corr = (D:=np.diag(1/np.sqrt(np.diag(expected)))) @ expected @ D
print(f"Expected Corr:\n{corr}")
print(f"Variance of Cov:\n{variance}")

Expected Cov:
[[0.02020202 0.         0.         0.         0.        ]
 [0.         0.02020202 0.         0.         0.        ]
 [0.         0.         0.02020202 0.         0.        ]
 [0.         0.         0.         0.02020202 0.        ]
 [0.         0.         0.         0.         0.02020202]]
Expected Corr:
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
Variance of Cov:
[[8.41487877e-06 4.16536499e-06 4.16536499e-06 4.16536499e-06
  4.16536499e-06]
 [4.16536499e-06 8.41487877e-06 4.16536499e-06 4.16536499e-06
  4.16536499e-06]
 [4.16536499e-06 4.16536499e-06 8.41487877e-06 4.16536499e-06
  4.16536499e-06]
 [4.16536499e-06 4.16536499e-06 4.16536499e-06 8.41487877e-06
  4.16536499e-06]
 [4.16536499e-06 4.16536499e-06 4.16536499e-06 4.16536499e-06
  8.41487877e-06]]


In [9]:
iw_dist = invwishart(df=9, scale=psi)
iw_dist.rvs(100000).var(0)

NameError: name 'psi' is not defined

In [10]:
data, labels = random_data_gen(n_samples=1000, n_feats=10, maha=1., psi_diag=1.)

In [11]:
data_covar= np.stack([np.cov(random_data_gen(n_samples=200, n_feats=5, maha=1., psi_diag=1, ddof=150)[0][:100].T) for i in range(1000)])

In [12]:
data_corr = np.stack([(D:=np.diag(1/np.sqrt(np.diag(wishart_cov)))) @ wishart_cov @ D for wishart_cov in data_covar])

In [13]:
np.quantile(data_corr, q=[.025, .975], axis=0)

array([[[ 1.        , -0.24966474, -0.26022604, -0.239892  ,
         -0.24157122],
        [-0.24966474,  1.        , -0.24334487, -0.25067608,
         -0.24362033],
        [-0.26022604, -0.24334487,  1.        , -0.25619293,
         -0.2404409 ],
        [-0.239892  , -0.25067608, -0.25619293,  1.        ,
         -0.2653355 ],
        [-0.24157122, -0.24362033, -0.2404409 , -0.2653355 ,
          1.        ]],

       [[ 1.        ,  0.24923633,  0.24398142,  0.25127786,
          0.24859063],
        [ 0.24923633,  1.        ,  0.24425387,  0.23711631,
          0.23920541],
        [ 0.24398142,  0.24425387,  1.        ,  0.24440722,
          0.26522726],
        [ 0.25127786,  0.23711631,  0.24440722,  1.        ,
          0.24144272],
        [ 0.24859063,  0.23920541,  0.26522726,  0.24144272,
          1.        ]]])

In [14]:
np.mean(data_covar, axis=0).round(2)

array([[ 0.01,  0.  ,  0.  ,  0.  , -0.  ],
       [ 0.  ,  0.01, -0.  ,  0.  , -0.  ],
       [ 0.  , -0.  ,  0.01, -0.  ,  0.  ],
       [ 0.  ,  0.  , -0.  ,  0.01, -0.  ],
       [-0.  , -0.  ,  0.  , -0.  ,  0.01]])

### BROKEN: Kronecker product version

In [169]:
import numpy as np

def comm_mat(m, n):
    # determine permutation applied by K
    w = np.arange(m * n).reshape((m, n), order="F").T.ravel(order="F")
    # apply this permutation to the rows (i.e. to each column) of identity matrix and return result
    return np.eye(m * n)[w, :]

def vec(X):
    return np.ravel(X, order='F')

def kron_Var_invwish(psi, dof):
    p = len(psi)
    c2 = ((dof - p)*(dof - p - 1)*(dof - p - 3))**(-1)
    c1 = (dof-p-2)*c2
    c3 = (dof-p-1)**(-2)
    K_pp = comm_mat(p, p)
    return c1 * np.kron(psi, psi) + c2*vec(psi) @ vec(psi).T + c2 * K_pp @ np.kron(psi, psi) 
    