In [137]:
import numpy as np
from scipy.spatial.distance import mahalanobis
from scipy.stats import multivariate_normal, invwishart
def random_data_gen(n_samples=1000, n_feats=10, maha=1.0, psi_diag=1.0, psi_offdiag=0., ddof=150, class_ratio=0.5, seed=None):
    if seed:
        np.random.seed(seed)
    ## initialize multivariate normal dist with normally distributed means and covariance
    ## drawn from an inverse wishart distribution (conjugate prior for MVN)
    norm_means_a = np.random.randn(n_feats)
    norm_means_b = np.zeros_like(norm_means_a)
    psi = psi_diag * np.eye(n_feats) + psi_offdiag * ~np.eye(n_feats).astype(bool)
    nu = n_feats + ddof
    wishart_cov = invwishart(nu, psi).rvs()
    ## specify the mahalanobis distance between the two distributions
    dist = mahalanobis(norm_means_a, norm_means_b, wishart_cov)
    norm_means_a = norm_means_a * (maha / dist)
    assert np.isclose(mahalanobis(norm_means_a, norm_means_b, wishart_cov), maha)
    ## multivariate normal distributions with different means and equal variances
    mvn_a = multivariate_normal(mean=norm_means_a, cov=wishart_cov)
    mvn_b = multivariate_normal(mean=norm_means_b, cov=wishart_cov)
    ## not used, but compute correlations
    corr = (D:=np.diag(1/np.sqrt(np.diag(wishart_cov)))) @ wishart_cov @ D
    ## generate data samples from a multivariate normal
    data = np.vstack([mvn_a.rvs(int(n_samples*class_ratio)), mvn_b.rvs(n_samples - int(n_samples*class_ratio))])
    labels = np.arange(len(data))<int(n_samples*class_ratio)
    return data, labels
#     idx = np.random.choice(np.arange(n_samples), n_samples, replace=False)
#     return data[idx], labels[idx]

In [116]:
data, labels = random_data_gen(n_samples=1000, n_feats=10, maha=1., psi_diag=1., seed=1)
data, labels

(array([[ 5.60382192e+00, -2.00184839e+00, -1.76000226e+00, ...,
         -2.49541696e+00,  1.07216939e+00, -9.22616432e-01],
        [ 5.49165612e+00, -2.05865832e+00, -1.84694233e+00, ...,
         -2.55926190e+00,  1.06162122e+00, -8.08179665e-01],
        [ 5.63523162e+00, -2.13963632e+00, -1.86029316e+00, ...,
         -2.57952799e+00,  1.12648119e+00, -8.66350227e-01],
        ...,
        [-5.07071616e-02,  3.53242082e-03,  4.07076712e-02, ...,
         -8.49180686e-02, -8.76625594e-02,  2.72813781e-02],
        [ 3.34065677e-02, -1.09287952e-01, -4.01749074e-02, ...,
         -2.99504685e-02, -7.43017354e-02, -3.50626539e-02],
        [-1.04111421e-01, -4.46627065e-02,  3.69168349e-03, ...,
          1.23519444e-01,  4.22717064e-02, -3.64803443e-03]]),
 array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  

In [6]:
np.save("random_data_X.npy", data)
np.save("random_data_y", labels)

In [73]:
def E_invwish(psi, dof):
    n_feats = len(psi)
    return psi / (dof-n_feats-1)

def Var_invwish(psi, dof):
    p = len(psi)
    Var = np.empty((p, p))
    for i in range(p):
        for j in range(p):
            Var[i][j] = (dof-p+1) * psi[i][j]**2 + (dof-p-1) * psi[i][i]*psi[j][j] 
    Var /= (dof-p)*(dof-p-1)**2*(dof-p-3)
    return Var

In [74]:
def covariance_invwishart(diag=1., offdiag=0., ddof=4, p=5):
    psi = diag*np.eye(5) + offdiag * np.ones((p, p))
    expected = E_invwish(psi, p+ddof)
    variance = Var_invwish(psi, p+ddof)
    return expected, variance 

expected, variance = covariance_invwishart(2, 0, ddof=10)
print(f"Expected Cov:\n{expected}")
corr = (D:=np.diag(1/np.sqrt(np.diag(expected)))) @ expected @ D
print(f"Expected Corr:\n{corr}")
print(f"Variance of Cov:\n{variance}")

Expected Cov:
[[0.22222222 0.         0.         0.         0.        ]
 [0.         0.22222222 0.         0.         0.        ]
 [0.         0.         0.22222222 0.         0.        ]
 [0.         0.         0.         0.22222222 0.        ]
 [0.         0.         0.         0.         0.22222222]]
Expected Corr:
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
Variance of Cov:
[[0.01410935 0.00634921 0.00634921 0.00634921 0.00634921]
 [0.00634921 0.01410935 0.00634921 0.00634921 0.00634921]
 [0.00634921 0.00634921 0.01410935 0.00634921 0.00634921]
 [0.00634921 0.00634921 0.00634921 0.01410935 0.00634921]
 [0.00634921 0.00634921 0.00634921 0.00634921 0.01410935]]


In [9]:
iw_dist = invwishart(df=9, scale=psi)
iw_dist.rvs(100000).var(0)

NameError: name 'psi' is not defined

In [75]:
data, labels = random_data_gen(n_samples=1000, n_feats=10, maha=1., psi_diag=1.)

In [76]:
np.cov(data.T)

array([[ 2.11337559e+01, -3.08223678e+00,  7.00927508e+00,
        -1.09136845e-01, -2.62665209e+00,  1.57397862e+01,
        -2.12814900e+00, -2.61325522e+00, -2.19929252e-01,
         3.81866761e+00],
       [-3.08223678e+00,  4.56311417e-01, -1.02238632e+00,
         1.62698098e-02,  3.83464766e-01, -2.29681948e+00,
         3.09756520e-01,  3.81738315e-01,  3.24989558e-02,
        -5.57379061e-01],
       [ 7.00927508e+00, -1.02238632e+00,  2.33222478e+00,
        -3.66479657e-02, -8.69779746e-01,  5.22311878e+00,
        -7.07011244e-01, -8.67886770e-01, -7.28119600e-02,
         1.26627167e+00],
       [-1.09136845e-01,  1.62698098e-02, -3.66479657e-02,
         5.85556937e-03,  1.31990962e-02, -8.20331257e-02,
         1.01523323e-02,  1.41373474e-02,  1.20997425e-03,
        -1.95500377e-02],
       [-2.62665209e+00,  3.83464766e-01, -8.69779746e-01,
         1.31990962e-02,  3.33782405e-01, -1.95560659e+00,
         2.63479777e-01,  3.24377059e-01,  2.87234823e-02,
        -4.

In [68]:
data_covar[0]

array([[ 4.96151568,  7.0445802 , -1.29205562,  1.92744237,  5.91554689],
       [ 7.0445802 , 10.03374466, -1.83804783,  2.74286713,  8.42024327],
       [-1.29205562, -1.83804783,  0.34757278, -0.5015138 , -1.54477097],
       [ 1.92744237,  2.74286713, -0.5015138 ,  0.76392008,  2.30213762],
       [ 5.91554689,  8.42024327, -1.54477097,  2.30213762,  7.08354046]])

In [70]:
data_covar[0]

array([[ 1.32461816e-02, -7.42721852e-04, -9.14612845e-04,
         1.10285195e-03,  4.55117617e-05],
       [-7.42721852e-04,  1.31646218e-02,  1.34630179e-03,
         2.31997711e-03,  1.35394691e-04],
       [-9.14612845e-04,  1.34630179e-03,  7.69575346e-03,
         1.62357035e-03, -1.15958329e-03],
       [ 1.10285195e-03,  2.31997711e-03,  1.62357035e-03,
         1.04366977e-02, -5.15768534e-04],
       [ 4.55117617e-05,  1.35394691e-04, -1.15958329e-03,
        -5.15768534e-04,  8.58132633e-03]])

In [132]:
data_covar= np.stack([np.cov(random_data_gen(n_samples=200, n_feats=5, maha=1., psi_diag=1., ddof=100)[0][:100].T) for i in range(1000)])

In [133]:
data_corr = np.stack([(D:=np.diag(1/np.sqrt(np.diag(wishart_cov)))) @ wishart_cov @ D for wishart_cov in data_covar])

In [134]:
np.quantile(data_corr, q=[.025, .975], axis=0)

array([[[ 1.        , -0.26030399, -0.28292655, -0.26624259,
         -0.27525976],
        [-0.26030399,  1.        , -0.26971977, -0.26895583,
         -0.26083137],
        [-0.28292655, -0.26971977,  1.        , -0.27437894,
         -0.27280784],
        [-0.26624259, -0.26895583, -0.27437894,  1.        ,
         -0.27590618],
        [-0.27525976, -0.26083137, -0.27280784, -0.27590618,
          1.        ]],

       [[ 1.        ,  0.27354688,  0.27247805,  0.26425711,
          0.25993115],
        [ 0.27354688,  1.        ,  0.2581902 ,  0.26684015,
          0.27829434],
        [ 0.27247805,  0.2581902 ,  1.        ,  0.27000853,
          0.27974811],
        [ 0.26425711,  0.26684015,  0.27000853,  1.        ,
          0.26837825],
        [ 0.25993115,  0.27829434,  0.27974811,  0.26837825,
          1.        ]]])

In [135]:
np.quantile(data_corr, q=[.1, .9], axis=0)

array([[[ 1.        , -0.16903361, -0.18102743, -0.17935758,
         -0.17772066],
        [-0.16903361,  1.        , -0.17282183, -0.18994219,
         -0.18048465],
        [-0.18102743, -0.17282183,  1.        , -0.17753612,
         -0.18197967],
        [-0.17935758, -0.18994219, -0.17753612,  1.        ,
         -0.18635543],
        [-0.17772066, -0.18048465, -0.18197967, -0.18635543,
          1.        ]],

       [[ 1.        ,  0.1825135 ,  0.1924824 ,  0.17781244,
          0.17790806],
        [ 0.1825135 ,  1.        ,  0.17543695,  0.17805637,
          0.18555939],
        [ 0.1924824 ,  0.17543695,  1.        ,  0.17391113,
          0.18311711],
        [ 0.17781244,  0.17805637,  0.17391113,  1.        ,
          0.1822637 ],
        [ 0.17790806,  0.18555939,  0.18311711,  0.1822637 ,
          1.        ]]])

In [136]:
np.mean(data_covar, axis=0).round(2)

array([[ 0.01,  0.  , -0.  , -0.  ,  0.  ],
       [ 0.  ,  0.01,  0.  ,  0.  ,  0.  ],
       [-0.  ,  0.  ,  0.01, -0.  ,  0.  ],
       [-0.  ,  0.  , -0.  ,  0.01, -0.  ],
       [ 0.  ,  0.  ,  0.  , -0.  ,  0.01]])

### BROKEN: Kronecker product version

In [169]:
import numpy as np

def comm_mat(m, n):
    # determine permutation applied by K
    w = np.arange(m * n).reshape((m, n), order="F").T.ravel(order="F")
    # apply this permutation to the rows (i.e. to each column) of identity matrix and return result
    return np.eye(m * n)[w, :]

def vec(X):
    return np.ravel(X, order='F')

def kron_Var_invwish(psi, dof):
    p = len(psi)
    c2 = ((dof - p)*(dof - p - 1)*(dof - p - 3))**(-1)
    c1 = (dof-p-2)*c2
    c3 = (dof-p-1)**(-2)
    K_pp = comm_mat(p, p)
    return c1 * np.kron(psi, psi) + c2*vec(psi) @ vec(psi).T + c2 * K_pp @ np.kron(psi, psi) 
    