In [1]:
import numpy as np
import pandas as pd

In [2]:
def generate_dataset(n_samples, mean, cov, seed=42):
    """
    Generate a synthetic dataset using multivariate normal distribution.
    
    Parameters:
    - n_samples: int, number of samples to generate
    - mean: list, mean vector for the dataset
    - cov: list of lists, covariance matrix for the dataset
    - seed: int, optional random seed for reproducibility
    
    Returns:
    - dataset: np.ndarray, generated dataset
    """
    if seed:
        np.random.seed(seed)
    dataset = np.random.multivariate_normal(mean, cov, size=n_samples)
    return dataset

In [3]:
# Generate dataset X
n_X = 10
mu_X = [1, 2]
cov_X = [[1, 0.5], [0.5, 1]]
dataset_X = generate_dataset(n_X, mu_X, cov_X, seed=42)

In [4]:
# Generate dataset Y
n_Y = 20
mu_Y = [3, 4]
cov_Y = [[2, 0.2], [0.2, 3]]
dataset_Y = generate_dataset(n_Y, mu_Y, cov_Y, seed=42)

In [69]:
# Calculate mean and covariance for dataset X
mean_X = np.mean(dataset_X, axis=0)
cov_X = compute_feature_covariance(dataset_X)

In [73]:
print(f"Mean X: \n{mean_X}")
print(f"Cov X: \n{cov_X}")

Mean X: 
[1.23891226 2.08103921]
Cov X: 
[[1.13531412 0.38161625]
 [0.38161625 0.55938953]]


In [74]:
# Calculate mean and covariance for dataset Y
mean_Y = np.mean(dataset_Y, axis=0)
cov_Y = compute_feature_covariance(dataset_Y)

In [75]:
print(f"Mean Y: \n{mean_Y}")
print(f"Cov Y: \n{cov_Y}")

Mean Y: 
[2.5827721  3.76773895]
Cov Y: 
[[ 1.97640754 -0.04528297]
 [-0.04528297  2.29103673]]


In [76]:
result = np.append(dataset_X, dataset_Y, axis=0)

In [78]:
mean_result = np.mean(result, axis=0)
cov_result = compute_feature_covariance(result)

In [79]:
print(f"Mean result: \n{mean_result}")
print(f"Cov result: \n{cov_result}")

Mean result: 
[2.13481882 3.20550571]
Cov result: 
[[2.09736734 0.60072522]
 [0.60072522 2.34603345]]


In [80]:
import numpy as np

def compute_combined_stats(n_X, mu_X, Sigma_X, n_Y, mu_Y, Sigma_Y):
    """
    Compute the mean vector and covariance matrix of the combined dataset,
    ensuring proper handling of population covariance.

    Parameters:
    - n_X: int, number of instances in dataset X
    - mu_X: np.ndarray, mean vector of dataset X
    - Sigma_X: np.ndarray, covariance matrix of dataset X
    - n_Y: int, number of instances in dataset Y
    - mu_Y: np.ndarray, mean vector of dataset Y
    - Sigma_Y: np.ndarray, covariance matrix of dataset Y

    Returns:
    - mu_combined: np.ndarray, mean vector of the combined dataset
    - Sigma_combined: np.ndarray, covariance matrix of the combined dataset
    """
    # Compute combined mean vector
    mu_combined = (n_X * mu_X + n_Y * mu_Y) / (n_X + n_Y)

    # Compute the mean difference term
    mean_diff = mu_X - mu_Y
    mean_adjustment = (n_X * n_Y) / (n_X + n_Y)**2 * np.outer(mean_diff, mean_diff)

    # Compute combined covariance matrix
    Sigma_combined = (n_X * Sigma_X + n_Y * Sigma_Y) / (n_X + n_Y) + mean_adjustment

    return mu_combined, Sigma_combined


In [81]:
# Example usage
if __name__ == "__main__":
    # Example data
    n_X = 10
    mu_X = mean_X
    Sigma_X = cov_X

    n_Y = 20
    mu_Y = mean_Y
    Sigma_Y = cov_Y

    # Compute combined mean and covariance
    mu_combined, Sigma_combined = compute_combined_stats(n_X, mu_X, Sigma_X, n_Y, mu_Y, Sigma_Y)

    # Output results
    print("Combined Mean Vector:")
    print(mu_combined)

    print("\nCombined Covariance Matrix:")
    print(Sigma_combined)
    


Combined Mean Vector:
[2.13481882 3.20550571]

Combined Covariance Matrix:
[[2.09736734 0.60072522]
 [0.60072522 2.34603345]]


In [62]:
import numpy as np

def compute_feature_covariance(data):
    """
    Compute the covariance matrix of features for a dataset using population covariance.

    Parameters:
    - data: np.ndarray, shape (n_samples, n_features)
      The dataset where rows are instances and columns are features.

    Returns:
    - covariance_matrix: np.ndarray, shape (n_features, n_features)
      Covariance matrix of the features.
    """
    # Ensure the input is a NumPy array
    data = np.array(data)
    
    # Subtract the mean from each feature
    mean_centered = data - np.mean(data, axis=0)
    
    # Compute the population covariance matrix
    covariance_matrix = np.dot(mean_centered.T, mean_centered) / data.shape[0]
    
    return covariance_matrix

# Example Usage
if __name__ == "__main__":

    # Compute the covariance matrix
    covariance_matrix = compute_feature_covariance(result)
    
    # Print the covariance matrix
    print("Covariance Matrix:")
    print(covariance_matrix)


Covariance Matrix:
[[2.09736734 0.60072522]
 [0.60072522 2.34603345]]
