# Generate baseline/control dataset
#### -5000x201 dataset
#### -multivariate normal features
#### -5 true features, 195 noise
#### -balanced 4 class target, linear combination of the 5 features by creating linear score
#### -correlation matrix for collinearity, var=1 corr=0.1 off diagonals

In [None]:
import numpy as np
import pandas as pd

#Set random seed for reproducibility
np.random.seed(42)

#Define dataset dimensions
n = 5000    # number of observations
p = 200     # total number of features

#Create an exchangeable covariance matrix with variance=1 and correlation=0.1 off-diagonals
rho = 0.1
Sigma = np.full((p, p), rho)
np.fill_diagonal(Sigma, 1.0)

#Generate the feature matrix X from a multivariate normal distribution
X = np.random.multivariate_normal(mean=np.zeros(p), cov=Sigma, size=n)

#Define coefficients for the 5 true predictors
beta_true = np.array([1.0, -1.0, 0.5, 2.0, -0.5])

#Compute a linear score using only the 5 true predictors, add some noise
signal = X[:, :5].dot(beta_true) + np.random.normal(0, 1, n)

#Determine class boundaries based on quartiles of the signal
q1, q2, q3 = np.percentile(signal, [25, 50, 75])

#Assign classes 0, 1, 2, 3 based on the signal quartiles
Y = np.where(signal <= q1, 0,
np.where(signal <= q2, 1,
np.where(signal <= q3, 2, 3)))

#Combine features and target into a pandas DataFrame
columns = [f'X{i+1}' for i in range(p)]
df = pd.DataFrame(X, columns=columns)
df['Target'] = Y

#Display basic information
print('Dataset shape:', df.shape)
print('Target class distribution:')
print(df['Target'].value_counts().sort_index())

Dataset shape: (5000, 201)
Target class distribution:
Target
0    1250
1    1250
2    1250
3    1250
Name: count, dtype: int64
