In [1]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='a92ebd05-8be4-4c8f-8f61-c21d319e8a85', project_access_token='p-475f8c824417caa2ec2e40f6d092d8307e642f86')
pc = project.project_context


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd



In [3]:
# n: number of centroids
# sigma: number of sigma deviations from each centroid
# d: number of dimensions
# num_r: number of rows/samples per category

# This multivariate Gaussian classes generator uses a single coordinate as component for the mean between classes.
# For example, variables living in R^10 with two classes will have coordinates X0 (for class 1) and X1 (for class 2) to describe their mean.
# Other dimensions would be just noise centered at 0.

def gen_classes(n,sigma,d,num_r):
    x=sigma/np.sqrt(d)
    means=np.multiply(np.identity(d),x)
    
    nd_noise=[]
    var_matrix=np.identity(d)
    obs_matrix=np.zeros(d+1)
    for i in range(n):
        
        obs=np.random.multivariate_normal(means[i],var_matrix,num_r)
        obs=np.column_stack((obs,np.multiply(np.ones(num_r),i)))
        
        obs_matrix = np.vstack((obs_matrix,obs))
        
    obs_matrix=obs_matrix[1:]
    col_names=["X"+str(i) for i in range(d)]
    col_names.append("cat")
    return obs_matrix,col_names

# This multivariate Gaussian classes generator uses is ad hoc for just two classes with d dimensions. To avoid excesive noise, mean will be
# described by more than just one coordinate.

# Added pr parameter: fraction of significant covariates. For example: if pr=0.9, then 90% of the covariates will be centered in the mean,
# the remainding will be centered at 0, being pure noise.

def gen_classes_two(sigma,d,num_r,pr):
    x=sigma/np.sqrt(d)
    ones_vec=np.random.binomial(1,pr,d)
    means1=np.multiply(ones_vec,x)
    means2=np.multiply(ones_vec,-x)
    means=np.vstack((means1,means2))
    nd_noise=[]
    var_matrix=np.identity(d)
    obs_matrix=np.zeros(d+1)
    for i in range(2):
        
        obs=np.random.multivariate_normal(means[i],var_matrix,num_r)
        obs=np.column_stack((obs,np.multiply(np.ones(num_r),i)))
        
        obs_matrix = np.vstack((obs_matrix,obs))
        
    obs_matrix=obs_matrix[1:]
    col_names=["X"+str(i) for i in range(d)]
    col_names.append("cat")
    return obs_matrix,col_names

In [4]:
# Matrix of covariates and responses for classes >> 2 (near d)

#test,names=gen_classes(2,5,2500,30000)

In [5]:
# Matrix of covariates and responses for 2 classes


test,names=gen_classes_two(5,1500,25000,0.8)

In [6]:
df=pd.DataFrame(test,columns=names)

In [7]:
project.save_data(data=df.to_csv(index=False),file_name='big_dataset.csv',overwrite=True)

{'file_name': 'big_dataset.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'sbssparktest-donotdelete-pr-o2cqjfwqogiwri',
 'asset_id': 'b05b04a1-60c4-405f-ba9d-8b420f1e922d'}