In [1]:
from sklearn.datasets import make_blobs
from pandas import DataFrame
from os import mkdir
from os.path import isdir, join
from datetime import datetime

In [2]:
RANDOM_STATE = 435

def generate_dataset_data_frame(n_samples: int, n_features:int, centers: int) -> DataFrame:
    samples, labels = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features, random_state=RANDOM_STATE)

    columns = list(map(lambda number: ('Column ' + str(number+1)),  range(0, n_features)))

    return DataFrame(data = samples, columns=columns)    

In [3]:
SAMPLES_OUTPUT_DIR = './generated-datasets'
CSV_EXTENSION = '.csv'
CSV_SEPARATOR = ','
DEFAULT_CSV_ENCODING = 'utf-8'

def create_directory_if_not_exist(directory_path):
    if not isdir(directory_path):
        mkdir(directory_path)

def save_data_frame_to_csv(data_frame: DataFrame, file_name: str):
    create_directory_if_not_exist(SAMPLES_OUTPUT_DIR)

    file_path = join(SAMPLES_OUTPUT_DIR, file_name + CSV_EXTENSION)

    data_frame.to_csv(file_path, index = True, encoding=DEFAULT_CSV_ENCODING, sep=CSV_SEPARATOR)

In [4]:
AMOUNT_OF_FEATURES = 20

AMOUNT_OF_CENTERS = 5

DESIRED_SAMPLES_SIZES = [100, 1000, 5000, 10000, 50000, 100000, 250000, 500000, 1000000] 

current_time = datetime.now()

formatted_time = current_time.strftime("%d%b%Y%H%M%S")

for n_samples in DESIRED_SAMPLES_SIZES:
    data_frame = generate_dataset_data_frame(n_samples = n_samples, n_features=AMOUNT_OF_FEATURES, centers=AMOUNT_OF_CENTERS)

    file_name = 'dataset_' + str(n_samples) + 'x' + str(AMOUNT_OF_FEATURES) + '_' + formatted_time

    save_data_frame_to_csv(data_frame= data_frame, file_name=file_name)

    print('Generated a dataset with ' + str(n_samples) + ' rows and ' +  str(AMOUNT_OF_FEATURES) + ' columns.')


Generated a dataset with 100 rows and 20 columns.
Generated a dataset with 1000 rows and 20 columns.
Generated a dataset with 5000 rows and 20 columns.
Generated a dataset with 10000 rows and 20 columns.
Generated a dataset with 50000 rows and 20 columns.
Generated a dataset with 100000 rows and 20 columns.
Generated a dataset with 250000 rows and 20 columns.
Generated a dataset with 500000 rows and 20 columns.
Generated a dataset with 1000000 rows and 20 columns.
