### 1. Generating Test Data

In [3]:
import os
import datetime
import random
import string
import pyarrow as pa
import pyarrow.parquet as pq

In [31]:
def generate_dummy_data(n_rows: int, n_columns: int) -> dict:
    '''This function takes as an input the number of rows and columns
    of a table and generates a dictionary with the given
    dimensions, where keys represent columns and values represent rows,
    filled with dummy data.

    Parameters:
        n_rows (int): the number of rows for the data being generated
        n_columns (int): the number of columns for the data

    Returns:
        dict: a dictionary with n_columns number of keys and n_rows
        number of values per each key. Each key represents a data type
        and the values are randomly generated data for that type.
    '''

    random.seed(10)

    letters = string.ascii_letters

    data = {}

    for i_col in range(n_columns):
        col_name = f"col_{i_col + 1}"

        # Generates dummy data with different data types
        if i_col == 0:
            # Random datetime column
            col_name += '_datetime'
            timestamps = []
            # Create a list with random pyarrow.TimestampScalar objects
            while len(timestamps) < n_rows:
                random_epoch_time = random.randint(0, 2**30)
                timestamp = datetime.datetime.fromtimestamp(random_epoch_time)
                timestamps.append(timestamp)
            # Convert the list to a pyarrow array
            pyarrow_timestamps = pa.array(timestamps, type=pa.timestamp('ns'))
            data[col_name] = pyarrow_timestamps

        if i_col % 7 == 0:
            # Random integer column
            col_name += '_int'
            data[col_name] = pa.array(
                random.sample(range(0, random.randint(1, 1000)), n_rows),
                type=pa.uint32())

        if i_col % 7 == 1:
            # Random float column with values between 0 and 1
            col_name += '_float'
            floatlist = []
            while len(floatlist) < n_rows:
                number = random.random()
                floatlist.append(number)
            data[col_name] = pa.array(floatlist, type=pa.float32())

        if i_col % 7 == 2:
            # Random boolean column
            col_name += '_bool'
            boollist = []
            while len(boollist) < n_rows:
                boolean = bool(random.getrandbits(1))
                boollist.append(boolean)
            data[col_name] = pa.array(boollist, type=pa.bool_())

        if i_col % 7 == 3:
            # Random exponential distribution float column
            col_name += '_float'
            expolist = []
            while len(expolist) < n_rows:
                expo = random.expovariate(random.randint(1, 20))
                expolist.append(expo)
            data[col_name] = pa.array(expolist, type=pa.float32())

        if i_col % 7 == 4:
            # Random string column
            col_name += '_str'
            stringlist = []
            while len(stringlist) < n_rows:
                random_string =''.join(random.choice(letters) for _ in range(
                    random.randint(5, 8)))
                stringlist.append(random_string)
            data[col_name] = pa.array(stringlist, type=pa.string())

        if i_col % 7 == 5:
            # Random normal distribution float column with a random mean
            col_name += '_float'
            normlist = []
            while len(normlist) < n_rows:
              norm = random.gauss(mu=random.randint(15, 50),
                                  sigma=random.randint(1, 25))
              normlist.append(norm)
            data[col_name] = pa.array(normlist, type=pa.float32())

        if i_col % 7 == 6:
            # Random integer column
            col_name += '_int'
            data[col_name] = pa.array(random.choices(
                range(random.randint(-10, -1), random.randint(0, 10)),
                k=n_rows), type = pa.int32())

    return data

def generate_parquet_file(n_rows: int, n_columns: int,
                          save_path: str, filename: str) -> None:
    '''This function takes as input the number of rows and columns of
    the table that is being generated and the file name for the
    Parquet file created from the table.

    Parameters:
        rows (int): the number of rows for the table being generated
        columns (int): the number of columns for the table
        filename (str): the file name for the Parquet file created from
             the table

    Returns:
        None: downloads a Parquet file containing a table with the given
        dimensions, filled with randomly generated dummy data and
        various data types
    '''
    # Validity checks
    if not(n_rows and n_rows > 0 and
           n_rows < 10e6 and isinstance(n_rows, int)):
        raise ValueError('Invalid input dimensions!')

    if not(n_columns and n_columns > 0 and
           n_columns < 10e6 and isinstance(n_rows, int)):
        raise ValueError('Invalid input dimensions!')

    else:
        # Generate dummy data
        dummy_data = generate_dummy_data(n_rows, n_columns)

        # Convert DataFrame to Arrow Table
        table = pa.Table.from_pydict(dummy_data)

        # Chech if save path exists
        if os.path.exists(save_path) == False:
            raise ValueError('Save path does not exist')

        # Write Arrow Table to Parquet file
        # .parquet added to file name automatically
        file_path = os.path.join(save_path, filename + '.parquet')
        pq.write_table(table, file_path)
        print(f"Parquet file '{filename}' generated successfully.")

    # Sample run
    #
    # import os
    # import datetime
    # import random
    # import string
    # import pyarrow as pa
    # import pyarrow.parquet as pq
    #
    # generate_parquet_file(10, 10, 'C:\\Users\\user', 'dummy_data')