<a href="https://colab.research.google.com/github/kalmikko/Data-Randomizer-for-Confidentiality/blob/main/shuffler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
"""known issues:
1. floating point inaccuracy results in excess decimals in randomized values
2. empty header names in excel replaced with "Unnamed: x" header name
3. not comprehensively tested with different data structures
"""

import pandas as pd
import numpy as np
import os
import random

def randomize_value(value):
    if pd.isna(value):
        return value  # Keep the value as-is if it is NaN (empty)
    if isinstance(value, (int, float)):
        if isinstance(value, float):
            # Handle floats correctly
            decimal_places = len(str(value).split('.')[1]) if '.' in str(value) else 0
            random_value = round(random.uniform(0, 2 * value), decimal_places)
        else:
            random_value = random.randint(0, 10**len(str(value)))
        return random_value
    elif isinstance(value, bool):
        return random.choice([True, False])
    else:
        return value  # Keep the original value for strings and unsupported types

def randomize_data(input_file):
    """Reads a data file, randomizes its values (except strings),
    and saves it with the same format.

    Args:
        input_file: Path to the input data file.
    """
    # Determine file type
    file_extension = os.path.splitext(input_file)[1].lower()

    if file_extension == '.csv':
        df = pd.read_csv(input_file, keep_default_na=True)
        df_randomized = df.applymap(randomize_value)
        output_file = "randomized_" + os.path.basename(input_file)
        df_randomized.to_csv(output_file, index=False, na_rep='')

    elif file_extension == '.xlsx':
        xls = pd.ExcelFile(input_file)
        output_file = "randomized_" + os.path.basename(input_file)
        with pd.ExcelWriter(output_file) as writer:
            for sheet_name in xls.sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet_name, keep_default_na=True)
                df_randomized = df.applymap(randomize_value)
                df_randomized.to_excel(writer, sheet_name=sheet_name, index=False, na_rep='')

    elif file_extension == '.json':
        df = pd.read_json(input_file)
        df_randomized = df.applymap(randomize_value)
        output_file = "randomized_" + os.path.basename(input_file)
        df_randomized.to_json(output_file, orient='records')

    else:
        raise ValueError("Unsupported file format. Please provide a CSV, XLSX, or JSON file.")

    print(f"Randomized data saved to: {output_file}")

# Example usage:
input_file = "/your/folder/your_file.csv"  # Replace with your file path
randomize_data(input_file)

Randomized data saved to: randomized_xlsx_test.xlsx


In [11]:
ls

[0m[01;34mdrive[0m/                   randomized_excel_test.xlsx  randomized_xlsx_test.xlsx
randomized_csv_test.csv  randomized_json_test.json   [01;34msample_data[0m/
