In [28]:
# imports necessary
import pandas as pd
import numpy as np
import os

In [29]:
output_dir = 'processed_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# read data
data = pd.read_csv('Cleaned_data.csv')

# columns to add noise to
numerical_cols = [
    "Thorax_length",
    "l2",
    "l3p",
    "l3d",
    "lpd",
    "l3",
    "w1",
    "w2",
    "w3",
    "wing_loading",
]

# define percentages of noise to add
noise_levels = [0, 1, 2.5, 5, 10, 20]

In [30]:
# define function to add noise to data
def add_noise(data: pd.DataFrame, columns: list[str], noise_percent: float) -> pd.DataFrame:
    data_noisy = data.copy()

    for column in columns:

        # calculate std deviation of column
        std = data[column].std()

        # create noise with normal distribution
        noise = np.random.normal(0, std * (noise_percent / 100), size=data[column].shape)

        data_noisy[column] += noise

    return data_noisy


In [31]:
# generate noisy data and save to csv
for percent in noise_levels:
    noisy_data = add_noise(data, numerical_cols, percent)
    file_path = os.path.join(
        output_dir, f"{percent}%_noisy_data.csv"
    )  # Define the full path
    noisy_data.to_csv(file_path, index=False)
    print(f"Data with {percent}% noise saved to {file_path}")

Data with 0% noise saved to processed_data\0%_noisy_data.csv
Data with 1% noise saved to processed_data\1%_noisy_data.csv
Data with 2.5% noise saved to processed_data\2.5%_noisy_data.csv
Data with 5% noise saved to processed_data\5%_noisy_data.csv
Data with 10% noise saved to processed_data\10%_noisy_data.csv
Data with 20% noise saved to processed_data\20%_noisy_data.csv
