In [33]:
# imports necessary
import pandas as pd
import numpy as np
import os
from parameters import *
import import_ipynb

In [34]:
output_dir = 'processed_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# read data
data = pd.read_csv('Cleaned_data.csv')

# columns to add noise to
numerical_cols = [
    "Thorax_length",
    "l2",
    "l3p",
    "l3d",
    "lpd",
    "l3",
    "w1",
    "w2",
    "w3",
    "wing_loading",
]

# define percentages of noise to add
noise_levels = [0,1,2.5,5,10]

In [35]:
# define function to add noise to data

# seed numpy random number generator
np.random.seed(RANDOM_SEED)

def add_noise(data: pd.DataFrame, noise_percent: float) -> pd.DataFrame:
    data_noisy = data.copy()
    for column in numerical_cols:

        # calculate std deviation of column
        std = data[column].std()

        # create noise with normal distribution
        noise = np.random.normal(0, std * (noise_percent / 100), size=data[column].shape)

        data_noisy[column] += noise

    return data_noisy

In [36]:
# generate noisy data and save to csv
for percent in noise_levels:
    noisy_data = add_noise(data, percent)
    file_path = os.path.join(
        output_dir, f"{percent}%_noisy_data.csv"
    )  # Define the full path
    noisy_data.to_csv(file_path, index=False)
    print(f"Data with {percent}% noise saved to {file_path}")

[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
Data with 0% noise saved to processed_data\0%_noisy_data.csv
[ 0.00016115  0.0003666   0.00038768 -0.00092475  0.00054087]
[ 0.00123698  0.00028083 -0.00233245 -0.00040615 -0.00051904]
[ 0.00045448 -0.00063789  0.00012956 -0.00048554 -0.00087415]
[ 0.00239053  0.00031734 -0.00048782 -0.00035201 -0.00060274]
[-0.00196361  0.00102168 -0.00061172 -0.00241029  0.00024264]
[ 0.00052872  0.00219279 -0.00123634  0.0037459   0.00091566]
[ 0.00019523 -0.00081143 -0.00044496 -0.00149029  0.00040823]
[ 1.11491377e-03 -3.56601991e-04  1.16252508e-03  4.02731218e-05
  1.16029736e-04]
[-1.27076606e-03  6.85033323e-04 -2.88827553e-04 -1.58829745e-03
 -5.05892747e-05]
[ 0.00039287 -0.00054377 -0.00016838  0.0006127   0.00016856]
Data with 1% noise saved to processed_data\1%_noisy_data.csv
[-0.0005955  -0.00110533 -0.00239382 -0.004437