In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm

def data_augmentation(X, y, target_size):
    """
    Perform data augmentation to increase the dataset size.

    Args:
        X (numpy array): Feature matrix of shape (n_samples, n_features).
        y (numpy array): Target labels of shape (n_samples,).
        target_size (int): Desired size of the augmented dataset.

    Returns:
        X_augmented (numpy array): Augmented feature matrix of shape (target_size, n_features).
        y_augmented (numpy array): Augmented target labels of shape (target_size,).
    """

    num_samples = X.shape[0]
    num_augmentations = target_size - num_samples

    X_augmented = np.zeros((num_augmentations, X.shape[1]))
    y_augmented = np.zeros(num_augmentations, dtype=y.dtype)

    for i in range(num_augmentations):
        # Randomly select an original sample
        index = np.random.choice(num_samples)
        sample = X[index]
        label = y[index]

        # Apply data augmentation techniques
        augmented_sample = apply_augmentation(sample)

        # Add augmented sample to the augmented dataset
        X_augmented[i] = augmented_sample
        y_augmented[i] = label

    # Concatenate original dataset with augmented dataset
    X_augmented = np.concatenate((X, X_augmented), axis=0)
    y_augmented = np.concatenate((y, y_augmented), axis=0)

    return X_augmented, y_augmented

def apply_augmentation(sample):
    """
    Apply data augmentation techniques to a single sample.

    Args:
        sample (numpy array): Single sample to be augmented.

    Returns:
        augmented_sample (numpy array): Augmented sample.
    """

    # Example augmentation technique - adding Gaussian noise
    augmented_sample = sample + np.random.normal(loc=0, scale=0.1, size=sample.shape)

    return augmented_sample


In [2]:
# Load the dataset
data = pd.read_csv('./data/encoded_data.csv')

# Assuming your data is in a pandas DataFrame called 'data'
X = data.drop('success', axis=1).to_numpy()
y = data['success'].to_numpy()

X_augmented, y_augmented = data_augmentation(X, y, 1000)

# Create a new DataFrame with the augmented data
augmented_data = pd.DataFrame(X_augmented, columns=data.columns[:-1])
augmented_data['success'] = y_augmented

# Export the augmented data to a CSV file
augmented_data.to_csv('./data/augmented_data.csv', index=False)