In [None]:
import os
import torch
import pandas as pd
import nlpaug.augmenter.word as naw
from sklearn.model_selection import KFold
from config import load_config
from data_prep import read_data
from augment import balance_dataset

In [None]:
# Load config
augmentation_config = load_config("configs/augmentation_config.yaml")

# Set device
device = (
    augmentation_config["device"]
    if augmentation_config["device"] in ["cuda", "cpu"]
    else ("cuda" if torch.cuda.is_available() else "cpu")
)

# Read data to be augmented
df = read_data(augmentation_config['data_path'])

# Create output directory
os.makedirs(augmentation_config["output_dir"], exist_ok=True)

In [None]:
# Define augmentation model
augmenter = naw.ContextualWordEmbsAug(
    model_path=augmentation_config["augmenter_model"],
    action=augmentation_config["augment_action"],
    device=device,
)

# K-Fold splitting & augmentation
kf = KFold(
    n_splits=augmentation_config["n_splits"],
    shuffle=True,
    random_state=augmentation_config["random_seed"],
)

balanced_datasets = []
for i, (_, test_index) in enumerate(kf.split(df), start=1):
    subset = df.iloc[test_index]
    balanced_subset = balance_dataset(subset, "numeric_label", "quote", augmenter)
    if augmentation_config['save_intermediate_file'] == True:
        balanced_path = os.path.join(
        augmentation_config["output_dir"], f"df_balanced{i}.csv"
    )
        balanced_subset.to_csv(balanced_path, index=False)
    else:
        pass
    print(f"Augmented subset {i} saved to {balanced_path}")
    balanced_datasets.append(balanced_subset)


In [None]:
# combine all the balanced subsets
combined_balanced = pd.concat(balanced_datasets, ignore_index=True)

# Save combined balanced dataset
output_path = os.path.join(augmentation_config["output_dir"], "aug_balanced_train.csv")