In [1]:
import random
import pandas as pd
from sklearn.utils import shuffle

# Load the original parsed dataset
df = pd.read_csv("parsed_test_cases.csv")
df.dropna(subset=["step_keywords", "num_steps", "duration", "result"], inplace=True)

# Split step keywords into tokens
df["step_tokens"] = df["step_keywords"].apply(lambda x: x.split())

# Define common extra keywords to use
common_keywords = ["Set", "Send", "Reset", "Verify", "Check", "Power", "Switch", "Read", "Write"]

augmented_rows = []

# How many synthetic samples per original?
AUGMENT_PER_SAMPLE = 3

for _, row in df.iterrows():
    for _ in range(AUGMENT_PER_SAMPLE):
        base_tokens = row["step_tokens"].copy()
        num_steps = row["num_steps"]

        # Randomly shuffle or modify
        random.shuffle(base_tokens)
        if random.random() < 0.5:
            base_tokens.append(random.choice(common_keywords))
        if len(base_tokens) > 1 and random.random() < 0.3:
            base_tokens.pop(random.randint(0, len(base_tokens) - 1))

        # Truncate or pad to match num_steps
        while len(base_tokens) < num_steps:
            base_tokens.append(random.choice(common_keywords))
        while len(base_tokens) > num_steps:
            base_tokens.pop()

        # Create augmented row
        new_row = {
            "test_id": row["test_id"] + "_aug",
            "num_steps": num_steps,
            "step_keywords": " ".join(base_tokens),
            "duration": row["duration"] * random.uniform(0.9, 1.1),  # slight variation
            "result": row["result"]
        }
        augmented_rows.append(new_row)

# Combine with original
df_aug = pd.DataFrame(augmented_rows)
df_combined = pd.concat([df, df_aug], ignore_index=True)
df_combined = shuffle(df_combined).reset_index(drop=True)

# Save
df_combined.to_csv("parsed_test_cases_augmented.csv", index=False)
print("✅ Augmented dataset saved as parsed_test_cases_augmented.csv")


✅ Augmented dataset saved as parsed_test_cases_augmented.csv
