In [1]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet

BASE_PATH = r"C:\Users\Impana\Downloads\invoice-classification\\"

nltk.download('wordnet')

d3_path = os.path.join(BASE_PATH, "data", "D3train.csv")
d3 = pd.read_csv(d3_path)
print(d3.shape, d3['label'].nunique())

(7508, 2) 34


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Impana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def get_synonym(word):
    synsets = wordnet.synsets(word)
    if not synsets:
        return word
    lemmas = [l.name().replace('_', ' ') for s in synsets for l in s.lemmas()]
    lemmas = [l for l in lemmas if l.lower() != word.lower()]
    return np.random.choice(lemmas) if lemmas else word

def augment_sentence_wn(text, aug_frac=0.1):
    words = str(text).split()
    if len(words) == 0:
        return text
    n_to_aug = max(1, int(len(words) * aug_frac))
    idxs = np.random.choice(len(words), n_to_aug, replace=False)
    for i in idxs:
        words[i] = get_synonym(words[i])
    return ' '.join(words)

In [3]:
target_size = 100_000
current = len(d3)
needed = target_size - current
print("Current:", current, "Needed:", needed)

aug_rows = []
if needed > 0:
    # random sample from D3 for augmentation
    base_sample = d3.sample(needed, replace=True, random_state=42)
    for _, row in base_sample.iterrows():
        new_text = augment_sentence_wn(row['description'])
        aug_rows.append({'description': new_text, 'label': row['label']})

aug_df = pd.DataFrame(aug_rows)
d3_wn100k = pd.concat([d3, aug_df], ignore_index=True)
print("Final size:", len(d3_wn100k))

out_path = os.path.join(BASE_PATH, "data", "D3_WNtrain100k.csv")
d3_wn100k.to_csv(out_path, index=False)
print("Saved:", out_path)

Current: 7508 Needed: 92492
Final size: 100000
Saved: C:\Users\Impana\Downloads\invoice-classification\\data\D3_WNtrain100k.csv
