In [1]:
import pandas as pd
import os
import numpy as np
import re
import nltk
from nltk.corpus import stopwords, wordnet

BASE_PATH = r"C:\Users\Impana\Downloads\invoice-classification\\"

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Impana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Impana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = str(text).lower().strip()
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([w for w in text.split() if len(w) > 1])
    text = ' '.join([w for w in text.split() if w not in stop_words])
    text = ' '.join([w for w in text.split() if wordnet.synsets(w)])
    return text.strip()

def augment_label(label_text: str) -> str:
    
    words = clean_text(label_text).split()
    if len(words) == 0:
        return ""
    if len(words) == 1:
        return words[0]

    # simple random op: swap or drop one word
    op = np.random.choice(['swap', 'drop'])
    if op == 'swap' and len(words) >= 2:
        i, j = np.random.choice(len(words), 2, replace=False)
        words[i], words[j] = words[j], words[i]
    elif op == 'drop':
        k = np.random.randint(len(words))
        words.pop(k)

    return ' '.join(words)


In [3]:
# Load original cleaned train set
train_path = os.path.join(BASE_PATH, "data", "D2train.csv")
train_df = pd.read_csv(train_path)

print("D2train:", train_df.shape)

# Generate one auxiliary word from label
aug_words = []
for lbl in train_df['label']:
    aug = augment_label(lbl)
    tokens = aug.split()
    if len(tokens) == 0:
        aug_words.append("")
    else:
        aug_words.append(tokens[np.random.randint(len(tokens))])

train_df['aux_word'] = aug_words

# Enriched description = original description + auxiliary label word
train_df['description_enriched'] = (
    train_df['description'].astype(str) + " " + train_df['aux_word'].astype(str)
).str.strip()

# D3train = both original and enriched descriptions (doubles size), as in the paper.[file:1]
d3train = pd.concat(
    [
        train_df[['description', 'label']],
        train_df[['description_enriched', 'label']].rename(columns={'description_enriched': 'description'})
    ],
    ignore_index=True
)

print("D3train shape:", d3train.shape)

out_path = os.path.join(BASE_PATH, "data", "D3train.csv")
d3train.to_csv(out_path, index=False)
print("Saved:", out_path)


D2train: (3754, 2)
D3train shape: (7508, 2)
Saved: C:\Users\Impana\Downloads\invoice-classification\\data\D3train.csv
