In [None]:
import os, pandas as pd

BASE_PATH = r"C:\Users\Impana\Downloads\invoice-classification\\"
OUT_PATH = os.path.join(BASE_PATH, "data", "sroie", "D01_sroie_sectors.csv")

df.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


In [None]:
import re
import nltk
from nltk.corpus import stopwords, wordnet

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = str(text).lower().strip()
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join(w for w in text.split() if len(w) > 1)
    text = ' '.join(w for w in text.split() if w not in stop_words)
    text = ' '.join(w for w in text.split() if wordnet.synsets(w))
    return text.strip()

df = pd.read_csv(OUT_PATH)
df['text_clean'] = df['text'].apply(clean_text)
df = df[df['text_clean'].str.len() > 0].drop_duplicates()

print(df.shape, df['category'].value_counts())
clean_path = os.path.join(BASE_PATH, "data", "sroie", "D02_sroie_cleaned.csv")
df.to_csv(clean_path, index=False)
print("Saved:", clean_path)


In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv(clean_path)

df = df.groupby('category').filter(lambda x: len(x) > 1)
print("After dropping single-instance classes:", df['category'].value_counts())

train_df, test_df = train_test_split(
    df,
    test_size=0.15,
    stratify=df['category'],
    random_state=42
)

print("Train:", train_df.shape, "Test:", test_df.shape)
train_path = os.path.join(BASE_PATH, "data", "sroie", "D2_sroie_train.csv")
test_path  = os.path.join(BASE_PATH, "data", "sroie", "D2_sroie_test.csv")
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)
print("Saved train/test")


In [None]:
import numpy as np

def augment_label(label_text: str) -> str:
    words = clean_text(label_text).split()
    if len(words) == 0:
        return ""
    if len(words) == 1:
        return words[0]
    op = np.random.choice(['swap', 'drop'])
    if op == 'swap' and len(words) >= 2:
        i, j = np.random.choice(len(words), 2, replace=False)
        words[i], words[j] = words[j], words[i]
    elif op == 'drop':
        k = np.random.randint(len(words))
        words.pop(k)
    return ' '.join(words)

train_df = pd.read_csv(train_path)

aux_words = []
for lbl in train_df['category']:
    aug = augment_label(lbl)
    tokens = aug.split()
    aux_words.append(tokens[np.random.randint(len(tokens))] if tokens else "")

train_df['aux_word'] = aux_words
train_df['text_enriched'] = (train_df['text_clean'] + " " + train_df['aux_word']).str.strip()

d3 = pd.concat(
    [
        train_df[['text_clean', 'category']].rename(columns={'text_clean': 'text'}),
        train_df[['text_enriched', 'category']].rename(columns={'text_enriched': 'text'})
    ],
    ignore_index=True
)

print("D3 shape:", d3.shape)
d3_path = os.path.join(BASE_PATH, "data", "sroie", "D3_sroie_train.csv")
d3.to_csv(d3_path, index=False)
print("Saved:", d3_path)
