In [1]:
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords, wordnet

BASE_PATH = r"C:\Users\Impana\Downloads\invoice-classification\\"

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Impana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Impana\AppData\Roaming\nltk_data...


True

In [2]:
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    text = str(text).lower().strip()
    text = re.sub(r'\d+', ' ', text)          # remove digits
    text = re.sub(r'[^\w\s]', ' ', text)      # remove punctuation/symbols
    text = re.sub(r'\s+', ' ', text)          # collapse spaces
    # remove 1â€‘character tokens
    text = ' '.join([w for w in text.split() if len(w) > 1])
    # remove stopwords
    text = ' '.join([w for w in text.split() if w not in stop_words])
    # keep only words that exist in WordNet (English words)
    text = ' '.join([w for w in text.split() if wordnet.synsets(w)])
    return text.strip()


In [3]:
in_path = os.path.join(BASE_PATH, "data", "D01_raw.csv")
df = pd.read_csv(in_path)

print("Before cleaning:", df.shape)
df['description'] = df['description'].apply(clean_text)
# drop empty descriptions and duplicates
df = df[df['description'].str.len() > 0].drop_duplicates()
print("After cleaning:", df.shape, "labels:", df['label'].nunique())

out_path = os.path.join(BASE_PATH, "data", "D02_cleaned.csv")
df.to_csv(out_path, index=False)
print("Saved:", out_path)


Before cleaning: (5566, 2)
After cleaning: (4419, 2) labels: 36
Saved: C:\Users\Impana\Downloads\invoice-classification\\data\D02_cleaned.csv


In [4]:
from sklearn.model_selection import train_test_split

df = pd.read_csv(os.path.join(BASE_PATH, "data", "D02_cleaned.csv"))

# drop labels that appear only once
df = df.groupby('label').filter(lambda x: len(x) > 1)
print("After removing single-instance labels:", df.shape)

train_df, test_df = train_test_split(
    df,
    test_size=0.15,
    stratify=df['label'],
    random_state=42
)

print("Train:", train_df.shape, "Test:", test_df.shape)

train_df.to_csv(os.path.join(BASE_PATH, "data", "D2train.csv"), index=False)
test_df.to_csv(os.path.join(BASE_PATH, "data", "D2test.csv"), index=False)
print("Saved D2train and D2test")


After removing single-instance labels: (4417, 2)
Train: (3754, 2) Test: (663, 2)
Saved D2train and D2test
