In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

BASE_PATH = r"C:\Users\Impana\Downloads\invoice-classification\\"

train_df = pd.read_csv(os.path.join(BASE_PATH, "data", "D3_WNtrain100k.csv"))
test_df  = pd.read_csv(os.path.join(BASE_PATH, "data", "D2test.csv"))

print("Train:", train_df.shape, "Test:", test_df.shape)

Train: (100000, 2) Test: (663, 2)


In [3]:
X_train_text = train_df['description'].astype(str).tolist()
X_test_text  = test_df['description'].astype(str).tolist()

le = LabelEncoder()
y_train = le.fit_transform(train_df['label'].astype(str).values)
y_test  = le.transform(test_df['label'].astype(str).values)
num_classes = len(le.classes_)
print("Classes:", num_classes)

max_words = 30000
max_len = 40  # invoice descriptions are short

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_text),
                            maxlen=max_len, padding='post')
X_test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test_text),
                            maxlen=max_len, padding='post')

vocab_size = min(max_words, len(tokenizer.word_index) + 1)
print("Vocab size:", vocab_size)

Classes: 34
Vocab size: 3471


In [4]:
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              input_length=max_len),
    Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=['accuracy']
)

model.summary()

batch_size = 256
epochs = 10

history = model.fit(
    X_train_seq, y_train,
    validation_split=0.1,
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
)




Epoch 1/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 200ms/step - accuracy: 0.8790 - loss: 0.4663 - val_accuracy: 0.9956 - val_loss: 0.0172
Epoch 2/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 203ms/step - accuracy: 0.9957 - loss: 0.0181 - val_accuracy: 0.9999 - val_loss: 0.0012
Epoch 3/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 202ms/step - accuracy: 0.9990 - loss: 0.0054 - val_accuracy: 1.0000 - val_loss: 1.9831e-04
Epoch 4/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 193ms/step - accuracy: 0.9994 - loss: 0.0031 - val_accuracy: 1.0000 - val_loss: 8.2994e-05
Epoch 5/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 192ms/step - accuracy: 0.9995 - loss: 0.0022 - val_accuracy: 1.0000 - val_loss: 2.3704e-05
Epoch 6/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 194ms/step - accuracy: 0.9998 - loss: 0.0013 - val_accuracy: 1.0000 - val_loss: 2.481

In [None]:
y_pred_prob = model.predict(X_test_seq, verbose=0)
y_pred = np.argmax(y_pred_prob, axis=1)

labels_test = np.unique(y_test)

from sklearn.metrics import classification_report

print(classification_report(
    y_test,
    y_pred,
    labels=labels_test,
    target_names=le.classes_[labels_test]
))

              precision    recall  f1-score   support

  CLASS-1248       1.00      1.00      1.00         1
  CLASS-1249       1.00      1.00      1.00        18
  CLASS-1250       1.00      1.00      1.00        27
  CLASS-1274       1.00      1.00      1.00       142
  CLASS-1294       1.00      1.00      1.00         4
  CLASS-1309       1.00      1.00      1.00         5
  CLASS-1322       1.00      1.00      1.00         4
  CLASS-1376       1.00      1.00      1.00        52
  CLASS-1429       1.00      1.00      1.00         5
  CLASS-1477       1.00      1.00      1.00         7
  CLASS-1522       1.00      1.00      1.00       104
  CLASS-1567       1.00      1.00      1.00        10
  CLASS-1652       1.00      1.00      1.00         2
  CLASS-1688       1.00      1.00      1.00         1
  CLASS-1721       1.00      1.00      1.00        15
  CLASS-1758       1.00      1.00      1.00       144
  CLASS-1770       1.00      1.00      1.00         3
  CLASS-1805       1.00    