# Invoice / Not Invoice Document Classifier
***
### Problem Statement
1. Generate a solution to identify if a document is an invoice
2. Categorize documents that are invoices or not invoices

### Dataset
* Labels &rarr; documents, including invoices
* Important features &rarr; text content (optionally: layout)

### Solution
* Combination of OCR (Optical Character Recognition) and a machine learning model (CNN) for image classification
    * OCR &rarr; extract text and layout information
    * ML model &rarr; classify based on extracted features
    * TensorFlow &rarr; CNN architecture

### Model Evaluation
* Split the dataset into sets &rarr; Training, Validation, Test
* Metrics &rarr; Accuracy, Precision, Recall, F1-score

### Workflow Steps
| Step                | Description                                               | Tool       |
|---------------------|-----------------------------------------------------------|:-----------|
| Pre-Process         | Use OCR to extract text and features from documents       | PyPDF      |
| Feature Engineering | Create features based on text and layout                  | PyPDF      |
| Data Split          | Split data into sets: Training, Testing, Validation       | 70/15/15   |
| Model Training      | Train the model on labeled data                           | TensorFlow |
| Validation          | Tune model parameters based on validation set performance | TensorFlow |
| Testing             | Evaluate the model using test data                        | TensorFlow |
| Deployment          | Integrate into document processing pipeline               | TensorFlow |

### Machine Learning Model Methods
| Step                   | Method                             |
|------------------------|------------------------------------|
| Model Method           | Convolutional Neural Network (CNN) |
| Feature Engineering    | CNN (handles feature engineering)  |
| Optimizer              | Adam                               |
| Learning Rate          | 0.001                              |
| Betas                  | Default                            |
| Loss Function          | MeanSquaredError                   |
| Evaluation Metrics     | RootMeanSquaredError               |
| Hyperparameter Tuning  | BayesianOptimization               |

***

## Extract, Load, Review Images

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
import os
import matplotlib.pyplot as plt
from PIL import Image
import re
from sklearn.metrics import mean_squared_error
import numpy as np
import tarfile
import PyPDF2

In [None]:
##-- Extract Images

with tarfile.open('images.tar.gz', 'r:gz') as f:
  f.extractall('images/')

In [4]:
##-- Review Images

def show_imgs(image_files):
    for img_file in image_files:
        img_path = os.path.join(image_folder, img_file)
        img = Image.open(img_path)   
        plt.imshow(img)
        plt.axis('off')
        plt.show()
    return

# show_imgs(image_files[:5])

In [5]:
##-- Review Image Dimensions

def show_img_dims(image_files):
    for img_file in image_files:
        img_path = os.path.join(image_folder, img_file)
        with Image.open(img_path) as img:
            img_size = img.size
        print(f"{img_file}: {img_size}")
    return

# show_img_dims(image_files[:5])

In [None]:
from tensorflow.keras import backend as K
K.clear_session()

image_folder = 'images/images'
image_files = os.listdir(image_folder)

##-- Rescale/standardize images
def rescale_imgs(image_files, standard_size=(224, 224), new_folder="images_rescaled"):

     # Create new directory
    if not os.path.exists(new_folder):
        os.makedirs(new_folder)
        
    rescaled_images={}
    
    for img_file in image_files:
        img_path = os.path.join(image_folder, img_file)
        with Image.open(img_path) as img:
            img_rescaled = img.resize(standard_size).convert("RGB")
            rescaled_images[img_file] = img_rescaled
            save_path = os.path.join(new_folder, img_file)
            img_rescaled.save(save_path)

            
##-- Get list of rescaled image paths
rescaled_image_folder = 'images_rescaled'
rescaled_image_files = [os.path.join(rescaled_image_folder, fname) for fname in os.listdir(rescaled_image_folder)]

##-- Invoice categorization based on file name or text contents from file
def categorize_inv(filename):
    match = re.search(r'_([a-zA-Z]+).jpg$', filename)
    if match:
        try:
            invoice = match.group(1).char.isalpha().lower()==("invoice")
            return invoice
        except:
            with open(filename, 'rb') as f:
                pdf_reader = PyPDF2.PdfFileReader(f)
                text = ""
                for page_num in range(pdf_reader.numPages):
                    page = pdf_reader.getPage(page_num)
                    text += page.extractText()
                invoice = "invoice" in text.lower()
                return invoice
    else:
        return None

##-- Categorize Invoices and assign labels
inv_labels = [categorize_inv(os.path.basename(file)) for file in rescaled_image_files if categorize_inv(os.path.basename(file)) is not None]

dataset = tf.data.Dataset.from_tensor_slices((rescaled_image_files, inv_labels))

def load_and_preprocess_image(image_path, label):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image = tf.cast(image, tf.float32) / 255.0  # normalize to [0,1] range
    return image, label

dataset = dataset.map(load_and_preprocess_image)

dataset = dataset.shuffle(buffer_size=len(rescaled_image_files))

batch_size = 32

##-- Count samples & calculate splits
total = len(dataset)
train_size,test_size = int(0.7 * total),int(0.15 * total)
val_size = total - train_size - test_size

##-- Split & batch dataset
train_data = dataset.take(train_size).batch(batch_size)
test_data = dataset.skip(train_size).take(test_size).batch(batch_size)
val_data = dataset.skip(train_size + test_size).batch(batch_size)

## Build CNN Model

### Model Selection, Training, Testing, Validation, & Evaluation

In [None]:
##-- Define CNN architecture & initialize sequential model
def CNN_model(dropout_rate):

    model = Sequential([
        ##-- 1st Convolution Layer: 32 filters of size (3,3) w/ ReLU activation
        Conv2D(32, (3, 3), activation='relu'),

        ##-- 1st Max Pooling Layer
        ##-- Pooling size (2,2)
        MaxPooling2D(2, 2),

        ##-- 2nd Convolution Layer
        ##-- 64 filters of size (3,3) w/ ReLU activation
        Conv2D(64, (3, 3), activation='relu'),

        ##-- 2nd Max Pooling Layer
        ##-- Pooling size (2,2)
        MaxPooling2D(2, 2),

        ##-- 3rd Convolution Layer
        ##-- 128 filters of size (3,3) w/ ReLU activation
        Conv2D(128, (3, 3), activation='relu'),

        ##-- 3rd Max Pooling Layer
        ##-- Pooling size (2,2)
        MaxPooling2D(2, 2),

        ##-- Flatten Layer
        ##-- Flatten 3D output to 1D tensor
        Flatten(),

        ##-- Fully Connected Layer
        ##-- 512 neurons with ReLU activation
        Dense(512, activation='relu'),

        ##-- Dropout Layer
        ##-- Drop to avoid overfitting
        Dropout(dropout_rate),

        ##-- Output Layer
        ##-- 1 neuron with linear activation for regression
        Dense(1, activation='linear')  
    ])

    return model

In [70]:
from bayes_opt import BayesianOptimization

def train_model(dropout_rate):
    model = CNN_model(dropout_rate)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    ## model.summary() ##--Summarize model
    history = model.fit(train_data, epochs=10, verbose=0, validation_data=val_data)
    val_rmse = history.history['val_root_mean_squared_error'][-1]
    return val_rmse

# Bounded region of parameter space
pbounds = {'dropout_rate': (0.2, 0.5)}

optimizer = BayesianOptimization(
    f=train_model,
    pbounds=pbounds,
    verbose=2,
    random_state=1,
)

optimizer.maximize(init_points=10, n_iter=50)

# Best hyperparameters
print(optimizer.max)

|   iter    |  target   | dropou... |
-------------------------------------
| [0m1        [0m | [0m14.03    [0m | [0m0.3251   [0m |
| [95m2        [0m | [95m20.18    [0m | [95m0.4161   [0m |
| [95m3        [0m | [95m23.66    [0m | [95m0.2      [0m |
| [95m4        [0m | [95m35.43    [0m | [95m0.2907   [0m |
| [0m5        [0m | [0m14.78    [0m | [0m0.244    [0m |
| [0m6        [0m | [0m18.96    [0m | [0m0.2277   [0m |
| [0m7        [0m | [0m14.59    [0m | [0m0.2559   [0m |
| [0m8        [0m | [0m14.75    [0m | [0m0.3037   [0m |
| [0m9        [0m | [0m35.05    [0m | [0m0.319    [0m |
| [0m10       [0m | [0m26.72    [0m | [0m0.3616   [0m |
| [0m11       [0m | [0m18.59    [0m | [0m0.2907   [0m |
| [0m12       [0m | [0m20.61    [0m | [0m0.319    [0m |
| [0m13       [0m | [0m21.96    [0m | [0m0.2907   [0m |
| [0m14       [0m | [0m35.33    [0m | [0m0.319    [0m |
| [0m15       [0m | [0m17.65    [0m | [0m0

In [2]:
##-- Train model using optimal dropout rate
best_dropout_rate = optimizer.max['params']['dropout_rate']
final_model = CNN_model(best_dropout_rate)
final_model.compile(optimizer='adam', loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()])
final_model.fit(train_data, epochs=10, verbose=1, validation_data=val_data)

##-- Test model to make predictions
def make_prediction(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image = tf.cast(image, tf.float32) / 255.0  # normalize to [0,1] range
    image = tf.expand_dims(image, axis=0)  # Add batch dimension
    inv_prediction = final_model.predict(image) 
    return inv_prediction[0][0]

In [None]:
##-- EXAMPLE: Make a prediction for a new image
# print(f"Invoice Prediction: {make_prediction('images_rescaled/new_image.jpg')}")

## Model Monitoring & _pro re nata_ Retraining 

In [None]:
##-- Initialize variables
prev_test_rmse = None
model = None

##-- Set initial drift threshold
drift_threshold = 0.1  

##-- Model drift check: Compares current RMSE to previous RMSE
def check_model_drift(current_test_rmse, prev_test_rmse):
    if prev_test_rmse is None:
        return False
    
    drift_ratio = abs(current_test_rmse - prev_test_rmse) / prev_test_rmse
    return drift_ratio >= drift_threshold

##-- Data drift check: Compares statistics of new data to old data
def check_data_drift(new_data_stats, old_data_stats, threshold=0.05):
    # Calculate drift based on statistics
    drift = np.abs(new_data_stats - old_data_stats) > threshold
    return np.any(drift)

while True:
    
    ##-- Train model
    if model is None:
        
        best_params = optimizer.max['params']
        model = CNN_model(best_params['dropout_rate'])
        model.compile(optimizer='adam', loss='mean_squared_error', metrics=[tf.keras.metrics.RootMeanSquaredError()])
        model.fit(train_data, epochs=10, verbose=0, validation_data=val_data)

    ##-- Test for model drift
    test_predictions = model.predict(test_data)
    test_rmse = np.sqrt(mean_squared_error(test_predictions, test_data))
    
    if check_model_drift(test_rmse, prev_test_rmse):
        print("Model drift detected.")
        model = None  # Set model to None to retrain next iteration
    prev_test_rmse = test_rmse

    ##-- Test for data drift
    new_data_stats = np.mean([data for data, _ in train_data], axis=0)
    if check_data_drift(new_data_stats, old_data_stats):
        print("Data drift detected.")
        model = None  # Set model to None to retrain next iteration
    old_data_stats = new_data_stats

    ##-- Optionally: Add termination condition OR integrate scheduled checks
