In [1]:
import os
%pwd

'C:\\Users\\kbged\\Downloads\\mlprojects\\Document_classifier_with_GithubCICD_FASTAPI_AWS\\research'

In [2]:
os.chdir("../")
%pwd

'C:\\Users\\kbged\\Downloads\\mlprojects\\Document_classifier_with_GithubCICD_FASTAPI_AWS'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTrainingValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_dir: str

In [4]:
from docClassify.constants import *
from docClassify.utils.common import read_yaml, create_directories, compute_metrics, scale_bounding_box, create_bounding_box
from docClassify.utils.common import DocumentClassificationDataset

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_training_validation_config(self) -> DataTrainingValidationConfig:
        config = self.config.data_training_validation

        create_directories([config.root_dir])

        data_training_validation_config = DataTrainingValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_dir=config.unzip_dir
        )

        return data_training_validation_config

In [6]:
import os
from docClassify.logger import logger

In [13]:
from tqdm.notebook import tqdm
import torch
import pandas as pd
import json
from PIL import Image
from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3TokenizerFast, LayoutLMv3Processor, LayoutLMv3ForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import Dataset, DataLoader


class TrainAndValidate:
    def __init__(self, config: DataTrainingValidationConfig):
        self.feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
        self.tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
        self.processor = LayoutLMv3Processor(self.feature_extractor, self.tokenizer)
        self.config = config
        self.DOCUMENT_CLASSES = sorted(list(map(lambda p: p.name,Path(self.config.unzip_dir).glob("*"))))    

    def get_train_test_path(self):
        # Convert PosixPath objects to strings
        image_paths=sorted(list(Path(self.config.unzip_dir).glob("*/*.png")))
        image_paths_str = [str(path) for path in image_paths]
        
        # Define labels based on whether the paths contain specific strings
        income_labels = ["income" in path for path in image_paths_str]
        balance_labels = ["balance" in path for path in image_paths_str]
        cashflow_labels = ["cashflow" in path for path in image_paths_str]
        
        # Use any one of the labels as the target for stratified split
        # Here, I'm using income_labels, but you can choose based on your requirements
        train_images_str, test_images_str = train_test_split(image_paths_str, test_size=0.2, stratify=income_labels, random_state=42)
        
        # Convert back to PosixPath objects
        train_images = [Path(path) for path in train_images_str]
        test_images = [Path(path) for path in test_images_str]

        return train_images, test_images
    
    def train(self, train_images, test_images):
        train_dataset = DocumentClassificationDataset(train_images, self.processor)
        valid_dataset = DocumentClassificationDataset(test_images, self.processor)
        
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=1,
            shuffle=True,
            #num_workers=10
        )
        
        valid_dataloader = DataLoader(
            valid_dataset,
            batch_size=1,
            shuffle=False,
            #num_workers=10
        )

        device = "cuda:0" if torch.cuda.is_available() else "cpu"

        n_classes = len(self.DOCUMENT_CLASSES)
        
        model = LayoutLMv3ForSequenceClassification.from_pretrained(
                    "microsoft/layoutlmv3-base",
                    num_labels=n_classes
                )
        model.to(device)

        # load seqeval metric
        #metric = evaluate.load("seqeval")
        model.config.id2label = {k: v for k, v in enumerate(self.DOCUMENT_CLASSES)}
        model.config.label2id = {v: k for k, v in enumerate(self.DOCUMENT_CLASSES)}
        # labels of the model
        ner_labels = list(model.config.id2label.values())
        
        num_epochs = 1
        optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)
        
        # Initialize an empty DataFrame to store the metrics
        columns = ["Epoch", "Training Loss", "Validation Loss", "Precision", "Recall", "F1", "Accuracy"]
        df_metrics = pd.DataFrame(columns=columns)
        
        # Early stopping parameters
        patience = 3 # Number of epochs to wait for improvement
        best_validation_loss = float('inf')
        current_patience = 0
        
        for epoch in range(num_epochs):
            print("Epoch:", epoch)
        
            # Training
            model.train()
            training_loss = 0.0
            num = 0
            for batch in tqdm(train_dataloader):
                labels = torch.Tensor(batch["labels"]).unsqueeze_(0).long().to(device)
                outputs = model(
                    input_ids=batch["input_ids"].to(device),
                    attention_mask=torch.tensor(batch["attention_mask"]).to(device),
                    bbox=torch.tensor(batch["bbox"]).to(device),
                    pixel_values=torch.tensor(batch["pixel_values"]).to(device),
                    labels=batch["labels"].to(device)
                )
                loss = outputs.loss
                training_loss += loss.item()
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                num += 1
        
            print("Training Loss:", training_loss / num)
        
            # Validation
            model.eval()
            preds = []
            labs = []
            validation_loss = 0.0
            num = 0
            for batch in tqdm(valid_dataloader):
                labels = torch.Tensor(batch["labels"]).to(device)
                outputs = model(
                    input_ids=batch["input_ids"].to(device),
                    attention_mask=torch.tensor(batch["attention_mask"]).to(device),
                    bbox=torch.tensor(batch["bbox"]).to(device),
                    pixel_values=torch.tensor(batch["pixel_values"]).to(device),
                    labels=labels
                )
                loss = outputs.loss
                preds_idx = outputs.logits.argmax(axis=1)
                labs.append(labels.tolist())
                preds.append(preds_idx.tolist())
                validation_loss += loss.item()
                num += 1
        
            print("Validation Loss:", validation_loss / num)
            print(preds)
            print(labs)
        
            overall_precision, overall_recall, overall_f1, overall_accuracy = compute_metrics([preds, labs])
            print("Overall Precision:", overall_precision)
            print("Overall Recall:", overall_recall)
        
            # Store metrics in the DataFrame
            metrics_data = {
                "Epoch": epoch,
                "Training Loss": training_loss,
                "Validation Loss": validation_loss,
                "Precision": overall_precision,
                "Recall": overall_recall,
                "F1": overall_f1,
                "Accuracy": overall_accuracy
            }
            #df_metrics = df_metrics.append(metrics_data, ignore_index=True)
            df_metrics.loc[len(df_metrics)] = metrics_data
        
            # Early stopping check
            if validation_loss < best_validation_loss:
                best_validation_loss = validation_loss
                current_patience = 0
            else:
                current_patience += 1
                if current_patience >= patience:
                    print(f"Early stopping! No improvement in validation loss for {patience} consecutive epochs.")
                    break
        
        # Save the DataFrame to a CSV file or do any further analysis
        df_metrics.to_csv("metrics.csv", index=False)
        print(df_metrics)

        return df_metrics
        
        # Convert DataFrame to markdown
        #markdown_table = df_metrics.to_markdown()
        
        # Print the markdown table
        #print(markdown_table)


        


In [14]:
try:
    config = ConfigurationManager()
    data_training_validation_config = config.get_data_training_validation_config()
    #data_training_validation = DataTrainingValidationConfig(config=data_training_validation_config)
    train_and_validate = TrainAndValidate(data_training_validation_config)
    train_images, test_images = train_and_validate.get_train_test_path()
    df = train_and_validate.train(train_images, test_images)
    #data_training_validation.prepare_all_files()
except Exception as e:
    raise e

[2024-02-06 12:03:27,659: INFO: common: yaml file: src\docClassify\config\config.yaml loaded successfully]
[2024-02-06 12:03:27,665: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-06 12:03:27,668: INFO: common: created directory at: artifacts]
[2024-02-06 12:03:27,672: INFO: common: created directory at: artifacts/data_training_validation]


Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0


  0%|          | 0/40 [00:00<?, ?it/s]

  attention_mask=torch.tensor(batch["attention_mask"]).to(device),
  bbox=torch.tensor(batch["bbox"]).to(device),
  pixel_values=torch.tensor(batch["pixel_values"]).to(device),


Training Loss: 1.1226830914616586


  0%|          | 0/10 [00:00<?, ?it/s]

  attention_mask=torch.tensor(batch["attention_mask"]).to(device),
  bbox=torch.tensor(batch["bbox"]).to(device),
  pixel_values=torch.tensor(batch["pixel_values"]).to(device),


Validation Loss: 1.0879846274852754
[[0], [0], [2], [2], [2], [2], [2], [0], [2], [2]]
[[0], [1], [2], [0], [0], [2], [2], [1], [1], [0]]
Overall Precision: 0.26190476190476186
Overall Recall: 0.4
   Epoch  Training Loss  Validation Loss  Precision  Recall        F1  \
0      0      44.907324        10.879846   0.261905     0.4  0.294286   

   Accuracy  
0       0.4  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
test_images
sorted(list(map(lambda p: p.name,test_images[0].parent.parent.glob("*"))))

['balance sheet', 'cashflow', 'income statement']

In [13]:
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
processor = LayoutLMv3Processor(feature_extractor, tokenizer)



vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from pathlib import Path

# Convert PosixPath objects to strings
image_paths=sorted(list(Path("artifacts/data_ingestion/data/").glob("*/*.png")))
image_paths_str = [str(path) for path in image_paths]

# Define labels based on whether the paths contain specific strings
income_labels = ["income" in path for path in image_paths_str]
balance_labels = ["balance" in path for path in image_paths_str]
cashflow_labels = ["cashflow" in path for path in image_paths_str]

# Use any one of the labels as the target for stratified split
# Here, I'm using income_labels, but you can choose based on your requirements
train_images_str, test_images_str = train_test_split(image_paths_str, test_size=0.2, stratify=income_labels, random_state=42)

# Convert back to PosixPath objects
train_images = [Path(path) for path in train_images_str]
test_images = [Path(path) for path in test_images_str]

DOCUMENT_CLASSES = sorted(list(map(
    lambda p: p.name,
    Path("artifacts/data_ingestion/data/").glob("*")
)))
DOCUMENT_CLASSES
test_images

[WindowsPath('artifacts/data_ingestion/data/balance sheet/bs7.png'),
 WindowsPath('artifacts/data_ingestion/data/cashflow/cf9.png'),
 WindowsPath('artifacts/data_ingestion/data/income statement/is4.png'),
 WindowsPath('artifacts/data_ingestion/data/balance sheet/bs16.png'),
 WindowsPath('artifacts/data_ingestion/data/balance sheet/bs15.png'),
 WindowsPath('artifacts/data_ingestion/data/income statement/is8.png'),
 WindowsPath('artifacts/data_ingestion/data/income statement/is2.png'),
 WindowsPath('artifacts/data_ingestion/data/cashflow/cf5.png'),
 WindowsPath('artifacts/data_ingestion/data/cashflow/cf12.png'),
 WindowsPath('artifacts/data_ingestion/data/balance sheet/bs3.png')]

In [26]:
train_dataset = DocumentClassificationDataset(train_images, processor)
valid_dataset = DocumentClassificationDataset(test_images, processor)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=1,
    shuffle=True,
    #num_workers=10
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=1,
    shuffle=False,
    #num_workers=10
)

In [30]:
from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3TokenizerFast, LayoutLMv3Processor, LayoutLMv3ForSequenceClassification

device = "cuda:0" if torch.cuda.is_available() else "cpu"

n_classes = len(DOCUMENT_CLASSES)

model = LayoutLMv3ForSequenceClassification.from_pretrained(
            "microsoft/layoutlmv3-base",
            num_labels=n_classes
        )
model.to(device)

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LayoutLMv3ForSequenceClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Enco

In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tqdm.notebook import tqdm
import torch
import pandas as pd
import json
from PIL import Image


# load seqeval metric
#metric = evaluate.load("seqeval")
model.config.id2label = {k: v for k, v in enumerate(DOCUMENT_CLASSES)}
model.config.label2id = {v: k for k, v in enumerate(DOCUMENT_CLASSES)}
# labels of the model
ner_labels = list(model.config.id2label.values())

num_epochs = 1
optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)

# Initialize an empty DataFrame to store the metrics
columns = ["Epoch", "Training Loss", "Validation Loss", "Precision", "Recall", "F1", "Accuracy"]
df_metrics = pd.DataFrame(columns=columns)

# Early stopping parameters
patience = 3 # Number of epochs to wait for improvement
best_validation_loss = float('inf')
current_patience = 0

for epoch in range(num_epochs):
    print("Epoch:", epoch)

    # Training
    model.train()
    training_loss = 0.0
    num = 0
    for batch in tqdm(train_dataloader):
        labels = torch.Tensor(batch["labels"]).unsqueeze_(0).long().to(device)
        outputs = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=torch.tensor(batch["attention_mask"]).to(device),
            bbox=torch.tensor(batch["bbox"]).to(device),
            pixel_values=torch.tensor(batch["pixel_values"]).to(device),
            labels=batch["labels"].to(device)
        )
        loss = outputs.loss
        training_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        num += 1

    print("Training Loss:", training_loss / num)

    # Validation
    model.eval()
    preds = []
    labs = []
    validation_loss = 0.0
    num = 0
    for batch in tqdm(valid_dataloader):
        labels = torch.Tensor(batch["labels"]).to(device)
        outputs = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=torch.tensor(batch["attention_mask"]).to(device),
            bbox=torch.tensor(batch["bbox"]).to(device),
            pixel_values=torch.tensor(batch["pixel_values"]).to(device),
            labels=labels
        )
        loss = outputs.loss
        preds_idx = outputs.logits.argmax(axis=1)
        labs.append(labels.tolist())
        preds.append(preds_idx.tolist())
        validation_loss += loss.item()
        num += 1

    print("Validation Loss:", validation_loss / num)
    print(preds)
    print(labs)

    overall_precision, overall_recall, overall_f1, overall_accuracy = compute_metrics([preds, labs])
    print("Overall Precision:", overall_precision)
    print("Overall Recall:", overall_recall)

    # Store metrics in the DataFrame
    metrics_data = {
        "Epoch": epoch,
        "Training Loss": training_loss,
        "Validation Loss": validation_loss,
        "Precision": overall_precision,
        "Recall": overall_recall,
        "F1": overall_f1,
        "Accuracy": overall_accuracy
    }
    df_metrics = df_metrics.append(metrics_data, ignore_index=True)

    # Early stopping check
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        current_patience = 0
    else:
        current_patience += 1
        if current_patience >= patience:
            print(f"Early stopping! No improvement in validation loss for {patience} consecutive epochs.")
            break

# Save the DataFrame to a CSV file or do any further analysis
df_metrics.to_csv("metrics.csv", index=False)
df_metrics

# Convert DataFrame to markdown
markdown_table = df_metrics.to_markdown()

# Print the markdown table
print(markdown_table)

Epoch: 0


  0%|          | 0/40 [00:00<?, ?it/s]

  attention_mask=torch.tensor(batch["attention_mask"]).to(device),
  bbox=torch.tensor(batch["bbox"]).to(device),
  pixel_values=torch.tensor(batch["pixel_values"]).to(device),


Training Loss: 1.086370076239109


  0%|          | 0/10 [00:00<?, ?it/s]

  attention_mask=torch.tensor(batch["attention_mask"]).to(device),
  bbox=torch.tensor(batch["bbox"]).to(device),
  pixel_values=torch.tensor(batch["pixel_values"]).to(device),


Validation Loss: 1.083499014377594
[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
[[0], [1], [2], [0], [0], [2], [2], [1], [1], [0]]


NameError: name 'compute_metrics' is not defined