#### Dependencies

In [1]:
# !pip install pandas
# !pip install numpy
# !pip install opencv-python
# !pip install pytesseract
# !dnf install tesseract-ocr
# !dnf install poppler-utils
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
# !pip install fitz
# !pip install PyPDF2 pdf2image


# Creating a model that takes in the images and processes them


## Image Processing

### Collect all PDF's and change them to image format


In [2]:
import os
import glob
import PyPDF2
from pdf2image import convert_from_path

# Path to PDFs
directory_path = './Prod files/'

# Extension to be captured
pdf_extension = '*.pdf'

# Create a directory to save the extracted images
output_directory = './extracted_images/'
os.makedirs(output_directory, exist_ok=True)

def extract_images_from_pdf(pdf_path, output_dir):
    try:
        # Open the PDF file using PyPDF2
        with open(pdf_path, 'rb') as pdf_file:
            full_name = os.path.basename(pdf_path)
            file_name = os.path.splitext(full_name)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)

            # Convert each page of the PDF to images using pdf2image
            images = convert_from_path(pdf_path)

            for page_num, image in enumerate(images, start=1):
                # Save the image as PNG
                image_filename = os.path.join(output_dir, f'{file_name[0]}page{page_num}.png')
                image.save(image_filename, 'PNG')

    except Exception as e:
        print(f"Error extracting images from {pdf_path}: {str(e)}")

# Find PDF files in the directory
pdf_files = glob.glob(os.path.join(directory_path, pdf_extension))

# Loop through each PDF and extract images
for pdf_path in pdf_files:
    extract_images_from_pdf(pdf_path, output_directory)

print("Image extraction completed.")


Image extraction completed.


## extract the text from the images

In [3]:
import pytesseract
import cv2
import os
import glob

# Define the directory path where your invoice images are located
directory_path = "./extracted_images/"

# Extensions to be captured
image_extensions = ['*.jpg', '*.png']

# List to store the captured data
image_data = []

i = 0

# Tesseract configuration options (you can customize these)
custom_config = r'--oem 3 --psm 6'

for extension in image_extensions:
    pattern = os.path.join(directory_path, extension)
    image_files = glob.glob(pattern)
    for img_path in image_files:
        full_name = os.path.basename(img_path)
        file_name = os.path.splitext(full_name)

        # Load image
        img = cv2.imread(img_path)

        # Convert image to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Apply threshold to convert to binary image
        threshold_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

        # Additional pre-processing steps can be added here (e.g., resizing, denoising)

        # Pass the pre-processed image through pytesseract with custom config
        text = pytesseract.image_to_string(threshold_img, config=custom_config, lang='eng')

        name = file_name[0]

        # Print the extracted text
        image_data.append([name, text])
        i += 1
        print('Image', i)


Image 1
Image 2
Image 3
Image 4
Image 5
Image 6
Image 7
Image 8
Image 9
Image 10


## Save the extracted text in a df 

In [4]:
import pandas as pd
text = pd.DataFrame(image_data,columns=("image_name","text"))
text.to_csv("./Prod files/extracted_text.csv")
text.to_json("./Prod files/extracted_text.json")

## Begin the Training process

### This also entails analysing your data and understanding it

### The formats of the extracted photos look as follows

## Creating mock entries for the data

In [5]:
# one big list for all the data
full_list = []

### Dependencies

In [6]:
import random
from faker import Faker
faker = Faker()

### Chandarana

#### Chandarana Format:
Doc No : Numbers

Supplier: Numbers

Ref No : KES +/- Numbers

Net: KES +/- Numbers


#### usable functions

In [7]:
def supplierrefgen():
    year = str(random.randint(2000, 2023))
    month = str(random.randint(1, 12))
    invnos = str(random.randint(1,100000000))
    placer = str(year+month+invnos)
    return placer

In [8]:
def docnogen():
    placer = str(random.randint(1, 1000000))
    return placer

In [9]:
def grossgen():
    currency = "KES "
    choice =  random.choice(["-", ""])
    amt = str(random.randint(100, 100000))
    placer = str(currency+choice+amt)
    return placer

In [10]:
def netgen():
    currency = "KES -"
    amt = str(random.randint(100, 100000))
    placer = str(currency+amt)
    return placer

#### Creating mock data

In [11]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Initialize empty lists to store mock data
mock_invoices = []

# Labels list
labels =[]
i = 0
# Generate and store 1000 mock invoices
for i in range(1000):
    # Generate random data for labels

    
    # Generate the invoice text with labels and multiple rows
    rows = []
    for _ in range(50):  # Generate 50 rows
        docno = docnogen()
        supplier_ref = supplierrefgen()
        gross = grossgen()
        net = netgen()
        row = f"{docno} {supplier_ref} {gross} {net}"
        rows.append(row)
        labels.append([i+1,docno,supplier_ref,gross,net])
    newline = "\n\t"
    

    invoice_text = f"""
    Remittance Advice - Chandarana Dec 21page1,"CHANDARANA SUPERMARKET LTD
    30386 Paid By: Accounts
    KENYA

    REMITTANCE ADVICE 1149720 Original
    Ht [Date [Paid Doc. | Doc. No. Supplier Ref No Gross Amount Net Amount
    {newline.join(rows)} 
    KES 11,891,067.58 KES 9,030.65] KES 11,882,036.93
    Transfer Date: Bank Trans. Total: KES 11,882,036.93
    18/01/2022
    Total Amount: KES 11,882,036.93
    Name Of Collector / ID Date Collected Signature of Vendor |
    """
    
    # Append the generated invoice text to the list
    mock_invoices.append([i+1, invoice_text])
    full_list.append([i+1, invoice_text])
    i=i+1

# Print the first mock invoice for reference (will use the list index for id)
# print(mock_invoices[0])
print("Finished")

# Create a DataFrame from the mock invoices list
chandaranamockdf = pd.DataFrame(mock_invoices, columns=("id", "sentence"))

# You can also save the labels (docno, supplier_ref, gross, net) in a separate DataFrame if needed.
chandaranalabel = pd.DataFrame(labels,columns=("id","document_no","supplier_ref","gross","net"))


Finished


In [12]:
chandaranalabel.to_csv("./Prod files/chandaranalabels.csv",index=False)
chandaranamockdf.to_csv('./Prod files/chandaranamock.csv', index=False)

### Carrefour

#### Carrefour Format:
Company Name : Name

Inv Nos : Nos/Let/let

Date: dd-mm-yy

Remarks : Numbers/DEFAULT AGREEMENT

Amt: Number +/-

Company_name Total : Numbers
            

In [13]:
from faker import Faker
import random
faker = Faker()

def companygen():
    name = faker.company()
    return name


In [14]:
def invgen():

    def chance1():
        front = str(random.randint(10000000, 99999999))
        # Generate two random uppercase letters
        letters = ''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(2))
        
        # Generate a random number between 10000 and 99999
        number = str(random.randint(0, 9))

        last = ''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(3))
        
        # Concatenate the letters and number to create the invoice number
        invoice_number = f"{front}/{letters}{number}/{last}"
        return invoice_number
    
    def chance2():
        front = str(random.randint(10000000, 99999999))
        invoice_number = f"{front}"
        return invoice_number
    
    inv1 = chance1()
    inv2 = chance2()
    invresult = random.choice([inv1,inv2])
    
    placer = ''.join(invresult)
    return placer



In [15]:
def dategen():
    day = str(random.randint(1, 28))
    month= str(random.randint(1, 12))
    year = str(random.randint(2000, 2040))
    placer = f"{day}-{month}-{year}"
    return placer


In [16]:
def remarksgen():
    percentage = str(random.randint(0, 100))
    numbers = str(random.randint(1, 3))
    numbers2 = str(random.randint(100, 199))
    choice1 = f"DEFAULT AGREEMENT ({percentage}%) 00{numbers} {numbers2}"
    choice2 = "DEFAULT AGREEMENT (EXEMPTED)"
    remark = random.choice([choice1,choice2])
    placer = random.choice([remark,""])
    return placer


 

In [17]:
def amountgen():
    value = str(random.randint(100, 10000))
    return value

In [18]:
def totalgen():
    value = str(random.randint(10000, 999999))
    return value

In [19]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Initialize empty lists to store mock data
mock_invoices = []

# Labels list
invlabels =[]
totallabels = []
i = 0
# Generate and store 1000 mock invoices
for i in range(1000):
    # Generate random data for labels

    
    # Generate the invoice text with labels and multiple rows
    rows = []
    for _ in range(50):  # Generate 50 rows
        companyname = companygen()
        total = totalgen()
        row = f"{companyname}"
        rows.append(row)
        for _ in range(10):
            invnos = invgen()
            date = dategen()
            remark = remarksgen()
            amt = amountgen()
            row = f"{invnos} {date} {remark} {amt}"
            rows.append(row)
            invlabels.append([i+1,companyname,invnos,date,remark,amt])
        row = f"{companyname} Total : {total}"
        rows.append(row)
        totallabels.append([i+1,companyname,total])
    newline = "\n\t"
    

    invoice_text = f"""
    Payment Attachment Page - 1
    Payment No. : 130595
    AICNo: 2298452 UNGALIMITED Payment Dt.: 28-02-22
    ee AMoUNtPaid =
    Company Invoice Number Invoice Date Remarks
    (KES)
    {newline.join(rows)} 
    UNGA LIMITED Total : $,898,752.53
    """
    
    # Append the generated invoice text to the list
    mock_invoices.append([i+1, invoice_text])
    i = i + 1
    

# Print the first mock invoice for reference (will use the list index for id)
# print(mock_invoices[0])
print("Finished")

# Create a DataFrame from the mock invoices list
carrefourmockdf = pd.DataFrame(mock_invoices, columns=("id", "sentence"))

# You can also save the labels (docno, supplier_ref, gross, net) in a separate DataFrame if needed.
carrefourlabel = pd.DataFrame(labels,columns=("id","document_no","supplier_ref","gross","net"))

carrefourtotals = pd.DataFrame(totallabels,columns=('id',"name","totals"))


Finished


In [20]:
carrefourlabel.to_csv("./Prod files/carrefourlabels.csv",index=False)
carrefourmockdf.to_csv('./Prod files/carrefourmock.csv', index=False)

### Jumra

#### Jumra Format:
Paid doc : A/P

Doc No : Credit memo/Invoice

Date: dd/mm/yy

Invoice amount : +/- Numbers

Witheld: +/- Numbers

Amt : +/- Numbers

outstanding: +/- Numbers


Document amount due: KES Numbers

Total amount due: KES Numbers


`To be done later`

Account Name: Bank name

Transfer date: dd/mm/yy

reference: rtgs

Amount: Numbers


Bank transfer total: KES Numbers


In [21]:
import random
from faker import Faker
faker = Faker()

def docgen():
    placer = random.choice(["Credit memo","Invoice"])
    return placer


In [22]:
def invnosgen():
    nos = str(random.randint(20000000, 29999999))
    placer=f"N{nos}"
    return placer

x= invnosgen()
print(x)

N28468421


In [23]:
def dategen():
    day = str(random.randint(1, 28))
    month= str(random.randint(1, 12))
    year = str(random.randint(2000, 2040))
    placer = f"{day}/{month}/{year}"
    return placer


In [24]:
def invamtgen():
    sign = random.choice(["-",""])
    amt = str(round(random.uniform(10000, 99999), 2))
    placer = f"{sign}{amt}"
    return placer

x = invamtgen()
print(x)



-41263.51


In [25]:
def witheldamtgen():
    sign = random.choice(["-",""])
    amt = str(round(random.uniform(1, 1000), 2))
    placer = f"{sign}{amt}"
    return placer

x = witheldamtgen()
print(x)

-441.35


In [26]:
def amtgen():
    sign = random.choice(["-",""])
    amt = str(round(random.uniform(10000, 99999), 2))
    placer = f"{sign}{amt}"
    return placer

x = amtgen()
print(x)

89479.74


In [27]:
def outstandingamtgen():
    sign = random.choice(["-",""])
    amt = str(round(random.uniform(1000, 9999), 2))
    placer = f"{sign}{amt}"
    return placer

x = outstandingamtgen()
print(x)

-5047.6


In [28]:
def amountdue():
    amt = str(round(random.uniform(100000, 9999999), 2))
    placer = f"{amt}"
    return placer

x = amountdue()
print(x)


5197525.05


In [29]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Initialize empty lists to store mock data
mock_invoices = []

# Labels list
invlabels =[]
totallabels = []
i = 0
# Generate and store 1000 mock invoices
for i in range(1000):
    # Generate random data for labels

    total = totalgen()
    name = companygen()
    # Generate the invoice text with labels and multiple rows
    rows = []
    companyname = companygen()
    for _ in range(50):  # Generate 50 rows
        paid = "A/P"
        doc = docgen()
        nos = invnosgen()
        date = dategen()
        invamt = invamtgen()
        withheld = witheldamtgen()
        amt= amtgen()
        outstanding = outstandingamtgen()
        row = f"{paid} {doc} {nos} {date} {invamt} {withheld} {amt} {outstanding}"
        rows.append(row)
        labels.append([i+1,paid,doc,nos,date,invamt,withheld,amt,outstanding])
    
    totallabels.append([i+1,companyname,total])
    newline = "\n\t"
    

    invoice_text = f"""
    Jumbo complex, Mogadishu Road 7, 1523 10/02/2022
    P.O Box 543 - 00606 _— TT
    , Kenya Account. PIN Number - Supplier
    PIN: P051414072z S086
    Cheque Ne.
    Haco Tiger Brands (EA) Ltd
    ""P O Box 43903, 00100"" ;
    Haco Tiger Brands (EA) Ltd
    ""P O Box 43903, 00100""
    Payments Currency: KES
    # Paid Doc Doc. No. Date Invoice Amount Withheld Amnt Amount Outstanding
    {newline.join(rows)} 
    Bank Transfer
    Account Name Transfer Date Reference Amount
    VICTORIA COMMERCIAL BANK1 10/02/2022 rtgs 4,072,474.25
    Bank Transfer Total: KES 4,072,474.25
    Outgoing Payments - S086
    Signature: Date: Total Payment Amount: KES 4,072,474.25
    """
    
    # Append the generated invoice text to the list
    mock_invoices.append([i+1, invoice_text])
    i = i + 1
    

# Print the first mock invoice for reference (will use the list index for id)
# print(mock_invoices[0])
print("Finished")

# Create a DataFrame from the mock invoices list
jumradf = pd.DataFrame(mock_invoices, columns=("id", "sentence"))

# You can also save the labels (docno, supplier_ref, gross, net) in a separate DataFrame if needed.
jumralabel = pd.DataFrame(labels,columns=("id","paid","doc","nos","date","invamt","withheld","amt","outstanding"))

jumratotals = pd.DataFrame(totallabels,columns=("id","name","totals"))


Finished


In [30]:
jumralabel.to_csv("./Prod files/jumralabels.csv",index=False)
jumradf.to_csv('./Prod files/jumramock.csv', index=False)

## PREPARATION FOR TRAINING

In [31]:
carrefourtotals.dtypes

id         int64
name      object
totals    object
dtype: object

In [32]:
# The df that are to be trained 
# jumradf
# chandaranamockdf
# carrefourmockdf
# jumratotals
# carrefourtotals
# jumralabel
# chandaranalabel
# carrefourlabel

import spacy

nlp = spacy.blank("en")


ruler = nlp.add_pipe("entity_ruler")
jumrapatterns=[]
chandaranapatterns = []
carrefourpatterns = []

i = 0 
while i < (len(jumralabel)):
    jumrapatterns.append({"label": "Paid","pattern": jumralabel['paid'][i]})
    jumrapatterns.append({"label": "Document type","pattern": jumralabel['doc'][i]})
    jumrapatterns.append({"label": "Invoice Number","pattern": jumralabel['nos'][i]})
    jumrapatterns.append({"label": "Invoice Date","pattern": jumralabel['date'][i]})
    jumrapatterns.append({"label": "Invoice amount","pattern": jumralabel['invamt'][i]})
    jumrapatterns.append({"label": "Withheld amount","pattern": jumralabel['withheld'][i]})
    jumrapatterns.append({"label": "Amount","pattern": jumralabel['amt'][i]})
    jumrapatterns.append({"label": "Outstanding Amount","pattern": jumralabel['outstanding'][i]})
    i = i+1

i=0
while i<len(jumratotals):
    jumrapatterns.append({"label": "Grand Total amount","pattern": jumratotals['totals'][i]})
    i = i+1

i=0
while i<len(chandaranalabel):
    chandaranapatterns.append({"label": "Document Number","pattern": chandaranalabel['document_no'][i]})
    chandaranapatterns.append({"label": "Supplier reference","pattern": chandaranalabel['supplier_ref'][i]})
    chandaranapatterns.append({"label": "Gross amount","pattern": chandaranalabel['gross'][i]})
    chandaranapatterns.append({"label": "Net amount","pattern": chandaranalabel['net'][i]})
    i = i+1

i=0
while i<len(carrefourlabel):
    carrefourpatterns.append({"label": "Document Number","pattern": carrefourlabel['document_no'][i]})
    carrefourpatterns.append({"label": "Supplier reference","pattern": carrefourlabel['supplier_ref'][i]})
    carrefourpatterns.append({"label": "Gross amount","pattern": carrefourlabel['gross'][i]})
    carrefourpatterns.append({"label": "Net amount","pattern": carrefourlabel['net'][i]})
    i = i+1

i=0
while i<len(carrefourtotals):
    carrefourpatterns.append({"label": "Grand total amount","pattern": carrefourtotals['totals'][i]})
    i = i+1



ruler.add_patterns(carrefourpatterns)
ruler.add_patterns(chandaranapatterns)
ruler.add_patterns(jumrapatterns)




### Creating training and entity data

In [33]:
i = 0
entities = []
TRAIN_DATA = []

while i < len(jumradf) :
    sentence = jumradf['sentence'][i]
    doc = nlp(sentence)
    entities = set()
    for ent in doc.ents:
        entities.add((ent.label_, ent.start_char, ent.end_char))
    unique_entities_list = list(entities)
    TRAIN_DATA.append([sentence, {"entities": unique_entities_list}]) 
    i = i+1 

i = 0
while i < len(chandaranamockdf) :
    sentence = chandaranamockdf['sentence'][i]
    doc = nlp(sentence)
    entities = set()
    for ent in doc.ents:
        entities.add((ent.label_, ent.start_char, ent.end_char))
    unique_entities_list = list(entities)
    TRAIN_DATA.append([sentence, {"entities": unique_entities_list}]) 
    i = i+1 

i = 0
while i < len(carrefourmockdf) :
    sentence = carrefourmockdf['sentence'][i]
    doc = nlp(sentence)
    entities = set()
    for ent in doc.ents:
        entities.add((ent.label_, ent.start_char, ent.end_char))
    unique_entities_list = list(entities)
    TRAIN_DATA.append([sentence, {"entities": unique_entities_list}]) 
    i = i+1 


# ent.text,ent.label_
# print (TRAIN_DATA)
print('finished')

finished


In [34]:
import srsly
import typer
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin

def convert(lang: str, TRAIN_DATA, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in TRAIN_DATA:
        doc = nlp.make_doc(text)
        ents = []
        for  label,start,end in annot["entities"]:
            start = int(start)
            end = int(end)
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)

In [35]:
convert("en", TRAIN_DATA, "./proddata/train.spacy")
convert("en", TRAIN_DATA, "./proddata/valid.spacy")

In [37]:
!python -m spacy init fill-config base_config.cfg config.cfg


[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [38]:
!python -m spacy train config.cfg --paths.train ./proddata/train.spacy --paths.dev ./proddata/valid.spacy --output ./prodmodel/

[38;5;4mℹ Saving to output directory: prodmodel[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
