#### Dependencies

In [1]:
# !pip install pandas
# !pip install numpy
# !pip install opencv-python
# !pip install pytesseract
# !dnf install tesseract-ocr
# !dnf install poppler-utils
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
# !pip install fitz
# !pip install PyPDF2 pdf2image


# Creating a model that takes in the images and processes them


## Image Processing

### Collect all PDF's and change them to image format


In [2]:
import os
import glob
import PyPDF2
from pdf2image import convert_from_path

# Path to PDFs
directory_path = './Prod files/'

# Extension to be captured
pdf_extension = '*.pdf'

# Create a directory to save the extracted images
output_directory = './extracted_images/'
os.makedirs(output_directory, exist_ok=True)

def extract_images_from_pdf(pdf_path, output_dir):
    try:
        # Open the PDF file using PyPDF2
        with open(pdf_path, 'rb') as pdf_file:
            full_name = os.path.basename(pdf_path)
            file_name = os.path.splitext(full_name)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)

            # Convert each page of the PDF to images using pdf2image
            images = convert_from_path(pdf_path)

            for page_num, image in enumerate(images, start=1):
                # Save the image as PNG
                image_filename = os.path.join(output_dir, f'{file_name[0]}page{page_num}.png')
                image.save(image_filename, 'PNG')

    except Exception as e:
        print(f"Error extracting images from {pdf_path}: {str(e)}")

# Find PDF files in the directory
pdf_files = glob.glob(os.path.join(directory_path, pdf_extension))

# Loop through each PDF and extract images
for pdf_path in pdf_files:
    extract_images_from_pdf(pdf_path, output_directory)

print("Image extraction completed.")


Image extraction completed.


## extract the text from the images

In [3]:
import pytesseract
import cv2
import os
import glob

# Define the directory path where your invoice images are located
directory_path = "./extracted_images/"

# Extensions to be captured
image_extensions = ['*.jpg', '*.png']

# List to store the captured data
image_data = []

i = 0

# Tesseract configuration options (you can customize these)
custom_config = r'--oem 3 --psm 6'

for extension in image_extensions:
    pattern = os.path.join(directory_path, extension)
    image_files = glob.glob(pattern)
    for img_path in image_files:
        full_name = os.path.basename(img_path)
        file_name = os.path.splitext(full_name)

        # Load image
        img = cv2.imread(img_path)

        # Convert image to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Apply threshold to convert to binary image
        threshold_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

        # Additional pre-processing steps can be added here (e.g., resizing, denoising)

        # Pass the pre-processed image through pytesseract with custom config
        text = pytesseract.image_to_string(threshold_img, config=custom_config, lang='eng')

        name = file_name[0]

        # Print the extracted text
        image_data.append([name, text])
        i += 1
        print('Image', i)


Image 1
Image 2
Image 3
Image 4
Image 5
Image 6
Image 7
Image 8
Image 9
Image 10


## Save the extracted text in a df 

In [4]:
import pandas as pd
text = pd.DataFrame(image_data,columns=("image_name","text"))
text.to_csv("./Prod files/extracted_text.csv")
text.to_json("./Prod files/extracted_text.json")

## Begin the Training process

### This also entails analysing your data and understanding it

### The formats of the extracted photos look as follows

#### Jumra Format:
Paid doc : A/P

Doc No : Credit memo/Invoice

Date: dd/mm/yy

Invoice amount : +/- Numbers

Witheld: +/- Numbers

Amt : +/- Numbers

outstanding: +/- Numbers


Document amount due: KES Numbers

Total amount due: KES Numbers


Account Name: Bank name

Transfer date: dd/mm/yy

reference: rtgs

Amount: Numbers


Bank transfer total: KES Numbers


## Creating mock entries for the data

### Dependencies

In [7]:
import random
from faker import Faker
faker = Faker()

### Chandarana

#### Chandarana Format:
Doc No : Numbers

Supplier: Numbers

Ref No : KES +/- Numbers

Net: KES +/- Numbers


#### usable functions

In [12]:
def supplierrefgen():
    year = str(random.randint(2000, 2023))
    month = str(random.randint(1, 12))
    invnos = str(random.randint(1,100000000))
    placer = str(year+month+invnos)
    return placer

In [13]:
def docnogen():
    placer = str(random.randint(1, 1000000))
    return placer

In [14]:
def grossgen():
    currency = "KES "
    choice =  random.choice(["-", ""])
    amt = str(random.randint(100, 100000))
    placer = str(currency+choice+amt)
    return placer

In [15]:
def netgen():
    currency = "KES -"
    amt = str(random.randint(100, 100000))
    placer = str(currency+amt)
    return placer

#### Creating mock data

In [32]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Initialize empty lists to store mock data
mock_invoices = []

# Labels list
labels =[]
i = 0
# Generate and store 1000 mock invoices
for i in range(1000):
    # Generate random data for labels

    
    # Generate the invoice text with labels and multiple rows
    rows = []
    for _ in range(50):  # Generate 50 rows
        docno = docnogen()
        supplier_ref = supplierrefgen()
        gross = grossgen()
        net = netgen()
        row = f"{docno} {supplier_ref} {gross} {net}"
        rows.append(row)
        labels.append([i+1,docno,supplier_ref,gross,net])
    newline = "\n\t"
    

    invoice_text = f"""
    Remittance Advice - Chandarana Dec 21page1,"CHANDARANA SUPERMARKET LTD
    30386 Paid By: Accounts
    KENYA

    REMITTANCE ADVICE 1149720 Original
    Ht [Date [Paid Doc. | Doc. No. Supplier Ref No Gross Amount Net Amount
    {newline.join(rows)} 
    KES 11,891,067.58 KES 9,030.65] KES 11,882,036.93
    Transfer Date: Bank Trans. Total: KES 11,882,036.93
    18/01/2022
    Total Amount: KES 11,882,036.93
    Name Of Collector / ID Date Collected Signature of Vendor |
    """
    
    # Append the generated invoice text to the list
    mock_invoices.append([i+1, invoice_text])
    i=i+1

# Print the first mock invoice for reference (will use the list index for id)
# print(mock_invoices[0])
print("Finished")

# Create a DataFrame from the mock invoices list
chandaranamockdf = pd.DataFrame(mock_invoices, columns=("id", "sentence"))

# You can also save the labels (docno, supplier_ref, gross, net) in a separate DataFrame if needed.
chandaranalabel = pd.DataFrame(labels,columns=("id","document_no","supplier_ref","gross","net"))


Finished


In [33]:
chandaranalabel.to_csv("./Prod files/chandaranalabels.csv",index=False)
chandaranamockdf.to_csv('./Prod files/chandaranamock.csv', index=False)

### Carrefour

#### Carrefour Format:
Company Name : Name

Inv Nos : Nos/Let/let

Date: dd-mm-yy

Remarks : Numbers/DEFAULT AGREEMENT

Amt: Number +/-

Company_name Total : Numbers
            

In [113]:
from faker import Faker
import random
faker = Faker()

def companygen():
    name = faker.company()
    return name


In [114]:
def invgen():

    def chance1():
        front = str(random.randint(10000000, 99999999))
        # Generate two random uppercase letters
        letters = ''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(2))
        
        # Generate a random number between 10000 and 99999
        number = str(random.randint(0, 9))

        last = ''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(3))
        
        # Concatenate the letters and number to create the invoice number
        invoice_number = f"{front}/{letters}{number}/{last}"
        return invoice_number
    
    def chance2():
        front = str(random.randint(10000000, 99999999))
        invoice_number = f"{front}"
        return invoice_number
    
    inv1 = chance1()
    inv2 = chance2()
    invresult = random.choice([inv1,inv2])
    
    placer = ''.join(invresult)
    return placer



In [115]:
def dategen():
    day = str(random.randint(1, 28))
    month= str(random.randint(1, 12))
    year = str(random.randint(2000, 2040))
    placer = f"{day}-{month}-{year}"
    return placer


In [116]:
def remarksgen():
    percentage = str(random.randint(0, 100))
    numbers = str(random.randint(1, 3))
    numbers2 = str(random.randint(100, 199))
    choice1 = f"DEFAULT AGREEMENT ({percentage}%) 00{numbers} {numbers2}"
    choice2 = "DEFAULT AGREEMENT (EXEMPTED)"
    remark = random.choice([choice1,choice2])
    placer = random.choice([remark,""])
    return placer


 

In [117]:
def amountgen():
    value = str(random.randint(100, 10000))
    return value

In [118]:
def totalgen():
    value = str(random.randint(10000, 999999))
    return value

In [None]:
# Payment Attachment Page - 1
# Payment No. : 130595
# AICNo: 2298452 UNGALIMITED Payment Dt.: 28-02-22
# ee AMoUNtPaid =
# Company Invoice Number Invoice Date Remarks
# (KES)
# HM KE NAI Two Rivers
# 22500434/KN2/GRA 14-02-22 22602305 1,661.41-
# 22500489/KN2/GRA, 19-02-22 22602675 1,205.48-
# 22500507/KN2/GRA, 21-02-22 22602776 447.08-
# 22CKRIV1032 24-02-22 DEFAULT AGREEMENT (0%) 003 101,797.18-
# 910073512 24-01-22 294,596.72 M
# 910073889 05-02-22 141,768.08
# 910073949 08-02-22 327,498.64
# HMKENAITwoRivers Total : 658,752.29
# HM KE NAI Nairobi Hub
# 22500678/KNI/GRA 19-02-22 22602779 2,677.26-
# 22500714/KNI/GRA, 22-02-22 22602976 1,471.37-
# 22500763/KNI/GRA, 25-02-22 22603174 1,645.90-
# 22CKHUB1032 24-02-22 DEFAULT AGREEMENT (0%) 003 113,329.14-
# 9100074057 11-02-22 149,280.00
# 910073715 01-02-22 163,098.24
# 910073718 01-02-22 56,466.69
# 910073828 04-02-22 147,383.28
# 910073833 04-02-22 173,382.20
# 910073953 08-02-22 82,926.01
# 910074056 11-02-22 169,472.83
# HMKENAINairobi Hub Total : 822,885.58
# HM KE NAI SouthField
# 22500223/KSOF/GRA 15-02-22 22600805 2,777.47-
# 22CKSOF1032 24-02-22. DEFAULT AGREEMENT (EXEMPTED) 31,452.16-
# 910073722 01-02-22 82,915.42
# HMKENAISouthField Total : 48,685.79
# HM KE NAITRM
# 22500303/KN3/GRA 14-02-22 22601520 1,972.07-
# 22500324/KN3/GRA, 16-02-22 22601677 3,194.04-
# 22CKTRM1032 24-02-22 DEFAULT AGREEMENT (16%) 001 74,073.61-
# 910073785 03-02-22 247,323.47
# 910073909 07-02-22 72,877.64
# 910074008 10-02-22 269,712.52
# HM KE NAITRM Total : 510,673.91
# HM KE NAI MEGA
# 22500527/KN8/GRA 19-02-22 22602325 7,142.66-
# 22500546/KN8/GRA, 22-02-22 22602469 1,021.53-
# 22CKMEG1032 24-02-22 DEFAULT AGREEMENT (16%) 001 85,554.36-
# 910067457 04-02-22 114,916.26
# 910073716 01-02-22 177,541.51
# 910073827 04-02-22 351,775.77
# 910074046 11-02-22 422,437.51
# HM KE NAI MEGA Total : 972,952.50
# HM KE NAI Galleria
# 22500756/KN6/GRA 18-02-22 22602300 2,899.23-
# 22500806/KN6/GRA, 22-02-22 22602474 1,157.75-
# 22500872/KN6/GRA, 26-02-22 22602692 1,386.26-
# 22CKGLR1032 24-02-22 DEFAULT AGREEMENT (0%) 003 79,047.19-
# 910073831 04-02-22 69,585.00
# 910073908 07-02-22 143,675.96
# 2250046 1/KN7/GRA, 15-02-22 22601801 785.56-
# 22500518/KN7/GRA, 18-02-22 22601978 297.66-
# 22500622/KN7/GRA, 25-02-22 22602338 1,369.62-
# 22CXVLM1032 24-02-22 DEFAULT AGREEMENT (0%) 003 43,039.77-
# 910073830 05-02-22 110,710.48
# 910073951 08-02-22 99,921.60
# SM KE NAI Village Market Total : 165,139.47
# SM KE Mom Nyali Complex
# 22CXNCM1032 24-02-22. DEFAULT AGREEMENT (0%) 003 62,186.13-
# 910073800 04-02-22 341,123.46
# SM KE Mom Nyali Complex Total : 278,937.33
# SM KE Mom Diani
# 22CXDIN1032 24-02-22. DEFAULT AGREEMENT (0%) 003 52,042.21-
# 910073720 02-02-22 9,947.00
# SM KE Mom Diani Total : 42,095.21-
# UNGA LIMITED Total : $,898,752.53

In [124]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Initialize empty lists to store mock data
mock_invoices = []

# Labels list
invlabels =[]
totallabels = []
i = 0
# Generate and store 1000 mock invoices
for i in range(1000):
    # Generate random data for labels

    
    # Generate the invoice text with labels and multiple rows
    rows = []
    for _ in range(50):  # Generate 50 rows
        companyname = companygen()
        total = totalgen()
        row = f"{companyname}"
        rows.append(row)
        for _ in range(10):
            invnos = invgen()
            date = dategen()
            remark = remarksgen()
            amt = amountgen()
            row = f"{invnos} {date} {remark} {amt}"
            rows.append(row)
            invlabels.append([i+1,companyname,invnos,date,remark,amt])
        row = f"{companyname} Total : {total}"
        rows.append(row)
        totallabels.append([i+1,companyname,total])
    newline = "\n\t"
    

    invoice_text = f"""
    Payment Attachment Page - 1
    Payment No. : 130595
    AICNo: 2298452 UNGALIMITED Payment Dt.: 28-02-22
    ee AMoUNtPaid =
    Company Invoice Number Invoice Date Remarks
    (KES)
    {newline.join(rows)} 
    UNGA LIMITED Total : $,898,752.53
    """
    
    # Append the generated invoice text to the list
    mock_invoices.append([i+1, invoice_text])
    i = i + 1
    

# Print the first mock invoice for reference (will use the list index for id)
# print(mock_invoices[0])
print("Finished")

# Create a DataFrame from the mock invoices list
carrefourmockdf = pd.DataFrame(mock_invoices, columns=("id", "sentence"))

# You can also save the labels (docno, supplier_ref, gross, net) in a separate DataFrame if needed.
carrefourlabel = pd.DataFrame(labels,columns=("id","document_no","supplier_ref","gross","net"))


Finished


In [125]:
carrefourlabel.to_csv("./Prod files/carrefourlabels.csv",index=False)
carrefourmockdf.to_csv('./Prod files/carrefourmock.csv', index=False)