#### Dependencies

In [1]:
# !pip install pandas
# !pip install numpy
# !pip install opencv-python
# !pip install pytesseract
# !dnf install tesseract-ocr
# !dnf install poppler-utils
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
# !pip install fitz
# !pip install PyPDF2 pdf2image


# Creating a model that takes in the images and processes them


## Image Processing

### Collect all PDF's and change them to image format


In [2]:
import os
import glob
import PyPDF2
from pdf2image import convert_from_path

# Path to PDFs
directory_path = './Prod files/'

# Extension to be captured
pdf_extension = '*.pdf'

# Create a directory to save the extracted images
output_directory = './extracted_images/'
os.makedirs(output_directory, exist_ok=True)

def extract_images_from_pdf(pdf_path, output_dir):
    try:
        # Open the PDF file using PyPDF2
        with open(pdf_path, 'rb') as pdf_file:
            full_name = os.path.basename(pdf_path)
            file_name = os.path.splitext(full_name)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)

            # Convert each page of the PDF to images using pdf2image
            images = convert_from_path(pdf_path)

            for page_num, image in enumerate(images, start=1):
                # Save the image as PNG
                image_filename = os.path.join(output_dir, f'{file_name[0]}page{page_num}.png')
                image.save(image_filename, 'PNG')

    except Exception as e:
        print(f"Error extracting images from {pdf_path}: {str(e)}")

# Find PDF files in the directory
pdf_files = glob.glob(os.path.join(directory_path, pdf_extension))

# Loop through each PDF and extract images
for pdf_path in pdf_files:
    extract_images_from_pdf(pdf_path, output_directory)

print("Image extraction completed.")


Image extraction completed.


## extract the text from the images

In [3]:
import pytesseract
import cv2
import os
import glob

# Define the directory path where your invoice images are located
directory_path = "./extracted_images/"

# Extensions to be captured
image_extensions = ['*.jpg', '*.png']

# List to store the captured data
image_data = []

i = 0

# Tesseract configuration options (you can customize these)
custom_config = r'--oem 3 --psm 6'

for extension in image_extensions:
    pattern = os.path.join(directory_path, extension)
    image_files = glob.glob(pattern)
    for img_path in image_files:
        full_name = os.path.basename(img_path)
        file_name = os.path.splitext(full_name)

        # Load image
        img = cv2.imread(img_path)

        # Convert image to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Apply threshold to convert to binary image
        threshold_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

        # Additional pre-processing steps can be added here (e.g., resizing, denoising)

        # Pass the pre-processed image through pytesseract with custom config
        text = pytesseract.image_to_string(threshold_img, config=custom_config, lang='eng')

        name = file_name[0]

        # Print the extracted text
        image_data.append([name, text])
        i += 1
        print('Image', i)


Image 1
Image 2
Image 3
Image 4
Image 5
Image 6
Image 7
Image 8
Image 9
Image 10


## Save the extracted text in a df 

In [4]:
import pandas as pd
text = pd.DataFrame(image_data,columns=("image_name","text"))
text.to_csv("./Prod files/extracted_text.csv")
text.to_json("./Prod files/extracted_text.json")

## Begin the Training process

### This also entails analysing your data and understanding it

### The formats of the extracted photos look as follows

#### Chandarana Format:
Doc No : Numbers

Supplier: Numbers

Ref No : KES +/- Numbers

Net: KES +/- Numbers


#### Carrefour Format:
Company Name : Name

Inv Nos : Nos/Let/let

Date: dd-mm-yy

Remarks : Numbers/DEFAULT AGREEMENT

Amt: Number +/-

Company_name Total : Numbers
            

#### Jumra Format:
Paid doc : A/P

Doc No : Credit memo/Invoice

Date: dd/mm/yy

Invoice amount : +/- Numbers

Witheld: +/- Numbers

Amt : +/- Numbers

outstanding: +/- Numbers


Document amount due: KES Numbers

Total amount due: KES Numbers


Account Name: Bank name

Transfer date: dd/mm/yy

reference: rtgs

Amount: Numbers


Bank transfer total: KES Numbers


## Creating mock entries for the data

### Dependencies

In [7]:
import random
from faker import Faker
faker = Faker()

### Chandarana

#### usable functions

In [12]:
def supplierrefgen():
    year = str(random.randint(2000, 2023))
    month = str(random.randint(1, 12))
    invnos = str(random.randint(1,100000000))
    placer = str(year+month+invnos)
    return placer

In [13]:
def docnogen():
    placer = str(random.randint(1, 1000000))
    return placer

In [14]:
def grossgen():
    currency = "KES "
    choice =  random.choice(["-", ""])
    amt = str(random.randint(100, 100000))
    placer = str(currency+choice+amt)
    return placer

In [15]:
def netgen():
    currency = "KES -"
    amt = str(random.randint(100, 100000))
    placer = str(currency+amt)
    return placer

### Creating mock data

In [32]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Initialize empty lists to store mock data
mock_invoices = []

# Labels list
labels =[]

# Generate and store 1000 mock invoices
for i in range(1000):
    # Generate random data for labels

    
    # Generate the invoice text with labels and multiple rows
    rows = []
    for _ in range(50):  # Generate 50 rows
        docno = docnogen()
        supplier_ref = supplierrefgen()
        gross = grossgen()
        net = netgen()
        row = f"{docno} {supplier_ref} {gross} {net}"
        rows.append(row)
        labels.append([i+1,docno,supplier_ref,gross,net])
    newline = "\n\t"
    

    invoice_text = f"""
    Remittance Advice - Chandarana Dec 21page1,"CHANDARANA SUPERMARKET LTD
    30386 Paid By: Accounts
    KENYA

    REMITTANCE ADVICE 1149720 Original
    Ht [Date [Paid Doc. | Doc. No. Supplier Ref No Gross Amount Net Amount
    {newline.join(rows)} 
    #     KES 11,891,067.58 KES 9,030.65] KES 11,882,036.93
    #     Transfer Date: Bank Trans. Total: KES 11,882,036.93
    #     18/01/2022
    #     Total Amount: KES 11,882,036.93
    #     Name Of Collector / ID Date Collected Signature of Vendor |
    """
    
    # Append the generated invoice text to the list
    mock_invoices.append([i+1, invoice_text])

# Print the first mock invoice for reference (will use the list index for id)
# print(mock_invoices[0])
print("Finished")

# Create a DataFrame from the mock invoices list
chandaranamockdf = pd.DataFrame(mock_invoices, columns=("id", "sentence"))

# You can also save the labels (docno, supplier_ref, gross, net) in a separate DataFrame if needed.
chandaranalabel = pd.DataFrame(labels,columns=("id","document_no","supplier_ref","gross","net"))


Finished


In [33]:
chandaranalabel.to_csv("./Prod files/chandaranalabels.csv",index=False)
chandaranamockdf.to_csv('./Prod files/chandaranamock.csv', index=False)