# PDF PROCESSING MODEL.

This model is generated to extract data from invoice pdfs. This data should entail the recipient name, address etc.

# Collecting the Data

## Installing dependencies and importing them

In [2]:
# !pip install transformers
# !pip install pandas
# !pip install numpy
# !pip install opencv-python
# !pip install pytesseract
# !dnf install tesseract-ocr
# !dnf install poppler-utils
# !pip install scikit-learn
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

## Pick the CSV that contains the labels and save it to a df


In [3]:
import pandas as pd
labeldf = pd.read_csv("./files/data.csv")
labeldf.dtypes

id                   int64
recipient           object
recipientaddress    object
invoicenos          object
invoicedate         object
duedate             object
Balance             object
dtype: object

## Pick the images first and extract data from the images

In [4]:
import pytesseract
import cv2
import os
import glob


# Define the directory path where your invoice images are located
directory_path = "./files/"  

# Define a list of image file extensions (add more if needed)
image_extensions = ['*.jpg', '*.jpeg', '*.png']

# Initialize lists to store image data and labels
image_data = []

i = 0

for extension in image_extensions:
    pattern = os.path.join(directory_path, extension)
    image_files = glob.glob(pattern)
    for img_path in image_files:
      full_name = os.path.basename(img_path)
      file_name = os.path.splitext(full_name)
      # Load image
      img = cv2.imread(img_path)

      # Convert image to grayscale
      gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

      # Apply threshold to convert to binary image
      threshold_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

      # Pass the image through pytesseract
      text = pytesseract.image_to_string(threshold_img)

      name = int(file_name[0])
      # Print the extracted text
      image_data.append([name,text])
      i = i + 1
      print('image', i)
      

image 0
image 1


## Save to a dataframe

In [5]:
import pandas as pd
textdf = pd.DataFrame(image_data,columns=['imageid','text'])  # Store your image data here
textdf.head(20)

Unnamed: 0,imageid,text
0,2,Invoice\n\nStanford Plumbing & Heating\nbeotts...
1,1,Stanford Plumbing & Heating\n128 Madison drive...


## Pick up the CSV and merge it with the text dataframe

In [6]:
labelled_text = textdf.merge(labeldf, left_on='imageid', right_on='id', how='outer')
labelled_text = labelled_text.drop(columns=['id'])

labelled_text.to_csv("./files/extracted_text.csv", index=False)
# Drop unfilled data
labelled_text.dtypes

imageid              int64
text                object
recipient           object
recipientaddress    object
invoicenos          object
invoicedate         object
duedate             object
Balance             object
dtype: object

# Model implementation

## importing the csv data for the tokenized photos

In [7]:
# Data of different pages that constitute my training data
imported = pd.read_csv("./files/extracted_text.csv")


In [8]:
# imported.drop('tokens', axis=1, inplace=True)
# imported.drop('embeddings', axis=1, inplace=True)
imported.head()

Unnamed: 0,imageid,text,recipient,recipientaddress,invoicenos,invoicedate,duedate,Balance
0,2,Invoice\n\nStanford Plumbing & Heating\nbeotts...,Allen Smith,"123 Madison drive Seattle, WA",INVO2081,"Jun 14,2018","Jun 19,2018",2688.0
1,1,Stanford Plumbing & Heating\n128 Madison drive...,Allen Smith,"87 Private st, Seattle, WA",INVO2081,11/11/2018,12/10/2018,2844.8


## Dropping NaN rows

In [9]:
imported.dropna(inplace=True)

print(imported.isna().sum())

imageid             0
text                0
recipient           0
recipientaddress    0
invoicenos          0
invoicedate         0
duedate             0
Balance             0
dtype: int64


## initializing annotation

In [10]:
import spacy
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# The function that generates the annotations

def annotate_ner(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Initialize a list to store annotations
    annotations = []

    # Iterate through entities in the processed text
    for ent in doc.ents:
        annotations.append({
            "start": ent.start_char,
            "end": ent.end_char,
            "label": ent.label_,
            "text": ent.text
        })

    return annotations


## Forming annotations

In [11]:
import pandas as pd
i = 0
annot_list = []
while i < len(imported):
  text = imported['text'][i]

  #call the function
  annotations = annotate_ner(text)

  # save the annotations
  annot_list.append([i+1,annotations])
  print(f"image {i+1}") #display the number of annotated images
  i = i+1

annotations = pd.DataFrame(annot_list,columns=['id','annotations'])
annotations.to_csv("./files/annotations.csv" ,index = False)
annotations.to_json("./files/annotations.json" ,index = False)



image 1
image 2


In [12]:
annotations.head()

Unnamed: 0,id,annotations
0,1,"[{'start': 9, 'end': 36, 'label': 'ORG', 'text..."
1,2,"[{'start': 0, 'end': 27, 'label': 'ORG', 'text..."


In [13]:
import json

json_file_path = './files/annotations.json'

# Open and read the JSON file
with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)

# Initialize a list to store ORG labels and text
org_labels_text = []

# Iterate through the annotations
for annotations_list in data['annotations'].values():
    for annotation in annotations_list:
        if annotation['label'] == 'ORG':
            org_labels_text.append(annotation['text'])

# Print the collected ORG labels and text
for org_text in org_labels_text:
    print(f"ORG Label Text: {org_text}")

ORG Label Text: Stanford Plumbing & Heating
ORG Label Text: ety plow ana
ORG Label Text: Stanford Plumbing & Heating
ORG Label Text: Nest


In [14]:
# add the annotations to the df containing the text and the labels
imported = imported.merge(annotations, left_on='imageid', right_on='id', how='outer')
imported.drop('id', axis=1, inplace=True)
imported.dtypes

imageid              int64
text                object
recipient           object
recipientaddress    object
invoicenos          object
invoicedate         object
duedate             object
Balance             object
annotations         object
dtype: object

## spaCy tokenization

In [15]:
# Using the spaCy model to tokenize the text
spacytokens_list = []
i = 0

while i < len(imported):
  # Text to be tokenized
  text = imported['text'][i]

  # tokens
  doc = nlp(text)

  # Access tokens in the processed text
  tokens = [token.text for token in doc]

  # Print the annotations
  spacytokens_list.append([i+1,tokens])
  print(f"image {i+1}")
  i = i+1

spacytokens = pd.DataFrame(spacytokens_list,columns=['id','spacytokens'])
spacytokens.to_json("./files/tokens.json" ,index = False)

image 1
image 2


In [16]:
# Adding the tokens to the df and saving the data to a csv
imported = imported.merge(spacytokens, left_on='imageid', right_on='id', how='outer')
imported.drop('id', axis=1, inplace=True)
imported.to_csv('./files/tokenized_text.csv', index=False)

## Entity creations

In [17]:
# Separating the entities and the labels from the data that passed throught the model

entity_list = []
i=0
while i < len(imported):
  tokenized = imported['spacytokens'][i]
  string = " ".join(tokenized)
  # Process the tokenized text with spaCy
  doc = nlp(string)

  # Access named entities in the processed text
  for ent in doc.ents:
    entity_list.append([i+1,ent.text,ent.label_])

  print(f'image {i+1}')
  i=i+1

spacyentities = pd.DataFrame(entity_list,columns=['id','entity_text','entity_label'])
spacyentities.to_csv("./files/entities.csv")


image 1
image 2


## Grouping the labels and entities from the results according to the id

In [18]:

# Group by 'ID' and aggregate 'Label' and 'Title' as a list of dictionaries
result = spacyentities.groupby('id').apply(lambda x: x[['entity_label', 'entity_text']].to_dict(orient='records')).reset_index(name='Data')

# Initialize an empty list to store the output dictionaries
output_list = []

# Create dictionaries for each 'ID' containing arrays of labels and titles
for index, row in result.iterrows():
    id_dict = {'id': row['id'], 'Data': row['Data']}
    output_list.append(id_dict)


In [19]:
print(len(output_list))
# Now, output_list contains dictionaries with dimensions 1x1
for item in output_list:
    print(item)

2
{'id': 1, 'Data': [{'entity_label': 'ORG', 'entity_text': 'Stanford Plumbing & Heating'}, {'entity_label': 'CARDINAL', 'entity_text': '128'}, {'entity_label': 'PERSON', 'entity_text': 'Madison'}, {'entity_label': 'GPE', 'entity_text': 'WA'}, {'entity_label': 'DATE', 'entity_text': '72290'}, {'entity_label': 'PERSON', 'entity_text': 'Allen Sith'}, {'entity_label': 'CARDINAL', 'entity_text': '87'}, {'entity_label': 'GPE', 'entity_text': 'Seattle'}, {'entity_label': 'DATE', 'entity_text': '9002 - 1898'}, {'entity_label': 'ORG', 'entity_text': 'nea newtehen'}, {'entity_label': 'ORG', 'entity_text': 'Toto'}, {'entity_label': 'CARDINAL', 'entity_text': '20'}, {'entity_label': 'LAW', 'entity_text': 'PayPal feb Getanforplumbing'}, {'entity_label': 'PERSON', 'entity_text': 'Balance Due'}, {'entity_label': 'MONEY', 'entity_text': '2,808.90'}]}
{'id': 2, 'Data': [{'entity_label': 'ORG', 'entity_text': 'Stanford Plumbing & Heating \n beottsanforepLamarg cm'}, {'entity_label': 'CARDINAL', 'entity

# ChatGPT including

## Include chatgpt in deduction

In [20]:
# key = "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"
# import openai

# # Set your API key
# openai.api_key = key

# # Test the API connection by making a simple request with an engine specified
# try:
#     response = openai.Completion.create(
#         prompt="Test connection",
#         engine="text-davinci-002"  # Specify the engine or model you want to use
#     )
#     if response.choices:
#         print("API connection test successful.")
#         print("Response:", response.choices[0].text)
#     else:
#         print("API request was successful, but the response is empty.")
# except Exception as e:
#     print("API connection test failed. Error:", str(e))



In [21]:
# import os, openai, shutil, json
# from llama_index import Document, VectorStoreIndex, LLMPredictor, Prompt
# from langchain.chat_models import ChatOpenAI
# from dotenv import load_dotenv
# from typing import Any, List

# load_dotenv()

# os.environ['USE_TORCH'] = '1'
# os.environ['OPENAI_API_KEY'] = str(os.getenv('OPENAI_API_KEY'))
# openai.api_key = str(os.getenv('OPENAI_API_KEY'))
# openai.api_endpoint = "https://api.openai.com/v1"

# TEMPLATE = (
#     "You are a helpful assistant that extracts specific information from unstructured data. That data is provided below\n"
#     "---------------------\n"
#     "{context_str}"
#     "\n---------------------\n"
#     "Given this information, please answer the question with precision: {query_str}\n"
# )
# QA_TEMPLATE = Prompt(TEMPLATE)

In [22]:
# def extract_data(text:str, questions:List[str]) -> str:
#     document = Document(text=text)

#     llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name='gpt-4'))
#     index = VectorStoreIndex.from_documents([document], llm_predictor=llm_predictor)

#     query_engine = index.as_query_engine(text_qa_template=QA_TEMPLATE)

#     result_string = ""
#     for q in questions:
#         result_string += '---------------\n'
#         # result_string += 'QUESTION:\n'
#         # result_string += q + '\n'
#         # result_string += 'ANSWER:\n'
#         result_string += query_engine.query(q).response + '\n'

#     return result_string

In [23]:
# def lambda_handler(text):
#     questions_string = [
#         'what is the name of the recipient company',
#         'what is the address of the recipient company',
#         'what is the invoice number',
#         'what is the invoice date',
#         'what is the due date of the invoice',
#         'what is the due balance to be paid',
#     ]
#     questions = questions_string if isinstance(questions_string, list) else questions_string.split(',')

#     result = extract_data(text, questions)

#     return(result)

In [24]:
# lambda_handler(newimported['text'][0])