<a href="https://colab.research.google.com/github/jlagares/RAG-AI-Training/blob/main/src/LibTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PDF Testing for conversion to pages**


Install Requirements

In [2]:
# Install required libraries.
# Install pdf2image using pip:
!pip install pdf2image

# Install Poppler utilities using apt-get:
!apt-get install -y poppler-utils

# Ensure the OpenAI package is installed
!pip install --upgrade openai


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


Make sure libraries are installed, clone the repo locally

In [3]:
from google.colab import userdata
git_key = userdata.get('github')
git_user = userdata.get('git_user')

# URL-encode the '@' in your email address
git_user = git_user.replace('@', '%40')
print(git_user)
#Define your user and github token with your actual GitHub username
!git clone https://{git_user}:{git_key}@github.com/dannyduude/FinSync.git
!git pull https://{git_user}:{git_key}@github.com/dannyduude/FinSync.git
%cd /content/FinSync/src
!ls  # Optional: Verify that file_convert_pdf.py is listed
import sys
sys.path.insert(0, '/content/FinSync/src')

javier.lagares%40gmail.com
fatal: destination path 'FinSync' already exists and is not an empty directory.
fatal: not a git repository (or any of the parent directories): .git
/content/FinSync/src
file_convert_pdf.py  FinSync  image_ocr.py  LibTest.ipynb  main.ipynb  __pycache__


# Real code starts here

In [4]:
from google.colab import userdata
api_key = userdata.get('OAIkey')

In [5]:
# If the file_convert_pdf.py is in the same directory as your notebook, simply import:
from file_convert_pdf import FileConvertPDF
from image_ocr import ImageOCR

In [25]:
import os
from pdf2image import convert_from_path
from PIL import Image

class FileConvertPDF:
    def __init__(self, input_pdf_path: str, output_folder: str, target_shorter_axis: int = 512):
        """
        Initialize the converter with an input PDF path, output folder, and target size for the shorter axis.

        Args:
            input_pdf_path (str): The path to the PDF file.
            output_folder (str): The folder where the output images will be saved.
            target_shorter_axis (int): The desired size (in pixels) for the shorter axis of the image.
        """
        self.input_pdf_path = input_pdf_path
        self.output_folder = output_folder
        self.target_shorter_axis = target_shorter_axis

        # Create the output folder if it doesn't exist.
        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

    def _resize_image(self, image: Image.Image) -> Image.Image:
        """
        Resize an image so that its shorter side is equal to target_shorter_axis while preserving aspect ratio.

        Args:
            image (PIL.Image.Image): The image to resize.

        Returns:
            PIL.Image.Image: The resized image.
        """
        width, height = image.size
        # Determine the scaling factor so that the shorter side becomes target_shorter_axis.
        scale_factor = self.target_shorter_axis / min(width, height)
        new_width = int(width * scale_factor)
        new_height = int(height * scale_factor)
        return image.resize((new_width, new_height), Image.LANCZOS)

    def convert(self):
        """
        Convert each page of the PDF into a JPEG image. Each image is resized so that its shorter side is
        target_shorter_axis pixels.
        """
        # Convert all PDF pages to images using pdf2image.
        pages = convert_from_path(self.input_pdf_path)
        # Extract the base name of the PDF file without extension.
        base_name = os.path.splitext(os.path.basename(self.input_pdf_path))[0]

        for i, page in enumerate(pages, start=1):
            # Resize the page image.
            page_resized = self._resize_image(page)
            # Print the output resolution (width, height) of the resized image.
            print(f"Page {i} resolution: {page_resized.size}")

            output_file = f"{base_name}_page_{i}.jpg"
            output_path = os.path.join(self.output_folder, output_file)
            page_resized.save(output_path, "JPEG")
            print(f"Saved: {output_path}")


In [26]:
from google.colab import drive
drive.mount('/content/drive')

# Define the paths for your PDF file and output folder in Google Drive.
input_pdf_path = '/content/drive/MyDrive/FinSync/TestAbono.pdf'  # Update with your PDF path
output_folder = '/content/drive/MyDrive/FinSync/output'   # Update with your desired output folder

# Create an instance of FileConvertPDF and run the conversion.
converter = FileConvertPDF(input_pdf_path, output_folder)
converter.convert()

print("PDF conversion to images completed.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Page 1 resolution: (1085, 512)
Saved: /content/drive/MyDrive/FinSync/output/TestAbono_page_1.jpg
PDF conversion to images completed.


In [32]:
import base64
from PIL import Image
from io import BytesIO
class ImageOCR:
    def __init__(self, openai_client, system_prompt: str, model: str = "gpt-3.5-turbo"):
        """
        Initialize the ImageParse instance.

        Args:
            openai_client: An instance of OpenAI (e.g. created via `from openai import OpenAI`).
            system_prompt (str): The system prompt to be sent to OpenAI.
            model (str): The model to use for completions. Defaults to "gpt-3.5-turbo".
        """
        self.openai_client = openai_client
        self.system_prompt = system_prompt
        self.model = model

    def convert(self, image_path: str, prompt_text: str = "Please use this image for reference:") -> str:
        """
        Downscales the image, encodes it to Base64, builds a user prompt with additional instructions,
        and calls the OpenAI API using the provided client.

        Args:
            image_path (str): Path to the image file.
            prompt_text (str): Optional text to prepend to the image data.

        Returns:
            str: The content of the API's response.
        """
        # Open the image and resize it to a low resolution (e.g., 256x256 pixels)
        with Image.open(image_path) as img:
            low_res_img = img
            buffer = BytesIO()
            low_res_img.save(buffer, format="PNG")
            encoded_bytes = base64.b64encode(buffer.getvalue())
            encoded_str = encoded_bytes.decode('utf-8')

        # Create a data URL for the PNG image
        base64_image = f"data:image/png;base64,{encoded_str}"

        # Construct the full user prompt with additional instructions.
        # (The note asks to use the low resolution image with zero invention in the analysis.)
        user_prompt = (
            f"{prompt_text}\n"
            "Note: This image has been downscaled to low resolution for processing. ",
            {"type": "image_url",
             "image_url" : {"url": f"{base64_image}",
                            "detail": "low"}}
        )

        # Build the messages payload
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        # Call the OpenAI API using the client's chat completions method.
        response = client.chat.completions.create(
            model = self.model,
            messages = messages,
            temperature=0
        )

        # Extract the answer from the response and return it.
        answer = response.choices[0].message.content.strip()
        return answer

In [38]:
from openai import OpenAI # Import the OpenAI class from the openai library.
from IPython.display import Markdown, display

client = OpenAI(api_key=api_key)
MODEL = "gpt-4o"
# Create an instance of ImageParse by passing the openai client and a system prompt.
img_parser = ImageOCR(openai_client = client,
    system_prompt = "You are supposed to help doing OCR from a document, Create a structured document with the requester of the payment, recipient, observations, recipient bank, total amount, date of transaction.\
    Use the following fields for output in JSON format: requestor, recipient, recipient_bank, recipient_iban, recipient_bic, total_amount, currency, transaction_date, transaction_comments",
    model = MODEL
)

# Provide the path to your image file.
image_file_path = '/content/drive/MyDrive/FinSync/output/TestAbono_page_1.jpg'  # Update with the correct image path

# Call convert to encode the image, build the prompt, and get the OpenAI response.
result = img_parser.convert(image_file_path, prompt_text="")
print("OpenAI Response:")
print(result)



OpenAI Response:
```json
{
  "requestor": "JAVIER SALVADOR LAGARES GARCIA",
  "recipient": "IES Eugeni D'Ors",
  "recipient_bank": "BANCO DE SABADELL S.A.",
  "recipient_iban": "ES46 0081 5206 1100 0101 2576",
  "recipient_bic": "BSABESBBXXX",
  "total_amount": 91.00,
  "currency": "EUR",
  "transaction_date": "11-06-24",
  "transaction_comments": "MATRICULA: DESPESA ESCOLAR I ASSEGURANÇA ESCOLAR"
}
```


In [None]:
url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"


In [None]:
!pip install openai



In [None]:
MODEL = "gpt-4o"

In [None]:
from openai import OpenAI # Import the OpenAI class from the openai library.
from IPython.display import Markdown, display

client = OpenAI(api_key=api_key)

In [None]:
system_prompt = "you are Asterix the gaul"
user_prompt = "explain what should I do to win against romans"

In [None]:
from os import system
response = client.chat.completions.create(
    model = MODEL,
    messages = [
        {"role":"system", "content": system_prompt},
        {"role":"user", "content":user_prompt}
    ],
    temperature=1.0,
    top_p=1.0
)

In [None]:
response


ChatCompletion(id='chatcmpl-AvpASC8nanQETVe0EtWnAMDW5YBwd', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='By Toutatis! Taking on the Romans is indeed a tough challenge, but with a little cunning and some help from your friends, you can certainly give them a run for their money! Here’s what you need to do:\n\n1. **Strategize Wisely**: Understand the layout and strengths of your Roman adversaries. Gather intelligence on their plans and movements.\n\n2. **Leverage the Magic Potion**: Don’t forget the powerful magic potion brewed by our druid Getafix! It gives you and your fellow villagers superhuman strength for a limited time, which has turned the tide of many battles.\n\n3. **Teamwork is Essential**: Rally the villagers, including Obelix, who’s always eager for a tussle with the Romans (even though he fell into the cauldron of potion as a child and doesn’t need more). The strength of our community is one of our greatest assets.\n\n4

In [None]:
display(Markdown(response.choices[0].message.content))

By Toutatis! Taking on the Romans is indeed a tough challenge, but with a little cunning and some help from your friends, you can certainly give them a run for their money! Here’s what you need to do:

1. **Strategize Wisely**: Understand the layout and strengths of your Roman adversaries. Gather intelligence on their plans and movements.

2. **Leverage the Magic Potion**: Don’t forget the powerful magic potion brewed by our druid Getafix! It gives you and your fellow villagers superhuman strength for a limited time, which has turned the tide of many battles.

3. **Teamwork is Essential**: Rally the villagers, including Obelix, who’s always eager for a tussle with the Romans (even though he fell into the cauldron of potion as a child and doesn’t need more). The strength of our community is one of our greatest assets.

4. **Use Clever Tactics**: Outwit rather than outfight. Use the landscape to your advantage, set up clever traps, and employ guerrilla tactics that take advantage of your knowledge of the local terrain.

5. **Maintain High Morale**: Keep the spirits high with a good boar feast and camaraderie. A motivated and cheerful group is much more effective in outmaneuvering the Romans.

6. **Communicate Clearly**: Always stay in touch with Getafix for more potion if needed, and keep an eye out for any Roman trickery.

Remember, the Romans may outnumber us, but our indomitable spirit and clever tactics have kept our village free! Engage with your foes intelligently and bravely, and victory shall be ours once more. Courage, fellow Gaul!

Text generation with parameters

## OpenAI
# PDF to CSV


In [None]:
!pip install openai --quiet

In [None]:
from google.colab import userdata
api_key = userdata.get('OAIkey')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Typical code to prepare PDFs
# pip install pymupdf pillow
import fitz  # PyMuPDF
from PIL import Image
import os

# Go through the directory and find all files with pdf ending
for filename in os.listdir(directory):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(directory, filename)
        pdf_document = fitz.open(pdf_path)
        for page_number in range(len(pdf_document)):
            page = pdf_document.load_page(page_number)
            pix = page.get_pixmap()

            # Save image directly in the given directory
            image_path = os.path.join(directory, f"{os.path.splitext(filename)[0]}_page_{page_number}.png")
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            img.save(image_path)

print("All PDF files have been converted")

In [None]:
%ls /content/drive/MyDrive/


 45bf4311-aeff-46c9-8f9c-9464f5f1dffb.jpg
'Autorització bancària Escola Contrapunt.docx'
'Autorització bancària Escola Contrapunt.docx.gdoc'
 BCN-CLT_eticket.pdf
 [0m[01;34mBerta[0m/
[01;34m'Bollullos 2017'[0m/
 calendari16-17.xlsx
 calendari16-17.xlsx.gsheet
'Calendari  d'\''activitats Contrapunt 2016 -2017 Pares modificat (1).pdf'
[01;34m'calendario nenas'[0m/
[01;34m'Colab Notebooks'[0m/
'Condiciones Generales Todo Riesgo.pdf'
'Condiciones Particulares.pdf'
'Copia de DSC_0180.JPG'
'Disapacidad Salvador1 (1).jpg'
'Disapacidad Salvador1.jpg'
'Disapacidad Salvador2 (1).jpg'
'Disapacidad Salvador2.jpg'
'Disapacidad Salvador3 (1).jpg'
'Disapacidad Salvador3.jpg'
'Disapacidad Salvador4 (1).jpg'
'Disapacidad Salvador4.jpg'
'Disapacidad Salvador5 (1).jpg'
'Disapacidad Salvador5.jpg'
'Disapacidad Salvador6 (1).jpg'
'Disapacidad Salvador6.jpg'
'DiscapacidadSalvador Sentencia.pdf'
 Estatutos.pdf
'Extraescolars Exemple [Form].gform'
'Extraescolars Exemple.gsheet'
[01;34m'fotos fam

In [None]:
from openai import OpenAI
import os
import base64
from IPython.display import Image, display
import pandas as pd

In [None]:
# Set model to OpenAI
MODEL = "gpt-4o"
client = OpenAI(api_key=api_key)


In [None]:
# System prompt

In [None]:
!pip install imapclient

Collecting imapclient
  Downloading IMAPClient-3.0.1-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading IMAPClient-3.0.1-py2.py3-none-any.whl (182 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/182.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.5/182.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imapclient
Successfully installed imapclient-3.0.1


In [None]:
EMAIL = "javier.lagares@gmail.com"
IMAP_SERVER = "imap.gmail.com"
PASSWORD = userdata.get('gmailAppPwd')
print(f"-{PASSWORD}-")

-vgwa gwhh gytr ktbc-


In [None]:
import imaplib

try:
    mail = imaplib.IMAP4_SSL(IMAP_SERVER)
    mail.login(EMAIL, PASSWORD)
    print("Login successful!")

    mail.select("inbox")  # Select the inbox

    # Search for the latest email related to English classes
    # status, messages = mail.search(None, '(OR SUBJECT "TEST")')

    # email_ids = messages[0].split()

    # if email_ids:
    #     latest_email_id = email_ids[-1]  # Get the latest email ID

    #     # Fetch the latest email
    #     status, msg_data = mail.fetch(latest_email_id, "(RFC822)")

    #     for response_part in msg_data:
    #         if isinstance(response_part, tuple):
    #             msg = email.message_from_bytes(response_part[1])
    #             sender = msg["from"]
    #             subject = msg["subject"]
    #             date = msg["date"]

    #             # Extract email content
    #             body = ""
    #             if msg.is_multipart():
    #                 for part in msg.walk():
    #                     if part.get_content_type() == "text/plain":
    #                         body = part.get_payload(decode=True).decode("utf-8", errors="ignore")
    #                         break
    #             else:
    #                 body = msg.get_payload(decode=True).decode("utf-8", errors="ignore")

    #             # Display information
    #             print("\n=== Latest Email Related to TEST Classes ===")
    #             print(f"Sender: {sender}")
    #             print(f"Subject: {subject if subject else 'No Subject'}")
    #             print(f"Date: {date}")
    #             print("\nEmail Content:\n" + body[:500])  # Display first 500 characters
    #             print("\n===============================================\n")


    mail.logout()
except imaplib.IMAP4.error:
    print("Login failed: Invalid credentials.")


Login successful!
