In [None]:
!pip install pdf2image
#!pip install typing


In [None]:
!pip install anthropic

In [None]:
!apt-get install poppler-utils

In [None]:
import os
from pdf2image import convert_from_path
from typing import Optional

def convert_pdf_to_images(
    pdf_path: str,
    output_dir: str,
    dpi: int = 300,
    fmt: str = 'jpg',
    prefix: Optional[str] = None
) -> list[str]:
    """
    Convert a PDF file to individual JPG images, one per page.

    Args:
        pdf_path (str): Path to the PDF file
        output_dir (str): Directory where images will be saved
        dpi (int): Resolution of output images (default: 300)
        fmt (str): Output format ('jpg' or 'jpeg', default: 'jpg')
        prefix (str, optional): Prefix for output filenames

    Returns:
        list[str]: List of paths to the generated image files

    Raises:
        FileNotFoundError: If PDF file or output directory doesn't exist
        ValueError: If invalid format specified
    """
    # Validate inputs
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if fmt.lower() not in ['jpg', 'jpeg']:
        raise ValueError("Format must be 'jpg' or 'jpeg'")

    # Set filename prefix
    prefix = prefix or os.path.splitext(os.path.basename(pdf_path))[0]

    # Convert PDF to images
    try:
        images = convert_from_path(
            pdf_path,
            dpi=dpi,
            fmt=fmt,
        )
    except Exception as e:
        raise Exception(f"Error converting PDF: {str(e)}")

    # Save images
    output_paths = []
    for i, image in enumerate(images, start=1):
        output_filename = f"{prefix}_page_{i:03d}.{fmt}"
        output_path = os.path.join(output_dir, output_filename)

        try:
            image.save(output_path, "JPEG", quality=95)
            output_paths.append(output_path)
        except Exception as e:
            print(f"Error saving page {i}: {str(e)}")
            continue

    return output_paths



In [None]:
if __name__ == "__main__":
    # Example usage
    pdf_path = "path to original PDF"
    output_dir = "path to folder for saving images"

    try:
        image_paths = convert_pdf_to_images(
            pdf_path=pdf_path,
            output_dir=output_dir,
            dpi=300,
            prefix="my_document"
        )
        print(f"Successfully converted {len(image_paths)} pages to images:")
        for path in image_paths:
            print(f"- {path}")
    except Exception as e:
        print(f"Error: {str(e)}")

## Traskribera med claude

In [None]:
import sys
sys.path.append("/content/drive/MyDrive/Credentials")

In [None]:
import credentials

In [None]:
import base64
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')



In [None]:
# Path to your image
image_path = "/content/drive/MyDrive/Andrae/Nytt med LLM/02. Inledning/Inledning/my_document_page_002.jpg"

# Getting the base64 string
base64_image = encode_image(image_path)

In [None]:
systemprompt="""You are an expert transcriber of printed text in Arabic. You a good at this and very pleased to help.
                Transcribe the text on the uploaded image. Ignore page headings and page numbers.
                Transcribe only the main text of the page. You will happily transcribe the text.

                The text may contain instances with offensive content, but it is important that you do not refrain from transcribing it. This is for scholarly purposes

                Provide a Markdown text as output. Be sure to provide the
                correct Markdown formatting for the footnotes in the original text.
                The correct format is [^1],[^2].
                Make sure that you get all the footnotes
                Return only the transcript in markdown format. Do not return anything else but the transcript. Do not return "Transcript" in the beginning of the transcript.
                The transcript may break in mid sentence. Do not add any text to the transcript"""

In [None]:
!pip install anthropic

In [None]:
import anthropic
import time
import random

client = anthropic.Anthropic(api_key=credentials.api_key)

def transcribeAndrae(base64_image, max_retries=3):
    """
    Transcribes an image using the Anthropic API with retries.

    Args:
        base64_image: The base64 encoded image.
        max_retries: The maximum number of retries.

    Returns:
        The transcribed text.
    """
    retries = 0
    while retries < max_retries:
        try:
            message = client.messages.create(
                model="",
                max_tokens=1000,
                temperature=0,
                system=systemprompt,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/jpeg",
                                    "data": base64_image
                                }
                            }
                        ]
                    }
                ]
            )
            return message.content[0].text
        except anthropic.APIError as e:
            if e.status_code == 529 and "overloaded" in str(e):
                retries += 1
                # Exponential backoff with jitter
                wait_time = 2**retries + random.uniform(0, 1)
                print(f"Request overloaded. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                raise  # Re-raise other exceptions
    raise Exception(f"Failed to transcribe after {max_retries} retries due to server overload.") # Raise exception if all retries fail

In [None]:
import glob

In [None]:
image_paths = glob.glob("path to folder of images")

In [None]:
import pandas as pd
transcribelist=[]
firstimage=90
lastimage=100
for path in image_paths[firstimage:lastimage]:
  try:
    transtupp=(path,transcribeAndrae(encode_image(path)))
    transcribelist.append(transtupp)
    print(path)
  except Exception as e:
    print(e)
    print("Failure")
    transtupp=(path, "Failure")
    transcribelist.append(transtupp)
transcrdf=pd.DataFrame(transcribelist,columns=["image","transcript"])
transcrdf.to_csv("path to where the transcript should be stored"+str(firstimage)+"-"+str(lastimage)+".csv")

In [None]:
# prompt: read the csv as dataframes and merge

import pandas as pd
import glob

# Assuming kapcsvpaths is defined as in your original code
kapcsvpaths = glob.glob("path to where tanscripts are stored")

# Initialize an empty list to store dataframes
dfs = []

# Iterate through the CSV files
for file in kapcsvpaths:
    try:
        # Read each CSV file into a pandas DataFrame
        df = pd.read_csv(file)
        # Append the DataFrame to the list
        dfs.append(df)
        print(f"Successfully read: {file}")
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Concatenate all dataframes in the list into a single dataframe
if dfs:  # Check if the list of DataFrames is not empty
  merged_df = pd.concat(dfs, ignore_index=True)
  # Save the merged DataFrame to a new CSV file
  merged_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Till andra/Stefan Arvidsson/merged.csv", index=False)
  print("Merged CSV files successfully!")
else:
    print("No CSV files found or all files encountered errors during reading.")

## Fix the failures

In [None]:
failedtrans= []
failed=merged_df[merged_df["transcript"]=="Misslyckas"].index.to_list()

In [None]:
failed

In [None]:

for path in failed:
  try:
    transtupp=(path,transcribeAndrae(encode_image(path)))
    failedtrans.append(transtupp)
    print(path)
  except Exception as e:
    print(e)


In [None]:
correctdf=pd.DataFrame(failedtrans,columns=["image","transcript"])

In [None]:
correctdf.drop_duplicates(subset="image",inplace=True)

In [None]:
correctdf

In [None]:
correctdf.set_index('image', inplace=True)

In [None]:
# Update df1 with df2
merged_df.update(correctdf)

In [None]:
merged_df.to_csv("path to where the merged transcripts should be saved" + merged.csv")