In [None]:
import cv2
import pytesseract
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files, drive
from PIL import Image
import os

# Mount Google Drive to access dataset
drive.mount('/content/drive', force_remount=True)

# Set up Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Function to preprocess image
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load {image_path}. Skipping...")
        return None
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 11, 2)
    return thresh

# Function to extract text using OCR
def extract_text(image_path):
    processed_image = preprocess_image(image_path)
    if processed_image is None:
        return ""
    text = pytesseract.image_to_string(processed_image, config='--psm 6')
    return text

# Function to structure extracted text
def structure_text(file_number, extracted_text):
    lines = extracted_text.split('\n')
    structured_data = {
        'File Number': file_number,
        'Patient Name': '',
        'Doctor Name': '',
        'Age': '',
        'Sex': '',
        'Prescription': '',
        'Dosage': '',
        'Instructions': ''
    }

    for line in lines:
        line = line.strip()
        if 'Dr.' in line:
            structured_data['Doctor Name'] = line
        elif 'Age' in line:
            structured_data['Age'] = ''.join(filter(str.isdigit, line))
        elif any(word in line.lower() for word in ['male', 'female', 'm', 'f']):
            structured_data['Sex'] = line
        elif 'Rx' in line or 'Prescription' in line:
            structured_data['Prescription'] = line
        elif 'mg' in line or 'ml' in line:
            structured_data['Dosage'] = line
        elif 'Take' in line or 'daily' in line:
            structured_data['Instructions'] = line
        elif len(line.split()) > 2 and not structured_data['Patient Name']:
            structured_data['Patient Name'] = line

    return structured_data

# Load images from Google Drive folder
image_folder = '/content/drive/MyDrive/Datasets/datasets/prescription/'
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]

# Read only first 10 files
data = []
num_files_to_process = min(10, len(image_files))

for idx, image_file in enumerate(image_files[:num_files_to_process], start=1):
    image_path = os.path.join(image_folder, image_file)
    extracted_text = extract_text(image_path)
    structured_data = structure_text(idx, extracted_text)
    data.append(structured_data)

    # Display image and extracted text
    img = Image.open(image_path)
    plt.imshow(img)
    plt.axis('off')
    plt.show()
    print(f'Extracted Text from {image_file}:\n', extracted_text, '\n' + '-'*50)

# Convert data to DataFrame and export
prescriptions_df = pd.DataFrame(data)
csv_path = os.path.join(image_folder, 'extracted_prescriptions.csv')
prescriptions_df.to_csv(csv_path, index=False)

# Ensure the file is saved properly
if os.path.exists(csv_path):
    print(f"CSV file successfully saved at: {csv_path}")
    files.download(csv_path)
else:
    print("Error: CSV file was not saved correctly.")
