In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install pytesseract opencv-python pandas sqlite3 torch torchvision scikit-learn

In [None]:
import cv2
import pytesseract
from pytesseract import Output
import os

# Set Tesseract OCR Path
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update based on your system

def extract_text_from_image(image_path):
    # Read the image
    image = cv2.imread(image_path)
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply thresholding for better OCR accuracy
    _, thresh = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY)
    # Extract text using Tesseract OCR
    text = pytesseract.image_to_string(thresh, lang='eng')
    return text

# Process all images in the folder
def process_images(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(('.png', '.jpg', '.jpeg')):
            file_path = os.path.join(folder_path, file_name)
            text = extract_text_from_image(file_path)
            data.append({'file_name': file_name, 'text': text})
    return data

image_data = process_images('data/images')

In [None]:
import pandas as pd

def clean_and_structure_data(raw_data):
    structured_data = []
    for record in raw_data:
        text = record['text']
        # Example: Split by lines and extract fields (customize for your format)
        lines = text.split('\n')
        record_data = {
            'patient_name': lines[0] if len(lines) > 0 else None,
            'age': lines[1].split(':')[1].strip() if len(lines) > 1 and 'Age:' in lines[1] else None,
            'diagnosis': lines[2] if len(lines) > 2 else None,
            'prescription': ' '.join(lines[3:]) if len(lines) > 3 else None,
        }
        structured_data.append(record_data)
    return pd.DataFrame(structured_data)

structured_data = clean_and_structure_data(image_data)

In [None]:
import sqlite3

def save_to_database(dataframe, db_name, table_name):
    conn = sqlite3.connect(db_name)
    dataframe.to_sql(table_name, conn, if_exists='replace', index=False)
    conn.close()

save_to_database(structured_data, 'hospital_data.db', 'prescriptions')

In [None]:
def load_data_from_database(db_name, table_name):
    conn = sqlite3.connect(db_name)
    dataframe = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    conn.close()
    return dataframe

ml_data = load_data_from_database('hospital_data.db', 'prescriptions')

# Example ML Pipeline (basic)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Feature extraction from text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(ml_data['diagnosis'].fillna(''))
y = ml_data['age'].fillna(0).astype(int)  # Replace age with your target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

In [None]:
Step 4: Optional Enhancements

1. Handwritten Text Improvement:

Fine-tune a CRNN or use pre-trained models like IAM Dataset CRNN.



2. Data Visualization:

Use Matplotlib or Seaborn to visualize trends in data.



3. Validation:

Validate OCR output manually for higher accuracy before ML processing.



4. Deployment:

Package the project as a desktop application using PyInstaller or Tkinter for a GUI.

