# Handwritten Prescription Medicine Recognition

This notebook implements a TensorFlow-based pipeline for recognizing medicine names from handwritten prescriptions.


In [None]:
# Cell: Imports and Environment Setup
import os
import json
import pathlib
import string
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
from PIL import Image

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)


2025-10-20 12:44:53.464510: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Cell: GPU Configuration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(f"Physical GPUs: {len(gpus)}, Logical GPUs: {len(logical_gpus)}")
    except RuntimeError as e:
        print(e)
else:
    print('No GPU detected. Training will fall back to CPU.')


In [None]:
# Cell: Configuration Parameters
DATA_ROOT = pathlib.Path('dataset')  # Root directory containing Training/Validation/Testing
TRAIN_DIR = DATA_ROOT / 'Training'
VAL_DIR = DATA_ROOT / 'Validation'
TEST_DIR = DATA_ROOT / 'Testing'

TRAIN_IMAGES_DIR = TRAIN_DIR / 'training_words'
VAL_IMAGES_DIR = VAL_DIR / 'validation_words'
TEST_IMAGES_DIR = TEST_DIR / 'testing_words'

TRAIN_LABELS_FILE = TRAIN_DIR / 'training_labels.csv'
VAL_LABELS_FILE = VAL_DIR / 'validation_labels.csv'
TEST_LABELS_FILE = TEST_DIR / 'testing_labels.csv'

IMAGE_COLUMN = 'IMAGE'  # column containing the image filename
LABEL_COLUMN = 'MEDICINE_NAME'  # supervised target column; adjust if you want GENERIC_NAME instead

OUTPUT_DIR = pathlib.Path('artifacts')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

IMG_HEIGHT = 128
IMG_WIDTH = 512
BATCH_SIZE = 16
AUTOTUNE = tf.data.AUTOTUNE
MAX_LABEL_LENGTH = 64

print(f'Data root: {DATA_ROOT.resolve()}')
print(f'Training labels: {TRAIN_LABELS_FILE}')
print(f'Validation labels: {VAL_LABELS_FILE}')
print(f'Testing labels: {TEST_LABELS_FILE}')


In [None]:
# Cell: Placeholder Asset Creation
placeholder_path = pathlib.Path('placeholder.png')
if not placeholder_path.exists():
    placeholder_image = Image.new('L', (IMG_WIDTH, IMG_HEIGHT), color=255)
    placeholder_image.save(placeholder_path)
    print(f'Created placeholder image at {placeholder_path.resolve()}')
else:
    print(f'Placeholder image already exists at {placeholder_path.resolve()}')


In [None]:
# Cell: Character Vocabulary
# Build vocabulary from medical lexicon or dataset labels
DEFAULT_CHARSET = string.ascii_lowercase + string.ascii_uppercase + string.digits + ' -./()'

def build_vocabulary(labels: List[str], extra_tokens: str = '') -> Tuple[Dict[str, int], Dict[int, str]]:
    characters = sorted(set(''.join(labels)) | set(DEFAULT_CHARSET) | set(extra_tokens))
    char_to_num = {char: idx + 1 for idx, char in enumerate(characters)}
    char_to_num['<BLANK>'] = 0
    num_to_char = {idx: char for char, idx in char_to_num.items()}
    return char_to_num, num_to_char

def load_labels(annotation_file: pathlib.Path) -> List[str]:
    if not annotation_file.exists():
        print(f"{annotation_file} not found; returning placeholder labels.")
        return ['Paracetamol', 'Ibuprofen']
    df = pd.read_csv(annotation_file)
    if LABEL_COLUMN not in df.columns:
        raise ValueError(f"Expected column '{LABEL_COLUMN}' in {annotation_file}, found {list(df.columns)}")
    return df[LABEL_COLUMN].astype(str).tolist()

raw_labels = load_labels(TRAIN_LABELS_FILE)
CHAR_TO_NUM, NUM_TO_CHAR = build_vocabulary(raw_labels)
VOCAB_SIZE = len(CHAR_TO_NUM)
print(f'Vocabulary size: {VOCAB_SIZE}')


In [None]:
# Cell: Data Loading Utilities
def read_image(path: tf.Tensor) -> tf.Tensor:
    image = tf.io.read_file(path)
    image = tf.io.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, tf.float32)
    return image

@tf.function
def preprocess_image(image: tf.Tensor) -> tf.Tensor:
    image = tf.image.resize(image, [IMG_HEIGHT, IMG_WIDTH], preserve_aspect_ratio=True)
    image = tf.image.pad_to_bounding_box(image, 0, 0, IMG_HEIGHT, IMG_WIDTH)
    image = tf.image.adjust_brightness(image, 0.05)
    image = tf.image.adjust_contrast(image, 1.5)
    return image

@tf.function
def augment_image(image: tf.Tensor) -> tf.Tensor:
    image = tf.image.random_brightness(image, 0.15)
    image = tf.image.random_contrast(image, 0.75, 1.25)
    image = tf.image.rot90(image, k=tf.random.uniform([], minval=0, maxval=4, dtype=tf.int32))
    return image

def encode_label_py(label: str) -> np.ndarray:
    label_chars = list(label)
    encoded = [CHAR_TO_NUM.get(ch, CHAR_TO_NUM[' ']) for ch in label_chars]
    encoded = encoded[:MAX_LABEL_LENGTH]
    padding = [0] * (MAX_LABEL_LENGTH - len(encoded))
    return np.array(encoded + padding, dtype=np.int32)

@tf.function
def prepare_example(path: tf.Tensor, label: tf.Tensor, training: bool = False):
    image = read_image(path)
    image = preprocess_image(image)
    if training:
        image = augment_image(image)
    label_encoded = tf.numpy_function(func=lambda l: encode_label_py(l.decode('utf-8')), inp=[label], Tout=tf.int32)
    label_encoded.set_shape([MAX_LABEL_LENGTH])
    return image, label_encoded


In [None]:
# Cell: Dataset Pipeline
def create_dataset(annotation_file: pathlib.Path, images_dir: pathlib.Path, training: bool = False) -> tf.data.Dataset:
    if not annotation_file.exists() or not images_dir.exists():
        print(f"Either {annotation_file} or {images_dir} is missing. Using placeholder samples.")
        dummy_paths = tf.constant([str(pathlib.Path('placeholder.png').resolve())] * len(raw_labels))
        dummy_labels = tf.constant(raw_labels)
        dataset = tf.data.Dataset.from_tensor_slices((dummy_paths, dummy_labels))
    else:
        df = pd.read_csv(annotation_file)
        if IMAGE_COLUMN not in df.columns:
            raise ValueError(f"Expected column '{IMAGE_COLUMN}' in {annotation_file}, found {list(df.columns)}")
        if LABEL_COLUMN not in df.columns:
            raise ValueError(f"Expected column '{LABEL_COLUMN}' in {annotation_file}, found {list(df.columns)}")
        paths = df[IMAGE_COLUMN].apply(lambda p: str((images_dir / p).resolve())).tolist()
        labels = df[LABEL_COLUMN].astype(str).tolist()
        dataset = tf.data.Dataset.from_tensor_slices((paths, labels))

    dataset = dataset.map(lambda p, l: prepare_example(p, l, training), num_parallel_calls=AUTOTUNE)
    if training:
        dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return dataset

train_ds = create_dataset(TRAIN_LABELS_FILE, TRAIN_IMAGES_DIR, training=True)
val_ds = create_dataset(VAL_LABELS_FILE, VAL_IMAGES_DIR, training=False)
test_ds = create_dataset(TEST_LABELS_FILE, TEST_IMAGES_DIR, training=False)


In [None]:
# Cell: Model Architecture
def build_crnn_model(img_width: int, img_height: int, vocab_size: int) -> keras.Model:
    input_img = layers.Input(shape=(img_height, img_width, 1), name='image_input')
    x = layers.Conv2D(64, (3, 3), padding='same', activation='relu')(input_img)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Conv2D(256, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D(pool_size=(2, 1))(x)
    x = layers.Conv2D(512, (3, 3), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D(pool_size=(2, 1))(x)
    x = layers.Conv2D(512, (2, 2), padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Permute((2, 1, 3), name='permute_width_first')(x)
    x = layers.Lambda(
        lambda t: tf.reshape(t, (tf.shape(t)[0], tf.shape(t)[1], tf.shape(t)[2] * tf.shape(t)[3])),
        name='flatten_height_channels',
        output_shape=lambda s: (s[1], s[2] * s[3]),
    )(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(x)

    output = layers.Dense(vocab_size, activation='softmax', name='dense_output')(x)

    model = keras.Model(inputs=input_img, outputs=output, name='crnn_model')
    return model

crnn_model = build_crnn_model(IMG_WIDTH, IMG_HEIGHT, VOCAB_SIZE)
crnn_model.summary()


In [None]:
# Cell: CTC Loss and Training Step
class CTCLayer(layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype='int64')
        input_length = tf.cast(tf.shape(y_pred)[1], dtype='int64')
        label_length = tf.cast(tf.shape(y_true)[1], dtype='int64')

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype='int64')
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype='int64')

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(tf.reduce_mean(loss))
        return y_pred

def build_ctc_model(base_model: keras.Model) -> keras.Model:
    labels = layers.Input(name='label', shape=(MAX_LABEL_LENGTH,), dtype='int32')
    y_pred = base_model.output
    output = CTCLayer(name='ctc_loss')(labels, y_pred)
    ctc_model = keras.Model(inputs=[base_model.input, labels], outputs=output)
    return ctc_model

ctc_model = build_ctc_model(crnn_model)
ctc_model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4))


In [None]:
# Cell: Training Loop
EPOCHS = 50
optimizer = keras.optimizers.Adam(learning_rate=1e-4)

@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        logits = crnn_model(images, training=True)
        ctc_layer = CTCLayer()
        _ = ctc_layer(labels, logits)
        loss = tf.add_n(ctc_layer.losses) if ctc_layer.losses else 0.0
    gradients = tape.gradient(loss, crnn_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, crnn_model.trainable_variables))
    return loss

print('Custom training step defined. Prefer using ctc_model.fit with tf.data pipelines for full training.')
# Example usage with Keras fit:
# history = ctc_model.fit(
#     train_ds.map(lambda img, lbl: ({'image_input': img, 'label': lbl}, lbl)),
#     validation_data=val_ds.map(lambda img, lbl: ({'image_input': img, 'label': lbl}, lbl)),
#     epochs=EPOCHS,
# )


In [None]:
# Cell: Decoding Utilities
@tf.function
def greedy_decode(pred):
    return tf.math.argmax(pred, axis=-1, output_type=tf.int32)


def decode_batch_predictions(pred):
    results = []
    for text in pred:
        text = tf.gather(text, tf.where(tf.not_equal(text, 0)))
        text = tf.squeeze(text, axis=-1)
        chars = [NUM_TO_CHAR.get(int(char), '') for char in text.numpy()]
        results.append(''.join(chars))
    return results


def recognize_medicines(model: keras.Model, dataset: tf.data.Dataset) -> List[List[str]]:
    medicines = []
    for batch_images, _ in dataset:
        preds = model.predict(batch_images)
        decoded = decode_batch_predictions(greedy_decode(preds))
        medicines.append(decoded)
    return medicines

print('Decoding utilities ready.')


In [None]:
# Cell: Inference Example
def run_inference_example(model: keras.Model, sample_paths: List[str]):
    for path in sample_paths:
        path_obj = pathlib.Path(path)
        if not path_obj.exists():
            print(f'Sample {path} not found. Skipping.')
            continue
        image = tf.io.read_file(str(path_obj))
        image = tf.io.decode_png(image, channels=1)
        image = tf.image.convert_image_dtype(image, tf.float32)
        image = preprocess_image(image)
        image = tf.expand_dims(image, axis=0)
        preds = model.predict(image)
        decoded = decode_batch_predictions(greedy_decode(preds))
        print(f'{path}: {decoded[0]}')

print("Call `run_inference_example(crnn_model, ['path/to/image.png'])` after training.")


## Next Steps

1. Replace placeholder dataset paths with actual annotated prescription data.
2. Ensure labels are properly encoded and aligned with the vocabulary.
3. Train the model using `ctc_model.fit`.
4. Integrate a medicine lexicon for post-processing corrections.


## Dataset Paths

The training pipeline expects the dataset to be organized with the following directories relative to the project root:

- Training images: `dataset/Training/training_words`
- Validation images: `dataset/Validation/validation_words`
- Testing images: `dataset/Testing/testing_words`

CSV annotation files with the medicine labels are expected at:

- Training labels: `dataset/Training/training_labels.csv`
- Validation labels: `dataset/Validation/validation_labels.csv`
- Testing labels: `dataset/Testing/testing_labels.csv`


## Train Model

Fine-tune the CRNN directly from this notebook once the dataset folders are populated at:

- `dataset/Training/training_words` with labels in `dataset/Training/training_labels.csv`
- `dataset/Validation/validation_words` with labels in `dataset/Validation/validation_labels.csv`
- `dataset/Testing/testing_words` with labels in `dataset/Testing/testing_labels.csv`

The code below prepares remapped datasets, configures checkpoints and TensorBoard logging under `artifacts/`, and runs `ctc_model.fit`. Adjust the hyperparameters as needed for your hardware and data volume.


### Troubleshooting Training

If `ctc_model.fit` fails or exits early, walk through these checks before rerunning training:

1. **Verify dataset paths** – confirm the `dataset/Training`, `dataset/Validation`, and `dataset/Testing` folders exist and contain the expected `*_words` subdirectories and CSV files. The helper cell below will print counts if everything is wired correctly.
2. **Inspect CSV columns** – the annotation files must include both the `IMAGE` filename column and the `MEDICINE_NAME` label column. Any missing or misspelled header will raise a ValueError when building the datasets.
3. **Spot-check a batch** – TensorFlow will fall back to the placeholder samples if directories are empty or paths are wrong. Review the sanity-check output and confirm you see real image paths, not `placeholder.png`.
4. **Watch system resources** – large images or high batch sizes may exhaust GPU/CPU RAM. Reduce `BATCH_SIZE`, close other GPU jobs, or run on a machine with more memory if you observe OOM errors.
5. **Confirm TensorFlow availability** – ensure the environment has TensorFlow installed with GPU support if desired (e.g., `pip install tensorflow==2.12.*` or `tensorflow-gpu`). Restart the kernel after installation.
6. **Resume from checkpoints** – if training stops mid-way, reload the best checkpoint by calling `crnn_model.load_weights('artifacts/checkpoints/<file>.keras')` before restarting to avoid losing progress.

Once these checks pass, rerun the callback configuration and the training cell.


In [None]:

# Cell: Dataset Sanity Checks
SUPPORTED_IMAGE_EXTS = ('.png', '.jpg', '.jpeg', '.bmp')

def inspect_split(images_dir, labels_file, split_name):
    print(f'[{split_name}]')
    if not images_dir.exists():
        print(f'  ✗ Images directory missing: {images_dir}')
    else:
        image_files = [p for p in images_dir.iterdir() if p.suffix.lower() in SUPPORTED_IMAGE_EXTS]
        sample_files = sorted([p.name for p in image_files[:3]])
        print(f'  ✓ Images directory found ({len(image_files)} files with supported extensions)')
        if sample_files:
            print('  Sample image files:', ', '.join(sample_files))
        else:
            print('  ⚠️ No files detected with extensions', SUPPORTED_IMAGE_EXTS)
    if not labels_file.exists():
        print(f'  ✗ Labels CSV missing: {labels_file}')
        return
    df = pd.read_csv(labels_file)
    print(f'  ✓ Labels CSV found with {len(df)} rows and columns: {list(df.columns)}')
    missing_cols = [col for col in [IMAGE_COLUMN, LABEL_COLUMN] if col not in df.columns]
    if missing_cols:
        print(f'  ✗ Missing expected columns: {missing_cols}')
    else:
        preview = df[[IMAGE_COLUMN, LABEL_COLUMN]].head(3)
        print('  Preview:')
        for _, row in preview.iterrows():
            print(f"    - {row[IMAGE_COLUMN]} -> {row[LABEL_COLUMN]}")

inspect_split(TRAIN_IMAGES_DIR, TRAIN_LABELS_FILE, 'Training split')
inspect_split(VAL_IMAGES_DIR, VAL_LABELS_FILE, 'Validation split')
inspect_split(TEST_IMAGES_DIR, TEST_LABELS_FILE, 'Test split')



In [None]:
checkpoint_dir = OUTPUT_DIR / 'checkpoints'
tensorboard_log_dir = OUTPUT_DIR / 'logs'
checkpoint_dir.mkdir(parents=True, exist_ok=True)
tensorboard_log_dir.mkdir(parents=True, exist_ok=True)

train_inputs = train_ds.map(lambda img, lbl: ({'image_input': img, 'label': lbl}, lbl))
val_inputs = val_ds.map(lambda img, lbl: ({'image_input': img, 'label': lbl}, lbl))
test_inputs = test_ds.map(lambda img, lbl: ({'image_input': img, 'label': lbl}, lbl))

print(f'Checkpoints directory: {checkpoint_dir.resolve()}')
print(f'TensorBoard logs directory: {tensorboard_log_dir.resolve()}')


In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath=str(checkpoint_dir / 'crnn_{epoch:02d}.keras'),
        monitor='val_loss',
        mode='min',
        save_best_only=True,
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-6,
    ),
    keras.callbacks.TensorBoard(log_dir=str(tensorboard_log_dir)),
]

history = ctc_model.fit(
    train_inputs,
    validation_data=val_inputs,
    epochs=EPOCHS,
    callbacks=callbacks,
)
