In [1]:
pip install opencv-python numpy pandas tensorflow scikit-learn openpyxl

Note: you may need to restart the kernel to use updated packages.


In [11]:
# ------------------------------
# CNN-based OMR Scoring Pipeline
# ------------------------------

# REQUIREMENTS: pip install opencv-python numpy pandas tensorflow scikit-learn openpyxl

import os, cv2, numpy as np, pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# ---------- CONFIG ----------
dataset_root = "."            # folder containing SetA/SetB
excel_file   = "Key (Set A and B).xlsx"
IMG_SIZE = 48
CROP_DIR = "bubble_crops"
NUM_QUESTIONS = 100
classes = ["A","B","C","D","BLANK"]

os.makedirs(CROP_DIR, exist_ok=True)
for cls in classes:
    os.makedirs(os.path.join(CROP_DIR, cls), exist_ok=True)

# ---------- HELPER FUNCTIONS ----------

def warp_sheet(img, dst_size=(1200,1700)):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5,5), 0)
    edged = cv2.Canny(blur, 50, 150)
    cnts, _ = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    cnts = sorted(cnts, key=cv2.contourArea, reverse=True)[:15]
    for c in cnts:
        peri = cv2.arcLength(c, True)
        approx = cv2.approxPolyDP(c, 0.02*peri, True)
        if len(approx)==4:
            pts = approx.reshape(4,2).astype("float32")
            dst = np.array([[0,0],[dst_size[0]-1,0],[dst_size[0]-1,dst_size[1]-1],[0,dst_size[1]-1]],dtype="float32")
            M = cv2.getPerspectiveTransform(pts,dst)
            return cv2.warpPerspective(img,M,(dst_size[0],dst_size[1]))
    return cv2.resize(img, dst_size)

def crop_bubble(gray, cx, cy, radius=20, pad=4):
    r = radius + pad
    x1, y1 = max(cx-r,0), max(cy-r,0)
    x2, y2 = min(cx+r, gray.shape[1]-1), min(cy+r, gray.shape[0]-1)
    crop = gray[y1:y2, x1:x2]
    if crop.size==0: return None
    crop = cv2.resize(crop, (IMG_SIZE, IMG_SIZE))
    return crop

def detect_centers(gray):
    detector = cv2.SimpleBlobDetector_create()
    keypoints = detector.detect(gray)
    centers = [(int(k.pt[0]), int(k.pt[1])) for k in keypoints]
    return centers

# ---------- LOAD ANSWER KEYS ----------
dfA = pd.read_excel(excel_file, sheet_name="Set - A")
dfB = pd.read_excel(excel_file, sheet_name="Set - B")

def create_answer_dict(df):
    df.columns = df.columns.str.strip()
    answer_dict = {}
    topics = df.columns
    for topic in topics:
        for val in df[topic].dropna():
            q_no, ans = val.replace('.', '-').split('-', 1)
            answer_dict[int(q_no.strip())] = ans.strip().upper()
    return answer_dict

answer_A = create_answer_dict(dfA)
answer_B = create_answer_dict(dfB)

# ---------- PREPARE DATASET (Crop bubbles for CNN training) ----------
def prepare_cnn_dataset():
    for set_name, answer_dict in [("Set A", answer_A), ("Set B", answer_B)]:
        folder = os.path.join(dataset_root, set_name)
        if not os.path.isdir(folder): continue
        for fname in sorted(os.listdir(folder)):
            if not fname.lower().endswith((".png",".jpg",".jpeg")): continue
            path = os.path.join(folder, fname)
            img = cv2.imread(path)
            warped = warp_sheet(img)
            gray = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)
            centers = detect_centers(gray)
            # For training, assume left->right top->bottom order
            centers = sorted(centers, key=lambda p: (p[1], p[0]))
            for q_idx, (cx,cy) in enumerate(centers[:NUM_QUESTIONS]):
                crop = crop_bubble(gray,cx,cy)
                if crop is None: continue
                # get label from answer dict
                label = answer_dict.get(q_idx+1,"BLANK")
                label = label.upper() if label in ["A","B","C","D"] else "BLANK"
                save_path = os.path.join(CROP_DIR,label,f"{set_name}_{fname}_{q_idx}.png")
                cv2.imwrite(save_path, crop)

# ---------- BUILD CNN ----------
def build_cnn():
    model = Sequential([
        Conv2D(32,(3,3),activation='relu',input_shape=(IMG_SIZE,IMG_SIZE,1)),
        MaxPooling2D((2,2)),
        Conv2D(64,(3,3),activation='relu'),
        MaxPooling2D((2,2)),
        Flatten(),
        Dense(128,activation='relu'),
        Dropout(0.5),
        Dense(len(classes),activation='softmax')
    ])
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    return model

# ---------- TRAIN CNN ----------
def train_cnn(model):
    datagen = ImageDataGenerator(rescale=1./255,validation_split=0.2)
    train_gen = datagen.flow_from_directory(CROP_DIR,target_size=(IMG_SIZE,IMG_SIZE),
                                            color_mode='grayscale',class_mode='categorical',
                                            subset='training',batch_size=32,shuffle=True)
    val_gen = datagen.flow_from_directory(CROP_DIR,target_size=(IMG_SIZE,IMG_SIZE),
                                          color_mode='grayscale',class_mode='categorical',
                                          subset='validation',batch_size=32)
    model.fit(train_gen, validation_data=val_gen, epochs=50, verbose=1)
    return model

# ---------- PREDICT SHEET ----------
def predict_sheet(warped_img, model):
    gray = cv2.cvtColor(warped_img, cv2.COLOR_BGR2GRAY)
    centers = detect_centers(gray)
    centers = sorted(centers, key=lambda p: (p[1], p[0]))[:NUM_QUESTIONS]
    responses = []
    for cx,cy in centers:
        crop = crop_bubble(gray,cx,cy)
        if crop is None:
            responses.append("NA")
            continue
        x = crop.reshape(1,IMG_SIZE,IMG_SIZE,1)/255.0
        pred = np.argmax(model.predict(x, verbose=0))
        responses.append(classes[pred])
    return responses

# ---------- SCORE SHEET ----------
def score_sheet(responses, answer_dict):
    score = 0
    for qno, ans in answer_dict.items():
        resp = responses[qno-1] if qno-1 < len(responses) else "NA"
        if resp==ans: score+=1
    return score

# ---------- MAIN EXECUTION ----------
if __name__=="__main__":
    print("Step 1: Preparing CNN dataset (cropping bubbles)...")
    prepare_cnn_dataset()
    
    print("Step 2: Building CNN...")
    model = build_cnn()
    
    print("Step 3: Training CNN...")
    model = train_cnn(model)
    
    print("Step 4: Predicting & scoring all sheets...")
    results = []
    for set_name, answer_dict in [("Set A", answer_A), ("Set B", answer_B)]:
        folder = os.path.join(dataset_root, set_name)
        if not os.path.isdir(folder): continue
        for fname in sorted(os.listdir(folder)):
            if not fname.lower().endswith((".png",".jpg",".jpeg")): continue
            path = os.path.join(folder, fname)
            img = cv2.imread(path)
            warped = warp_sheet(img)
            responses = predict_sheet(warped, model)
            score = score_sheet(responses, answer_dict)
            results.append({"Student": fname, "Set": set_name,
                            "Responses":"|".join(responses),"Score":score})
            print(f"{fname} ({set_name}) -> score: {score}")
    
    pd.DataFrame(results).to_csv("omr_cnn_results.csv", index=False)
    print("Saved CNN-based results to omr_cnn_results.csv")


Step 1: Preparing CNN dataset (cropping bubbles)...
Step 2: Building CNN...
Step 3: Training CNN...
Found 304 images belonging to 5 classes.
Found 73 images belonging to 5 classes.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 133ms/step - accuracy: 0.2899 - loss: 1.5014 - val_accuracy: 0.3699 - val_loss: 1.3558
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 72ms/step - accuracy: 0.3416 - loss: 1.3679 - val_accuracy: 0.3699 - val_loss: 1.3385
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 79ms/step - accuracy: 0.3736 - loss: 1.4112 - val_accuracy: 0.3699 - val_loss: 1.3331
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/step - accuracy: 0.3634 - loss: 1.4430 - val_accuracy: 0.3699 - val_loss: 1.3182
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 74ms/step - accuracy: 0.3632 - loss: 1.3753 - val_accuracy: 0.3699 - val_loss: 1.3479
Epoch 6/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 72ms/step - accuracy: 0.3437 - loss: 1.3658 - val_accuracy: 0.3699 - val_loss: 1.3192
Epoch 7/50
[1m10/10[0m [32m━━━

In [13]:
# ------------------------------
# Robust CNN-based OMR Scoring
# ------------------------------

# REQUIREMENTS: pip install opencv-python numpy pandas tensorflow scikit-learn openpyxl matplotlib

import os, cv2, numpy as np, pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# ---------- CONFIG ----------
dataset_root = "."            
excel_file   = "Key (Set A and B).xlsx"
IMG_SIZE = 48
CROP_DIR = "bubble_crops"
NUM_QUESTIONS = 100
classes = ["A","B","C","D","BLANK"]
EPOCHS = 100  # increased epochs
BATCH_SIZE = 32

os.makedirs(CROP_DIR, exist_ok=True)
for cls in classes:
    os.makedirs(os.path.join(CROP_DIR, cls), exist_ok=True)

# ---------- HELPER FUNCTIONS ----------
def warp_sheet(img, dst_size=(1200,1700)):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5,5), 0)
    edged = cv2.Canny(blur, 50, 150)
    cnts, _ = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    cnts = sorted(cnts, key=cv2.contourArea, reverse=True)[:15]
    for c in cnts:
        peri = cv2.arcLength(c, True)
        approx = cv2.approxPolyDP(c, 0.02*peri, True)
        if len(approx)==4:
            pts = approx.reshape(4,2).astype("float32")
            dst = np.array([[0,0],[dst_size[0]-1,0],[dst_size[0]-1,dst_size[1]-1],[0,dst_size[1]-1]],dtype="float32")
            M = cv2.getPerspectiveTransform(pts,dst)
            return cv2.warpPerspective(img,M,(dst_size[0],dst_size[1]))
    return cv2.resize(img, dst_size)

def crop_bubble(gray, cx, cy, radius=20, pad=4):
    r = radius + pad
    x1, y1 = max(cx-r,0), max(cy-r,0)
    x2, y2 = min(cx+r, gray.shape[1]-1), min(cy+r, gray.shape[0]-1)
    crop = gray[y1:y2, x1:x2]
    if crop.size==0: return None
    crop = cv2.resize(crop, (IMG_SIZE, IMG_SIZE))
    return crop

def detect_centers(gray, visualize=False):
    # Use blob detection with tuned parameters
    params = cv2.SimpleBlobDetector_Params()
    params.filterByArea = True
    params.minArea = 50
    params.maxArea = 5000
    params.filterByCircularity = True
    params.minCircularity = 0.3
    detector = cv2.SimpleBlobDetector_create(params)
    keypoints = detector.detect(gray)
    centers = [(int(k.pt[0]), int(k.pt[1])) for k in keypoints]
    if visualize:
        vis = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
        for (x,y) in centers:
            cv2.circle(vis,(x,y),10,(0,0,255),2)
        plt.imshow(vis[:,:,::-1])
        plt.show()
    return centers

# ---------- LOAD ANSWER KEYS ----------
dfA = pd.read_excel(excel_file, sheet_name="Set - A")
dfB = pd.read_excel(excel_file, sheet_name="Set - B")

def create_answer_dict(df):
    df.columns = df.columns.str.strip()
    answer_dict = {}
    topics = df.columns
    for topic in topics:
        for val in df[topic].dropna():
            q_no, ans = val.replace('.', '-').split('-', 1)
            answer_dict[int(q_no.strip())] = ans.strip().upper()
    return answer_dict

answer_A = create_answer_dict(dfA)
answer_B = create_answer_dict(dfB)

# ---------- PREPARE DATASET (with augmentation) ----------
def prepare_cnn_dataset():
    for set_name, answer_dict in [("Set A", answer_A), ("Set B", answer_B)]:
        folder = os.path.join(dataset_root, set_name)
        if not os.path.isdir(folder): continue
        for fname in sorted(os.listdir(folder)):
            if not fname.lower().endswith((".png",".jpg",".jpeg")): continue
            path = os.path.join(folder, fname)
            img = cv2.imread(path)
            warped = warp_sheet(img)
            gray = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)
            centers = detect_centers(gray)
            centers = sorted(centers, key=lambda p: (p[1], p[0]))
            for q_idx, (cx,cy) in enumerate(centers[:NUM_QUESTIONS]):
                crop = crop_bubble(gray,cx,cy)
                if crop is None: continue
                label = answer_dict.get(q_idx+1,"BLANK")
                label = label.upper() if label in ["A","B","C","D"] else "BLANK"
                save_path = os.path.join(CROP_DIR,label,f"{set_name}_{fname}_{q_idx}.png")
                cv2.imwrite(save_path, crop)

# ---------- BUILD CNN ----------
def build_cnn():
    model = Sequential([
        Conv2D(32,(3,3),activation='relu',input_shape=(IMG_SIZE,IMG_SIZE,1)),
        MaxPooling2D((2,2)),
        Conv2D(64,(3,3),activation='relu'),
        MaxPooling2D((2,2)),
        Flatten(),
        Dense(128,activation='relu'),
        Dropout(0.5),
        Dense(len(classes),activation='softmax')
    ])
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    return model

# ---------- TRAIN CNN ----------
def train_cnn(model):
    datagen = ImageDataGenerator(rescale=1./255,
                                 validation_split=0.2,
                                 rotation_range=5,
                                 width_shift_range=0.05,
                                 height_shift_range=0.05,
                                 brightness_range=[0.8,1.2])
    train_gen = datagen.flow_from_directory(CROP_DIR,target_size=(IMG_SIZE,IMG_SIZE),
                                            color_mode='grayscale',class_mode='categorical',
                                            subset='training',batch_size=BATCH_SIZE,shuffle=True)
    val_gen = datagen.flow_from_directory(CROP_DIR,target_size=(IMG_SIZE,IMG_SIZE),
                                          color_mode='grayscale',class_mode='categorical',
                                          subset='validation',batch_size=BATCH_SIZE)
    early = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(train_gen, validation_data=val_gen, epochs=EPOCHS, callbacks=[early])
    return model

# ---------- PREDICT SHEET ----------
def predict_sheet(warped_img, model, visualize=False):
    gray = cv2.cvtColor(warped_img, cv2.COLOR_BGR2GRAY)
    centers = detect_centers(gray, visualize=visualize)
    centers = sorted(centers, key=lambda p: (p[1], p[0]))[:NUM_QUESTIONS]
    responses = []
    for cx,cy in centers:
        crop = crop_bubble(gray,cx,cy)
        if crop is None:
            responses.append("NA")
            continue
        x = crop.reshape(1,IMG_SIZE,IMG_SIZE,1)/255.0
        pred = np.argmax(model.predict(x, verbose=0))
        responses.append(classes[pred])
    return responses

# ---------- SCORE SHEET ----------
def score_sheet(responses, answer_dict):
    score = 0
    for qno, ans in answer_dict.items():
        resp = responses[qno-1] if qno-1 < len(responses) else "NA"
        if resp==ans: score+=1
    return score

# ---------- MAIN EXECUTION ----------
if __name__=="__main__":
    print("Step 1: Preparing CNN dataset...")
    prepare_cnn_dataset()
    
    print("Step 2: Building CNN...")
    model = build_cnn()
    
    print("Step 3: Training CNN...")
    model = train_cnn(model)
    
    print("Step 4: Predicting & scoring all sheets...")
    results = []
    for set_name, answer_dict in [("Set A", answer_A), ("Set B", answer_B)]:
        folder = os.path.join(dataset_root, set_name)
        if not os.path.isdir(folder): continue
        for fname in sorted(os.listdir(folder)):
            if not fname.lower().endswith((".png",".jpg",".jpeg")): continue
            path = os.path.join(folder, fname)
            img = cv2.imread(path)
            warped = warp_sheet(img)
            responses = predict_sheet(warped, model, visualize=False)
            score = score_sheet(responses, answer_dict)
            results.append({"Student": fname, "Set": set_name,
                            "Responses":"|".join(responses),"Score":score})
            print(f"{fname} ({set_name}) -> score: {score}")
    
    pd.DataFrame(results).to_csv("omr_cnn_results.csv", index=False)
    print("Saved CNN-based results to omr_cnn_results.csv")


Step 1: Preparing CNN dataset...
Step 2: Building CNN...
Step 3: Training CNN...
Found 304 images belonging to 5 classes.
Found 73 images belonging to 5 classes.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 182ms/step - accuracy: 0.2895 - loss: 1.5234 - val_accuracy: 0.2192 - val_loss: 1.3738
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step - accuracy: 0.3244 - loss: 1.4012 - val_accuracy: 0.3699 - val_loss: 1.3305
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 114ms/step - accuracy: 0.3685 - loss: 1.3733 - val_accuracy: 0.3699 - val_loss: 1.3497
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 126ms/step - accuracy: 0.3448 - loss: 1.3532 - val_accuracy: 0.3699 - val_loss: 1.3519
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 119ms/step - accuracy: 0.3480 - loss: 1.4143 - val_accuracy: 0.3699 - val_loss: 1.3473
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 115ms/step - accuracy: 0.3619 - loss: 1.4028 - val_accuracy: 0.3699 - val_loss: 1.3608
Epoch 7/100
[1m10/10

In [15]:
# After training
model.save("omr_cnn_model.h5")
print("Saved Keras model as omr_cnn_model.h5")




Saved Keras model as omr_cnn_model.h5


In [17]:
import tensorflow as tf

# Load the saved Keras model
model = tf.keras.models.load_model("omr_cnn_model.h5")

# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]  # optional: optimize size & latency
tflite_model = converter.convert()

# Save TFLite model
with open("omr_cnn_model.tflite", "wb") as f:
    f.write(tflite_model)

print("Saved TensorFlow Lite model as omr_cnn_model.tflite")




INFO:tensorflow:Assets written to: C:\Users\HP\AppData\Local\Temp\tmpz0a5k_fz\assets


INFO:tensorflow:Assets written to: C:\Users\HP\AppData\Local\Temp\tmpz0a5k_fz\assets


Saved artifact at 'C:\Users\HP\AppData\Local\Temp\tmpz0a5k_fz'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 48, 48, 1), dtype=tf.float32, name='input_layer_2')
Output Type:
  TensorSpec(shape=(None, 5), dtype=tf.float32, name=None)
Captures:
  1891457705616: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1891457703312: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1891457703504: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1891457705808: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1891457704848: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1891457706384: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1891457706000: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1891457707728: TensorSpec(shape=(), dtype=tf.resource, name=None)
Saved TensorFlow Lite model as omr_cnn_model.tflite
