## Class to process page 10 of the survey

In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd
plt.rcParams['figure.figsize'] = (10, 10)
from utils import *
from define import *
# import define

In [2]:
class Page11Processor:
    def __init__(self, xml_path):
        df = xml2csv(xml_path)

        self.all_columns = df.iloc[[7,8,9]]
        self.all_columns_np = np.array(self.all_columns[['xmin', 'ymin', 'xmax', 'ymax']])

        self.all_rows = df.iloc[range(10, 33)]
        self.question_of_rows = [row.split('_')[0] for row in self.all_rows['class']]
        self.all_rows_np = np.array(self.all_rows[['xmin', 'ymin', 'xmax', 'ymax']])
        
    def process(self, img):
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        extractor = CheckboxExtractor()
        checkboxes = extractor.detect_checkbox(gray)    
        tick_list = list(map(lambda x: extractor.is_ticked(gray[x[1]:x[3], x[0]:x[2]]), checkboxes))
        checkboxes = np.hstack([checkboxes, np.array(tick_list).reshape(-1,1)]) # [x1,y1,x2,y2,is_ticked]

        column_ids  = []
        row_ids = []
        for i in range(len(checkboxes)):
            x1,y1,x2,y2,_is_tick = checkboxes[i]
            _column_id = np.where((self.all_columns_np[:,0] < x1) & (self.all_columns_np[:,2] > x2))[0][0]
            column_ids += [_column_id]

            checkbox_coor = checkboxes[i:i+1, :]
            _iou = np_vec_no_jit_iou(checkbox_coor[:,:4], self.all_rows_np)
            row_ids += [np.argmax(_iou)]

        _id_offset = 0
        result_df = pd.DataFrame()
        for _question_id in range(57, 64):
            for _column_id in range(3):
                question_name = f'{_question_id}.{_column_id+1}'

                _checkbox_ids = list(filter(lambda x: ( int(self.question_of_rows[row_ids[x]]) == _question_id ) and (column_ids[x] == _column_id) and (checkboxes[x][4] == 1), range(len(checkboxes)) ))
                answers = []
                if len(_checkbox_ids) > 0:
                    for _checkbox_id in _checkbox_ids:
                        answers += [ANSWER_TEXT_11[_question_id][row_ids[_checkbox_id] - _id_offset]]
                result_df[question_name] = [', '.join(answers)]

            _id_offset += len(ANSWER_TEXT_11[_question_id])

        return result_df

    


In [3]:
processor = Page11Processor('page10.xml')

xml to csv page10.xml


In [4]:
img = cv2.imread('page10.jpg')
ret = processor.process(img)

In [5]:
ret

Unnamed: 0,57.1,57.2,57.3,58.1,58.2,58.3,59.1,59.2,59.3,60.1,...,60.3,61.1,61.2,61.3,62.1,62.2,62.3,63.1,63.2,63.3
0,One,"Two, More than two",,Never,A few times,A few times,Never,Once,Never,Never,...,Never,Never,Never,Never,No,No,No,Out-of-school,Out-of-school,


In [6]:
ret.to_csv('./result11.csv')