In [1]:
import cv2
import pandas as pd
import numpy as np
import os
import math
import json

from pdf2image import convert_from_path
from PIL import Image
from matplotlib import pyplot as plt

In [2]:
#### 최신버전
def convert_pdf_folder_to_images(pdf_folder, output_folder, chk = False):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, filename)
            images = convert_from_path(pdf_path)
            base_filename = os.path.splitext(filename)[0]
            
            for page_num, img in enumerate(images):
                if page_num == 0:
                    left_num = len(images)*2
                    right_num = 1
                else:
                    left_num = right_num + 1
                    right_num = left_num + 1
                    
                output_path = os.path.join(output_folder, f"{base_filename}_page_{page_num + 1}.jpg")
                img.save(output_path, 'JPEG')
                
                if chk:
                    width, height = img.size
                    width -= 20
                    height -= 20
                    left_half = img.crop((20, 20, width // 2, height))
                    right_half = img.crop((width // 2, 20, width, height))
                else:
                    width, height = img.size
                    left_half = img.crop((0, 0, width // 2, height))
                    right_half = img.crop((width // 2, 0, width, height))
                    
                # left_half_path = os.path.join(output_folder, f"{base_filename}_page_{page_num + 1}_left.jpg")
                # right_half_path = os.path.join(output_folder, f"{base_filename}_page_{page_num + 1}_right.jpg")
                left_half_path = os.path.join(output_folder, f"{base_filename}_{left_num}.jpg")
                right_half_path = os.path.join(output_folder, f"{base_filename}_{right_num}.jpg")
                
                left_half.save(left_half_path, 'JPEG')
                right_half.save(right_half_path, 'JPEG')
                
# pdf_folder = './scan/datas/pdf'  # PDF 파일이 있는 폴더 경로
# output_folder = './scan/datas/pdf/divided/t2'  # 출력 이미지 파일을 저장할 폴더 경로
pdf_folder = './scan/datas/pdf/'  # PDF 파일이 있는 폴더 경로
output_folder = './scan/datas/pdf/divided/tem'  # 출력 이미지 파일을 저장할 폴더 경로

convert_pdf_folder_to_images(pdf_folder, output_folder, chk = False) # chk : True -> 마킹된 검사지 

In [3]:
#### 최신버전
def is_square(cnt, min_area, max_area):
    epsilon = 0.02 * cv2.arcLength(cnt, True)
    approx = cv2.approxPolyDP(cnt, epsilon, True)
    
    # 정사각형은 꼭지점이 4개여야 함
    if len(approx) != 4:
        return False

    # 각도와 비율 체크
    for i in range(4):
        pt1 = approx[i][0]
        pt2 = approx[(i + 1) % 4][0]
        pt3 = approx[(i + 2) % 4][0]
        
        vec1 = pt2 - pt1
        vec2 = pt3 - pt2
        
        dot_product = vec1[0] * vec2[0] + vec1[1] * vec2[1]
        magnitude1 = np.sqrt(vec1[0]**2 + vec1[1]**2)
        magnitude2 = np.sqrt(vec2[0]**2 + vec2[1]**2)
        
        cos_angle = dot_product / (magnitude1 * magnitude2)
        angle = np.arccos(cos_angle) * (180 / np.pi)
        
        if angle < 80 or angle > 100:  # 각도가 직각에 가까운지 체크
            return False
    
    # 가로와 세로의 길이가 거의 같은지 확인
    x, y, w, h = cv2.boundingRect(approx)
    if abs(w - h) > min(w, h) * 0.1:  # 정사각형 비율이 아닌 경우 제외
        return False
    
    area = cv2.contourArea(cnt)
    if area < min_area or area > max_area:
        return False

    return True

def draw_contours(image_folder, output_folder, min_area=100, max_area=1000):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in os.listdir(image_folder):
        if (filename.endswith('.jpg')) and ("_page" not in filename):
            image_path = os.path.join(image_folder, filename)
            img = cv2.imread(image_path)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            blurred = cv2.GaussianBlur(gray, (5, 5), 0)

            # 경계 검출 (Canny Edge Detection)
            edges = cv2.Canny(blurred, 50, 100) # 50 100

            # 윤곽선 검출
            contours, _ = cv2.findContours(edges.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            # 사각형 및 크기 필터링
            rectangles = [cnt for cnt in contours if is_square(cnt, min_area, max_area)]
            
            rectangles = rectangles[:2] + rectangles[-2:]
            # 원본 이미지에 윤곽선 그리기
            # x 좌표와 y 좌표가 가장 작은 점이 좌상단 (top-left).
            # x 좌표는 크지만 y 좌표가 작은 점이 우상단 (top-right).
            # x 좌표는 작지만 y 좌표가 큰 점이 좌하단 (bottom-left).
            # x 좌표와 y 좌표가 모두 큰 점이 우하단 (bottom-right).
            cut_xy = []
            for rdx, rect in enumerate(rectangles):
                x, y, w, h = cv2.boundingRect(rect)
                if rdx == 0: # 우하단
                    cut_xy += [x + w]
                    cut_xy += [y + h]
                if rdx == 3: # 좌상단
                    cut_xy = [x, y] + cut_xy
                # cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255), 5)
            cut_xy = tuple(cut_xy)
            img = img[cut_xy[1]:cut_xy[3], cut_xy[0]:cut_xy[2]]
            
            # 결과 이미지 저장
            # result_image_path = os.path.join(output_folder, f"square_{filename}")
            result_image_path = os.path.join(output_folder, f"{filename}")
            cv2.imwrite(result_image_path, img)
            # cropped_image.save(result_image_path)
            
    return

# 사용 예시
image_output_folder = './scan/datas/pdf/divided/tem'
contour_output_folder = './scan/datas/pdf/square/tem'  # 네모를 기준으로 자른 사진을 저장할 폴더 경로

# 이미지에서 윤곽선 검출 및 그리기
draw_contours(image_output_folder, contour_output_folder)

In [4]:
#### 최신버전2
rects_point_dict = dict()

rects_point_dict['CLS_검사지(중고등)'] = {}
rects_point_dict['CLS_검사지(중고등)'][1] = [[276, 430, 634, 476], [276, 545, 635, 593], [281, 779, 337, 909], [352, 759, 484, 1109], [496, 761, 632, 1104], [663, 371, 875, 1104], [877, 367, 1089, 1107], [1093, 365, 1311, 1100], [1315, 371, 1544, 1103], [585, 1321, 772, 1611], [1353, 1323, 1531, 1656], [763, 1743, 1337, 2173]]
rects_point_dict['CLS_검사지(중고등)'][2] = [[795, 76, 1279, 1029], [617, 1245, 800, 1649], [1383, 1248, 1559, 1663], [737, 1811, 808, 2176], [1503, 1800, 1555, 2188]]
rects_point_dict['CLS_검사지(중고등)'][3] = [[700, 139, 783, 493], [1467, 123, 1535, 416], [579, 639, 771, 2184], [1348, 636, 1535, 2184]]
rects_point_dict['CLS_검사지(중고등)'][4] = [[617, 128, 803, 2107], [1375, 125, 1557, 2101]]

rects_point_dict['CLS_검사지(초등)'] = {}
rects_point_dict['CLS_검사지(초등)'][1] = [[276, 545, 636, 593], [284, 780, 332, 1015], [349, 755, 488, 1105], [496, 757, 628, 1107], [661, 369, 869, 1105], [880, 373, 1089, 1097], [1100, 368, 1301, 1101], [1312, 367, 1528, 1112], [516, 1311, 775, 1659], [1288, 1313, 1531, 1613], [812, 1793, 1307, 2133]]
rects_point_dict['CLS_검사지(초등)'][2] = [[793, 77, 1296, 1035], [608, 1253, 807, 1712], [1376, 1252, 1559, 1712], [733, 1881, 805, 2180], [1499, 1881, 1559, 2179]]
rects_point_dict['CLS_검사지(초등)'][3] = [[700, 124, 775, 951], [1467, 123, 1533, 812], [573, 1097, 777, 2185], [1341, 1097, 1537, 2176]]
rects_point_dict['CLS_검사지(초등)'][4] = [[605, 129, 803, 2100], [1364, 124, 1559, 2035]]

rects_point_dict['FIT'] = {}
rects_point_dict['FIT'][1] = [[284, 527, 656, 580], [297, 780, 353, 909], [364, 744, 505, 1096], [515, 751, 648, 1104], [679, 359, 885, 1097], [893, 363, 1107, 1101], [1111, 357, 1325, 1101], [1328, 357, 1548, 1101], [599, 1263, 788, 2196], [1363, 1260, 1549, 2201]]
rects_point_dict['FIT'][2] = [[601, 131, 785, 2185], [1368, 131, 1553, 2193]]
rects_point_dict['FIT'][3] = [[599, 124, 780, 912], [1364, 123, 1548, 860], [239, 1067, 399, 2189], [625, 1063, 777, 2181], [1008, 1075, 1164, 2183], [1391, 1071, 1545, 2195]]
rects_point_dict['FIT'][4] = [[245, 135, 403, 1197], [628, 135, 779, 1189], [1011, 137, 1167, 1189], [1393, 125, 1543, 1013], [1191, 1397, 1331, 1609], [1355, 1395, 1481, 1611], [604, 1755, 788, 2104], [1364, 1756, 1548, 2099]]

rects_point_dict['POWER(중고등)'] = {}
rects_point_dict['POWER(중고등)'][1] = [[267, 387, 632, 443], [275, 504, 628, 556], [277, 736, 328, 888], [347, 716, 484, 1073], [495, 716, 629, 1069], [656, 328, 872, 1065], [883, 329, 1088, 1069], [1093, 328, 1299, 1065], [1311, 331, 1523, 1071], [612, 1144, 769, 2180], [1372, 1144, 1521, 2177]]
rects_point_dict['POWER(중고등)'][2] = [[643, 56, 800, 2183], [1408, 45, 1553, 2188]]
rects_point_dict['POWER(중고등)'][3] = [[613, 56, 767, 2188], [1373, 55, 1530, 2179]]
rects_point_dict['POWER(중고등)'][4] = [[637, 53, 796, 1155], [1407, 45, 1551, 1091], [645, 1228, 799, 2073], [1408, 1219, 1553, 2009]]

rects_point_dict['POWER(초등)'] = {}
rects_point_dict['POWER(초등)'][1] = [[273, 663, 633, 717], [277, 897, 337, 1168], [347, 877, 489, 1229], [493, 877, 625, 1225], [656, 491, 864, 1233], [875, 488, 1092, 1228], [1097, 484, 1303, 1232], [1315, 485, 1524, 1239], [585, 1344, 764, 2163], [1356, 1347, 1528, 2156]]
rects_point_dict['POWER(초등)'][2] = [[615, 217, 795, 2093], [1384, 219, 1553, 2091]]
rects_point_dict['POWER(초등)'][3] = [[585, 220, 771, 2089], [1353, 213, 1540, 2096]]
rects_point_dict['POWER(초등)'][4] = [[617, 125, 796, 1109], [1381, 120, 1553, 1053], [652, 1224, 765, 2039], [1412, 1217, 1524, 1975]]

In [14]:
#### 최신버전2
def is_in_sector(cnt, project_name, page, circle_point_dict):
    x, y, radius = cnt
    page = int(page)
    
    circle_left = x - radius
    circle_top = y - radius
    circle_right = x + radius
    circle_bottom = y + radius
    
    # 원이 사각형 내부에 완전히 포함되는지 확인
    pass_point = False
    for rdx, rect_point in enumerate(rects_point_dict[project_name][page]):
        sector = rdx+1
        left, top, right, bottom = rect_point
        if (circle_left >= left and
            circle_top >= top and
            circle_right <= right and
            circle_bottom <= bottom):
            pass_point = True
            if project_name not in circle_point_dict:
                circle_point_dict[project_name] = {}
            if page not in circle_point_dict[project_name]:
                circle_point_dict[project_name][page] = {}
            if sector not in circle_point_dict[project_name][page]:
                circle_point_dict[project_name][page][sector] = []

            circle_point_dict[project_name][page][sector].append([int(x), int(y), int(radius)])
            break
        
    if not pass_point:
        return False
    
    return True
    

def draw_contours(image_folder, output_folder): # 여기는 문제 없음
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    circle_point_dict = dict()
    for filename in os.listdir(image_folder):
        if (filename.endswith('.jpg')) and ("_page" not in filename):
            project_name_page = filename.replace('.jpg', '')
            splited_project_name_page = project_name_page.split('_')
            project_name = "_".join(splited_project_name_page[:-1])
            page = splited_project_name_page[-1]
            
            image_path = os.path.join(image_folder, filename)
            image = cv2.imread(image_path)
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            # 가우시안 블러로 노이즈 감소
            gray = cv2.GaussianBlur(gray, (9, 9), 0)

            # HoughCircles 함수를 사용하여 원 검출
            all_circles = cv2.HoughCircles(gray, cv2.HOUGH_GRADIENT, dp=1.0, minDist=20,
                                    param1=50, param2=20, minRadius=9, maxRadius=12)
            # all_circles = all_circles.astype(int)
            # circles = [(x,y,10) for cnt_list in all_circles for (x,y,r) in cnt_list if is_in_sector((x,y,r), project_name, page, circle_point_dict)]
            if all_circles is not None:
                all_circles = np.round(all_circles[0, :]).astype("int")
                circles = [(x, y, 10) for x, y, r in all_circles if is_in_sector((x, y, r), project_name, page, circle_point_dict)]
            
            # 원이 검출되었는지 확인
            if circles is not None:
                for (x, y, r) in circles:
                    
                    cv2.circle(image, (x, y), r, (255, 0, 0), 2)
                    # # 중심점 그리기
                    # cv2.circle(image, (x, y), 2, (0, 0, 255), 3)

            # 결과 이미지 저장
            # result_image_path = os.path.join(output_folder, f"circles_{filename}")
            result_image_path = os.path.join(output_folder, f"{filename}")
            cv2.imwrite(result_image_path, image)
            # plt.figure(figsize=(20, 20))
            # plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            # plt.axis('off')
            # plt.show()
    return circle_point_dict

image_output_folder = './scan/datas/pdf/square/tem' 
contour_output_folder = './scan/datas/pdf/circles/tem'  # 동그라미 검출 이미지를 저장할 폴더 경로

# 이미지에서 윤곽선 검출 및 그리기
circle_point_dict = draw_contours(image_output_folder, contour_output_folder)
circle_point_dict = {k: v for k, v in sorted(circle_point_dict.items())}
circle_point_dict = {k: {sk: sv for sk, sv in sorted(v.items())} for k, v in circle_point_dict.items()}
circle_point_dict = {k: {sk: {ssk: ssv for ssk, ssv in sorted(sv.items())} for sk, sv in v.items()} for k, v in circle_point_dict.items()}

In [15]:
project_dict = dict()
for project_name, pages_dict in circle_point_dict.items():
    project_df = pd.DataFrame()
    for page, sector_dict in pages_dict.items():
        page_df = pd.DataFrame()
        for sector, circle_list in sector_dict.items():
            df = pd.DataFrame(circle_list, columns=['x', 'y', 'r'])
            df['page'] = page
            df['sector'] = sector
            page_df = pd.concat([page_df, df], axis = 0).reset_index(drop=True)
        project_df = pd.concat([project_df, page_df], axis = 0).reset_index(drop=True)
        
    # circle_image_list = []
    # for filename in os.listdir('./scan/datas/pdf/circles/t1'):
    #     if (filename.endswith('.jpg')) and (project_name in filename):
    #         circle_image_list += [filename]
            
    # real_image_list = []
    # for filename in os.listdir('./scan/datas/PU'):
    #     if (filename.endswith('.jpg')) and ("page" in filename):
    #         real_image_list += [filename]
            
    sorted_df = pd.DataFrame()
    for idx, small_df in project_df.groupby(['page', 'sector']):
        sorted_small_df = small_df.sort_values(by='y')
        sorted_df = pd.concat([sorted_df, sorted_small_df], axis = 0).reset_index(drop = True)
                
    xy_sorted_df = pd.DataFrame()

    y_current = sorted_df['y'][0]
    y_question_num_list = []
    y_chk_equal_question_num = [1]
    y_list = sorted_df['y'][1:]

    for ydx, y in enumerate(y_list):
        if (abs(y_current - y) > 20) or (ydx == len(y_list)-1):
            y_question_num_list += y_chk_equal_question_num
            y_chk_equal_question_num = [y_chk_equal_question_num[-1]+1]
        else:
            y_chk_equal_question_num += [y_chk_equal_question_num[0]]
        y_current = y
    sorted_df['question_num'] = y_question_num_list + [y_question_num_list[-1]]

    for ddx, small_df in sorted_df.groupby('question_num'):
        small_df = small_df.sort_values('x').reset_index(drop=True)
        small_df['choices_count'] = [i+1 for i in range(len(small_df))]
        xy_sorted_df = pd.concat([xy_sorted_df, small_df], axis=0)
        
    xy_sorted_df = xy_sorted_df.reset_index(drop = True)
    json_df = xy_sorted_df.to_json(orient='columns')
    project_dict[project_name] = json.loads(json_df)          

In [16]:
with open('./scan/datas/json/rects_point_dict.json', 'w') as f:
    json.dump(rects_point_dict, f, ensure_ascii=False, indent=4)
    
with open('./scan/datas/json/circle_point_dict.json', 'w') as f:
    json.dump(circle_point_dict, f, ensure_ascii=False, indent=4)
    
with open('./scan/datas/json/project_dict.json', 'w') as f:
    json.dump(project_dict, f, ensure_ascii=False, indent=4)

In [18]:
rects_point_dict

{'CLS_검사지(중고등)': {1: [[276, 430, 634, 476],
   [276, 545, 635, 593],
   [281, 779, 337, 909],
   [352, 759, 484, 1109],
   [496, 761, 632, 1104],
   [663, 371, 875, 1104],
   [877, 367, 1089, 1107],
   [1093, 365, 1311, 1100],
   [1315, 371, 1544, 1103],
   [585, 1321, 772, 1611],
   [1353, 1323, 1531, 1656],
   [763, 1743, 1337, 2173]],
  2: [[795, 76, 1279, 1029],
   [617, 1245, 800, 1649],
   [1383, 1248, 1559, 1663],
   [737, 1811, 808, 2176],
   [1503, 1800, 1555, 2188]],
  3: [[700, 139, 783, 493],
   [1467, 123, 1535, 416],
   [579, 639, 771, 2184],
   [1348, 636, 1535, 2184]],
  4: [[617, 128, 803, 2107], [1375, 125, 1557, 2101]]},
 'CLS_검사지(초등)': {1: [[276, 545, 636, 593],
   [284, 780, 332, 1015],
   [349, 755, 488, 1105],
   [496, 757, 628, 1107],
   [661, 369, 869, 1105],
   [880, 373, 1089, 1097],
   [1100, 368, 1301, 1101],
   [1312, 367, 1528, 1112],
   [516, 1311, 775, 1659],
   [1288, 1313, 1531, 1613],
   [812, 1793, 1307, 2133]],
  2: [[793, 77, 1296, 1035],
   [608,