In [None]:
import os
import zipfile
# 解压测试集图片
zip_path = "./input/test_img_data.zip"
extract_dir = "./test_images"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"已解压图片到 {extract_dir}")

### Stage 1：OCR

In [None]:
#注意：安装命令运行结束后需重启内核才会更新
# 安装 paddlepaddle和paddleocr用于识别图片文字
!# CPU only
!python3 -m pip install paddlepaddle==2.5.2 -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install --user -i https://pypi.tuna.tsinghua.edu.cn/simple paddleocr
!pip uninstall -y numpy
!pip install --user -i https://pypi.tuna.tsinghua.edu.cn/simple numpy==1.26.4
!pip uninstall -y paddleocr
!pip install --user paddleocr==2.9.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
import os
from paddleocr import PaddleOCR
import json

# 初始化 OCR 引擎
ocr = PaddleOCR(use_angle_cls=True, lang='ch')

# 设置本地图片路径
image_path = "./input/train_img_data/2597.jpg"  # ← 替换为你的图片路径

# 检查图片是否存在
if not os.path.exists(image_path):
    print(f"图片不存在: {image_path}")
else:
    # 执行 OCR
    result = ocr.ocr(image_path, cls=True)

    # 仅保存原始 result 到 JSON 文件
    output_path = "raw_ocr_result.json"
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    print(f"原始 OCR 结果已保存至 {output_path}")

### Stage 2：Grammatical Error Correction(GEC)

In [None]:
#注意：安装命令运行结束后需重启内核才会更新
# 为避免依赖冲突，卸载 OCR 相关依赖
#!pip uninstall -y paddlex
#!pip install --user paddlex==2.1.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

#!pip uninstall -y paddleocr
#!pip install --user paddleocr==2.9.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
# 安装 pycorrector 和 transformers（纠错模型所需）
#!pip install --user -U -i https://pypi.tuna.tsinghua.edu.cn/simple pycorrector
#!pip install --user transformers==4.28.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
#!pip install --user kenlm -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
import json
from pycorrector import Corrector

# 载入pycorrector 模型
model = Corrector(language_model_path='./models/people2014corpus_chars.klm')

# 加载 OCR 后的数据
with open("test.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 推理
results = []
for i, item in enumerate(data):
    src = item["source_text"]
    corrected_res = model.correct(src)
    corrected_text = corrected_res['target']

    new_item = dict(item)
    new_item["predict_text"] = corrected_text

    results.append(new_item)

    if i % 10 == 0:
        print(f"\n第 {i+1} 条样本纠错结果：")
        print("原文：", src[:100])
        print("纠错：", corrected_text[:100])


# 保存最终预测结果
with open("./output/predict.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("纠错完成，结果已保存为 ./output/predict.json")

In [1]:
# 将预测结果压缩后再提交
import os
path=os.getcwd()
newpath=path+"/output/"
os.chdir(newpath)
os.system('zip prediction.zip predict.json')
os.chdir(path)

  adding: predict.json (deflated 86%)


In [None]:
!pip uninstall -y paddlex
!pip install --user paddlex==2.1.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

!pip uninstall -y paddleocr
!pip install --user paddleocr==2.9.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
import json

def process_recognition_result(res_file, output_file):
    # 读取OCR识别结果
    with open(res_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    source_text = ''.join(data['rec_texts'])  # 合并所有文本行
    target_text = source_text  # 如果需要校正可以替换为目标文本

    # 用于存储所有字符的bounding boxes
    char_bounding_boxes = []

    for text_line, box in zip(data['rec_texts'], data['rec_boxes']):
        num_chars = len(text_line)
        if num_chars == 0:
            continue

        x1, y1, x2, y2 = box
        width = (x2 - x1) / num_chars  # 每个字符的平均宽度

        # 分配每个字符的bounding box
        for i in range(num_chars):
            char_box = {
                "start_x": int(x1 + i * width),
                "end_x": int(x1 + (i + 1) * width),
                "start_y": y1,
                "end_y": y2
            }
            char_bounding_boxes.append(char_box)

    # 构造输出JSON对象
    result = {
        "fk_homework_id": 2597,
        "path": "2597.jpg",
        "source_text": source_text,
        "target_text": target_text,
        "bounding_box_list": char_bounding_boxes
    }

    # 写入输出文件
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump([result], f, ensure_ascii=False, indent=2)

# 执行程序
process_recognition_result('./output/2597_res.json', 'test.json')

In [1]:
# 将预测结果压缩后再提交
import os
path=os.getcwd()
newpath=path+"/output/"
os.chdir(newpath)
os.system('zip prediction.zip predict.json')
os.chdir(path)

  adding: predict.json (deflated 85%)
