In [9]:
import re
import os
import io
from typing import *
from dataclasses import dataclass
from PIL import Image
from zipfile import ZipFile
import xml.etree.ElementTree as ET

fpath = '***'

In [13]:
@dataclass(frozen=True)
class ImageData():
    sheet_code_name: str
    id_: str
    row: int
    col: int
    file_name: str


class ImageCollector():
    def __init__(self, file_path):
        self.__file_path:str = file_path
        self.__sheet_file_names = self.__get_sheet_file_names()

    def save_images(self) -> None:
        os.makedirs('./images', exist_ok=True)
        with ZipFile(fpath, 'r') as zip_data:
            for file in zip_data.namelist():
                if 'xl/media/image' in file:
                    file_data = zip_data.read(file)
                    # 画像のバイナリデータを読み込む
                    img_bin = io.BytesIO(file_data)
                    # バイナリデータをpillowから開く
                    img = Image.open(img_bin)
                    img.save((os.path.join('./images',os.path.basename(file))),quality = 95)
        return
    
    def collect_image_data(self) -> List[ImageData]:
        images: List[ImageData] = []
 
        for sheet_file_name in self.__sheet_file_names:
            drawing_id = self.__get_drawing_id_from_sheet_xml(sheet_file_name)
            if drawing_id is None:
                print(f'{sheet_file_name} has no drawings')
                continue
            drawing_file_name = self.__get_drawing_file_name_from_sheet_xml_rels(sheet_file_name, drawing_id)
            image_infos = self.__get_image_infos_from_drawing_xml(drawing_file_name)
            if len(image_infos) == 0:
                print(f'{sheet_file_name} has no images')
                continue
            print(f'{sheet_file_name} has {len(image_infos)} images')
            image_paths = self.__get_image_paths_from_drawing_xml_rel(drawing_file_name)
            for image_info in image_infos:
                image_data = ImageData(
                    sheet_code_name=self.__get_code_name_in_sheet_xml(sheet_file_name),
                    id_=image_info['id'],
                    row=image_info['row'],
                    col=image_info['col'],
                    file_name=os.path.basename(image_paths[image_info['id']]))
                images += [image_data]

        return images

    def __get_xml(self, file_path_in_zip: str) -> ET.XML:
        # zipファイルから，指定したxmlファイルを取得
        with ZipFile(self.__file_path, 'r') as zip_data:
            file_data = zip_data.read(file_path_in_zip)
        return ET.XML(file_data)

    def __get_sheet_file_names(self) -> List[str]:
        # ZipFileを展開して，'xl/worksheets'以下のファイル名を取得
        with ZipFile(fpath, 'r') as zip_data:
            file_names = zip_data.namelist()
        l = [os.path.basename(file_name) for file_name in file_names 
                if file_name.startswith('xl/worksheets') and 
                    file_name.endswith('.xml')]
        return sorted(l, key=lambda s: int(re.search(r'\d+', s).group()))

    def __get_code_name_in_sheet_xml(self, sheet_file_name: str) -> str:
        # sheet.xmlから，codeNameを取得
        sheet_xml = self.__get_xml(f'xl/worksheets/{sheet_file_name}')
        return sheet_xml.find("{*}sheetPr").attrib["codeName"]

    def __get_drawing_id_from_sheet_xml(self, sheet_file_name: str) -> str:
        # sheet.xmlから，drawing.xmlのidを取得
        sheet_xml = self.__get_xml(f'xl/worksheets/{sheet_file_name}')
        try:
            return list(sheet_xml.find('{*}drawing').attrib.values())[0]
        except:
            return None

    def __get_drawing_file_name_from_sheet_xml_rels(self, sheet_file_name: str, drawing_id:str) -> str:
        # drawing idを使って，drawing_xml_pathを取得
        sheet_xml_relation = self.__get_xml(f'xl/worksheets/_rels/{sheet_file_name}.rels')
        for child in sheet_xml_relation:
            if child.attrib['Id'] == drawing_id:
                draw_xml_path = child.attrib['Target']
                break
        return os.path.basename(draw_xml_path)

    def __get_image_infos_from_drawing_xml(self, drawing_file_name: str) -> List:
        # drawing.xmlから，画像のidと位置情報を取得
        drawing_xml = self.__get_xml(f'xl/drawings/{drawing_file_name}')
        image_info_list = []
        for child in drawing_xml.findall('{*}twoCellAnchor'):
            image_dict = {}
            if child.find('{*}pic') == None:
                continue
            id_dict = child.find('{*}pic').find('{*}blipFill').find('{*}blip').attrib
            image_dict['id'] = list(id_dict.values())[0]
            image_dict['row'] = child.find('{*}from').find('{*}row').text
            image_dict['col'] = child.find('{*}from').find('{*}col').text
            image_info_list.append(image_dict)
        return image_info_list

    def __get_image_paths_from_drawing_xml_rel(self, drawing_file_name:str) -> Dict:
        # 画像のidから，画像のpathを取得
        drawing_xml_relation = self.__get_xml(f'xl/drawings/_rels/{drawing_file_name}.rels')
        image_paths = {}
        for child2 in drawing_xml_relation:
            image_paths[child2.attrib['Id']] = child2.attrib['Target'] # path
        return image_paths

In [15]:
collector = ImageCollector(fpath)
image_data_list = collector.collect_image_data()
for image_data in image_data_list:
    print(image_data.__dict__)

sheet1.xml has no drawings
sheet2.xml has no drawings
sheet3.xml has no drawings
sheet4.xml has no drawings
sheet5.xml has 3 images
sheet6.xml has 26 images
sheet7.xml has 6 images
sheet8.xml has 10 images
sheet9.xml has 54 images
sheet10.xml has no drawings
sheet11.xml has no drawings
sheet12.xml has no drawings
sheet13.xml has no drawings
sheet14.xml has no drawings
sheet15.xml has no drawings
sheet16.xml has no images
sheet17.xml has no drawings
sheet18.xml has no drawings
sheet19.xml has no drawings
sheet20.xml has no drawings
sheet21.xml has no drawings
sheet22.xml has no drawings
{'sheet_code_name': 'Sheet5', 'id_': 'rId1', 'row': '7', 'col': '1', 'file_name': 'image1.jpeg'}
{'sheet_code_name': 'Sheet5', 'id_': 'rId2', 'row': '39', 'col': '1', 'file_name': 'image2.jpeg'}
{'sheet_code_name': 'Sheet5', 'id_': 'rId3', 'row': '71', 'col': '1', 'file_name': 'image3.jpeg'}
{'sheet_code_name': 'Sheet6', 'id_': 'rId1', 'row': '9', 'col': '3', 'file_name': 'image4.jpeg'}
{'sheet_code_name

In [16]:
collector.save_images()