In [None]:
import pandas as pd
import os
import re
from pathlib import Path
from typing import List, Dict, Optional

# ==============================================================================
# 1. 配置层 (Configuration Layer)
# ==============================================================================
class Config:
    """集中管理所有可配置参数"""
    # --- 路径与文件 ---
    INPUT_DIR = Path('path/to/your/excel_files_folder')
    OUTPUT_FILE = Path('path/to/your/output/最终报表_SOTA.xlsx')
    FILENAME_PATTERN = r'^\d{6}\.xlsx$'

    # --- Excel 表结构 ---
    TARGET_SHEET_NAME = '国网湖北省电力有限公司'
    SKIP_ROWS = 3  # 跳过顶部的标题行数
    HEADER_ROWS_COUNT = 4 # 多级表头所在的行数

    # --- 核心列名定义 (源数据中的列，经过扁平化处理后) ---
    # 请务必根据实际情况仔细核对这些值，特别是空格和标点！
    # '完成新装增容' -> 合计
    COL_TOTAL_CAPACITY = '本月完成新装、增容_合    计_10（6）kV及以上_容量'
    COL_TOTAL_COUNT    = '本月完成新装、增容_合    计_10（6）kV及以上_户数'
    # '完成新装' -> 其中:新装
    COL_NEW_CAPACITY   = '本月完成新装、增容_其中:新装_10（6）kV及以上_容量'
    COL_NEW_COUNT      = '本月完成新装、增容_其中:新装_10（6）kV及以上_户数'
    # '完成增容' -> 其中:
    COL_INCREASE_CAPACITY = '本月完成新装、增容_其中:_10（6）kV及以上_容量'
    COL_INCREASE_COUNT    = '本月完成新装、增容_其中:_10（6）kV及以上_户数'

    # --- 基础列名 (扁平化后) ---
    BASE_COL_SEQ = '序号_nan_nan_nan'
    BASE_COL_INDUSTRY = '行业_nan_nan_nan'
    
    # --- 输出定义 ---
    # 定义了最终Excel的6个工作表及其对应的数据源列
    @staticmethod
    def get_sheets_to_create() -> Dict[str, str]:
        return {
            '完成新装增容_容量': Config.COL_TOTAL_CAPACITY,
            '完成新装_容量': Config.COL_NEW_CAPACITY,
            '完成增容_容量': Config.COL_INCREASE_CAPACITY,
            '完成新装增容_户数': Config.COL_TOTAL_COUNT,
            '完成新装_户数': Config.COL_NEW_COUNT,
            '完成增容_户数': Config.COL_INCREASE_COUNT
        }

# ==============================================================================
# 2. 提取与解析层 (Extraction & Parsing Layer)
# ==============================================================================
def parse_complex_headers(df: pd.DataFrame) -> List[str]:
    """将一个多级表头的DataFrame的列名扁平化为唯一的字符串列表"""
    df.columns = df.columns.to_flat_index()
    new_columns = []
    last_valid = [None] * Config.HEADER_ROWS_COUNT
    for col_tuple in df.columns:
        new_tuple = []
        for i, level in enumerate(col_tuple):
            if 'Unnamed:' not in str(level):
                last_valid[i] = level
            new_tuple.append(last_valid[i])
        clean_name = '_'.join(str(item).strip() for item in new_tuple)
        new_columns.append(clean_name)
    return new_columns

def process_single_file(file_path: Path) -> Optional[pd.DataFrame]:
    """读取并处理单个Excel文件，返回一个干净的DataFrame"""
    year_month = file_path.stem
    print(f"  -> 正在处理文件: {file_path.name}")
    try:
        df = pd.read_excel(
            file_path,
            sheet_name=Config.TARGET_SHEET_NAME,
            skiprows=Config.SKIP_ROWS,
            header=list(range(Config.HEADER_ROWS_COUNT))
        )
        
        # 解析并设置列名
        df.columns = parse_complex_headers(df)

        # 清理数据
        df = df[df[Config.BASE_COL_SEQ].astype(str).strip() != '栏目'].copy()
        df.dropna(how='all', inplace=True)
        df['序号'] = pd.to_numeric(df[Config.BASE_COL_SEQ], errors='coerce')
        df.dropna(subset=['序号'], inplace=True)
        df['序号'] = df['序号'].astype(int)
        df.rename(columns={Config.BASE_COL_INDUSTRY: '行业'}, inplace=True)
        df['年月'] = year_month
        
        # 只选择需要的列
        required_cols = [
            '序号', '行业', '年月',
            Config.COL_TOTAL_CAPACITY, Config.COL_TOTAL_COUNT,
            Config.COL_NEW_CAPACITY, Config.COL_NEW_COUNT,
            Config.COL_INCREASE_CAPACITY, Config.COL_INCREASE_COUNT
        ]
        return df[required_cols]

    except KeyError as e:
        print(f"    [严重错误] 在文件 {file_path.name} 中找不到关键列: {e}。")
        print("    请检查Config类中定义的'核心列名'是否与Excel文件中的表头完全一致（包括空格和标点）!")
        raise  # 抛出异常，中断整个流程，因为列名不一致是根本性问题
    except Exception as e:
        print(f"    [警告] 处理文件 {file_path.name} 时发生未知错误，已跳过: {e}")
        return None

# ==============================================================================
# 3. 主流程 (Transformation & Loading)
# ==============================================================================
def main():
    """主执行函数，编排整个ETL流程"""
    print("开始执行数据处理流程...")
    
    # --- 提取 ---
    all_dataframes = []
    if not Config.INPUT_DIR.exists():
        print(f"[致命错误] 输入文件夹不存在: {Config.INPUT_DIR}")
        return

    for file_path in sorted(Config.INPUT_DIR.glob('*.xlsx')):
        if re.match(Config.FILENAME_PATTERN, file_path.name):
            df = process_single_file(file_path)
            if df is not None:
                all_dataframes.append(df)
    
    if not all_dataframes:
        print("未找到任何可处理的数据，程序退出。")
        return

    # --- 转换 ---
    print("\n所有文件数据读取完毕，正在聚合成总表...")
    long_df = pd.concat(all_dataframes, ignore_index=True)

    # --- 加载 ---
    print("正在生成最终的多工作表Excel报表...")
    # 确保输出目录存在
    Config.OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    
    with pd.ExcelWriter(Config.OUTPUT_FILE, engine='openpyxl') as writer:
        sheets_to_create = Config.get_sheets_to_create()
        for sheet_name, value_col in sheets_to_create.items():
            print(f"  -> 正在生成工作表: {sheet_name}")
            
            pivot_df = long_df.pivot_table(
                index=['序号', '行业'],
                columns='年月',
                values=value_col
            )
            
            pivot_df.columns.name = None
            pivot_df = pivot_df.reset_index()
            pivot_df.rename(columns={'行业': '分类'}, inplace=True)
            
            pivot_df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"\n全部任务成功完成！结果已保存到: {Config.OUTPUT_FILE}")

if __name__ == '__main__':
    main()