In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import os
from tqdm.notebook import tqdm

# --- 設定項目 ---
ANNOTATION_LIST_CSV = '../data/ground_truth/annotation_target_list.csv'
MASTER_LIST_CSV = '../data/processed/citing_papers_with_paths.csv'
OUTPUT_DIR = '../data/processed'
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'features_for_evaluation.csv')

# XMLの名前空間
namespaces = {'ce': 'http://www.elsevier.com/xml/common/dtd', 'sb': 'http://www.elsevier.com/xml/common/struct-bib/dtd', 'ja': 'http://www.elsevier.com/xml/ja/dtd'}

# --- 1. XML解析用の関数群を定義 ---

def get_citation_map_et(root_element):
    """参考文献リストから {ref_id: '文献情報'} の辞書を作成する"""
    citation_map = {}
    references = root_element.findall('.//ce:bibliography/ce:bibliography-sec/ce:bib-reference', namespaces)
    for ref in references:
        ref_id = ref.get('id')
        source_text_element = ref.find('.//ce:source-text', namespaces)
        citation_text = source_text_element.text if source_text_element is not None else ''.join(ref.itertext())
        if ref_id:
            citation_map[ref_id] = citation_text.strip() if citation_text else 'N/A'
    return citation_map

def find_target_ref_id(citation_map, target_title):
    """参考文献マップとデータ論文タイトルから、対応するRef IDを見つける"""
    for ref_id, full_citation in citation_map.items():
        if target_title.lower() in full_citation.lower():
            return ref_id
    return None

def parse_sections_recursive(element):
    """XML要素から、全セクションのタイトルと、各セクション内の引用IDリストを抽出する"""
    sections_data = []
    for section in element.findall('./ce:section', namespaces):
        title_tag = section.find('./ce:section-title', namespaces)
        sec_title = title_tag.text.strip() if title_tag is not None and title_tag.text else 'No Title'
        
        citations_in_section = []
        # 直下のparaタグ内の引用のみを対象とする
        paragraphs = section.findall('./ce:para', namespaces)
        for p in paragraphs:
            cross_refs = p.findall('.//ce:cross-ref', namespaces)
            for xref in cross_refs:
                if xref.get('refid'):
                    ref_ids = xref.get('refid').split()
                    for ref_id in ref_ids:
                        citations_in_section.append(ref_id)
                        
        sections_data.append({'title': sec_title, 'citations': citations_in_section})
        # 再帰呼び出しでサブセクションも探索
        sections_data.extend(parse_sections_recursive(section))
    return sections_data

def analyze_single_xml(xml_path, target_data_paper_title):
    """1つのXMLファイルを解析し、特徴量と判定結果を抽出するメイン関数"""
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        citation_map = get_citation_map_et(root)
        target_ref_id = find_target_ref_id(citation_map, target_data_paper_title)
        if not target_ref_id:
            return 0, [], 0, 0

        top_level_sections = root.find('.//ja:body/ce:sections', namespaces)
        if not top_level_sections:
            return 0, [], 0, 0
        
        all_sections_data = parse_sections_recursive(top_level_sections)
        
        mention_count = 0
        mentioned_sections = []
        keywords_to_check = ['data', 'method', 'experiment']
        contains_keyword = False
        
        for section in all_sections_data:
            count_in_section = section['citations'].count(target_ref_id)
            if count_in_section > 0:
                mention_count += count_in_section
                section_title = section['title']
                mentioned_sections.append(section_title)
                if any(keyword in section_title.lower() for keyword in keywords_to_check):
                    contains_keyword = True
        
        # 手法1とprediction_rule2を計算
        prediction_method1 = 1 if mention_count >= 2 else 0
        prediction_method2 = 1 if contains_keyword else 0
        
        return mention_count, list(set(mentioned_sections)), prediction_method1, prediction_method2

    except Exception:
        return -1, ['parsing_error'], -1, -1

# --- 2. メイン処理 ---
try:
    df_targets = pd.read_csv(ANNOTATION_LIST_CSV)
    df_master = pd.read_csv(MASTER_LIST_CSV)
    
    # 2つのDataFrameをマージして、アノテーション対象の論文情報（パスを含む）を取得
    # マージのキーをタプルにして、複数の列でマージする
    merge_keys = ['citing_paper_eid', 'citing_paper_doi', 'citing_paper_title', 'cited_data_paper_title']
    df_to_process = pd.merge(df_targets, df_master.drop_duplicates(subset=merge_keys), on=merge_keys, how='left')
    
    results_list = []
    print(f"アノテーション対象 {len(df_to_process)} 件のXMLを解析します...")
    
    for index, row in tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="特徴量抽出中"):
        xml_path = row['fulltext_xml_path']
        target_title = row['cited_data_paper_title']
        
        if pd.notna(xml_path) and os.path.exists(xml_path):
            count, sections, pred1, pred2 = analyze_single_xml(xml_path, target_title)
        else:
            count, sections, pred1, pred2 = -1, ['file_not_found'], -1, -1

        # 抽出した特徴量と判定結果を元の情報に追加
        result_row = row.to_dict()
        result_row['mention_count'] = count
        result_row['mentioned_sections'] = sections
        result_row['prediction_rule1'] = pred1
        result_row['prediction_rule2'] = pred2
        results_list.append(result_row)

    # --- 3. 結果の保存と表示 ---
    df_final = pd.DataFrame(results_list)
    
    # 指定されたカラムのみを保存
    columns_to_save = [
        'citing_paper_eid', 
        'citing_paper_doi', 
        'citing_paper_title', 
        'cited_data_paper_title',
        'mention_count',
        'mentioned_sections',
        'prediction_rule1',
        'prediction_rule2'
    ]
    df_to_save = df_final[columns_to_save]
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df_to_save.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')

    print(f"\n処理完了。特徴量抽出結果を '{OUTPUT_FILE}' に保存しました。")
    
    print("\n--- 保存されたデータの出力例（先頭5件）---")
    print(df_to_save.head())

except Exception as e:
    print(f"\nメイン処理中にエラーが発生しました: {e}")

アノテーション対象 200 件のXMLを解析します...


特徴量抽出中:   0%|          | 0/200 [00:00<?, ?it/s]

  if not top_level_sections:



処理完了。特徴量抽出結果を '../data/processed\features_for_evaluation.csv' に保存しました。

--- 保存されたデータの出力例（先頭5件）---
     citing_paper_eid                 citing_paper_doi  \
0  2-s2.0-85211640167  10.1016/j.marpolbul.2024.117442   
1  2-s2.0-85210281314   10.1016/j.jaridenv.2024.105282   
2  2-s2.0-85171680307   10.1016/j.revpalbo.2023.104989   
3  2-s2.0-85142708771     10.1016/j.foreco.2022.120653   
4  2-s2.0-85194770743    10.1016/j.jnucmat.2024.155194   

                                  citing_paper_title  \
0  Multi-indicator assessment of heavy metal poll...   
1  Potential effects of climate change on cacti d...   
2  Approaches to pollen taxonomic harmonisation i...   
3  Allometric equations to estimate the dry mass ...   
4  Microstructural evolution in doped high entrop...   

                              cited_data_paper_title  mention_count  \
0  Pollution load index for heavy metals in Mian-...              1   
1  The World Checklist of Vascular Plants, a cont...              2   
2 