In [31]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os

# --- 1. 設定 & ファイルパス ---
# 正解データCSVのパスを修正
GROUND_TRUTH_CSV = '../data/ground_truth/annotation_target_list.csv' 
FEATURES_CSV = '../data/processed/features_for_evaluation.csv'
LLM_PREDICTIONS_CSV = '../data/processed/prediction_llm.csv'
RESULTS_DIR = '../results'
TABLES_DIR = os.path.join(RESULTS_DIR, 'tables')

os.makedirs(TABLES_DIR, exist_ok=True)

# --- 2. データの読み込みと結合 ---
try:
    df_gt = pd.read_csv(GROUND_TRUTH_CSV)
    df_features = pd.read_csv(FEATURES_CSV)
    df_llm = pd.read_csv(LLM_PREDICTIONS_CSV)

    # 必要なカラムをマージして、評価の元となるマスターデータフレームを作成
    df_eval_base = pd.merge(
        df_gt[['citing_paper_doi', 'is_data_used_gt']],
        df_features[['citing_paper_doi', 'prediction_rule1', 'prediction_rule2']], # カラム名を修正
        on='citing_paper_doi', how='inner'
    )
    
    # 評価に使う可能性のある全てのLLM結果カラムを定義
    llm_columns_to_merge = [
        'citing_paper_doi', 
        'prediction_rule3_abstract', 
        'prediction_rule3_fulltext', 
        'prediction_rule3_fulltext_few_shot',
        'prediction_rule3_gemini-2_5-flash',
        'prediction_rule3_gemini-2_5-flash_zeroshot' # ★新しいモデル(Zero-shot)の結果カラム
    ]
    df_eval_base = pd.merge(
        df_eval_base,
        df_llm[[col for col in llm_columns_to_merge if col in df_llm.columns]],
        on='citing_paper_doi', how='left'
    )
    
    df_eval_base = df_eval_base[df_eval_base['is_data_used_gt'].isin([0, 1])].copy()
    df_eval_base['is_data_used_gt'] = df_eval_base['is_data_used_gt'].astype(int)
    
    print(f"全データを正常に読み込み・結合しました。")

except FileNotFoundError as e:
    print(f"エラー: ファイルが見つかりません。パスを確認してください。 {e}")
    df_eval_base = pd.DataFrame()

# --- 3. ハイブリッド手法の予測を生成 ---
if not df_eval_base.empty:
    # 既存のハイブリッド手法
    if 'prediction_rule3_fulltext' in df_eval_base.columns:
        df_eval_base['prediction_hybrid_AND_zeroshot'] = ((df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_fulltext'] == 1)).astype(int)
    if 'prediction_rule3_fulltext_few_shot' in df_eval_base.columns:
        df_eval_base['prediction_hybrid_AND_fewshot'] = ((df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_fulltext_few_shot'] == 1)).astype(int)
    if 'prediction_rule3_gemini-2_5-flash' in df_eval_base.columns:
        df_eval_base['prediction_hybrid_AND_gemini2_5_fewshot'] = ((df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_gemini-2_5-flash'] == 1)).astype(int)
    
    # ★新しいモデル(Zero-shot)とのハイブリッド手法を追加
    if 'prediction_rule3_gemini-2_5-flash_zeroshot' in df_eval_base.columns:
        df_eval_base['prediction_hybrid_AND_gemini2_5_zeroshot'] = ((df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_gemini-2_5-flash_zeroshot'] == 1)).astype(int)

# --- 4. 評価指標の計算 ---
if not df_eval_base.empty:
    
    # 評価対象のルールを定義
    rule_columns = {
        "Rule 1 (Mention Count)": "prediction_rule1",
        "Rule 2 (Section Keyword)": "prediction_rule2",
        "Rule 3 (LLM 1.5 Flash, Zero-shot)": "prediction_rule3_fulltext",
        "Rule 3 (LLM 1.5 Flash, Few-shot)": "prediction_rule3_fulltext_few_shot",
        "Rule 3 (LLM 2.5 Flash, Few-shot)": "prediction_rule3_gemini-2_5-flash", 
        "Rule 3 (LLM 2.5 Flash, Zero-shot)": "prediction_rule3_gemini-2_5-flash_zeroshot", # ★新しいモデルの結果を追加
        "Hybrid (Rule 2 AND 1.5 Zero-shot)": "prediction_hybrid_AND_zeroshot",
        "Hybrid (Rule 2 AND 1.5 Few-shot)": "prediction_hybrid_AND_fewshot",
        "Hybrid (Rule 2 AND 2.5 Few-shot)": "prediction_hybrid_AND_gemini2_5_fewshot",
        "Hybrid (Rule 2 AND 2.5 Zero-shot)": "prediction_hybrid_AND_gemini2_5_zeroshot" # ★新しいハイブリッドの結果を追加
    }
    
    results = []
    
    for name, col in rule_columns.items():
        if col not in df_eval_base.columns:
            continue

        eval_subset = df_eval_base[['citing_paper_doi', 'is_data_used_gt', col]].copy()
        eval_subset.dropna(subset=[col], inplace=True)
        eval_subset = eval_subset[eval_subset[col] != -1]
        eval_subset[col] = eval_subset[col].astype(int)
        
        y_true = eval_subset['is_data_used_gt']
        y_pred = eval_subset[col]
        
        if len(y_true) == 0: continue

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
        
        results.append({
            'Rule': name,
            'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn,
            'Accuracy': accuracy_score(y_true, y_pred),
            'Precision': precision_score(y_true, y_pred, zero_division=0),
            'Recall': recall_score(y_true, y_pred, zero_division=0),
            'F1-Score': f1_score(y_true, y_pred, zero_division=0),
            'Eval_Count': len(y_true)
        })
        
    df_metrics = pd.DataFrame(results)
    
    df_metrics.to_csv(os.path.join(TABLES_DIR, 'evaluation_metrics_all_models.csv'), index=False)
    
    print("\n" + "="*80)
    print("【最終評価結果レポート（全モデル比較）】")
    print("="*80)
    print(df_metrics.round(3))

全データを正常に読み込み・結合しました。

【最終評価結果レポート（全モデル比較）】
                                Rule  TP   TN  FP  FN  Accuracy  Precision  \
0             Rule 1 (Mention Count)  10  107  18  65     0.585      0.357   
1           Rule 2 (Section Keyword)  32  117   8  43     0.745      0.800   
2  Rule 3 (LLM 1.5 Flash, Zero-shot)  43  115  10  32     0.790      0.811   
3   Rule 3 (LLM 1.5 Flash, Few-shot)  54  102  23  21     0.780      0.701   
4   Rule 3 (LLM 2.5 Flash, Few-shot)  70  120   5   5     0.950      0.933   
5  Rule 3 (LLM 2.5 Flash, Zero-shot)  70  102  23   5     0.860      0.753   
6  Hybrid (Rule 2 AND 1.5 Zero-shot)  24  122   3  51     0.730      0.889   
7   Hybrid (Rule 2 AND 1.5 Few-shot)  29  124   1  46     0.765      0.967   
8   Hybrid (Rule 2 AND 2.5 Few-shot)  31  125   0  44     0.780      1.000   
9  Hybrid (Rule 2 AND 2.5 Zero-shot)  31  124   1  44     0.775      0.969   

   Recall  F1-Score  Eval_Count  
0   0.133     0.194         200  
1   0.427     0.557         20

In [32]:
df_metrics

Unnamed: 0,Rule,TP,TN,FP,FN,Accuracy,Precision,Recall,F1-Score,Eval_Count
0,Rule 1 (Mention Count),10,107,18,65,0.585,0.357143,0.133333,0.194175,200
1,Rule 2 (Section Keyword),32,117,8,43,0.745,0.8,0.426667,0.556522,200
2,"Rule 3 (LLM 1.5 Flash, Zero-shot)",43,115,10,32,0.79,0.811321,0.573333,0.671875,200
3,"Rule 3 (LLM 1.5 Flash, Few-shot)",54,102,23,21,0.78,0.701299,0.72,0.710526,200
4,"Rule 3 (LLM 2.5 Flash, Few-shot)",70,120,5,5,0.95,0.933333,0.933333,0.933333,200
5,"Rule 3 (LLM 2.5 Flash, Zero-shot)",70,102,23,5,0.86,0.752688,0.933333,0.833333,200
6,Hybrid (Rule 2 AND 1.5 Zero-shot),24,122,3,51,0.73,0.888889,0.32,0.470588,200
7,Hybrid (Rule 2 AND 1.5 Few-shot),29,124,1,46,0.765,0.966667,0.386667,0.552381,200
8,Hybrid (Rule 2 AND 2.5 Few-shot),31,125,0,44,0.78,1.0,0.413333,0.584906,200
9,Hybrid (Rule 2 AND 2.5 Zero-shot),31,124,1,44,0.775,0.96875,0.413333,0.579439,200


In [33]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os
import numpy as np

# --- 1. 設定 & ファイルパス ---
GROUND_TRUTH_CSV = '../data/ground_truth/annotation_target_list_updated.csv'
FEATURES_CSV = '../data/processed/features_for_evaluation.csv'
LLM_PREDICTIONS_CSV = '../data/processed/prediction_llm.csv'
RESULTS_DIR = '../results'
TABLES_DIR = os.path.join(RESULTS_DIR, 'tables')

os.makedirs(TABLES_DIR, exist_ok=True)

# --- 2. データの読み込みと結合 ---
try:
    df_gt = pd.read_csv(GROUND_TRUTH_CSV)
    df_features = pd.read_csv(FEATURES_CSV)
    df_llm = pd.read_csv(LLM_PREDICTIONS_CSV)

    # 必要なカラムをマージして、評価の元となるマスターデータフレームを作成
    df_eval_base = pd.merge(
        df_gt[['citing_paper_doi', 'is_data_used_gt']],
        # 正しいカラム名 'prediction_rule1', 'prediction_rule2' を参照
        df_features[['citing_paper_doi', 'prediction_rule1', 'prediction_rule2']],
        on='citing_paper_doi', how='inner'
    )
    
    llm_columns_to_merge = [
        'citing_paper_doi', 'prediction_rule3_abstract', 
        'prediction_rule3_fulltext', 'prediction_rule3_fulltext_few_shot',
        'prediction_rule3_gemini-2_5-flash', 'prediction_rule3_gemini-2_5-flash_zeroshot'
    ]
    df_eval_base = pd.merge(
        df_eval_base,
        df_llm[[col for col in llm_columns_to_merge if col in df_llm.columns]],
        on='citing_paper_doi', how='left'
    )
    
    df_eval_base = df_eval_base[df_eval_base['is_data_used_gt'].isin([0, 1])].copy()
    df_eval_base['is_data_used_gt'] = df_eval_base['is_data_used_gt'].astype(int)
    
    print(f"全データを正常に読み込み・結合しました。")

except FileNotFoundError as e:
    print(f"エラー: ファイルが見つかりません。パスを確認してください。 {e}")
    df_eval_base = pd.DataFrame()

# --- 3. ハイブリッドルールの予測を生成 ---
if not df_eval_base.empty:
    # 既存のANDゲートハイブリッド
    if 'prediction_rule3_fulltext' in df_eval_base.columns:
        df_eval_base['prediction_hybrid_AND_zeroshot'] = ((df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_fulltext'] == 1)).astype(int)
    if 'prediction_rule3_fulltext_few_shot' in df_eval_base.columns:
        df_eval_base['prediction_hybrid_AND_fewshot'] = ((df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_fulltext_few_shot'] == 1)).astype(int)
    if 'prediction_rule3_gemini-2_5-flash' in df_eval_base.columns:
        df_eval_base['prediction_hybrid_AND_gemini2_5'] = ((df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_gemini-2_5-flash'] == 1)).astype(int)
    if 'prediction_rule3_gemini-2_5-flash_zeroshot' in df_eval_base.columns:
        df_eval_base['prediction_hybrid_AND_gemini2_5_zeroshot'] = ((df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_gemini-2_5-flash_zeroshot'] == 1)).astype(int)

    # 階層的ハイブリッドモデルのロジック
    if 'prediction_rule3_gemini-2_5-flash' in df_eval_base.columns and 'prediction_rule3_fulltext' in df_eval_base.columns:
        default_prediction = df_eval_base['prediction_rule3_gemini-2_5-flash']
        condition1 = (df_eval_base['prediction_rule2'] == 1) & (df_eval_base['prediction_rule3_fulltext'] == 1)
        condition2 = (df_eval_base['prediction_rule3_fulltext'] == 0)
        df_eval_base['prediction_hierarchical_hybrid'] = np.select(
            [condition1, condition2],
            [1, 0],
            default=default_prediction
        )

# --- 4. 評価指標の計算 ---
if not df_eval_base.empty:
    
    # 評価対象のルールを定義
    rule_columns = {
        "Rule 1 (Mention Count)": "prediction_rule1",
        "Rule 2 (Section Keyword)": "prediction_rule2",
        "Rule 3 (LLM 1.5, Zero-shot)": "prediction_rule3_fulltext",
        "Rule 3 (LLM 1.5, Few-shot)": "prediction_rule3_fulltext_few_shot",
        "Rule 3 (LLM 2.5, Few-shot)": "prediction_rule3_gemini-2_5-flash",
        "Rule 3 (LLM 2.5, Zero-shot)": "prediction_rule3_gemini-2_5-flash_zeroshot",
        "Hybrid (Rule 2 AND 2.5 Zero-shot)": "prediction_hybrid_AND_gemini2_5_zeroshot",
        "Hierarchical Hybrid Model": "prediction_hierarchical_hybrid"
    }
    
    results = []
    
    for name, col in rule_columns.items():
        if col not in df_eval_base.columns:
            continue

        eval_subset = df_eval_base[['citing_paper_doi', 'is_data_used_gt', col]].copy()
        eval_subset.dropna(subset=[col], inplace=True)
        eval_subset = eval_subset[eval_subset[col] != -1]
        eval_subset[col] = eval_subset[col].astype(int)
        
        y_true = eval_subset['is_data_used_gt']
        y_pred = eval_subset[col]
        
        if len(y_true) == 0: continue

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
        
        results.append({
            'Rule': name, # カラム名を'Rule'に統一
            'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn,
            'Accuracy': accuracy_score(y_true, y_pred),
            'Precision': precision_score(y_true, y_pred, zero_division=0),
            'Recall': recall_score(y_true, y_pred, zero_division=0),
            'F1-Score': f1_score(y_true, y_pred, zero_division=0),
            'Eval_Count': len(y_true)
        })
        
    df_metrics = pd.DataFrame(results)
    
    df_metrics.to_csv(os.path.join(TABLES_DIR, 'evaluation_metrics_hierarchical.csv'), index=False)
    
    print("\n" + "="*80)
    print("【最終評価結果レポート（階層的ハイブリッドモデル追加）】")
    print("="*80)
    print(df_metrics.round(3))

全データを正常に読み込み・結合しました。

【最終評価結果レポート（階層的ハイブリッドモデル追加）】
                                Rule  TP   TN  FP  FN  Accuracy  Precision  \
0             Rule 1 (Mention Count)  13  113  15  59     0.630      0.464   
1           Rule 2 (Section Keyword)  31  118  10  41     0.745      0.756   
2        Rule 3 (LLM 1.5, Zero-shot)  41  116  12  31     0.785      0.774   
3         Rule 3 (LLM 1.5, Few-shot)  45   96  32  27     0.705      0.584   
4         Rule 3 (LLM 2.5, Few-shot)  56  109  19  16     0.825      0.747   
5        Rule 3 (LLM 2.5, Zero-shot)  62   97  31  10     0.795      0.667   
6  Hybrid (Rule 2 AND 2.5 Zero-shot)  28  124   4  44     0.760      0.875   
7          Hierarchical Hybrid Model  41  123   5  31     0.820      0.891   

   Recall  F1-Score  Eval_Count  
0   0.181     0.260         200  
1   0.431     0.549         200  
2   0.569     0.656         200  
3   0.625     0.604         200  
4   0.778     0.762         200  
5   0.861     0.752         200  
6   0.38

In [34]:
df_metrics

Unnamed: 0,Rule,TP,TN,FP,FN,Accuracy,Precision,Recall,F1-Score,Eval_Count
0,Rule 1 (Mention Count),13,113,15,59,0.63,0.464286,0.180556,0.26,200
1,Rule 2 (Section Keyword),31,118,10,41,0.745,0.756098,0.430556,0.548673,200
2,"Rule 3 (LLM 1.5, Zero-shot)",41,116,12,31,0.785,0.773585,0.569444,0.656,200
3,"Rule 3 (LLM 1.5, Few-shot)",45,96,32,27,0.705,0.584416,0.625,0.604027,200
4,"Rule 3 (LLM 2.5, Few-shot)",56,109,19,16,0.825,0.746667,0.777778,0.761905,200
5,"Rule 3 (LLM 2.5, Zero-shot)",62,97,31,10,0.795,0.666667,0.861111,0.751515,200
6,Hybrid (Rule 2 AND 2.5 Zero-shot),28,124,4,44,0.76,0.875,0.388889,0.538462,200
7,Hierarchical Hybrid Model,41,123,5,31,0.82,0.891304,0.569444,0.694915,200


In [42]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os

# --- 1. 設定 & ファイルパス ---
# 正解データCSVのパスを修正
GROUND_TRUTH_CSV = '../data/ground_truth/annotation_target_list.csv' 
FEATURES_CSV = '../data/processed/features_for_evaluation.csv'
LLM_PREDICTIONS_CSV = '../data/processed/prediction_llm.csv'
RESULTS_DIR = '../results'
TABLES_DIR = os.path.join(RESULTS_DIR, 'tables')

os.makedirs(TABLES_DIR, exist_ok=True)

pd.set_option('display.precision', 3)

# --- 2. データの読み込みと結合 ---
try:
    df_gt = pd.read_csv(GROUND_TRUTH_CSV)
    df_features = pd.read_csv(FEATURES_CSV)
    df_llm = pd.read_csv(LLM_PREDICTIONS_CSV)

    # 必要なカラムをマージして、評価の元となるマスターデータフレームを作成
    df_eval_base = pd.merge(
        df_gt[['citing_paper_doi', 'is_data_used_gt']],
        df_features[['citing_paper_doi', 'prediction_rule1', 'prediction_rule2']],
        on='citing_paper_doi', how='inner'
    )
    
    # 評価に使う可能性のある全てのLLM結果カラムを定義
    llm_columns_to_merge = [
        'citing_paper_doi', 
        'prediction_rule3_gemini-2_5-flash',
        'prediction_rule3_gemini-2_5-flash_zeroshot'
    ]
    df_eval_base = pd.merge(
        df_eval_base,
        df_llm[[col for col in llm_columns_to_merge if col in df_llm.columns]],
        on='citing_paper_doi', how='left'
    )
    
    df_eval_base = df_eval_base[df_eval_base['is_data_used_gt'].isin([0, 1])].copy()
    df_eval_base['is_data_used_gt'] = df_eval_base['is_data_used_gt'].astype(int)
    
    print(f"全データを正常に読み込み・結合しました。")

except FileNotFoundError as e:
    print(f"エラー: ファイルが見つかりません。パスを確認してください。 {e}")
    df_eval_base = pd.DataFrame()

# --- 3. 評価指標の計算 ---
if not df_eval_base.empty:
    
    # 【変更点】評価対象のルールを4つに限定
    rule_columns = {
        "引用回数による判定": "prediction_rule1",
        "引用セクションによる判定": "prediction_rule2",
        "Gemini 2.5 Flash, Zero-shot": "prediction_rule3_gemini-2_5-flash_zeroshot",
        "Gemini 2.5 Flash, Few-shot": "prediction_rule3_gemini-2_5-flash",
    }
    
    results = []
    
    for name, col in rule_columns.items():
        if col not in df_eval_base.columns:
            continue

        eval_subset = df_eval_base[['citing_paper_doi', 'is_data_used_gt', col]].copy()
        eval_subset.dropna(subset=[col], inplace=True)
        eval_subset = eval_subset[eval_subset[col] != -1]
        eval_subset[col] = eval_subset[col].astype(int)
        
        y_true = eval_subset['is_data_used_gt']
        y_pred = eval_subset[col]
        
        if len(y_true) == 0: continue

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
        
        results.append({
            'Rule': name,
            'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn,
            'Accuracy': accuracy_score(y_true, y_pred),
            'Precision': precision_score(y_true, y_pred, zero_division=0),
            'Recall': recall_score(y_true, y_pred, zero_division=0),
            'F1-Score': f1_score(y_true, y_pred, zero_division=0),
            'Eval_Count': len(y_true)
        })
        
    df_metrics = pd.DataFrame(results)
    
    df_metrics.to_csv(os.path.join(TABLES_DIR, 'evaluation_metrics_summary.csv'), index=False)
    
df_metrics

全データを正常に読み込み・結合しました。


Unnamed: 0,Rule,TP,TN,FP,FN,Accuracy,Precision,Recall,F1-Score,Eval_Count
0,引用回数による判定,10,107,18,65,0.585,0.357,0.133,0.194,200
1,引用セクションによる判定,32,117,8,43,0.745,0.8,0.427,0.557,200
2,"Gemini 2.5 Flash, Zero-shot",70,102,23,5,0.86,0.753,0.933,0.833,200
3,"Gemini 2.5 Flash, Few-shot",70,120,5,5,0.95,0.933,0.933,0.933,200
