In [2]:
import pandas as pd
import sqlite3
import os
import itertools
from tqdm.auto import tqdm
from IPython.display import display

# --- Step 1: Ë®≠ÂÆö ---
DB_PATH = "data/processed/s2orc_filtered.db"
TRAINING_DATAPAPERS_FILE = "data/datapapers/sampled/training_data_papers_50.csv"
# ‚ñº‚ñº‚ñº ‰øÆÊ≠£ÁÇπ: Âá∫Âäõ„Éï„Ç°„Ç§„É´Âêç„ÇíÂ§âÊõ¥ ‚ñº‚ñº‚ñº
OUTPUT_FILE = "data/processed/training_dataset_abstract_with_dois.csv"
NEGATIVE_SAMPLE_RATIO = 4

def create_training_dataset_with_dois():
    print("--- Creating Training Dataset (Abstract Version with DOIs) ---")

    try:
        with sqlite3.connect(DB_PATH) as conn:
            
            # --- 1. Ë®ìÁ∑¥Áî®„ÅÆ„Éá„Éº„ÇøË´ñÊñáDOI„É™„Çπ„Éà„ÇíÂèñÂæó ---
            df_train_papers = pd.read_csv(TRAINING_DATAPAPERS_FILE)
            train_dois = tuple(df_train_papers['cited_datapaper_doi'].unique())
            
            # --- 2. Ê≠£‰æã„Éö„Ç¢„ÅÆÂÖÉ„Å®„Å™„ÇãË´ñÊñá„É™„Çπ„Éà„ÇíÂèñÂæó ---
            placeholders = ','.join('?' for _ in train_dois)
            query = f"""
                SELECT
                    cited_datapaper_doi,
                    citing_doi,
                    human_annotation_status
                FROM
                    positive_candidates
                WHERE
                    cited_datapaper_doi IN ({placeholders})
                    AND (human_annotation_status = 1 OR (llm_annotation_status = 1 AND human_annotation_status = 0))
            """
            df_candidates = pd.read_sql_query(query, conn, params=train_dois)

            # --- 3. Ê≠£‰æã„Éö„Ç¢„ÅÆ‰ΩúÊàê (DOI„ÅÆÁµÑ„ÅøÂêà„Çè„Åõ) ---
            positive_pairs = []
            grouped = df_candidates.groupby('cited_datapaper_doi')
            
            for data_paper_doi, group in tqdm(grouped, desc="Generating Positive Pairs"):
                human_used_dois = group[group['human_annotation_status'] == 1]['citing_doi'].tolist()
                llm_used_dois = group[group['human_annotation_status'] == 0]['citing_doi'].tolist()
                
                if not human_used_dois:
                    continue 

                # (Á∑ë, Á∑ë) „Éö„Ç¢
                for pair in itertools.combinations(human_used_dois, 2):
                    positive_pairs.append({
                        'doi_a': pair[0], 
                        'doi_b': pair[1], 
                        'label': 1,
                        'data_paper_doi': data_paper_doi
                    })
                    
                # (Á∑ë, ÁôΩ) „Éö„Ç¢
                for human_doi in human_used_dois:
                    for llm_doi in llm_used_dois:
                        positive_pairs.append({
                            'doi_a': human_doi, 
                            'doi_b': llm_doi, 
                            'label': 1,
                            'data_paper_doi': data_paper_doi
                        })

            df_positive = pd.DataFrame(positive_pairs)
            print(f"Generated {len(df_positive):,} positive pairs.")
            
            # --- 4. Ë≤†‰æã„Éö„Ç¢„ÅÆ‰ΩúÊàê (Easy Negative) ---
            num_negatives_to_sample = len(df_positive) * NEGATIVE_SAMPLE_RATIO
            positive_dois = set(df_positive['doi_a']) | set(df_positive['doi_b'])
            anchor_dois_for_negative = df_positive['doi_a'].sample(n=num_negatives_to_sample, replace=True).tolist()
            
            query_random = f"""
                SELECT doi FROM papers 
                WHERE doi NOT IN (SELECT doi FROM full_texts) 
                  AND abstract IS NOT NULL AND abstract != ''
                ORDER BY RANDOM() 
                LIMIT {num_negatives_to_sample * 2}
            """
            df_random_papers = pd.read_sql_query(query_random, conn)
            random_dois = df_random_papers[~df_random_papers['doi'].isin(positive_dois)]['doi'].tolist()
            
            if len(random_dois) < num_negatives_to_sample:
                num_negatives_to_sample = len(random_dois)
            
            negative_pairs = []
            for i in range(num_negatives_to_sample):
                negative_pairs.append({
                    'doi_a': anchor_dois_for_negative[i], 
                    'doi_b': random_dois[i], 
                    'label': 0,
                    'data_paper_doi': None
                })
            
            df_negative = pd.DataFrame(negative_pairs)
            print(f"Generated {len(df_negative):,} negative pairs.")

            # --- 5. ÂÖ®„Éö„Ç¢„ÅÆÁµêÂêà„Å®„Ç¢„Éñ„Çπ„Éà„É©„ÇØ„Éà„ÅÆÂèñÂæó ---
            df_final_pairs = pd.concat([df_positive, df_negative]).reset_index(drop=True)
            all_dois = set(df_final_pairs['doi_a']) | set(df_final_pairs['doi_b'])
            
            print(f"Fetching abstracts for {len(all_dois):,} unique papers...")
            query_texts = f"SELECT doi, abstract FROM papers WHERE doi IN ({','.join('?'*len(all_dois))})"
            df_texts = pd.read_sql_query(query_texts, conn, params=list(all_dois))
            text_map = dict(zip(df_texts['doi'], df_texts['abstract']))

            # --- 6. ÊúÄÁµÇ„Éá„Éº„Çø„Çª„ÉÉ„Éà„ÅÆ‰ΩúÊàê„Å®‰øùÂ≠ò ---
            df_final_dataset = pd.DataFrame()
            # ‚ñº‚ñº‚ñº ‰øÆÊ≠£ÁÇπ: DOI„ÅÆÂàó„ÇíËøΩÂä† ‚ñº‚ñº‚ñº
            df_final_dataset['doi_a'] = df_final_pairs['doi_a']
            df_final_dataset['doi_b'] = df_final_pairs['doi_b']
            df_final_dataset['abstract_a'] = df_final_pairs['doi_a'].map(text_map)
            df_final_dataset['abstract_b'] = df_final_pairs['doi_b'].map(text_map)
            df_final_dataset['label'] = df_final_pairs['label']
            df_final_dataset['data_paper_doi'] = df_final_pairs['data_paper_doi']
            
            df_final_dataset = df_final_dataset.dropna(subset=['abstract_a', 'abstract_b']).reset_index(drop=True)
            
            print(f"Saving final dataset with {len(df_final_dataset):,} pairs to {OUTPUT_FILE}...")
            df_final_dataset.to_csv(OUTPUT_FILE, index=False)
            
            print("\n--- Process Complete ---")
            display(df_final_dataset.head())

    except Exception as e:
        print(f"üí• An error occurred: {e}")

# --- ÂÆüË°å ---
create_training_dataset_with_dois()

--- Creating Training Dataset (Abstract Version with DOIs) ---


Generating Positive Pairs:   0%|          | 0/136 [00:00<?, ?it/s]

Generated 7,063 positive pairs.
Generated 28,252 negative pairs.
Fetching abstracts for 29,523 unique papers...
Saving final dataset with 35,315 pairs to data/processed/training_dataset_abstract_with_dois.csv...

--- Process Complete ---


Unnamed: 0,doi_a,doi_b,abstract_a,abstract_b,label,data_paper_doi
0,10.3390/ENVIRONSCIPROC2023026031,10.1007/S00484-023-02531-2,We present climatology and trends of the UTCI ...,The modern unambiguous climate change reveals ...,1,10.1002/GDJ3.102
1,10.3390/ENVIRONSCIPROC2023026031,10.1007/S00704-022-04129-X,We present climatology and trends of the UTCI ...,Outdoor thermal comfort (OTC) surveys require ...,1,10.1002/GDJ3.102
2,10.1007/S00484-023-02531-2,10.1007/S00704-022-04129-X,The modern unambiguous climate change reveals ...,Outdoor thermal comfort (OTC) surveys require ...,1,10.1002/GDJ3.102
3,10.3390/ENVIRONSCIPROC2023026031,10.1038/S41598-023-44286-1,We present climatology and trends of the UTCI ...,"In the months of March-June, India experiences...",1,10.1002/GDJ3.102
4,10.3390/ENVIRONSCIPROC2023026031,10.1029/2023GL104850,We present climatology and trends of the UTCI ...,Extreme compound heatwaves (ECHWs) have the po...,1,10.1002/GDJ3.102
