In [1]:
import sys
import os
import pandas as pd
from dotenv import load_dotenv

# --- Step 1: モジュールをインポートするためのパス設定 ---
sys.path.append(os.path.abspath('..'))
from src.full_text_extractor import FullTextExtractor

# --- Step 2: 設定 ---
S2ORC_DIR = "../data/raw/s2orc/"
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 3: 全文テキストテーブルの構築を実行 ---
if __name__ == '__main__':
    print("--- Starting Full Text Table Construction ---")
    
    extractor = FullTextExtractor(db_path=DB_PATH)
    extractor.build_table(s2orc_dir=S2ORC_DIR)
    
    print("\n--- Verification ---")
    import sqlite3
    with sqlite3.connect(DB_PATH) as conn:
        count = pd.read_sql_query("SELECT COUNT(*) FROM full_texts", conn).iloc[0,0]
        print(f"Total full texts saved: {count:,}")
        display(pd.read_sql_query("SELECT * FROM full_texts LIMIT 5", conn))

  from .autonotebook import tqdm as notebook_tqdm


--- Starting Full Text Table Construction ---
Fetching target DOIs from `positive_candidates` table...
Found 11,249 unique candidate DOIs to extract.


Extracting Full Texts: 100%|██████████| 297/297 [08:17<00:00,  1.68s/it]

✅ Full text table construction complete.

--- Verification ---
Total full texts saved: 11,249





Unnamed: 0,doi,full_text
0,10.3390/CIMB44090267,\n\n2022\n\n\nDepartment of Pharmacognosy\nFac...
1,10.1016/J.DIB.2021.107641,\nData on nearly zero energy buildings (NZEBs)...
2,10.3390/V15101977,\nEvidence of a Protein-Coding Gene Antisense ...
3,10.3390/IJMS21093113,\nBinder-Free α-MnO 2 Nanowires on Carbon Clot...
4,10.1371/JOURNAL.PONE.0271458,\nMachine learning-based estimation of riverin...


In [3]:
import pandas as pd
import sqlite3
import os
import re
from tqdm.auto import tqdm

# --- Step 1: 設定と前処理関数の定義 ---
DB_PATH = "../data/processed/s2orc_filtered.db"

def preprocess_text_for_llm(text: str) -> str:
    """LLMへの入力用に、論文の全文テキストを前処理・クレンジングする"""
    if not isinstance(text, str): return ""
    text_before_refs = re.split(r'\n\s*(?:references|bibliography)\s*\n', text, maxsplit=1, flags=re.IGNORECASE)[0]
    text_no_urls = re.sub(r'https?://\S+|www\.\S+', '', text_before_refs)
    text_no_emails = re.sub(r'\S+@\S+', '', text_no_urls)
    text_no_newlines = text_no_emails.replace('\n', ' ').replace('\r', ' ')
    cleaned_text = re.sub(r'\s+', ' ', text_no_newlines).strip()
    return cleaned_text

# --- Step 2: データベースの更新 ---
def update_db_with_cleaned_text():
    print("--- Preprocessing full_text in the database ---")

    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.cursor()
        
        # 1. 新しいカラムを追加 (まだ存在しない場合)
        try:
            cursor.execute("ALTER TABLE full_texts ADD COLUMN cleaned_text TEXT")
            conn.commit()
            print("`cleaned_text` column added to `full_texts` table.")
        except sqlite3.OperationalError as e:
            # カラムが既に存在する場合のエラーは無視
            if "duplicate column name" in str(e):
                print("`cleaned_text` column already exists.")
            else:
                raise e

        # 2. 全てのfull_textを読み込んで前処理し、更新
        print("Fetching and preprocessing texts...")
        df = pd.read_sql_query("SELECT doi, full_text FROM full_texts", conn)
        
        # tqdmをpandasで使うための設定
        tqdm.pandas(desc="Cleaning text")
        df['cleaned_text'] = df['full_text'].progress_apply(preprocess_text_for_llm)
        
        # 3. 更新内容をDBに書き戻す
        print("Updating database with cleaned text...")
        update_data = [
            (row['cleaned_text'], row['doi']) for index, row in df.iterrows()
        ]
        
        update_query = "UPDATE full_texts SET cleaned_text = ? WHERE doi = ?"
        cursor.executemany(update_query, update_data)
        conn.commit()
        
        print("\n✅ Database update complete.")
        
        # 4. 結果のサンプルを表示
        print("\n--- Before and After Preprocessing (Sample) ---")
        df_sample = pd.read_sql_query("SELECT full_text, cleaned_text FROM full_texts LIMIT 1", conn).iloc[0]
        
        print("\n[Original Text (first 500 chars)]")
        print(df_sample['full_text'][:500])
        print("\n[Cleaned Text (first 500 chars)]")
        print(df_sample['cleaned_text'][:500])


# --- 実行 ---
update_db_with_cleaned_text()

  from .autonotebook import tqdm as notebook_tqdm


--- Preprocessing full_text in the database ---
`cleaned_text` column already exists.
Fetching and preprocessing texts...


Cleaning text: 100%|██████████| 11249/11249 [00:46<00:00, 244.07it/s]


Updating database with cleaned text...

✅ Database update complete.

--- Before and After Preprocessing (Sample) ---

[Original Text (first 500 chars)]


2022


Department of Pharmacognosy
Faculty of Pharmacy
Medical University of Sofia



Dunav Str
1000SofiaBulgaria

Curr. Issues Mol. Biol
44202210.3390/cimb44090267Received: 27 June 2022 Accepted: 21 August 2022Anticancer Secondary Metabolites: From Ethnopharmacology and Identification in Native Complexes to Biotechnological Studies in Species of Genus Astragalus L. and Gloriosa L. 3884-3904. https://doi.org/ 10.3390/cimb44090267 Academic Editor: Julius Liobikas
Citation: Ionkova, I.; Shkondro

[Cleaned Text (first 500 chars)]
2022 Department of Pharmacognosy Faculty of Pharmacy Medical University of Sofia Dunav Str 1000SofiaBulgaria Curr. Issues Mol. Biol 44202210.3390/cimb44090267Received: 27 June 2022 Accepted: 21 August 2022Anticancer Secondary Metabolites: From Ethnopharmacology and Identification in Native Complexes to Biotechno

正例ペアが作れるかどうかを確認


In [4]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: 設定 ---
DB_PATH = "../data/processed/s2orc_filtered.db"

def flag_creatable_pairs():
    """
    positive_candidatesテーブルに、ペア作成が可能かどうかのフラグを追加し、更新する。
    """
    print("--- Flagging pair-creatable candidates ---")

    if not os.path.exists(DB_PATH):
        print(f"❌ Error: Database file not found at {DB_PATH}")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            cursor = conn.cursor()
            
            # --- 1. is_pair_creatable 列を追加 (存在しない場合のみ) ---
            try:
                cursor.execute("ALTER TABLE positive_candidates ADD COLUMN is_pair_creatable INTEGER DEFAULT 0")
                print("`is_pair_creatable` column added and initialized to 0.")
            except sqlite3.OperationalError as e:
                if "duplicate column name" in str(e):
                    print("`is_pair_creatable` column already exists. Resetting to 0.")
                    cursor.execute("UPDATE positive_candidates SET is_pair_creatable = 0")
                else:
                    raise e
            
            # --- 2. ペア作成が可能なデータ論文のDOIリストを取得 ---
            query_eligible = """
                SELECT cited_datapaper_doi
                FROM positive_candidates
                GROUP BY cited_datapaper_doi
                HAVING COUNT(citing_doi) >= 2;
            """
            df_eligible = pd.read_sql_query(query_eligible, conn)
            eligible_dois = df_eligible['cited_datapaper_doi'].tolist()

            if not eligible_dois:
                print("No data papers found that are eligible for pair creation.")
                return
            
            print(f"Found {len(eligible_dois)} data papers eligible for pair creation.")

            # --- 3. 該当する行のフラグを1に更新 ---
            print("Updating flags for eligible candidates...")
            
            # プレースホルダーを作成 (?,?,?...)
            placeholders = ','.join('?' for _ in eligible_dois)
            update_query = f"UPDATE positive_candidates SET is_pair_creatable = 1 WHERE cited_datapaper_doi IN ({placeholders})"
            
            cursor.execute(update_query, eligible_dois)
            conn.commit()
            
            print(f"✅ Updated {cursor.rowcount} rows.")

            # --- 4. 結果の検証 ---
            print("\n--- Verification of Flags ---")
            df_verification = pd.read_sql_query("SELECT is_pair_creatable, COUNT(*) as count FROM positive_candidates GROUP BY is_pair_creatable", conn)
            display(df_verification)

    except Exception as e:
        print(f"💥 An error occurred: {e}")

# --- 実行 ---
flag_creatable_pairs()

--- Flagging pair-creatable candidates ---
`is_pair_creatable` column already exists. Resetting to 0.
Found 2305 data papers eligible for pair creation.
Updating flags for eligible candidates...
✅ Updated 9691 rows.

--- Verification of Flags ---


Unnamed: 0,is_pair_creatable,count
0,0,2738
1,1,9691


In [1]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: 設定 ---
DB_PATH = "../data/processed/s2orc_filtered.db"

def migrate_annotation_table():
    """
    positive_candidatesテーブルの構造を更新する。
    - annotation_status -> llm_annotation_status にリネーム
    - human_annotation_status 列を追加
    - annotation_source 列を削除
    """
    print("--- Migrating `positive_candidates` table schema ---")

    if not os.path.exists(DB_PATH):
        print(f"❌ Error: Database file not found at {DB_PATH}")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            cursor = conn.cursor()

            # --- 1. 現在のテーブル構造を確認 ---
            print("\n--- Schema Before Migration ---")
            schema_before = pd.read_sql_query("PRAGMA table_info(positive_candidates);", conn)
            display(schema_before)

            # --- 2. 既存テーブルをバックアップとしてリネーム ---
            try:
                cursor.execute("ALTER TABLE positive_candidates RENAME TO _positive_candidates_old")
                print("\n   1. Backed up existing table to '_positive_candidates_old'.")
            except sqlite3.OperationalError as e:
                if "no such table" in str(e):
                    print("   - No existing table found to migrate.")
                    return
                else:
                    raise e

            # --- 3. 新しい構造でテーブルを再作成 ---
            cursor.execute('''
                CREATE TABLE positive_candidates (
                    citing_doi TEXT,
                    cited_datapaper_doi TEXT,
                    cited_datapaper_title TEXT,
                    llm_annotation_status INTEGER, -- リネーム
                    human_annotation_status INTEGER DEFAULT 0, -- 新規追加
                    is_pair_creatable INTEGER,
                    PRIMARY KEY (citing_doi, cited_datapaper_doi)
                )
            ''')
            print("   2. Created new `positive_candidates` table with the correct schema.")

            # --- 4. バックアップからデータを移行 ---
            # 旧テーブルの annotation_status を新テーブルの llm_annotation_status にコピー
            insert_query = """
                INSERT INTO positive_candidates (
                    citing_doi, cited_datapaper_doi, cited_datapaper_title, 
                    llm_annotation_status, is_pair_creatable
                )
                SELECT 
                    citing_doi, cited_datapaper_doi, cited_datapaper_title,
                    annotation_status, is_pair_creatable
                FROM _positive_candidates_old;
            """
            cursor.execute(insert_query)
            conn.commit()
            print("   3. Migrated data from backup to the new table.")

            # --- 5. バックアップテーブルを削除 ---
            cursor.execute("DROP TABLE _positive_candidates_old")
            print("   4. Removed backup table.")

            # --- 6. 移行後のテーブル構造を確認 ---
            print("\n--- Schema After Migration ---")
            schema_after = pd.read_sql_query("PRAGMA table_info(positive_candidates);", conn)
            display(schema_after)

    except Exception as e:
        print(f"💥 An error occurred during migration: {e}")

# --- 実行 ---
migrate_annotation_table()

--- Migrating `positive_candidates` table schema ---

--- Schema Before Migration ---


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,citing_doi,TEXT,0,,1
1,1,cited_datapaper_doi,TEXT,0,,2
2,2,cited_datapaper_title,TEXT,0,,0
3,3,annotation_status,INTEGER,0,0.0,0
4,4,annotation_source,TEXT,0,,0
5,5,is_pair_creatable,INTEGER,0,0.0,0



   1. Backed up existing table to '_positive_candidates_old'.
   2. Created new `positive_candidates` table with the correct schema.
   3. Migrated data from backup to the new table.
   4. Removed backup table.

--- Schema After Migration ---


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,citing_doi,TEXT,0,,1
1,1,cited_datapaper_doi,TEXT,0,,2
2,2,cited_datapaper_title,TEXT,0,,0
3,3,llm_annotation_status,INTEGER,0,,0
4,4,human_annotation_status,INTEGER,0,0.0,0
5,5,is_pair_creatable,INTEGER,0,,0
