In [1]:
import sys
import os
import pandas as pd
from dotenv import load_dotenv

# --- Step 1: „É¢„Ç∏„É•„Éº„É´„Çí„Ç§„É≥„Éù„Éº„Éà„Åô„Çã„Åü„ÇÅ„ÅÆ„Éë„ÇπË®≠ÂÆö ---
sys.path.append(os.path.abspath('..'))
from src.full_text_extractor import FullTextExtractor

# --- Step 2: Ë®≠ÂÆö ---
S2ORC_DIR = "../data/raw/s2orc/"
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 3: ÂÖ®Êñá„ÉÜ„Ç≠„Çπ„Éà„ÉÜ„Éº„Éñ„É´„ÅÆÊßãÁØâ„ÇíÂÆüË°å ---
if __name__ == '__main__':
    print("--- Starting Full Text Table Construction ---")
    
    extractor = FullTextExtractor(db_path=DB_PATH)
    extractor.build_table(s2orc_dir=S2ORC_DIR)
    
    print("\n--- Verification ---")
    import sqlite3
    with sqlite3.connect(DB_PATH) as conn:
        count = pd.read_sql_query("SELECT COUNT(*) FROM full_texts", conn).iloc[0,0]
        print(f"Total full texts saved: {count:,}")
        display(pd.read_sql_query("SELECT * FROM full_texts LIMIT 5", conn))

  from .autonotebook import tqdm as notebook_tqdm


--- Starting Full Text Table Construction ---
Fetching target DOIs from `positive_candidates` table...
Found 11,249 unique candidate DOIs to extract.


Extracting Full Texts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 297/297 [08:17<00:00,  1.68s/it]

‚úÖ Full text table construction complete.

--- Verification ---
Total full texts saved: 11,249





Unnamed: 0,doi,full_text
0,10.3390/CIMB44090267,\n\n2022\n\n\nDepartment of Pharmacognosy\nFac...
1,10.1016/J.DIB.2021.107641,\nData on nearly zero energy buildings (NZEBs)...
2,10.3390/V15101977,\nEvidence of a Protein-Coding Gene Antisense ...
3,10.3390/IJMS21093113,\nBinder-Free Œ±-MnO 2 Nanowires on Carbon Clot...
4,10.1371/JOURNAL.PONE.0271458,\nMachine learning-based estimation of riverin...


In [3]:
import pandas as pd
import sqlite3
import os
import re
from tqdm.auto import tqdm

# --- Step 1: Ë®≠ÂÆö„Å®ÂâçÂá¶ÁêÜÈñ¢Êï∞„ÅÆÂÆöÁæ© ---
DB_PATH = "../data/processed/s2orc_filtered.db"

def preprocess_text_for_llm(text: str) -> str:
    """LLM„Å∏„ÅÆÂÖ•ÂäõÁî®„Å´„ÄÅË´ñÊñá„ÅÆÂÖ®Êñá„ÉÜ„Ç≠„Çπ„Éà„ÇíÂâçÂá¶ÁêÜ„Éª„ÇØ„É¨„É≥„Ç∏„É≥„Ç∞„Åô„Çã"""
    if not isinstance(text, str): return ""
    text_before_refs = re.split(r'\n\s*(?:references|bibliography)\s*\n', text, maxsplit=1, flags=re.IGNORECASE)[0]
    text_no_urls = re.sub(r'https?://\S+|www\.\S+', '', text_before_refs)
    text_no_emails = re.sub(r'\S+@\S+', '', text_no_urls)
    text_no_newlines = text_no_emails.replace('\n', ' ').replace('\r', ' ')
    cleaned_text = re.sub(r'\s+', ' ', text_no_newlines).strip()
    return cleaned_text

# --- Step 2: „Éá„Éº„Çø„Éô„Éº„Çπ„ÅÆÊõ¥Êñ∞ ---
def update_db_with_cleaned_text():
    print("--- Preprocessing full_text in the database ---")

    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.cursor()
        
        # 1. Êñ∞„Åó„ÅÑ„Ç´„É©„É†„ÇíËøΩÂä† („Åæ„Å†Â≠òÂú®„Åó„Å™„ÅÑÂ†¥Âêà)
        try:
            cursor.execute("ALTER TABLE full_texts ADD COLUMN cleaned_text TEXT")
            conn.commit()
            print("`cleaned_text` column added to `full_texts` table.")
        except sqlite3.OperationalError as e:
            # „Ç´„É©„É†„ÅåÊó¢„Å´Â≠òÂú®„Åô„ÇãÂ†¥Âêà„ÅÆ„Ç®„É©„Éº„ÅØÁÑ°Ë¶ñ
            if "duplicate column name" in str(e):
                print("`cleaned_text` column already exists.")
            else:
                raise e

        # 2. ÂÖ®„Å¶„ÅÆfull_text„ÇíË™≠„ÅøËæº„Çì„ÅßÂâçÂá¶ÁêÜ„Åó„ÄÅÊõ¥Êñ∞
        print("Fetching and preprocessing texts...")
        df = pd.read_sql_query("SELECT doi, full_text FROM full_texts", conn)
        
        # tqdm„Çípandas„Åß‰Ωø„ÅÜ„Åü„ÇÅ„ÅÆË®≠ÂÆö
        tqdm.pandas(desc="Cleaning text")
        df['cleaned_text'] = df['full_text'].progress_apply(preprocess_text_for_llm)
        
        # 3. Êõ¥Êñ∞ÂÜÖÂÆπ„ÇíDB„Å´Êõ∏„ÅçÊàª„Åô
        print("Updating database with cleaned text...")
        update_data = [
            (row['cleaned_text'], row['doi']) for index, row in df.iterrows()
        ]
        
        update_query = "UPDATE full_texts SET cleaned_text = ? WHERE doi = ?"
        cursor.executemany(update_query, update_data)
        conn.commit()
        
        print("\n‚úÖ Database update complete.")
        
        # 4. ÁµêÊûú„ÅÆ„Çµ„É≥„Éó„É´„ÇíË°®Á§∫
        print("\n--- Before and After Preprocessing (Sample) ---")
        df_sample = pd.read_sql_query("SELECT full_text, cleaned_text FROM full_texts LIMIT 1", conn).iloc[0]
        
        print("\n[Original Text (first 500 chars)]")
        print(df_sample['full_text'][:500])
        print("\n[Cleaned Text (first 500 chars)]")
        print(df_sample['cleaned_text'][:500])


# --- ÂÆüË°å ---
update_db_with_cleaned_text()

  from .autonotebook import tqdm as notebook_tqdm


--- Preprocessing full_text in the database ---
`cleaned_text` column already exists.
Fetching and preprocessing texts...


Cleaning text: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11249/11249 [00:46<00:00, 244.07it/s]


Updating database with cleaned text...

‚úÖ Database update complete.

--- Before and After Preprocessing (Sample) ---

[Original Text (first 500 chars)]


2022


Department of Pharmacognosy
Faculty of Pharmacy
Medical University of Sofia



Dunav Str
1000SofiaBulgaria

Curr. Issues Mol. Biol
44202210.3390/cimb44090267Received: 27 June 2022 Accepted: 21 August 2022Anticancer Secondary Metabolites: From Ethnopharmacology and Identification in Native Complexes to Biotechnological Studies in Species of Genus Astragalus L. and Gloriosa L. 3884-3904. https://doi.org/ 10.3390/cimb44090267 Academic Editor: Julius Liobikas
Citation: Ionkova, I.; Shkondro

[Cleaned Text (first 500 chars)]
2022 Department of Pharmacognosy Faculty of Pharmacy Medical University of Sofia Dunav Str 1000SofiaBulgaria Curr. Issues Mol. Biol 44202210.3390/cimb44090267Received: 27 June 2022 Accepted: 21 August 2022Anticancer Secondary Metabolites: From Ethnopharmacology and Identification in Native Complexes to Biotech

Ê≠£‰æã„Éö„Ç¢„Åå‰Ωú„Çå„Çã„Åã„Å©„ÅÜ„Åã„ÇíÁ¢∫Ë™ç


In [4]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: Ë®≠ÂÆö ---
DB_PATH = "../data/processed/s2orc_filtered.db"

def flag_creatable_pairs():
    """
    positive_candidates„ÉÜ„Éº„Éñ„É´„Å´„ÄÅ„Éö„Ç¢‰ΩúÊàê„ÅåÂèØËÉΩ„Åã„Å©„ÅÜ„Åã„ÅÆ„Éï„É©„Ç∞„ÇíËøΩÂä†„Åó„ÄÅÊõ¥Êñ∞„Åô„Çã„ÄÇ
    """
    print("--- Flagging pair-creatable candidates ---")

    if not os.path.exists(DB_PATH):
        print(f"‚ùå Error: Database file not found at {DB_PATH}")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            cursor = conn.cursor()
            
            # --- 1. is_pair_creatable Âàó„ÇíËøΩÂä† (Â≠òÂú®„Åó„Å™„ÅÑÂ†¥Âêà„ÅÆ„Åø) ---
            try:
                cursor.execute("ALTER TABLE positive_candidates ADD COLUMN is_pair_creatable INTEGER DEFAULT 0")
                print("`is_pair_creatable` column added and initialized to 0.")
            except sqlite3.OperationalError as e:
                if "duplicate column name" in str(e):
                    print("`is_pair_creatable` column already exists. Resetting to 0.")
                    cursor.execute("UPDATE positive_candidates SET is_pair_creatable = 0")
                else:
                    raise e
            
            # --- 2. „Éö„Ç¢‰ΩúÊàê„ÅåÂèØËÉΩ„Å™„Éá„Éº„ÇøË´ñÊñá„ÅÆDOI„É™„Çπ„Éà„ÇíÂèñÂæó ---
            query_eligible = """
                SELECT cited_datapaper_doi
                FROM positive_candidates
                GROUP BY cited_datapaper_doi
                HAVING COUNT(citing_doi) >= 2;
            """
            df_eligible = pd.read_sql_query(query_eligible, conn)
            eligible_dois = df_eligible['cited_datapaper_doi'].tolist()

            if not eligible_dois:
                print("No data papers found that are eligible for pair creation.")
                return
            
            print(f"Found {len(eligible_dois)} data papers eligible for pair creation.")

            # --- 3. Ë©≤ÂΩì„Åô„ÇãË°å„ÅÆ„Éï„É©„Ç∞„Çí1„Å´Êõ¥Êñ∞ ---
            print("Updating flags for eligible candidates...")
            
            # „Éó„É¨„Éº„Çπ„Éõ„É´„ÉÄ„Éº„Çí‰ΩúÊàê (?,?,?...)
            placeholders = ','.join('?' for _ in eligible_dois)
            update_query = f"UPDATE positive_candidates SET is_pair_creatable = 1 WHERE cited_datapaper_doi IN ({placeholders})"
            
            cursor.execute(update_query, eligible_dois)
            conn.commit()
            
            print(f"‚úÖ Updated {cursor.rowcount} rows.")

            # --- 4. ÁµêÊûú„ÅÆÊ§úË®º ---
            print("\n--- Verification of Flags ---")
            df_verification = pd.read_sql_query("SELECT is_pair_creatable, COUNT(*) as count FROM positive_candidates GROUP BY is_pair_creatable", conn)
            display(df_verification)

    except Exception as e:
        print(f"üí• An error occurred: {e}")

# --- ÂÆüË°å ---
flag_creatable_pairs()

--- Flagging pair-creatable candidates ---
`is_pair_creatable` column already exists. Resetting to 0.
Found 2305 data papers eligible for pair creation.
Updating flags for eligible candidates...
‚úÖ Updated 9691 rows.

--- Verification of Flags ---


Unnamed: 0,is_pair_creatable,count
0,0,2738
1,1,9691


In [1]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: Ë®≠ÂÆö ---
DB_PATH = "../data/processed/s2orc_filtered.db"

def migrate_annotation_table():
    """
    positive_candidates„ÉÜ„Éº„Éñ„É´„ÅÆÊßãÈÄ†„ÇíÊõ¥Êñ∞„Åô„Çã„ÄÇ
    - annotation_status -> llm_annotation_status „Å´„É™„Éç„Éº„É†
    - human_annotation_status Âàó„ÇíËøΩÂä†
    - annotation_source Âàó„ÇíÂâäÈô§
    """
    print("--- Migrating `positive_candidates` table schema ---")

    if not os.path.exists(DB_PATH):
        print(f"‚ùå Error: Database file not found at {DB_PATH}")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            cursor = conn.cursor()

            # --- 1. ÁèæÂú®„ÅÆ„ÉÜ„Éº„Éñ„É´ÊßãÈÄ†„ÇíÁ¢∫Ë™ç ---
            print("\n--- Schema Before Migration ---")
            schema_before = pd.read_sql_query("PRAGMA table_info(positive_candidates);", conn)
            display(schema_before)

            # --- 2. Êó¢Â≠ò„ÉÜ„Éº„Éñ„É´„Çí„Éê„ÉÉ„ÇØ„Ç¢„ÉÉ„Éó„Å®„Åó„Å¶„É™„Éç„Éº„É† ---
            try:
                cursor.execute("ALTER TABLE positive_candidates RENAME TO _positive_candidates_old")
                print("\n   1. Backed up existing table to '_positive_candidates_old'.")
            except sqlite3.OperationalError as e:
                if "no such table" in str(e):
                    print("   - No existing table found to migrate.")
                    return
                else:
                    raise e

            # --- 3. Êñ∞„Åó„ÅÑÊßãÈÄ†„Åß„ÉÜ„Éº„Éñ„É´„ÇíÂÜç‰ΩúÊàê ---
            cursor.execute('''
                CREATE TABLE positive_candidates (
                    citing_doi TEXT,
                    cited_datapaper_doi TEXT,
                    cited_datapaper_title TEXT,
                    llm_annotation_status INTEGER, -- „É™„Éç„Éº„É†
                    human_annotation_status INTEGER DEFAULT 0, -- Êñ∞Ë¶èËøΩÂä†
                    is_pair_creatable INTEGER,
                    PRIMARY KEY (citing_doi, cited_datapaper_doi)
                )
            ''')
            print("   2. Created new `positive_candidates` table with the correct schema.")

            # --- 4. „Éê„ÉÉ„ÇØ„Ç¢„ÉÉ„Éó„Åã„Çâ„Éá„Éº„Çø„ÇíÁßªË°å ---
            # Êóß„ÉÜ„Éº„Éñ„É´„ÅÆ annotation_status „ÇíÊñ∞„ÉÜ„Éº„Éñ„É´„ÅÆ llm_annotation_status „Å´„Ç≥„Éî„Éº
            insert_query = """
                INSERT INTO positive_candidates (
                    citing_doi, cited_datapaper_doi, cited_datapaper_title, 
                    llm_annotation_status, is_pair_creatable
                )
                SELECT 
                    citing_doi, cited_datapaper_doi, cited_datapaper_title,
                    annotation_status, is_pair_creatable
                FROM _positive_candidates_old;
            """
            cursor.execute(insert_query)
            conn.commit()
            print("   3. Migrated data from backup to the new table.")

            # --- 5. „Éê„ÉÉ„ÇØ„Ç¢„ÉÉ„Éó„ÉÜ„Éº„Éñ„É´„ÇíÂâäÈô§ ---
            cursor.execute("DROP TABLE _positive_candidates_old")
            print("   4. Removed backup table.")

            # --- 6. ÁßªË°åÂæå„ÅÆ„ÉÜ„Éº„Éñ„É´ÊßãÈÄ†„ÇíÁ¢∫Ë™ç ---
            print("\n--- Schema After Migration ---")
            schema_after = pd.read_sql_query("PRAGMA table_info(positive_candidates);", conn)
            display(schema_after)

    except Exception as e:
        print(f"üí• An error occurred during migration: {e}")

# --- ÂÆüË°å ---
migrate_annotation_table()

--- Migrating `positive_candidates` table schema ---

--- Schema Before Migration ---


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,citing_doi,TEXT,0,,1
1,1,cited_datapaper_doi,TEXT,0,,2
2,2,cited_datapaper_title,TEXT,0,,0
3,3,annotation_status,INTEGER,0,0.0,0
4,4,annotation_source,TEXT,0,,0
5,5,is_pair_creatable,INTEGER,0,0.0,0



   1. Backed up existing table to '_positive_candidates_old'.
   2. Created new `positive_candidates` table with the correct schema.
   3. Migrated data from backup to the new table.
   4. Removed backup table.

--- Schema After Migration ---


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,citing_doi,TEXT,0,,1
1,1,cited_datapaper_doi,TEXT,0,,2
2,2,cited_datapaper_title,TEXT,0,,0
3,3,llm_annotation_status,INTEGER,0,,0
4,4,human_annotation_status,INTEGER,0,0.0,0
5,5,is_pair_creatable,INTEGER,0,,0
