In [1]:
import pandas as pd
import sqlite3
import os
from IPython.display import display, HTML

# --- 設定 ---
DB_PATH = "../data/processed/s2orc_filtered.db"

def database_health_check():
    """
    データベース全体の健康状態を診断し、レポートを出力する
    """
    print("--- Starting Database Health Check ---")
    if not os.path.exists(DB_PATH):
        print(f"❌ Error: Database file not found at {DB_PATH}")
        return

    with sqlite3.connect(DB_PATH) as conn:
        # --- 1. テーブル一覧とスキーマの確認 ---
        tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
        print("✅ Found the following tables:")
        display(tables)

        for table_name in tables['name']:
            print(f"\n--- Schema for `{table_name}` table ---")
            schema = pd.read_sql_query(f"PRAGMA table_info({table_name});", conn)
            display(schema)
            
        # --- 2. 各テーブルの詳細分析 ---
        for table_name in tables['name']:
            print("\n" + "="*50)
            print(f"--- Detailed Analysis for `{table_name}` Table ---")
            
            # 総行数を取得
            total_count_df = pd.read_sql_query(f"SELECT COUNT(*) FROM {table_name}", conn)
            total_count = total_count_df.iloc[0, 0]
            print(f"Total rows: {total_count:,}")
            
            if total_count == 0:
                continue

            # NULL値/空文字列のチェック
            print("\n[Null/Empty Value Check]")
            schema = pd.read_sql_query(f"PRAGMA table_info({table_name});", conn)
            null_checks = []
            for col_name in schema['name']:
                # TEXT型の列のみを対象
                col_type = schema[schema['name'] == col_name]['type'].iloc[0]
                if col_type == 'TEXT':
                    query = f"SELECT COUNT(*) FROM {table_name} WHERE {col_name} IS NULL OR {col_name} = ''"
                    null_count = pd.read_sql_query(query, conn).iloc[0, 0]
                    null_percent = (null_count / total_count) * 100
                    null_checks.append({
                        "Column": col_name,
                        "Missing Count": f"{null_count:,}",
                        "Missing Rate": f"{null_percent:.2f}%"
                    })
            if null_checks:
                display(pd.DataFrame(null_checks))
            
            # データのサンプルを表示
            print("\n[Data Sample (first 5 rows)]")
            display(pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 5", conn))

# --- 実行 ---
database_health_check()

--- Starting Database Health Check ---
✅ Found the following tables:


Unnamed: 0,name
0,papers
1,citations
2,positive_candidates
3,full_texts



--- Schema for `papers` table ---


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,corpus_id,INTEGER,0,,1
1,1,doi,TEXT,0,,0
2,2,title,TEXT,0,,0
3,3,abstract,TEXT,0,,0



--- Schema for `citations` table ---


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,citing_doi,TEXT,0,,0
1,1,cited_doi,TEXT,0,,0



--- Schema for `positive_candidates` table ---


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,citing_doi,TEXT,0,,1
1,1,cited_datapaper_doi,TEXT,0,,2
2,2,cited_datapaper_title,TEXT,0,,0
3,3,annotation_status,INTEGER,0,0.0,0
4,4,annotation_source,TEXT,0,,0
5,5,is_pair_creatable,INTEGER,0,0.0,0



--- Schema for `full_texts` table ---


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,doi,TEXT,0,,1
1,1,full_text,TEXT,0,,0
2,2,cleaned_text,TEXT,0,,0



--- Detailed Analysis for `papers` Table ---
Total rows: 11,619,136

[Null/Empty Value Check]


Unnamed: 0,Column,Missing Count,Missing Rate
0,doi,0,0.00%
1,title,446635,3.84%
2,abstract,0,0.00%



[Data Sample (first 5 rows)]


Unnamed: 0,corpus_id,doi,title,abstract
0,14,10.4103/0250-474X.59543,Resolving Issues of Content Uniformity and Low...,The aim of present study were to arrest the pr...
1,16,10.3390/FI4020563,The U-City Paradigm: Opportunities and Risks f...,Volunteered Geographic Information (VGI) tools...
2,25,10.1038/CDDISCOVERY.2016.53,Generating kidney tissue from pluripotent stem...,With the isolation of human pluripotent stem c...
3,32,10.1155/2016/6183679,Influence of PEEK Coating on Hip Implant Stres...,Stress shielding is a well-known failure facto...
4,88,10.1216/RMJM/1181071856,Q-Reflexive Banach Spaces,Let E be a Banach space. There are several nat...



--- Detailed Analysis for `citations` Table ---
Total rows: 91,393,156

[Null/Empty Value Check]


Unnamed: 0,Column,Missing Count,Missing Rate
0,citing_doi,0,0.00%
1,cited_doi,0,0.00%



[Data Sample (first 5 rows)]


Unnamed: 0,citing_doi,cited_doi
0,10.3390/DIAGNOSTICS13091662,10.1007/S00590-017-1945-5
1,10.3390/DIAGNOSTICS13091662,10.3390/IJERPH191912484
2,10.3390/DIAGNOSTICS13091662,10.1148/RG.2019190046
3,10.3390/DIAGNOSTICS13091662,10.1007/S00256-021-03806-8
4,10.3390/DIAGNOSTICS13091662,10.3390/S20164481



--- Detailed Analysis for `positive_candidates` Table ---
Total rows: 12,429

[Null/Empty Value Check]


Unnamed: 0,Column,Missing Count,Missing Rate
0,citing_doi,0,0.00%
1,cited_datapaper_doi,0,0.00%
2,cited_datapaper_title,0,0.00%
3,annotation_source,2167,17.44%



[Data Sample (first 5 rows)]


Unnamed: 0,citing_doi,cited_datapaper_doi,cited_datapaper_title,annotation_status,annotation_source,is_pair_creatable
0,10.3390/CIMB44090267,10.1016/J.DIB.2017.08.003,Dataset on antitumor properties of silver nano...,1,machine_parallel,0
1,10.1016/J.DIB.2021.107641,10.1016/J.DIB.2019.104905,Environmental and economic data on energy effi...,1,machine_parallel,1
2,10.1016/J.DIB.2021.107641,10.1016/J.DIB.2018.02.038,Data on cost-optimal Nearly Zero Energy Buildi...,-1,machine_parallel,1
3,10.1016/J.DIB.2021.107641,10.1016/J.DIB.2017.08.043,Data on European non-residential buildings,-1,machine_parallel,1
4,10.1016/J.DIB.2021.107641,10.1016/J.DIB.2015.09.041,High performance solutions and data for nZEBs ...,-1,machine_parallel,1



--- Detailed Analysis for `full_texts` Table ---
Total rows: 11,249

[Null/Empty Value Check]


Unnamed: 0,Column,Missing Count,Missing Rate
0,doi,0,0.00%
1,full_text,0,0.00%
2,cleaned_text,0,0.00%



[Data Sample (first 5 rows)]


Unnamed: 0,doi,full_text,cleaned_text
0,10.3390/CIMB44090267,\n\n2022\n\n\nDepartment of Pharmacognosy\nFac...,2022 Department of Pharmacognosy Faculty of Ph...
1,10.1016/J.DIB.2021.107641,\nData on nearly zero energy buildings (NZEBs)...,Data on nearly zero energy buildings (NZEBs) p...
2,10.3390/V15101977,\nEvidence of a Protein-Coding Gene Antisense ...,Evidence of a Protein-Coding Gene Antisense to...
3,10.3390/IJMS21093113,\nBinder-Free α-MnO 2 Nanowires on Carbon Clot...,Binder-Free α-MnO 2 Nanowires on Carbon Cloth ...
4,10.1371/JOURNAL.PONE.0271458,\nMachine learning-based estimation of riverin...,Machine learning-based estimation of riverine ...
