In [1]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: Ë®≠ÂÆö ---

# „Éá„Éº„Çø„Éô„Éº„Çπ„ÅÆ„Éï„Ç°„Ç§„É´„Éë„Çπ
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 2: „Éá„Éº„Çø„Éô„Éº„Çπ„ÅÆÂàÜÊûê ---

def analyze_positive_candidates():
    """
    `positive_candidates`„ÉÜ„Éº„Éñ„É´„ÅÆÂÜÖÂÆπ„ÇíÂàÜÊûê„Åó„ÄÅÁµ±Ë®àÊÉÖÂ†±„ÇíË°®Á§∫„Åô„Çã
    """
    print("--- Analyzing `positive_candidates` Table ---")

    if not os.path.exists(DB_PATH):
        print(f"‚ùå Error: Database file not found at {DB_PATH}")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            
            # „ÉÜ„Éº„Éñ„É´„ÅåÂ≠òÂú®„Åô„Çã„ÅãÁ¢∫Ë™ç
            cursor = conn.cursor()
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='positive_candidates'")
            if cursor.fetchone() is None:
                print("‚ö†Ô∏è `positive_candidates` table not found.")
                print("   Please run the `06_identify_and_store_candidates.ipynb` script first.")
                return

            # --- 2.1: ÂÖ®‰ΩìÁöÑ„Å™Áµ±Ë®àÊÉÖÂ†± ---
            print("\n--- Overall Statistics ---")
            
            total_links = pd.read_sql_query("SELECT COUNT(*) FROM positive_candidates", conn).iloc[0, 0]
            print(f"Total positive candidate links found: {total_links:,}")
            
            unique_citing = pd.read_sql_query("SELECT COUNT(DISTINCT citing_doi) FROM positive_candidates", conn).iloc[0, 0]
            print(f"Unique citing papers (Positive Candidates): {unique_citing:,}")

            unique_cited = pd.read_sql_query("SELECT COUNT(DISTINCT cited_datapaper_doi) FROM positive_candidates", conn).iloc[0, 0]
            print(f"Unique data papers with at least one citer in S2ORC: {unique_cited:,}")

            # --- 2.2: „Éá„Éº„ÇøË´ñÊñá„Åî„Å®„ÅÆË¢´ÂºïÁî®Êï∞ÔºàÊ≠£‰æãÂÄôË£úÊï∞Ôºâ„ÅÆÂàÜÂ∏É ---
            print("\n" + "="*50)
            print("--- Distribution of Citing Papers per Data Paper ---")
            
            query = """
                SELECT 
                    cited_datapaper_doi, 
                    COUNT(citing_doi) AS candidate_count
                FROM positive_candidates
                GROUP BY cited_datapaper_doi
            """
            df_distribution = pd.read_sql_query(query, conn)
            
            print("\n[Statistics of candidate counts per data paper]")
            display(df_distribution['candidate_count'].describe())
            
            # --- 2.3: ÊúÄ„ÇÇÂ§ö„Åè„ÅÆÊ≠£‰æãÂÄôË£ú„ÇíÊåÅ„Å§„Éá„Éº„ÇøË´ñÊñá„Éà„ÉÉ„Éó10 ---
            print("\n--- Top 10 Data Papers with Most Candidates in S2ORC ---")
            
            # papers„ÉÜ„Éº„Éñ„É´„Å®ÁµêÂêà„Åó„Å¶„Çø„Ç§„Éà„É´„ÇÇË°®Á§∫
            query_top10 = """
                SELECT
                    T1.cited_datapaper_doi,
                    T2.title,
                    T1.candidate_count
                FROM (
                    SELECT 
                        cited_datapaper_doi, 
                        COUNT(citing_doi) AS candidate_count
                    FROM positive_candidates
                    GROUP BY cited_datapaper_doi
                ) AS T1
                JOIN papers AS T2 ON T1.cited_datapaper_doi = T2.doi
                ORDER BY T1.candidate_count DESC
                LIMIT 10;
            """
            df_top10 = pd.read_sql_query(query_top10, conn)
            display(df_top10)

    except Exception as e:
        print(f"üí• An error occurred while analyzing the database: {e}")

# --- ÂÆüË°å ---
analyze_positive_candidates()

--- Analyzing `positive_candidates` Table ---

--- Overall Statistics ---
Total positive candidate links found: 12,429
Unique citing papers (Positive Candidates): 11,249
Unique data papers with at least one citer in S2ORC: 5,043

--- Distribution of Citing Papers per Data Paper ---

[Statistics of candidate counts per data paper]


count    5043.000000
mean        2.464604
std         4.882605
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       197.000000
Name: candidate_count, dtype: float64


--- Top 10 Data Papers with Most Candidates in S2ORC ---


Unnamed: 0,cited_datapaper_doi,title,candidate_count
0,10.1038/S41597-020-0453-3,Version 4 of the CRU TS monthly high-resolutio...,197
1,10.5194/ESSD-13-4349-2021,ERA5-Land: A state-of-the-art global reanalysi...,92
2,10.1038/S41597-020-0534-3,The FLUXNET2015 dataset and the ONEFlux proces...,81
3,10.1007/S11558-019-09344-2,The KOF Globalisation Index -revisited,69
4,10.1016/J.DIB.2020.105340,Application of the ARIMA model on the COVID- 2...,69
5,10.1038/S41597-020-0462-2,High resolution temporal profiles in the Emiss...,55
6,10.1038/S41597-020-0369-Y,The first high-resolution meteorological forci...,54
7,10.1016/J.DIB.2019.104863,Dataset of breast ultrasound images,50
8,10.1038/S41597-020-00688-8,a cross-country database of COVID-19 testing,49
9,10.1038/S41597-020-0448-0,Epidemiological data from the COVID-19 outbrea...,48


In [2]:
import pandas as pd
import sqlite3
import json
import os
from tqdm.auto import tqdm
from IPython.display import display

# --- Step 1: Ë®≠ÂÆö ---

# Scopus„Åã„ÇâÂèéÈõÜ„Åó„Åü„Éá„Éº„ÇøË´ñÊñá„É™„Çπ„Éà„ÅÆ„Éï„Ç°„Ç§„É´„Éë„Çπ
SCOPUS_DATAPAPERS_FILE = "../data/datapapers/raw/all_datapapers_raw.json"
# ÊßãÁØâ„Åó„ÅüS2ORC„Éá„Éº„Çø„Éô„Éº„Çπ„ÅÆ„Éï„Ç°„Ç§„É´„Éë„Çπ
S2ORC_DB_PATH = "../data/processed/s2orc_filtered.db"


# --- Step 2: „Éá„Éº„Çø„Éô„Éº„Çπ„ÇíÂàÜÊûê ---

def analyze_s2orc_datapaper_stats():
    """
    S2ORC„Éá„Éº„Çø„Éô„Éº„ÇπÂÜÖ„ÅÆ„Éá„Éº„ÇøË´ñÊñá„Å´Èñ¢„Åô„ÇãÁµ±Ë®àÊÉÖÂ†±„ÇíÂàÜÊûê„ÉªË°®Á§∫„Åô„Çã
    """
    print("--- Analyzing Data Paper Statistics within S2ORC Database ---")

    # --- 2.1: Scopus„ÅÆ„Éá„Éº„ÇøË´ñÊñáDOI„É™„Çπ„Éà„ÇíË™≠„ÅøËæº„Åø ---
    if not os.path.exists(SCOPUS_DATAPAPERS_FILE):
        print(f"‚ùå Error: Scopus file not found at {SCOPUS_DATAPAPERS_FILE}")
        return
    with open(SCOPUS_DATAPAPERS_FILE, 'r', encoding='utf-8') as f:
        scopus_papers = json.load(f)
    scopus_datapaper_dois = {p['prism:doi'].upper() for p in scopus_papers if p.get('prism:doi')}
    
    # --- 2.2: S2ORC„Éá„Éº„Çø„Éô„Éº„Çπ„Å´Êé•Á∂ö ---
    if not os.path.exists(S2ORC_DB_PATH):
        print(f"‚ùå Error: S2ORC DB not found at {S2ORC_DB_PATH}")
        return
        
    with sqlite3.connect(S2ORC_DB_PATH) as conn:
        
        # --- 2.3: S2ORCÂÜÖ„ÅßË¶ã„Å§„Åã„Å£„Åü„Éá„Éº„ÇøË´ñÊñáÔºà„ÅÆDOIÔºâ„ÇíÁâπÂÆö ---
        print("\nFinding common data papers...")
        # DB„Åã„Çâpapers„ÉÜ„Éº„Éñ„É´„ÅÆÂÖ®DOI„Çí‰∏ÄÂ∫¶„Å†„ÅëÂèñÂæó
        s2orc_dois_in_papers = set(pd.read_sql_query("SELECT doi FROM papers", conn)['doi'])
        
        # S2ORCÂÜÖ„Å´Â≠òÂú®„Åô„Çã„Éá„Éº„ÇøË´ñÊñá„ÅÆDOI„Çª„ÉÉ„Éà
        datapapers_in_s2orc = scopus_datapaper_dois.intersection(s2orc_dois_in_papers)
        
        if not datapapers_in_s2orc:
            print("No common data papers found between Scopus list and S2ORC DB.")
            return

        print(f"‚úÖ S2ORCÂÜÖ„ÅßË¶ã„Å§„Åã„Å£„Åü„Éá„Éº„ÇøË´ñÊñáÊï∞: {len(datapapers_in_s2orc):,}")

        # --- 2.4: „Éá„Éº„ÇøË´ñÊñá„ÇíÂºïÁî®„Åó„Å¶„ÅÑ„ÇãË´ñÊñáÊï∞„ÇíË®àÁÆó ---
        # `positive_candidates`„ÉÜ„Éº„Éñ„É´„ÅØ„ÄÅ„Åæ„Åï„Å´„Åì„ÅÆÊÉÖÂ†±„ÇíÊåÅ„Å£„Å¶„ÅÑ„Çã
        df_candidates = pd.read_sql_query("SELECT * FROM positive_candidates", conn)
        
        # S2ORCÂÜÖ„Å´Â≠òÂú®„Åô„Çã„Éá„Éº„ÇøË´ñÊñá„ÅåÂºïÁî®„Åï„Çå„Å¶„ÅÑ„Çã„É™„É≥„ÇØ„ÅÆ„Åø„Å´Áµû„ÇäËæº„ÇÄ
        df_candidates_filtered = df_candidates[df_candidates['cited_datapaper_doi'].isin(datapapers_in_s2orc)]
        
        print(f"‚úÖ S2ORCÂÜÖ„ÅßË¶ã„Å§„Åã„Å£„Åü„Éá„Éº„ÇøË´ñÊñá„ÇíÂºïÁî®„Åó„Å¶„ÅÑ„ÇãË´ñÊñáÊï∞Ôºà„É¶„Éã„Éº„ÇØÔºâ: {df_candidates_filtered['citing_doi'].nunique():,}")

        # --- 2.5: ÂêÑ„Éá„Éº„ÇøË´ñÊñá„ÅÆS2ORCÂÜÖ„Åß„ÅÆË¢´ÂºïÁî®Êï∞„ÇíË®àÁÆó ---
        print("\n--- Calculating citation counts for each data paper within S2ORC ---")
        
        s2orc_citation_counts = df_candidates_filtered['cited_datapaper_doi'].value_counts().reset_index()
        s2orc_citation_counts.columns = ['datapaper_doi', 's2orc_citedby_count']
        
        print("\n[Distribution of citation counts within S2ORC]")
        display(s2orc_citation_counts['s2orc_citedby_count'].describe())
        
        # --- 2.6: S2ORCÂÜÖ„Åß„ÅÆË¢´ÂºïÁî®Êï∞„Åå2‰ª•‰∏ä„Åß„ÅÇ„Çã„Éá„Éº„ÇøË´ñÊñáÊï∞„ÇíË®àÁÆó ---
        count_cited_by_2_or_more = len(s2orc_citation_counts[s2orc_citation_counts['s2orc_citedby_count'] >= 2])
        
        print(f"\n‚úÖ S2ORCÂÜÖ„Åß„ÅÆË¢´ÂºïÁî®Êï∞„Åå2‰ª•‰∏ä„Åß„ÅÇ„Çã„Éá„Éº„ÇøË´ñÊñáÊï∞: {count_cited_by_2_or_more:,}")
        
        # ÂèÇËÄÉÊÉÖÂ†±„Å®„Åó„Å¶„Éà„ÉÉ„Éó10„ÇÇË°®Á§∫
        print("\n--- Top 10 most cited data papers within S2ORC ---")
        # papers„ÉÜ„Éº„Éñ„É´„Å®ÁµêÂêà„Åó„Å¶„Çø„Ç§„Éà„É´„ÇÇË°®Á§∫
        df_papers_info = pd.read_sql_query("SELECT doi, title FROM papers", conn)
        df_top10 = pd.merge(s2orc_citation_counts.head(10), df_papers_info, left_on='datapaper_doi', right_on='doi', how='left')
        display(df_top10[['datapaper_doi', 'title', 's2orc_citedby_count']])


# --- ÂÆüË°å ---
analyze_s2orc_datapaper_stats()

  from .autonotebook import tqdm as notebook_tqdm


--- Analyzing Data Paper Statistics within S2ORC Database ---

Finding common data papers...
‚úÖ S2ORCÂÜÖ„ÅßË¶ã„Å§„Åã„Å£„Åü„Éá„Éº„ÇøË´ñÊñáÊï∞: 7,692
‚úÖ S2ORCÂÜÖ„ÅßË¶ã„Å§„Åã„Å£„Åü„Éá„Éº„ÇøË´ñÊñá„ÇíÂºïÁî®„Åó„Å¶„ÅÑ„ÇãË´ñÊñáÊï∞Ôºà„É¶„Éã„Éº„ÇØÔºâ: 8,192

--- Calculating citation counts for each data paper within S2ORC ---

[Distribution of citation counts within S2ORC]


count    3303.000000
mean        2.697548
std         5.652057
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max       197.000000
Name: s2orc_citedby_count, dtype: float64


‚úÖ S2ORCÂÜÖ„Åß„ÅÆË¢´ÂºïÁî®Êï∞„Åå2‰ª•‰∏ä„Åß„ÅÇ„Çã„Éá„Éº„ÇøË´ñÊñáÊï∞: 1,601

--- Top 10 most cited data papers within S2ORC ---


Unnamed: 0,datapaper_doi,title,s2orc_citedby_count
0,10.1038/S41597-020-0453-3,Version 4 of the CRU TS monthly high-resolutio...,197
1,10.5194/ESSD-13-4349-2021,ERA5-Land: A state-of-the-art global reanalysi...,92
2,10.1038/S41597-020-0534-3,The FLUXNET2015 dataset and the ONEFlux proces...,81
3,10.1007/S11558-019-09344-2,The KOF Globalisation Index -revisited,69
4,10.1016/J.DIB.2020.105340,Application of the ARIMA model on the COVID- 2...,69
5,10.1038/S41597-020-0462-2,High resolution temporal profiles in the Emiss...,55
6,10.1038/S41597-020-0369-Y,The first high-resolution meteorological forci...,54
7,10.1016/J.DIB.2019.104863,Dataset of breast ultrasound images,50
8,10.1038/S41597-020-00688-8,a cross-country database of COVID-19 testing,49
9,10.1038/S41597-020-0448-0,Epidemiological data from the COVID-19 outbrea...,48


In [1]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: Ë®≠ÂÆö ---
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 2: „Éá„Éº„Çø„Éô„Éº„Çπ„ÅÆÂàÜÊûê ---
def analyze_multiple_citations():
    """
    Ë§áÊï∞„ÅÆ„Éá„Éº„ÇøË´ñÊñá„ÇíÂºïÁî®„Åó„Å¶„ÅÑ„ÇãË´ñÊñá„ÅÆÊï∞„ÇíË™øÊüª„Åó„ÄÅÂ†±Âëä„Åô„Çã
    """
    print("--- Analyzing Papers That Cite Multiple Data Papers ---")

    if not os.path.exists(DB_PATH):
        print(f"‚ùå Error: Database file not found at {DB_PATH}")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            # `positive_candidates`„ÉÜ„Éº„Éñ„É´„ÅÆÂ≠òÂú®„ÇíÁ¢∫Ë™ç
            cursor = conn.cursor()
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='positive_candidates'")
            if cursor.fetchone() is None:
                print("‚ö†Ô∏è `positive_candidates` table not found.")
                return

            # --- 2.1: Ë´ñÊñá„Åî„Å®„Å´„ÄÅ„ÅÑ„Åè„Å§„ÅÆ„Éá„Éº„ÇøË´ñÊñá„ÇíÂºïÁî®„Åó„Å¶„ÅÑ„Çã„ÅãÈõÜË®à ---
            query = """
                SELECT
                    citing_doi,
                    COUNT(cited_datapaper_doi) AS num_cited_datapapers
                FROM
                    positive_candidates
                GROUP BY
                    citing_doi
                HAVING
                    COUNT(cited_datapaper_doi) > 1
                ORDER BY
                    num_cited_datapapers DESC;
            """
            df_multiple_citers = pd.read_sql_query(query, conn)

            # --- 2.2: ÁµêÊûú„ÅÆÂ†±Âëä ---
            if df_multiple_citers.empty:
                print("\n‚úÖ No papers found that cite more than one data paper from our list.")
                return

            print(f"\nFound {len(df_multiple_citers):,} papers that cite more than one data paper.")
            
            print("\n--- Distribution of cited data paper counts ---")
            # ‰Ωï‰ª∂„ÅÆ„Éá„Éº„ÇøË´ñÊñá„ÇíÂºïÁî®„Åó„Å¶„ÅÑ„ÇãË´ñÊñá„Åå„ÄÅ„Åù„Çå„Åû„Çå‰Ωï‰ª∂„ÅÇ„Çã„Åã
            display(df_multiple_citers['num_cited_datapapers'].value_counts().sort_index().to_frame("Number of Papers"))

            print("\n--- Top 10 Papers Citing the Most Data Papers ---")
            # papers„ÉÜ„Éº„Éñ„É´„Å®ÁµêÂêà„Åó„Å¶„ÄÅË´ñÊñá„Çø„Ç§„Éà„É´„ÇÇË°®Á§∫
            top10_dois = df_multiple_citers.head(10)['citing_doi'].tolist()
            placeholders = ','.join('?' for _ in top10_dois)
            
            df_papers_info = pd.read_sql_query(f"SELECT doi, title FROM papers WHERE doi IN ({placeholders})", conn, params=top10_dois)
            
            # ÊÉÖÂ†±„ÇíÁµêÂêà„Åó„Å¶Ë°®Á§∫
            df_top10_details = pd.merge(df_multiple_citers.head(10), df_papers_info, left_on='citing_doi', right_on='doi')
            display(df_top10_details[['citing_doi', 'title', 'num_cited_datapapers']])


    except Exception as e:
        print(f"üí• An error occurred: {e}")

# --- ÂÆüË°å ---
analyze_multiple_citations()

--- Analyzing Papers That Cite Multiple Data Papers ---

Found 874 papers that cite more than one data paper.

--- Distribution of cited data paper counts ---


Unnamed: 0_level_0,Number of Papers
num_cited_datapapers,Unnamed: 1_level_1
2,677
3,122
4,50
5,18
6,6
8,1



--- Top 10 Papers Citing the Most Data Papers ---


Unnamed: 0,citing_doi,title,num_cited_datapapers
0,10.33263/LIANBS113.36893699,"Design, Synthesis, Spectroscopic Characterizat...",8
1,10.1016/J.DIB.2020.106712,Data compilation regarding the effects of grai...,6
2,10.1016/J.DIB.2024.110216,Data in Brief,6
3,10.1016/J.ENERGY.2021.122479,How will future climate impact the design and ...,6
4,10.1021/ACS.JCIM.3C01281,Snowball 2.0: Generic Material Data Parser for...,6
5,10.3889/OAMJMS.2019.109,Survival Analysis of Cancer Patients in North ...,6
6,10.5194/ESSD-15-1-2023,The World Atlas of Last Interglacial Shoreline...,6
7,10.1007/S13762-023-04872-2,Modified biomass adsorbents for removal of org...,5
8,10.1016/J.COMPENVURBSYS.2022.101809,Global Building Morphology Indicators,5
9,10.1016/J.COMPENVURBSYS.2024.102104,Building footprint data for countries in Afric...,5
