In [15]:
import pandas as pd
import os
import gzip
import json
from tqdm.auto import tqdm
from IPython.display import display

# --- Step 1: 設定 ---
S2ORC_DIR = "../data/raw/s2orc/"

# --- Step 2: フィルタリング条件を定義 ---
MIN_ABSTRACT_LENGTH = 50
# ▼▼▼ 修正点: 本文の最小長を定義 ▼▼▼
MIN_BODY_TEXT_LENGTH = 1000 
# ▼▼▼ 修正点: 参考文献数の閾値を緩和 ▼▼▼
MIN_BIB_ENTRIES = 1

# (ヘルパー関数は変更なし)
def extract_annotated_text(full_text, annotation_str):
    if not full_text or not annotation_str: return ""
    try:
        spans = json.loads(annotation_str)
        if spans and isinstance(spans[0], dict) and 'start' in spans[0] and 'end' in spans[0]:
            start, end = int(spans[0]['start']), int(spans[0]['end'])
            return full_text[start:end]
    except: return ""
    return ""

# --- Step 3: データの読み込み ---
try:
    all_files = [f for f in os.listdir(S2ORC_DIR) if f.endswith('.gz')]
    if not all_files: raise FileNotFoundError("S2ORC directory is empty.")
    target_filepath = os.path.join(S2ORC_DIR, all_files[1])
    print(f"🔬 Analyzing shard: {os.path.basename(target_filepath)}")
except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    target_filepath = None

records = []
if target_filepath:
    with gzip.open(target_filepath, 'rt', encoding='utf-8') as f:
        for line in tqdm(f, desc="Reading shard"):
            try:
                record = json.loads(line)
                if not isinstance(record, dict): continue
                
                content = record.get('content') or {}
                
                # ▼▼▼ 修正点: 本文(content.text)も抽出する ▼▼▼
                records.append({
                    'doi': record.get('externalids', {}).get('doi'),
                    'abstract': extract_annotated_text(content.get('text', ''), content.get('annotations', {}).get('abstract')),
                    'body_text': content.get('text', ''),
                    'bib_entries_count': len(record.get('bib_entries', {}))
                })
            except Exception:
                continue

df = pd.DataFrame(records)
print(f"\n✅ Loaded {len(df)} total papers from shard.")

# --- Step 4: 各フィルタリング条件を個別に評価 ---
if not df.empty:
    total_count = len(df)
    results = []

    # 1. DOI
    doi_passed = df[df['doi'].notna() & (df['doi'] != '')]
    results.append({"Filter": "Has DOI", "Passed Count": len(doi_passed)})

    # 2. アブストラクト
    abstract_passed = df[df['abstract'].str.len() >= MIN_ABSTRACT_LENGTH]
    results.append({"Filter": f"Abstract >= {MIN_ABSTRACT_LENGTH} chars", "Passed Count": len(abstract_passed)})

    # 3. 本文
    # ▼▼▼ 修正点: body_textの長さでチェック ▼▼▼
    body_text_passed = df[df['body_text'].str.len() >= MIN_BODY_TEXT_LENGTH]
    results.append({"Filter": f"Body Text >= {MIN_BODY_TEXT_LENGTH} chars", "Passed Count": len(body_text_passed)})

    # 4. 参考文献数
    bib_passed = df[df['bib_entries_count'] >= MIN_BIB_ENTRIES]
    results.append({"Filter": f"Bibliography >= {MIN_BIB_ENTRIES} entries", "Passed Count": len(bib_passed)})
    
    # --- Step 5: 結果をまとめて表示 ---
    df_results = pd.DataFrame(results)
    df_results['Pass Rate (%)'] = (df_results['Passed Count'] / total_count) * 100
    
    print("\n--- Independent Filter Analysis Results (Corrected) ---")
    print(f"Total papers analyzed: {total_count:,}")
    
    df_results['Pass Rate (%)'] = df_results['Pass Rate (%)'].map('{:.2f}%'.format)
    display(df_results)

🔬 Analyzing shard: 20250425_113502_00036_sv8h7_05186f24-8988-4922-9e27-1fa738badf18.gz


Reading shard: 54162it [00:46, 1172.47it/s]



✅ Loaded 54133 total papers from shard.

--- Independent Filter Analysis Results (Corrected) ---
Total papers analyzed: 54,133


Unnamed: 0,Filter,Passed Count,Pass Rate (%)
0,Has DOI,51324,94.81%
1,Abstract >= 50 chars,45684,84.39%
2,Body Text >= 1000 chars,53262,98.39%
3,Bibliography >= 1 entries,0,0.00%


In [4]:
import os
import gzip
import json
from IPython.display import display, JSON

# --- Step 1: 設定 ---
S2ORC_DIR = "../data/raw/s2orc/"
RECORDS_TO_INSPECT = 2 # 確認したいレコード数

# --- Step 2: 分析対象のファイルを選択 ---
try:
    all_files = [f for f in os.listdir(S2ORC_DIR) if f.endswith('.gz')]
    if not all_files: raise FileNotFoundError("S2ORC directory is empty.")
    target_filepath = os.path.join(S2ORC_DIR, all_files[0])
    print(f"🔬 Inspecting shard: {os.path.basename(target_filepath)}")
except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    target_filepath = None

# --- Step 3: 先頭レコードの annotations 構造を調査 ---
if target_filepath:
    print(f"\n--- Inspecting 'annotations' in first {RECORDS_TO_INSPECT} records ---")
    
    try:
        with gzip.open(target_filepath, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= RECORDS_TO_INSPECT:
                    break
                
                print(f"\n--- Record {i+1} ---")
                
                try:
                    record = json.loads(line)
                    if not isinstance(record, dict):
                        print("Record is not a valid dictionary.")
                        continue
                    
                    # ▼▼▼ 修正点: 'content' -> 'annotations' を取得 ▼▼▼
                    annotations = record.get('content', {}).get('annotations')
                    
                    if annotations is not None:
                        # bibentry が存在するかどうかも併せて確認
                        bibentry_info = annotations.get('bibentry')
                        if bibentry_info:
                             print("Found 'annotations' (contains 'bibentry'):")
                        else:
                             print("Found 'annotations' (but 'bibentry' is missing or null):")
                        
                        # annotationsオブジェクト全体を綺麗に表示
                        display(JSON(annotations))
                    else:
                        print("'annotations' key does not exist in this record.")

                except (json.JSONDecodeError, KeyError) as e:
                    print(f"Could not process record: {e}")
                    
    except Exception as e:
        print(f"💥 An error occurred: {e}")

🔬 Inspecting shard: 20250425_113502_00036_sv8h7_033bf465-0cc7-401a-991c-01857fe91606.gz

--- Inspecting 'annotations' in first 2 records ---

--- Record 1 ---
Found 'annotations' (contains 'bibentry'):


<IPython.core.display.JSON object>


--- Record 2 ---
Found 'annotations' (contains 'bibentry'):


<IPython.core.display.JSON object>

In [5]:
import os
import gzip
import json

# --- 設定 ---
S2ORC_DIR = "../data/raw/s2orc/"

# --- ファイルを選択 ---
try:
    all_files = [f for f in os.listdir(S2ORC_DIR) if f.endswith('.gz')]
    if not all_files:
        raise FileNotFoundError("S2ORC directory is empty or does not exist.")
    
    target_filepath = os.path.join(S2ORC_DIR, all_files[0])
    print(f"🔬 Reading the first line from: {os.path.basename(target_filepath)}")
    
except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    target_filepath = None

# --- 最初の1行を読み込んで表示 ---
if target_filepath:
    try:
        with gzip.open(target_filepath, 'rt', encoding='utf-8') as f:
            # 最初の1行だけを読み込む
            first_line = f.readline()
            
            if first_line:
                # 文字列をJSONオブジェクト（Pythonの辞書）に変換
                record = json.loads(first_line)
                
                print("\n--- Raw JSON of the First Record ---")
                # 整形して表示
                print(json.dumps(record, indent=2, ensure_ascii=False))
            else:
                print("File is empty.")

    except Exception as e:
        print(f"💥 An error occurred: {e}")

🔬 Reading the first line from: 20250425_113502_00036_sv8h7_033bf465-0cc7-401a-991c-01857fe91606.gz

--- Raw JSON of the First Record ---
{
  "corpusid": 234516512,
  "externalids": {
    "arxiv": null,
    "mag": "3134341373",
    "acl": null,
    "pubmed": null,
    "pubmedcentral": null,
    "dblp": null,
    "doi": "10.21608/aasj.2020.155061"
  },
  "content": {
    "source": {
      "pdfurls": [
        "https://aasj.journals.ekb.eg/article_155061_ee3648ea887b7cf2d87b2e1216f612d0.pdf"
      ],
      "pdfsha": "c2334dc4888cbfae9fb1b3d363b74588e9708ca9",
      "oainfo": {
        "license": null,
        "openaccessurl": "https://aasj.journals.ekb.eg/article_155061_ee3648ea887b7cf2d87b2e1216f612d0.pdf",
        "status": "GOLD"
      }
    },
    "text": "\nThe role of cryptocurrencies in financial transactions considering modern global conditions\n2020\n\nEman M Fouad eman.mostafa@azhar.edu.eg \nDepartment of Economics\nFaculty of Commerce (Girls)\nAl-Azhar University (Assiut Branch