In [4]:
import pandas as pd
import os
import gzip
import json
from IPython.display import display

# --- Step 1: 設定 ---
S2ORC_DIR = "../data/raw/s2orc/"
LINES_TO_READ = 5

# --- Step 2: S2ORCファイルを選択 ---
try:
    all_files = [f for f in os.listdir(S2ORC_DIR) if f.endswith('.gz')]
    if not all_files:
        raise FileNotFoundError("S2ORC directory is empty or does not exist.")
    
    target_file = all_files[0]
    target_filepath = os.path.join(S2ORC_DIR, target_file)
    print(f"✅ Target file for exploration: {target_file}")
    
except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    target_filepath = None

# --- Step 3: ファイルの中身を数行だけ確認 ---

# ▼▼▼ 修正点: `int()`による型変換を追加 ▼▼▼
def extract_annotated_text(full_text, annotation_str):
    """annotationsの情報を使って、全文テキストから特定の部分を抽出するヘルパー関数"""
    if not full_text or not annotation_str:
        return ""
    try:
        spans = json.loads(annotation_str)
        if spans:
            # startとendを整数(integer)に変換
            start = int(spans[0]['start'])
            end = int(spans[0]['end'])
            return full_text[start:end]
    except (json.JSONDecodeError, KeyError, IndexError, TypeError, ValueError):
        # さまざまなエラーをまとめて捕捉
        return ""
    return ""


if target_filepath:
    print(f"\n--- Reading first {LINES_TO_READ} lines from the file ---")
    
    extracted_data = []
    
    try:
        with gzip.open(target_filepath, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= LINES_TO_READ:
                    break
                
                print(f"\n--- Record {i+1} ---")
                
                try:
                    record = json.loads(line)
                    
                    corpus_id = record.get('corpusid')
                    doi = record.get('externalids', {}).get('doi')
                    
                    content = record.get('content', {})
                    full_text = content.get('text', '')
                    annotations = content.get('annotations', {})
                    
                    title = extract_annotated_text(full_text, annotations.get('title'))
                    abstract = extract_annotated_text(full_text, annotations.get('abstract'))
                    
                    bib_entries_str = annotations.get('bibentry', '[]')
                    bib_count = len(json.loads(bib_entries_str))

                    print(f"DOI: {doi}")
                    print(f"Title: {title}")
                    
                    extracted_data.append({
                        "corpus_id": corpus_id,
                        "doi": doi,
                        "title": title,
                        "abstract_length": len(abstract) if abstract else 0,
                        "citation_count_in_bib": bib_count
                    })

                except json.JSONDecodeError:
                    print("⚠️ Could not parse line as JSON.")

    except Exception as e:
        print(f"💥 An error occurred while reading the file: {e}")

# --- Step 4: 抽出した情報のサマリーを表示 ---
if extracted_data:
    print("\n\n" + "="*50)
    print("📋 Summary of Extracted Data (Corrected)")
    print("="*50)
    
    df_summary = pd.DataFrame(extracted_data)
    display(df_summary)

✅ Target file for exploration: 20250425_113502_00036_sv8h7_033bf465-0cc7-401a-991c-01857fe91606.gz

--- Reading first 5 lines from the file ---

--- Record 1 ---
DOI: 10.21608/aasj.2020.155061
Title: The role of cryptocurrencies in financial transactions considering modern global conditions

--- Record 2 ---
DOI: 10.1016/j.nuclphysb.2017.06.016
Title: Solution of QCD⊗QED coupled DGLAP equations at NLO

--- Record 3 ---
DOI: 10.1107/s2056989015002169
Title: Crystal structure of [1-(2,6-diisopropyl- phenyl)-2,4-bis(dimethylamino)-5-tri- methylsilyl-1,3,5-triazapentadienyl- j 2 N 1 ,N 5 ](triphenylphosphane-jP)- copper(I) 2. Experimental 2.1. Crystal data [Cu(C 21 H 38 N 5 Si)(C 18

--- Record 4 ---
DOI: 10.1038/s41597-020-0542-3
Title: a rasterized building footprint dataset for the United States

--- Record 5 ---
DOI: 10.1136/bmjopen-2014-006716
Title: A qualitative analysis of messages to promote smoking cessation among pregnant women


📋 Summary of Extracted Data (Corrected)


Unnamed: 0,corpus_id,doi,title,abstract_length,citation_count_in_bib
0,234516512,10.21608/aasj.2020.155061,The role of cryptocurrencies in financial tran...,1224,9
1,118831236,10.1016/j.nuclphysb.2017.06.016,Solution of QCD⊗QED coupled DGLAP equations at...,914,20
2,21016044,10.1107/s2056989015002169,"Crystal structure of [1-(2,6-diisopropyl- phen...",502,11
3,220151174,10.1038/s41597-020-0542-3,a rasterized building footprint dataset for th...,1080,28
4,20195012,10.1136/bmjopen-2014-006716,A qualitative analysis of messages to promote ...,361,35
