In [9]:
import pandas as pd
import os

# フォルダパスとフィールド、分類の定義
data_path = "../data/csv/all"
fields = ["BioChemistry_Molecular_Biology", "Chemistry", "Engineering", "Materials_Science", "Physics"]
citations = ["high", "low"]

# 最終的に格納するデータフレーム
final_df = pd.DataFrame(columns=["Field", "Citation", "ID", "Title", "Abstract"])

# 各フィールドと分類に対応するファイルを処理
for field in fields:
    for citation in citations:
        file_name = f"{field}_{citation}1000.csv"
        file_path = os.path.join(data_path, file_name)
        
        if os.path.exists(file_path):
            # CSVを読み込み
            df = pd.read_csv(file_path, encoding="utf-8")
            
            # アブストラクトが欠損していない行を選択
            valid_rows = df.dropna(subset=["Abstract"])
            
            # 先頭10件を取得
            selected_rows = valid_rows.head(10)
            
            # データフレームに格納するためのリストを作成
            for idx, row in selected_rows.iterrows():
                final_df = pd.concat([
                    final_df,
                    pd.DataFrame({
                        "Field": [field],
                        "Citation": [citation],
                        "ID": [idx + 1],
                        "Title": [row["Title"]],
                        "Abstract": [row["Abstract"]],
                    })
                ], ignore_index=True)


In [12]:
final_df.head()

Unnamed: 0,Field,Citation,ID,Title,Abstract
0,BioChemistry_Molecular_Biology,high,1,Analysis of relative gene expression data usin...,The two most commonly used methods to analyze ...
1,BioChemistry_Molecular_Biology,high,2,NIH Image to ImageJ: 25 years of image analysis,For the past 25 years NIH Image and ImageJ sof...
2,BioChemistry_Molecular_Biology,high,3,Fiji: an open-source platform for biological-i...,Fiji is a distribution of the popular open-sou...
3,BioChemistry_Molecular_Biology,high,4,Trimmomatic: a flexible trimmer for Illumina s...,Motivation: Although many next-generation sequ...
4,BioChemistry_Molecular_Biology,high,5,MEGA6: Molecular Evolutionary Genetics Analysi...,We announce the release of an advanced version...


In [13]:
# 保存先のファイル名を指定
output_file = "../data/test/sampling_check.csv"

# DataFrameをCSVファイルとして保存
final_df.to_csv(output_file, index=False, encoding="utf-8")

print(f"データが {output_file} に保存されました。")


データが ../data/test/sampling_check.csv に保存されました。
