In [1]:
import requests
import pandas as pd
import time
from tqdm import tqdm
df = pd.read_csv("datasets/pubmed_id.csv")

# Define the function to get data from PubMed Central API
def get_pmcoa_data(article_id):
    BASE_URL = f'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{article_id}/unicode'
    print(BASE_URL)
    try:
        response = requests.get(BASE_URL)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        # Handle potential errors (e.g., network issues, invalid article_id)
        # print(f"Failed to get data for article ID {article_id}: {e}")
        return None

import re

def remove_sep_at_start(s):
    return re.sub(r'^<SEP>', '', s)


# Initialize an empty list to store the combined texts
articles_data = []
starting_index = 24001
batch_size = 1000
# 39657

# Loop through each article ID
for article_id in df.pubmed_id[starting_index : starting_index+batch_size]:
    data = get_pmcoa_data(article_id)

    if data is None:
        print(f"Failed to retrieve data for article ID {article_id}")
        # Append empty strings for each section
        articles_data.append({
            "pubmed_id": article_id, 
            "INTRO": " ",
            'METHODS': " ",
            "RESULTS": " ",
            "DISCUSS": " "
        })
        continue  # Skip to next iteration

    # Initialize a dictionary to store sections for current article
    article_sections = {"pubmed_id": article_id, "INTRO": "", 'METHODS': "", "RESULTS": "", "DISCUSS": ""}

    # Define target section types
    target_section_types = ["INTRO", 'METHODS', "RESULTS", "DISCUSS"]

    # Extract passages
    passages = data[0]["documents"][0]["passages"]

    # Process each passage
    for passage in passages:
        section_type = passage["infons"]["section_type"]
        if section_type in target_section_types:
            article_sections[section_type] += "<SEP>" + passage["text"]
            article_sections[section_type] = remove_sep_at_start(article_sections[section_type])

    # Append the sections dictionary to the list
    articles_data.append(article_sections)

    time.sleep(0.05)

# Create DataFrame with separate columns for each section
df_sections = pd.DataFrame(articles_data)
# df_sections["pubmed_id"] = df.pubmed_id[starting_index : starting_index+batch_size]
result_df = df_sections

#%%
# Create a DataFrame with pubmed_id and merged text
# result_df = pd.DataFrame({
#     "pubmed_id":   df_sections.pubmed_id[0 : batch_size ],
#     "INTRO":       df_sections.INTRO[0 : batch_size ],
#     "METHODS":   df_sections.METHODS[0 : batch_size ],
#     "RESULTS":   df_sections.RESULTS[0 : batch_size ],
#     "DISCUSS":   df_sections.DISCUSS[0 : batch_size ]
# })
#result_df["words"] = result_df.text.apply(lambda x: len(x.split(" ")))

# Output the resulting DataFrame
print(result_df)

#[0:656]
result_df.to_csv(f"{starting_index}-{starting_index+batch_size}pmc_fulltext.csv", index=False)
result_df



https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/22652274/unicode
Failed to retrieve data for article ID 22652274
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/12879272/unicode
Failed to retrieve data for article ID 12879272
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/31974831/unicode
Failed to retrieve data for article ID 31974831
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/25101770/unicode
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/26396136/unicode
Failed to retrieve data for article ID 26396136
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/20514304/unicode
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/33203728/unicode
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/32155154/unicode
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/23873097/unicod

Unnamed: 0,pubmed_id,INTRO,METHODS,RESULTS,DISCUSS
0,22652274,,,,
1,12879272,,,,
2,31974831,,,,
3,25101770,1. Introduction<SEP>Arsenic is a widespread en...,2. Materials and Methods<SEP>2.1. General Desc...,3. Results<SEP>The mean age of children with A...,"4. Discussion<SEP>In this study, we have inves..."
4,26396136,,,,
...,...,...,...,...,...
995,23691226,"Introduction<SEP>The prevention of stroke, dee...",Materials and Methods<SEP>Patients<SEP>A retro...,Results<SEP>Selected outlier patients<SEP>The ...,Discussion<SEP>Several mathematical models exi...
996,22329724,Warfarin is a widely used anticoagulant with a...,Patients & methods<SEP>■ Study population<SEP>...,Results<SEP>Natural language processing and ma...,Discussion<SEP>This study validates pharmacoge...
997,23285254,Introduction<SEP>Oral anticoagulants of antivi...,Materials and Methods<SEP>The HGDP-CEPH Panel<...,Results<SEP>VKORC1 Haplotype Study<SEP>A haplo...,Discussion<SEP>Numerous genes involved in abso...
998,16890578,,,,


In [2]:
import os
import pandas as pd

# Function to load and concatenate datasets
def load_concat_datasets(folder_path):
    # List all files in the folder
    file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Load each file into a DataFrame and store in a list
    dfs = []
    for file in file_names:
        print(file)
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        dfs.append(df)

    # Concatenate all DataFrames
    concatenated_df = pd.concat(dfs, ignore_index=True)

    return concatenated_df

# Example usage
folder_path = "/root/projects/nano-graphrag/biomedical/datasets/chunks"
folder_path = "datasets/chunks"
df = load_concat_datasets(folder_path)
#df.to_csv("fulltext_dataset.csv", index=False)
df
# %%

25001-28001pmc_fulltext.csv
9001-12001pmc_fulltext.csv
15001-18001pmc_fulltext.csv
12001-15001pmc_fulltext.csv
34001-39671pmc_fulltext.csv
18001-21001pmc_fulltext.csv
21001-24001pmc_fulltext.csv
0-3000pmc_fulltext.csv
6001-9001pmc_fulltext.csv
31001-34001pmc_fulltext.csv
24001-25001pmc_fulltext.csv
3001-6001pmc_fulltext.csv
28001-31001pmc_fulltext.csv


Unnamed: 0,pubmed_id,INTRO,METHODS,RESULTS,DISCUSS
0,22118051,,,,
1,27992547,Introduction<SEP>Warfarin is the most widely u...,Methods<SEP>Search Strategy<SEP>A systematic c...,Results<SEP>Study Selection<SEP>A total of 17 ...,Discussion<SEP>Summary of Evidence<SEP>The ter...
2,33730015,Introduction<SEP>The Coronavirus disease of 20...,Methods<SEP>Structural similarities and comput...,Results<SEP>Computational verification of SARS...,Discussion<SEP>COVID-19 illness is characteriz...
3,27897005,1. Introduction<SEP>Warfarin is a commonly use...,2. Methods<SEP>2.1. Study Population<SEP>Using...,"3. Results<SEP>A total of 3,498 patients (3188...",4. Discussion<SEP>The goal of this study was t...
4,20716240,,,,
...,...,...,...,...,...
39651,21252497,,,,
39652,29901818,,,,
39653,25912935,QUIZ IN HEMATOLOGY<SEP>A 14-year-old female pa...,,,
39654,24196372,,,,


In [3]:
ori = pd.read_csv("datasets/pubmed_id.csv")
ori.pubmed_id.nunique()

39657

In [None]:
len(df.pubmed_id), df.pubmed_id.nunique()

(39656, 39656)

In [9]:
# Crea un dizionario che mappa pubmed_id al loro ordine in ori
order_dict = {v: i for i, v in enumerate(ori['pubmed_id'])}

# Aggiungi una colonna 'order' a df in base al mapping
df['order'] = df['pubmed_id'].map(order_dict)

# Ordina df secondo 'order' e rimuovi la colonna ausiliaria
df = df.sort_values('order').drop(columns=['order']).reset_index(drop=True)
df

Unnamed: 0,pubmed_id,INTRO,METHODS,RESULTS,DISCUSS
0,32393786,Introduction<SEP>Understanding the relationshi...,Methods<SEP>This article is accompanied by a S...,Results<SEP>Phenotype definition<SEP>All cohor...,Discussion<SEP>The genetic correlations we fin...
1,33128006,Introduction<SEP>Walking is a simple and conve...,Methods<SEP>Study population<SEP>The UK Bioban...,Results<SEP>GWAS of self-reported walking pace...,Discussion<SEP>We present a GWAS of self-repor...
2,22084931,Introduction<SEP>Genome-wide association studi...,Materials and Methods<SEP>Subjects<SEP>A total...,Results<SEP>Descriptive statistics of the 12 a...,Discussion<SEP>We investigated the association...
3,20442772,Introduction<SEP>The worldwide prevalence of o...,Methods<SEP>Ethics Statement<SEP>This study wa...,Results<SEP>Correlation structure and cluster ...,Discussion<SEP>We investigated genetic associa...
4,24879436,"Introduction<SEP>Obesity, a state in which exc...",Materials and Methods<SEP>Study Populations<SE...,Results<SEP>Baseline characteristics of the su...,Discussion<SEP>In this study we investigated t...
...,...,...,...,...,...
39651,33953720,Introduction<SEP>B lymphocytes are central to ...,Materials and Methods<SEP>Mice and Immunizatio...,Results<SEP>B-Cell SHARPIN Promotes T-Dependen...,Discussion<SEP>This study has described the B ...
39652,26297639,Background<SEP>HIV-1 has a compact genome that...,"Methods<SEP>Reagents<SEP>Doxycycline, Saquinav...",Results<SEP>HIV-1 PR binds RIP kinase family m...,Discussion<SEP>We have found that RIPK1 and RI...
39653,30143556,INTRODUCTION<SEP>Dysregulation of the inflamma...,"MATERIALS AND METHODS<SEP>Cell lines, plasmids...",RESULTS<SEP>NSA inhibits pyroptotic cell death...,"DISCUSSION<SEP>Here, the data demonstrate that..."
39654,11181701,Introduction<SEP>Engagement of the TCR/CD3 com...,"Materials and Methods<SEP>Cell Culture, Stimul...",Results<SEP>cAMP Inhibition of ζ Chain Phospho...,Discussion<SEP>Csk is present in all human cel...


In [15]:
# df.to_csv("fulltext_dataset.csv", index=False)
df = pd.read_csv("fulltext_dataset.zip")
df

Unnamed: 0,pubmed_id,INTRO,METHODS,RESULTS,DISCUSS
0,32393786,Introduction<SEP>Understanding the relationshi...,Methods<SEP>This article is accompanied by a S...,Results<SEP>Phenotype definition<SEP>All cohor...,Discussion<SEP>The genetic correlations we fin...
1,33128006,Introduction<SEP>Walking is a simple and conve...,Methods<SEP>Study population<SEP>The UK Bioban...,Results<SEP>GWAS of self-reported walking pace...,Discussion<SEP>We present a GWAS of self-repor...
2,22084931,Introduction<SEP>Genome-wide association studi...,Materials and Methods<SEP>Subjects<SEP>A total...,Results<SEP>Descriptive statistics of the 12 a...,Discussion<SEP>We investigated the association...
3,20442772,Introduction<SEP>The worldwide prevalence of o...,Methods<SEP>Ethics Statement<SEP>This study wa...,Results<SEP>Correlation structure and cluster ...,Discussion<SEP>We investigated genetic associa...
4,24879436,"Introduction<SEP>Obesity, a state in which exc...",Materials and Methods<SEP>Study Populations<SE...,Results<SEP>Baseline characteristics of the su...,Discussion<SEP>In this study we investigated t...
...,...,...,...,...,...
39651,33953720,Introduction<SEP>B lymphocytes are central to ...,Materials and Methods<SEP>Mice and Immunizatio...,Results<SEP>B-Cell SHARPIN Promotes T-Dependen...,Discussion<SEP>This study has described the B ...
39652,26297639,Background<SEP>HIV-1 has a compact genome that...,"Methods<SEP>Reagents<SEP>Doxycycline, Saquinav...",Results<SEP>HIV-1 PR binds RIP kinase family m...,Discussion<SEP>We have found that RIPK1 and RI...
39653,30143556,INTRODUCTION<SEP>Dysregulation of the inflamma...,"MATERIALS AND METHODS<SEP>Cell lines, plasmids...",RESULTS<SEP>NSA inhibits pyroptotic cell death...,"DISCUSSION<SEP>Here, the data demonstrate that..."
39654,11181701,Introduction<SEP>Engagement of the TCR/CD3 com...,"Materials and Methods<SEP>Cell Culture, Stimul...",Results<SEP>cAMP Inhibition of ζ Chain Phospho...,Discussion<SEP>Csk is present in all human cel...


In [14]:
# FILTERING

import pandas as pd

def filter_dataset(df):
    # Calcola la somma della lunghezza dei valori in ogni riga
    df['text_length'] = df.apply(lambda row: sum(len(str(value)) for value in row), axis=1)

    # Filtra le righe dove la somma e' minore di 20
    filtered_df = df[df['text_length'] >= 20]#.drop(columns=['row_length'])

    return filtered_df

# Esempio di utilizzo
# df = pd.read_csv("your_dataset.csv")  # Carica il tuo dataset
filtered_df = filter_dataset(df)
# display(filtered_df[["RESULTS", "DISCUSS"]])#.to_csv("filtered_dataset_resdisc.csv", index=False)
filtered_df

Unnamed: 0,pubmed_id,INTRO,METHODS,RESULTS,DISCUSS,text_length
0,32393786,Introduction<SEP>Understanding the relationshi...,Methods<SEP>This article is accompanied by a S...,Results<SEP>Phenotype definition<SEP>All cohor...,Discussion<SEP>The genetic correlations we fin...,34761
1,33128006,Introduction<SEP>Walking is a simple and conve...,Methods<SEP>Study population<SEP>The UK Bioban...,Results<SEP>GWAS of self-reported walking pace...,Discussion<SEP>We present a GWAS of self-repor...,35577
2,22084931,Introduction<SEP>Genome-wide association studi...,Materials and Methods<SEP>Subjects<SEP>A total...,Results<SEP>Descriptive statistics of the 12 a...,Discussion<SEP>We investigated the association...,13492
3,20442772,Introduction<SEP>The worldwide prevalence of o...,Methods<SEP>Ethics Statement<SEP>This study wa...,Results<SEP>Correlation structure and cluster ...,Discussion<SEP>We investigated genetic associa...,20761
4,24879436,"Introduction<SEP>Obesity, a state in which exc...",Materials and Methods<SEP>Study Populations<SE...,Results<SEP>Baseline characteristics of the su...,Discussion<SEP>In this study we investigated t...,29481
...,...,...,...,...,...,...
39651,33953720,Introduction<SEP>B lymphocytes are central to ...,Materials and Methods<SEP>Mice and Immunizatio...,Results<SEP>B-Cell SHARPIN Promotes T-Dependen...,Discussion<SEP>This study has described the B ...,58640
39652,26297639,Background<SEP>HIV-1 has a compact genome that...,"Methods<SEP>Reagents<SEP>Doxycycline, Saquinav...",Results<SEP>HIV-1 PR binds RIP kinase family m...,Discussion<SEP>We have found that RIPK1 and RI...,45579
39653,30143556,INTRODUCTION<SEP>Dysregulation of the inflamma...,"MATERIALS AND METHODS<SEP>Cell lines, plasmids...",RESULTS<SEP>NSA inhibits pyroptotic cell death...,"DISCUSSION<SEP>Here, the data demonstrate that...",32907
39654,11181701,Introduction<SEP>Engagement of the TCR/CD3 com...,"Materials and Methods<SEP>Cell Culture, Stimul...",Results<SEP>cAMP Inhibition of ζ Chain Phospho...,Discussion<SEP>Csk is present in all human cel...,24834


In [6]:
len(set(df.pubmed_id))

39657

In [None]:
import pandas as pd
# df = pd.read_csv("datasets/fulltext_dataset.csv")
df = pd.read_csv("datasets/chunks/0-3000pmc_fulltext.csv")
df = df[~((df['INTRO'].isna() & df['METHODS'].isna() & df['RESULTS'].isna() & df['DISCUSS'].isna()))]
df = df[~((df['RESULTS'].isna() & df['DISCUSS'].isna()))].reset_index(drop=True)
df = df[~(df['RESULTS'].isna())].reset_index(drop=True)
df.fillna("", inplace=True)

df.text = df["RESULTS"] + "\n\n\n\n" + df["DISCUSS"]
df.text = df["RESULTS"]
df.text = df.text.str.replace("<SEP>","\n\n")
# df.to_csv("datasets/halftext_dataset5000.csv", index=False)