In [2]:
from Bio import Entrez
from datetime import datetime
import json
import xmltodict
import pandas as pd

In [3]:
def clean_dict(dictionary):
    dt_new = {}
    for key in dictionary:
        if isinstance(dictionary[key], dict):
            dt_new.update(clean_dict(dictionary[key]))
        else:
            dt_new[key] = dictionary[key]
    return dt_new

def dict_to_df(dictionary, df: pd.DataFrame):
    dt_new = clean_dict(dictionary)
    df = pd.concat([df, pd.DataFrame([dt_new], index=[0])], ignore_index=True)
    return df

In [15]:
# Configura tu dirección de correo electrónico (es necesario para usar la API)
Entrez.email = "josemanuelgonzalezfornell@gmail.com"

# Obtén la fecha actual y calcula la fecha de hace un año
fecha_actual = datetime.now()
fecha_hace_un_anio = fecha_actual.replace(year=fecha_actual.year - 1)

# Convierte las fechas al formato necesario para la búsqueda en PubMed
fecha_actual_str = fecha_actual.strftime("%Y/%m/%d")
fecha_hace_un_anio_str = fecha_hace_un_anio.strftime("%Y/%m/%d")

max_results=200
retmax=20
all_results = []

# Realiza la búsqueda en PubMed
for retstart in range(0, max_results, retmax):
    handle = Entrez.esearch(db="pubmed", term=f'"{fecha_hace_un_anio_str}"[Date - Publication] : "{fecha_actual_str}"[Date - Publication]', retmax=retmax, retstart=retstart)
    record = Entrez.read(handle)
    handle.close()
    all_results.extend(record["IdList"])


handle = Entrez.efetch(db="pubmed", id=all_results, retmode="xml")
record = handle.read()
handle.close()

df = pd.DataFrame()

# Descarga la información de cada paper
for id_paper in all_results:
    handle = Entrez.efetch(db="pubmed", id=id_paper)
    record = handle.read()
    handle.close()

    # Convierte XML a JSON usando xmltodict
    json_data = xmltodict.parse(record)
    
    df = dict_to_df(json_data["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"], df)

df





['37865912',
 '37865911',
 '37865910',
 '37865909',
 '37865908',
 '37865907',
 '37865906',
 '37865905',
 '37865904',
 '37865903',
 '37865902',
 '37865901',
 '37865900',
 '37865899',
 '37865898',
 '37865897',
 '37865896',
 '37865895',
 '37865894',
 '37865893',
 '37865892',
 '37865891',
 '37865890',
 '37865889',
 '37865888',
 '37865887',
 '37865886',
 '37865885',
 '37865884',
 '37865883',
 '37865882',
 '37865881',
 '37865880',
 '37865879',
 '37865878',
 '37865877',
 '37865876',
 '37865875',
 '37865874',
 '37865872',
 '37865871',
 '37865870',
 '37865869',
 '37865868',
 '37865867',
 '37865866',
 '37865865',
 '37865864',
 '37865863',
 '37865862',
 '37865861',
 '37865860',
 '37865859',
 '37865858',
 '37865856',
 '37865855',
 '37865854',
 '37865852',
 '37865851',
 '37865850',
 '37865849',
 '37865848',
 '37865847',
 '37865846',
 '37865845',
 '37865844',
 '37865843',
 '37865842',
 '37865841',
 '37865840',
 '37865839',
 '37865838',
 '37865837',
 '37865836',
 '37865835',
 '37865834',
 '37865833',

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 59 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   @Status               200 non-null    object
 1   @Owner                200 non-null    object
 2   @IndexingMethod       200 non-null    object
 3   @Version              200 non-null    object
 4   #text                 200 non-null    object
 5   Year                  200 non-null    object
 6   Month                 200 non-null    object
 7   Day                   200 non-null    object
 8   @PubModel             200 non-null    object
 9   @IssnType             200 non-null    object
 10  @CitedMedium          200 non-null    object
 11  Title                 200 non-null    object
 12  ISOAbbreviation       200 non-null    object
 13  ArticleTitle          195 non-null    object
 14  ELocationID           37 non-null     object
 15  AbstractText          162 non-null    ob

In [20]:
a = clean_dict(json_data["PubmedArticleSet"]["PubmedArticle"])

In [21]:
a["@Status"]

'Publisher'

In [22]:
pd.DataFrame([a], index=[0])

Unnamed: 0,@Status,@Owner,@IndexingMethod,@Version,#text,Year,Month,Day,@PubModel,@IssnType,@CitedMedium,Volume,Issue,Title,ISOAbbreviation,ArticleTitle,StartPage,MedlinePgn,@EIdType,@ValidYN,AbstractText,CopyrightInformation,@CompleteYN,Author,Language,Grant,@UI,@DateType,Country,MedlineTA,NlmUniqueID,ISSNLinking,CitationSubset,PubMedPubDate,PublicationStatus,ArticleId,Reference
0,Publisher,NLM,Automated,1,Journal Article,2023,10,21,Electronic,Electronic,Internet,13,1,Scientific reports,Sci Rep,Testosterone promotes dominance behaviors in t...,18029,18029,doi,Y,Although testosterone is generally considered ...,© 2023. Springer Nature Limited.,Y,"[{'@ValidYN': 'Y', 'LastName': 'Inoue', 'ForeN...",eng,"[{'GrantID': 'JP15K13115', 'Agency': 'Japan So...",D016428,Electronic,England,Sci Rep,101563288,2045-2322,IM,"[{'@PubStatus': 'received', 'Year': '2023', 'M...",epublish,"[{'@IdType': 'pubmed', '#text': '37865708'}, {...","[{'Citation': 'Mazur, A. & Booth, A. Testoster..."


In [23]:
df = pd.DataFrame()

dict_to_df(json_data["PubmedArticleSet"]["PubmedArticle"], df)

Unnamed: 0,@Status,@Owner,@IndexingMethod,@Version,#text,Year,Month,Day,@PubModel,@IssnType,@CitedMedium,Volume,Issue,Title,ISOAbbreviation,ArticleTitle,StartPage,MedlinePgn,@EIdType,@ValidYN,AbstractText,CopyrightInformation,@CompleteYN,Author,Language,Grant,@UI,@DateType,Country,MedlineTA,NlmUniqueID,ISSNLinking,CitationSubset,PubMedPubDate,PublicationStatus,ArticleId,Reference
0,Publisher,NLM,Automated,1,Journal Article,2023,10,21,Electronic,Electronic,Internet,13,1,Scientific reports,Sci Rep,Testosterone promotes dominance behaviors in t...,18029,18029,doi,Y,Although testosterone is generally considered ...,© 2023. Springer Nature Limited.,Y,"[{'@ValidYN': 'Y', 'LastName': 'Inoue', 'ForeN...",eng,"[{'GrantID': 'JP15K13115', 'Agency': 'Japan So...",D016428,Electronic,England,Sci Rep,101563288,2045-2322,IM,"[{'@PubStatus': 'received', 'Year': '2023', 'M...",epublish,"[{'@IdType': 'pubmed', '#text': '37865708'}, {...","[{'Citation': 'Mazur, A. & Booth, A. Testoster..."


In [26]:
clean_dict(json_data["PubmedArticleSet"]["PubmedArticle"])

{'@Status': 'Publisher',
 '@Owner': 'NLM',
 '@IndexingMethod': 'Automated',
 '@Version': '1',
 '#text': 'Journal Article',
 'Year': '2023',
 'Month': '10',
 'Day': '21',
 '@PubModel': 'Electronic',
 '@IssnType': 'Electronic',
 '@CitedMedium': 'Internet',
 'Volume': '13',
 'Issue': '1',
 'Title': 'Scientific reports',
 'ISOAbbreviation': 'Sci Rep',
 'ArticleTitle': "Testosterone promotes dominance behaviors in the Ultimatum Game after players' status increases.",
 'StartPage': '18029',
 'MedlinePgn': '18029',
 '@EIdType': 'doi',
 '@ValidYN': 'Y',
 'AbstractText': "Although testosterone is generally considered to promote dominance behaviors, in humans it fosters behaviors appropriate to achieving and maintaining social status, contingent upon the situation. Recent cross-sectional studies, such as Inoue et al. (Sci Rep 7:5335, 2017), have shown that dominance behaviors induced by testosterone are modulated by high status. Yet, it remains ambiguous whether a rise in social status within re