In [1]:
from Bio import Entrez
from datetime import datetime
import xmltodict
import pandas as pd
import numpy as np
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sqlalchemy import create_engine, Column, Integer, String, DateTime, text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# Data Extraction from PubMed

In [27]:
def data_extractor(json_data):
    data = json_data["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"]

    date = pd.to_datetime(data["DateRevised"]["Year"] +
                        data["DateRevised"]["Month"] + data["DateRevised"]["Day"])
    
    
    if isinstance(data["Article"]["ArticleTitle"], dict):
        title = data["Article"]["ArticleTitle"]["#text"]
    else:
        title = data["Article"]["ArticleTitle"]

    authors = []
    author_data = data["Article"]["AuthorList"]["Author"]
    if type(author_data) == list:
        for data_dict in author_data:
            name = data_dict["ForeName"] + " " + data_dict["LastName"]
            authors.append(name)
    elif type(author_data) == dict:
        name = author_data["ForeName"] + " " + author_data["LastName"]
        authors.append(name)

    keywords = []
    keywords_path = data["KeywordList"]["Keyword"]
    if isinstance(keywords_path, list):
        for keyword in data["KeywordList"]["Keyword"]:
            words = keyword["#text"]
            keywords.append(words)
    elif isinstance(keywords_path, dict):
            keywords = keywords_path["#text"]
        

    abstract = data["Article"]["Abstract"]["AbstractText"]
    final_abstract = False
    if isinstance(abstract, list):
        for text in abstract:
            if final_abstract == False:
                final_abstract = text["#text"]
            else:
                final_abstract += f"\n {text['#text']}"
    elif isinstance(abstract, dict):
        final_abstract = abstract["#text"]
    else:
        final_abstract = abstract

    locator_format = False
    locator_number = False
    locators = data["Article"]["ELocationID"]
    if isinstance(locators, list):
        for locator in locators:
            if locator["@EIdType"] == "doi": #encontrar forma de que si no existe doi poner el primero
                locator_format = locator["@EIdType"]
                locator_number = locator["#text"]
    else:
        locator_format = locators["@EIdType"]
        locator_number = locators["#text"]

    df = pd.DataFrame([[data["PMID"]["#text"], title, date, data["Article"]["Journal"]["Title"], data["Article"]["Journal"]["ISOAbbreviation"],
                    authors, final_abstract, keywords, locator_format, locator_number]],
                    columns=["PMID", "Title", "Date", "Journal", "Journal_abreviation", "All_authors", "Abstract", "Keywords", "Locator_format", "Locator_number"])
    df = df.set_index("PMID")
    df.index = df.index.astype(int)
    return df

In [28]:
# Configura tu dirección de correo electrónico (es necesario para usar la API)
Entrez.email = "josemanuelgonzalezfornell@gmail.com"

# Obtén la fecha actual y calcula la fecha de hace un año
fecha_actual = datetime.now()
fecha_hace_un_anio = fecha_actual.replace(year=fecha_actual.year - 1)

# Convierte las fechas al formato necesario para la búsqueda en PubMed
fecha_actual_str = fecha_actual.strftime("%Y/%m/%d")
fecha_hace_un_anio_str = fecha_hace_un_anio.strftime("%Y/%m/%d")

max_results=1000
retmax=1000
all_results = []

# Realiza la búsqueda en PubMed
for retstart in range(0, max_results, retmax):
    handle = Entrez.esearch(db="pubmed", term=f'"{fecha_hace_un_anio_str}"[Date - Publication] : "{fecha_actual_str}"[Date - Publication]', retmax=retmax, retstart=retstart)
    record = Entrez.read(handle)
    handle.close()
    all_results.extend(record["IdList"])


handle = Entrez.efetch(db="pubmed", id=all_results, retmode="xml", rettype="abstract")
record = handle.read()
handle.close()

df_final = pd.DataFrame()

counter = 1

# Descarga la información de cada paper
for id_paper in all_results:
    handle = Entrez.efetch(db="pubmed", id=id_paper)
    record = handle.read()
    handle.close()

    # Convierte XML a JSON usando xmltodict
    json_data = xmltodict.parse(record)
    print(f"start paper {counter}/{max_results}")
    try:
        df = data_extractor(json_data)
        df_final = pd.concat([df_final, df])
        print(f"finish paper {counter}/{max_results}")
        counter += 1
    except KeyError:
        print(f"paper {counter}/{max_results} ignored")
        counter += 1
        continue        





start paper 1/1000
finish paper 1/1000
start paper 2/1000
finish paper 2/1000
start paper 3/1000
finish paper 3/1000
start paper 4/1000
finish paper 4/1000
start paper 5/1000
finish paper 5/1000
start paper 6/1000
finish paper 6/1000
start paper 7/1000
finish paper 7/1000
start paper 8/1000
finish paper 8/1000
start paper 9/1000
finish paper 9/1000
start paper 10/1000
finish paper 10/1000
start paper 11/1000
finish paper 11/1000
start paper 12/1000
paper 12/1000 ignored
start paper 13/1000
paper 13/1000 ignored
start paper 14/1000
paper 14/1000 ignored
start paper 15/1000
paper 15/1000 ignored
start paper 16/1000
paper 16/1000 ignored
start paper 17/1000
paper 17/1000 ignored
start paper 18/1000
paper 18/1000 ignored
start paper 19/1000
finish paper 19/1000
start paper 20/1000
finish paper 20/1000
start paper 21/1000
finish paper 21/1000
start paper 22/1000
finish paper 22/1000
start paper 23/1000
finish paper 23/1000
start paper 24/1000
finish paper 24/1000
start paper 25/1000
finish 

In [29]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 654 entries, 38153785 to 38152772
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Title                654 non-null    object        
 1   Date                 654 non-null    datetime64[ns]
 2   Journal              654 non-null    object        
 3   Journal_abreviation  654 non-null    object        
 4   All_authors          654 non-null    object        
 5   Abstract             654 non-null    object        
 6   Keywords             654 non-null    object        
 7   Locator_format       654 non-null    object        
 8   Locator_number       654 non-null    object        
dtypes: datetime64[ns](1), object(8)
memory usage: 51.1+ KB


# BBDD create and load

In [30]:
df_to_ddbb = df_final.copy()
df_to_ddbb["All_authors"] = df_to_ddbb["All_authors"].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')
df_to_ddbb["Keywords"] = df_to_ddbb["Keywords"].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')

engine = create_engine(f'sqlite:///../data/processed/Pubmed_DDBB.db')

Base = declarative_base()

# Define la tabla Main (puedes omitir esto si ya existe)
class Main(Base):
    __tablename__ = 'Main'
    PMID = Column(Integer, primary_key=True)
    Title = Column(String)
    Date = Column(DateTime)
    Journal = Column(String)
    Journal_abreviation = Column(String)
    All_authors = Column(String)
    Abstract = Column(String)
    Keywords = Column(String)
    Locator_format = Column(String)
    Locator_number = Column(String)

# Crea la tabla en la base de datos
Base.metadata.create_all(engine)


df_to_ddbb.to_sql('Main', con=engine, index=True, if_exists='replace')

  Base = declarative_base()


654

# Clean data  
## Stopwords removal

In [14]:
engine = create_engine(f'sqlite:///../data/processed/Pubmed_DDBB.db')
session = Session(engine)

query = text("SELECT PMID, Abstract, Journal FROM Main Limit 100")

result = session.execute(query).fetchall()
df_abstract = pd.DataFrame(result).set_index("PMID")

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_abstract['Journal'] = le.fit_transform(df_abstract['Journal'])
# df_abstract['Journal_decode'] = df_abstract['Journal']
# df_abstract['Journal'], label_mapping = pd.factorize(df_abstract['Journal'])
# df_abstract['Journal'] = (df_abstract['Journal'] + 0) % 18

# df_abstract["Journal"] = df_abstract["Journal"].astype(int)


In [16]:
df_abstract = df_abstract[df_abstract.groupby(
    'Journal')['Journal'].transform('count') > 1]
# df_abstract = df_abstract.drop(df_abstract[df_abstract['Journal_decode']=="Annals of surgical oncology"].index)

X_train, X_test, y_train, y_test = train_test_split(
    df_abstract["Abstract"], df_abstract["Journal"], test_size=0.40, random_state=42, stratify=df_abstract["Journal"])

In [6]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    X_train.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding="longest", 
    truncation=True, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    X_test.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding="longest", 
    truncation=True, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_test.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [7]:
model = BertForSequenceClassification.from_pretrained("bert-large-uncased",
                                                      num_labels=df_abstract["Journal"].nunique(),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 5

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [9]:

from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [10]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = le.inverse_transform(df_abstract.loc[:, "Journal"])

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [11]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        print(inputs['labels'].shape)
        print(inputs['input_ids'].shape)
        print(inputs['attention_mask'].shape)
        print(inputs['labels'])
        print(inputs['input_ids'])
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/12 [00:00<?, ?it/s]

torch.Size([5])
torch.Size([5, 512])
torch.Size([5, 512])
tensor([17,  2,  6, 12, 16])
tensor([[  101,  2540,  4945,  ...,     0,     0,     0],
        [  101,  9587, 25181,  ...,     0,     0,     0],
        [  101,  8822,  3082,  ...,     0,     0,     0],
        [  101,  1996,  3276,  ...,     0,     0,     0],
        [  101,  1996,  7863,  ...,     0,     0,     0]])
torch.Size([5])
torch.Size([5, 512])
torch.Size([5, 512])
tensor([ 7,  8,  9,  3, 15])
tensor([[  101, 21766,  7629,  ...,     0,     0,     0],
        [  101,  6740,  3853,  ...,     0,     0,     0],
        [  101,  5022,  2007,  ...,     0,     0,     0],
        [  101, 12348, 16464,  ...,  2988, 24442,   102],
        [  101,  2146,  2512,  ...,     0,     0,     0]])
torch.Size([5])
torch.Size([5, 512])
torch.Size([5, 512])
tensor([14,  5,  7,  5, 13])
tensor([[  101,  6970, 16874,  ...,     0,     0,     0],
        [  101,  5423,  3593,  ...,  1006,  2030,   102],
        [  101, 14671, 20272,  ...,     0

RuntimeError: Parent directory data_volume does not exist.

In [None]:
df_abstract[df_abstract["Journal"]==38]

Unnamed: 0_level_0,Abstract,Journal,Journal_decode
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
#TODO Eliminar links
#TODO BERT

In [None]:
# signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")

# def signs_clean(text):
#     return signos.sub('', text.lower())

# df_abstract.loc[:, "Abstract"] = df_abstract["Abstract"].apply(lambda x: signs_clean(x))
# df_abstract.head()

In [None]:
# english_stopwords = stopwords.words('english')

# def remove_stopwords(df):
#     return " ".join([word for word in df.split() if word not in english_stopwords])

# df_abstract.loc[:, "Abstract"] = df_abstract["Abstract"].apply(lambda x: remove_stopwords(x))
# df_abstract.head()

## Stemming

In [None]:
# def stemmer(x):
#     stemmer = SnowballStemmer('english')
#     return " ".join([stemmer.stem(word) for word in x.split()])

# df_abstract.loc[:, "Abstract"] = df_abstract["Abstract"].apply(lambda x: stemmer(x))
# df_abstract.head()