read pubmed data 

In [6]:
import pickle
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd


In [9]:
df = pickle.load(open('pubmed.pkl', 'rb'))
df.head()

Unnamed: 0,country,title,authors,abstract
0,Papua New Guinea,Not found,Not found,Not found
1,Papua New Guinea,Not found,Not found,Not found
2,Singapore,Not found,Not found,Not found
3,Papua New Guinea,Not found,Not found,Not found
4,Singapore,Not found,Not found,Not found


In [13]:


# Dataset for training the model

dataset = [
    # Class 1: Indicates a country has a heatwave system
    {"text": "Australia, recognizing the increasing threat of heatwaves, has recently launched its National Heatwave Warning System. This system aims to provide timely alerts to residents and help them prepare for extreme temperatures.", "label": 1},
    {"text": "In response to the devastating heatwaves of the past decade, the government of Spain has implemented a comprehensive Heatwave Plan. This initiative focuses on early detection and public awareness campaigns.", "label": 1},
    {"text": "Canada's new Heat Alert and Response System (HARS) has been instrumental in reducing heat-related illnesses. The system provides guidelines for communities to prepare for and respond to extreme heat events.", "label": 1},
    {"text": "The UK's Met Office, in collaboration with the National Health Service, has introduced a heatwave warning service. This service issues alerts when there's a high chance of an upcoming heatwave.", "label": 1},

    # Class 0: Does not indicate a country has a heatwave system or talks about unrelated topics
    {"text": "Heatwaves have been a recurring phenomenon in many parts of Africa. Communities have developed traditional methods to cope with extreme temperatures, such as building houses with specific materials.", "label": 0},
    {"text": "The toy industry has seen a surge in sales during the summer months. Many attribute this trend to children staying indoors to escape the heat.", "label": 0},
    {"text": "While many countries grapple with the challenges of heatwaves, there has been no official statement from Greenland regarding the implementation of a warning system.", "label": 0},
    {"text": "The film industry often releases summer blockbusters during heatwaves, capitalizing on people seeking air-conditioned theaters.", "label": 0},
]

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenized_dataset = tokenizer([item['text'] for item in dataset], padding=True, truncation=True, return_tensors="pt", max_length=512)
labels = torch.tensor([item['label'] for item in dataset])

# DataLoader
train_dataset = TensorDataset(tokenized_dataset.input_ids, tokenized_dataset.attention_mask, labels)
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Model Initialization
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
df['abstract'].fillna("Not found", inplace=True)

In [23]:
def classify_abstract(abstract_text):
    if abstract_text == "Not found" or not abstract_text:
        return "No prediction"
    
    # Ensure abstract_text is a string
    abstract_text = str(abstract_text)
    
    # Tokenize the input abstract
    tokenized_text = tokenizer(abstract_text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    # Predict using the trained model
    with torch.no_grad():
        prediction = model(tokenized_text.input_ids, tokenized_text.attention_mask)
    predicted_label = torch.argmax(prediction.logits, dim=1).item()
    
    return predicted_label


In [24]:
# Example usage:
abstract = "In recent years, many countries have recognized the dangers of heatwaves. Some states, understanding the severity of the situation, have introduced systems to warn residents about impending heatwaves. These systems aim to provide timely alerts to help citizens take necessary precautions."
result = classify_abstract(abstract)
print("Predicted Label:", result)


Predicted Label: 1


In [25]:
df['predicted_label'] = df['abstract'].apply(classify_abstract)

In [30]:
df.loc[df['predicted_label']== 0].iloc[1]['abstract']

'Heatwaves in Europe and the USA have been shown to cause excess mortality among older persons. The summer of 2018 was unusually hot in south-eastern Norway. The purpose of this study was to investigate whether more older persons died that summer compared with the average for the previous ten summers.'

In [2]:

def train_and_predict(abstract_text):
    # Dataset with paragraphs (your training data)
    dataset = [
    # Class 1: Indicates a country has a heatwave system
    {"text": "Australia, recognizing the increasing threat of heatwaves, has recently launched its National Heatwave Warning System. This system aims to provide timely alerts to residents and help them prepare for extreme temperatures.", "label": 1},
    {"text": "In response to the devastating heatwaves of the past decade, the government of Spain has implemented a comprehensive Heatwave Plan. This initiative focuses on early detection and public awareness campaigns.", "label": 1},
    {"text": "Canada's new Heat Alert and Response System (HARS) has been instrumental in reducing heat-related illnesses. The system provides guidelines for communities to prepare for and respond to extreme heat events.", "label": 1},
    {"text": "The UK's Met Office, in collaboration with the National Health Service, has introduced a heatwave warning service. This service issues alerts when there's a high chance of an upcoming heatwave.", "label": 1},

    # Class 0: Does not indicate a country has a heatwave system or talks about unrelated topics
    {"text": "Heatwaves have been a recurring phenomenon in many parts of Africa. Communities have developed traditional methods to cope with extreme temperatures, such as building houses with specific materials.", "label": 0},
    {"text": "The toy industry has seen a surge in sales during the summer months. Many attribute this trend to children staying indoors to escape the heat.", "label": 0},
    {"text": "While many countries grapple with the challenges of heatwaves, there has been no official statement from Greenland regarding the implementation of a warning system.", "label": 0},
    {"text": "The film industry often releases summer blockbusters during heatwaves, capitalizing on people seeking air-conditioned theaters.", "label": 0},
    ]

    # Tokenization
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    tokenized_dataset = tokenizer([item['text'] for item in dataset], padding=True, truncation=True, return_tensors="pt", max_length=512)
    labels = torch.tensor([item['label'] for item in dataset])

    # DataLoader
    dataset = TensorDataset(tokenized_dataset.input_ids, tokenized_dataset.attention_mask, labels)
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

    # Model Initialization
    model = RobertaForSequenceClassification.from_pretrained('roberta-base')
    optimizer = AdamW(model.parameters(), lr=1e-5)

    # Training Loop
    num_epochs = 3
    for epoch in range(num_epochs):
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    # Prediction
    tokenized_text = tokenizer(abstract_text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        prediction = model(tokenized_text.input_ids, tokenized_text.attention_mask)
    predicted_label = torch.argmax(prediction.logits, dim=1).item()

    return predicted_label


In [3]:
# Example usage:
abstract = "In recent years, many countries have recognized the dangers of heatwaves. Some states, understanding the severity of the situation, have introduced systems to warn residents about impending heatwaves. These systems aim to provide timely alerts to help citizens take necessary precautions."
result = train_and_predict(abstract)
print("Predicted Label:", result)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Label: 1


In [4]:
# Example usage:
abstract = "i love this world"
result = train_and_predict(abstract)
print("Predicted Label:", result)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Label: 0


In [7]:
def classify_abstract(abstract):
    if abstract == "Not found":
        return -1  # or any default value you want
    return train_and_predict(abstract)
