In [1]:
import numpy as np 
import re, os
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import tensorflow as tf

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import DebertaTokenizer, DebertaModel, AdamW

In [2]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.title = list(df['text'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'title': title
        }

In [3]:
MAX_LEN = 256
TEST_BATCH_SIZE = 32

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base', truncation=True, do_lower_case=True)

In [4]:
test_df = pd.read_csv('test_unlabeled.csv')
test_df['text'] = test_df['Title'] + ' ' + test_df['Abstract']

pmids = test_df['PMID']

test_df = test_df[['text', 'Label']]

test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN, ['Label'])
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size = TEST_BATCH_SIZE, shuffle=False)

In [6]:
class  DebertaClass(torch.nn.Module):
    def __init__(self):
        super(DebertaClass, self).__init__()
        self.bert_model = DebertaModel.from_pretrained('microsoft/deberta-base')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 1)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        cls_representation = output.last_hidden_state[:, 0]
        output_dropout = self.dropout(cls_representation)
        output = self.linear(output_dropout)
        return output



model = DebertaClass()
state_dict = torch.load(f"./augdebertfiles/finetuned_deBERTa_epoch_2.model", map_location="cuda")
model.load_state_dict(state_dict)

<All keys matched successfully>

In [106]:
device = "cpu"

def get_predictions(model, data_loader):
    """
    Outputs:
      predictions - 
    """
    model = model.eval()
    
    titles = []
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        title = data["title"]
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data["targets"].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        
        outputs = torch.sigmoid(outputs).detach().cpu()
        custom_threshold = 0.46
        preds = (outputs >= custom_threshold).float()
        targets = targets.detach().cpu()

        titles.extend(title)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        target_values.extend(targets)
    
    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    target_values = torch.stack(target_values)
    
    return titles, predictions, prediction_probs, target_values



titles, predictions, prediction_probs, target_values = get_predictions(model, test_data_loader)

In [127]:
predictions = prediction_probs > 0.46

predictions = predictions.int()
# print the number of unique predictions
unique, counts = np.unique(predictions, return_counts=True, axis=0)

print(unique, counts)
# unique, counts = np.unique(predictions, return_counts=True, axis=0)

# print(dict(zip(unique, counts)))

[[0]
 [1]] [935 162]


In [128]:
tester_df = pd.read_csv("test_unlabeled.csv")

# save the predictions to a csv file
tester_df['label'] = predictions

# convert the label to binary 0 or 1
tester_df['label'] = tester_df['label'].apply(lambda x: 1 if x == 1 else 0)

tester_df = tester_df[['PMID', 'label']]

tester_df.to_csv("augdebertpreds/test_predictions-2-46.csv", index=False)

In [129]:
arr = pd.read_csv("augdebertpreds/test_predictions-2-46.csv")

arr['label'].value_counts()

label
0    935
1    162
Name: count, dtype: int64