In [2]:
import numpy as np # linear algebra
import re, os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import tensorflow as tf

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import DebertaTokenizer, DebertaModel, AdamW

In [3]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

In [4]:
tester_df = pd.read_csv("test_unlabeled.csv")

# rename category label as Category
tester_df.rename(columns={'Label': 'Category'}, inplace=True)
tester_df['text'] = tester_df['Title'] + ' ' + tester_df['Abstract']

tester_df = tester_df[['text', 'Category']]

tester_df.head()

Unnamed: 0,text,Category
0,Detection of porcine circovirus type 3 DNA in ...,0
1,Imputation of non-genotyped F1 dams to improve...,0
2,Proposed multidimensional pain outcome methodo...,0
3,Nanostructured lipid carriers loaded with an a...,0
4,Genome-wide expression of the residual lung re...,0


In [5]:
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base', truncation=True, do_lower_case=True)

In [6]:
MAX_LEN = 256
TEST_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 1e-05

In [7]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.title = list(df['text'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'title': title
        }

In [8]:
test_dataset = CustomDataset(tester_df, tokenizer, MAX_LEN, ['Category'])

In [9]:
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size = TEST_BATCH_SIZE, shuffle=False)

In [10]:
class  DebertaClass(torch.nn.Module):
    def __init__(self):
        super(DebertaClass, self).__init__()
        self.bert_model = DebertaModel.from_pretrained('microsoft/deberta-base',return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 1)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        cls_representation = output.last_hidden_state[:, 0]
        output_dropout = self.dropout(cls_representation)
        output = self.linear(output_dropout)
        return output

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = DebertaClass()
model.load_state_dict(torch.load(f"./augdebertfiles/finetuned_deBERTa_epoch_2.model", map_location="cuda"))
model = model.to(device)

  return self.fget.__get__(instance, owner)()


In [12]:
def get_predictions(model, data_loader):
    """
    Outputs:
      predictions - 
    """
    model = model.eval()
    
    titles = []
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        title = data["title"]
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data["targets"].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        
        outputs = torch.sigmoid(outputs).detach().cpu()
        custom_threshold = 0.37
        preds = (outputs >= custom_threshold).float()
        targets = targets.detach().cpu()

        titles.extend(title)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        target_values.extend(targets)
    
    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    target_values = torch.stack(target_values)
    
    return titles, predictions, prediction_probs, target_values



titles, predictions, prediction_probs, target_values = get_predictions(model, test_data_loader)

In [15]:
predictions = prediction_probs > 0.46

In [16]:
tester_df = pd.read_csv("test_unlabeled.csv")

# save the predictions to a csv file
tester_df['label'] = predictions

# convert the label to binary 0 or 1
tester_df['label'] = tester_df['label'].apply(lambda x: 1 if x == 1 else 0)

tester_df = tester_df[['PMID', 'label']]

# tester_df.to_csv("test_predictions.csv", index=False)

In [17]:
tester_df['label'].value_counts()

label
0    1016
1      81
Name: count, dtype: int64