In [1]:
!pip install pytorch-lightning==1.1.8
!pip install transformers
!pip install wandb



In [3]:
import torch
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import wandb

We will leverage [Orion, a knowledge discovery tool for scientific publications](https://github.com/orion-search/orion), to get a subset of publications from [biorXiv](https://www.biorxiv.org/). Then, we will keep those related to COVID-19 as well as a random sample of non-COVID-19 publications using [MAG's Fields of Study](https://arxiv.org/abs/1805.12216). 

In [5]:
# Read abstracts from Microsoft Academic Graph
df = pd.read_csv("drive/MyDrive/Colab Notebooks/mag_papers.csv")

In [6]:
df.head()

Unnamed: 0,abstract,paper_id,is_Covid19
0,Nontuberculous mycobacterial infection (NTM) s...,2995758261,0
1,The epidermis is a stratified epithelium in wh...,3000980440,0
2,Animal models recapitulating human COVID-19 di...,3036267409,0
3,Summary Neural induction in vertebrates genera...,2776058480,0
4,The SARS-CoV-2 outbreak was recently declared ...,3035060659,1


In [7]:
print(f"Number of Covid-19 publications: {df[df.is_Covid19==1].shape[0]}")
print(f"Number of non-Covid-19 publications: {df[df.is_Covid19!=1].shape[0]}")

Number of Covid-19 publications: 739
Number of non-Covid-19 publications: 2000


In [8]:
# Set a seed with Pytorch Lightning
pl.seed_everything(42)

# Split the dataframe to training and evaluation sets
df_train, df_val = train_test_split(df, test_size=.05)

# Split the dataframe to training and test sets
df_train, df_test = train_test_split(df_train, test_size=.05)

Global seed set to 42


In [42]:
class CovidDataset(torch.utils.data.Dataset):
  def __init__(self, data, tokenizer, max_token_len=128):
    self.data = data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    # Grab a single row from the dataframe
    data_row = self.data.iloc[index]

    # Grab text and label
    abstract_text = data_row.abstract
    labels = torch.tensor(data_row.is_Covid19)

    encoding = tokenizer.encode_plus(
        abstract_text,
        add_special_tokens=True,
        max_length=self.max_token_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )

    return dict(
        abstract_text=abstract_text,
        input_ids=encoding["input_ids"].flatten(),
        attention_mask=encoding["attention_mask"].flatten(),
        labels=labels
    )

In [43]:
class CovidDataModule(pl.LightningDataModule):
  def __init__(self, df_train, df_val, df_test, tokenizer, batch_size=8, max_token_len=512):
    super().__init__()
    self.df_train = df_train
    self.df_val = df_val
    self.df_test = df_test
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.max_token_len = max_token_len

  def setup(self):
    self.train_dataset = CovidDataset(self.df_train, self.tokenizer, self.max_token_len)
    self.val_dataset = CovidDataset(self.df_val, self.tokenizer, self.max_token_len)
    self.test_dataset = CovidDataset(self.df_val, self.tokenizer, self.max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size=self.batch_size)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size)

In [44]:
# Hyperparameters
N_EPOCHS = 4
BATCH_SIZE = 16

# Pick a transformer from Huggingface
TRANSFORMER_MODEL_NAME = "bert-base-uncased"

# Instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# Create a Pytorch Lightning DataModule
data_module = CovidDataModule(df_train, df_val, df_test, tokenizer, batch_size=BATCH_SIZE, max_token_len=128)
data_module.setup()

In [56]:
class CovidClassifier(pl.LightningModule):
  def __init__(self, n_classes, lr=2e-5, steps_per_epoch=None, n_epochs=None):
    super().__init__()

    self.bert_model = BertModel.from_pretrained(TRANSFORMER_MODEL_NAME, return_dict=True)
    self.n_classes = n_classes
    self.classifier = nn.Linear(self.bert_model.config.hidden_size,  self.n_classes)
    self.steps_per_epoch = steps_per_epoch
    self.n_epochs = n_epochs
    self.lr = lr

    self.criterion = nn.CrossEntropyLoss()
    # self.criterion = nn.BCELoss()

    self.save_hyperparameters()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert_model(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)

    # output = torch.sigmoid(output)

    loss = 0
    if labels is not None:
      loss = self.criterion(output.view(-1, self.n_classes), labels.view(-1))
      # loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True)
    return {"loss":loss, "predictions":outputs, "labels":labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True)
    return loss
  
  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True)
    return loss

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.lr)
    warmup_steps = self.steps_per_epoch // 3
    total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer, warmup_steps, total_steps
    )
    return [optimizer], [scheduler]

In [46]:
# wandb.login()

In [48]:
model = CovidClassifier(n_classes=2, steps_per_epoch=len(df_train) // BATCH_SIZE, n_epochs=N_EPOCHS)

wandb_logger = WandbLogger(
    project="testing",
    save_code=False,
    tags=["covid-classifier"],
    reinit=True,
    )

wandb_logger.watch(model, log='all')

In [51]:
wandb.run.name

'vague-shape-3'

In [52]:
trainer = pl.Trainer(max_epochs=N_EPOCHS, gpus=1, progress_bar_refresh_rate=30, logger=wandb_logger)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [53]:
trainer.fit(model, data_module)


  | Name       | Type             | Params
------------------------------------------------
0 | bert_model | BertModel        | 109 M 
1 | classifier | Linear           | 1.5 K 
2 | criterion  | CrossEntropyLoss | 0     
------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [54]:
trainer.test()

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.241303488612175}
--------------------------------------------------------------------------------


[{'test_loss': 0.241303488612175}]

In [55]:
wandb.finish()

VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_runtime,458
_timestamp,1613932734
_step,0


0,1
_runtime,▁
_timestamp,▁
_step,▁


In [68]:
trainer.save_checkpoint("covid_classifier.ckpt")
new_model = CovidClassifier.load_from_checkpoint(n_classes=2, checkpoint_path="covid_classifier.ckpt")
new_model.freeze()



In [69]:
x = trainer.test()

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.20464491844177246}
--------------------------------------------------------------------------------


In [70]:
x

[{'test_loss': 0.20464491844177246}]

In [77]:
(len(df_train) // BATCH_SIZE) * 1 * BATCH_SIZE

2464

In [76]:
df_train.shape

(2471, 3)

# code scraps

In [2]:
import torch
from torch import nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [3]:
# Read abstracts from Microsoft Academic Graph
df = pd.read_csv("drive/MyDrive/Colab Notebooks/mag_papers.csv")

In [4]:
df.head()

Unnamed: 0,abstract,paper_id,is_Covid19
0,Nontuberculous mycobacterial infection (NTM) s...,2995758261,0
1,The epidermis is a stratified epithelium in wh...,3000980440,0
2,Animal models recapitulating human COVID-19 di...,3036267409,0
3,Summary Neural induction in vertebrates genera...,2776058480,0
4,The SARS-CoV-2 outbreak was recently declared ...,3035060659,1


In [5]:
# Split the dataframe to training and evaluation sets
pl.seed_everything(42)
df_train, df_val = train_test_split(df, test_size=.05)

Global seed set to 42


In [6]:
print(f"% of Covid-19 publications: {df[df.is_Covid19==1].shape[0]}")
print(f"% of non-Covid-19 publications: {df[df.is_Covid19!=1].shape[0]}")

% of Covid-19 publications: 739
% of non-Covid-19 publications: 2000


In [7]:
df_train.shape

(2602, 3)

In [8]:
# Shuffle the training data
df_train = df_train.sample(frac=1.)

In [None]:
sample_row = df.iloc[15]
sample_abstract = sample_row.abstract
sample_label = sample_row.is_Covid19

print(sample_abstract)
print()
print(sample_label)

Objectives: Pakistan has a high infectious disease burden with about 265,000 reported cases of COVID-19. We investigated the genomic diversity of SARS-CoV-2 strains and present the first data on viruses circulating in the country.
Methods: We performed whole-genome sequencing and data analysis of SARS-CoV-2 eleven strains isolated in March and May. 
Results: Strains from travelers clustered with those from China, Saudi Arabia, India, USA and Australia. Five of eight SARS-CoV-2 strains were GH clade with Spike glycoprotein D614G, Ns3 gene Q57H, and RNA dependent RNA polymerase (RdRp) P4715L mutations. Two were S (ORF8 L84S and N S202N) and three were L clade and one was an I clade strain. One GH and one L strain each displayed Orf1ab L3606F indicating further evolutionary transitions. 
Conclusions: This data reveals SARS-CoV-2 strains of L, G, S and I have been circulating in Pakistan from March, at the start of the pandemic. It indicates viral diversity regarding infection in this popu

In [None]:
# Instantiate the tokenizer
TRANSFORMER_MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
encoding = tokenizer.encode_plus(
    sample_abstract, 
    add_special_tokens=True,
    max_length=512, 
    return_token_type_ids=True, 
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
    )

In [None]:
encoding.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
len(encoding["token_type_ids"].squeeze(0))

512

In [None]:
encoding["input_ids"].shape

torch.Size([1, 512])

In [None]:
encoding["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0, 0

In [None]:
type(encoding["input_ids"])

torch.Tensor

In [9]:
#Instantiate the tokenizer
TRANSFORMER_MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

class CovidDataset(torch.utils.data.Dataset):
  def __init__(self, data, tokenizer, max_token_len=128):
    self.data = data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    # Grab a single row from the dataframe
    data_row = self.data.iloc[index]

    # Grab text and label
    abstract_text = data_row.abstract
    labels = torch.tensor(data_row.is_Covid19)

    # print(torch.FloatTensor(labels))

    encoding = tokenizer.encode_plus(
        abstract_text,
        add_special_tokens=True,
        max_length=self.max_token_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )

    return dict(
        abstract_text=abstract_text,
        input_ids=encoding["input_ids"].flatten(),
        attention_mask=encoding["attention_mask"].flatten(),
        labels=labels
    )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [11]:
train_dataset = CovidDataset(df_train, tokenizer, max_token_len=512)

In [12]:
sample_items = train_dataset[0]

In [13]:
sample_items.keys()

dict_keys(['abstract_text', 'input_ids', 'attention_mask', 'labels'])

In [14]:
sample_items['labels']

tensor(1)

In [15]:
sample_items['abstract_text']

'Despite considerable research progress on SARS-CoV-2, the direct zoonotic origin (intermediate host) of the virus remains ambiguous. The most definitive approach to identify the intermediate host would be the detection of SARS-CoV-2-like coronaviruses in wild animals. However, due to the high number of animal species, it is not feasible to screen all the species in the laboratory. Given that the recognition of the binding ACE2 proteins is the first step for the coronaviruses to invade host cells, we proposed a computational pipeline to identify potential intermediate hosts of SARS-CoV-2 by modeling the binding affinity between the Spike receptor-binding domain (RBD) and host ACE2. Using this pipeline, we systematically examined 285 ACE2 variants from mammals, birds, fish, reptiles, and amphibians, and found that the binding energies calculated on the modeled Spike-RBD/ACE2 complex structures correlate closely with the effectiveness of animal infections as determined by multiple experi

In [None]:
# load model
bert_model = BertModel.from_pretrained(TRANSFORMER_MODEL_NAME, return_dict=True)

In [None]:
sample_items["input_ids"].unsqueeze(dim=0).shape

torch.Size([1, 512])

In [None]:
prediction = bert_model(sample_items["input_ids"].unsqueeze(dim=0), sample_items["attention_mask"].unsqueeze(dim=0))

In [None]:
prediction.last_hidden_state.shape

torch.Size([1, 512, 768])

In [26]:
class CovidDataModule(pl.LightningDataModule):
  def __init__(self, df_train, df_val, tokenizer, batch_size=8, max_token_len=512):
    super().__init__()
    self.df_train = df_train
    self.df_val = df_val
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.max_token_len = max_token_len

  def setup(self):
    self.train_dataset = CovidDataset(self.df_train, self.tokenizer, self.max_token_len)
    self.val_dataset = CovidDataset(self.df_val, self.tokenizer, self.max_token_len)
    self.test_dataset = CovidDataset(self.df_val, self.tokenizer, self.max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

  def val_dataloader(self):
    return DataLoader(self.val_dataset, batch_size=self.batch_size)

  def test_dataloader(self):
    return DataLoader(self.val_dataset, batch_size=self.batch_size)


In [27]:
N_EPOCHS = 4
BATCH_SIZE = 12

data_module = CovidDataModule(df_train, df_val, tokenizer, batch_size=BATCH_SIZE, max_token_len=128)
data_module.setup()

512

## Modelling

In [29]:
target = torch.ones([10, 64], dtype=torch.float32)  # 64 classes, batch size = 10
output = torch.full([10, 64], 1.5)  # A prediction (logit)
pos_weight = torch.ones([64])  # All weights are equal to 1
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
criterion(output, target)  # -log(sigmoid(1.5))

tensor(0.2014)

In [28]:
nn.BCEWithLogitsLoss

torch.nn.modules.loss.BCEWithLogitsLoss

In [31]:
class CovidClassifier(pl.LightningModule):
  def __init__(self, n_classes, steps_per_epoch=None, n_epochs=None):
    super().__init__()

    self.bert_model = BertModel.from_pretrained(TRANSFORMER_MODEL_NAME, return_dict=True)
    self.n_classes = n_classes
    self.classifier = nn.Linear(self.bert_model.config.hidden_size,  self.n_classes)
    self.steps_per_epoch = steps_per_epoch
    self.n_epochs = n_epochs

    self.criterion = nn.CrossEntropyLoss()
    # self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert_model(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)

    # output = torch.sigmoid(output)

    loss = 0
    if labels is not None:
      loss = self.criterion(output.view(-1, self.n_classes), labels.view(-1))
      # loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=False)
    return {"loss":loss, "predictions":outputs, "labels":labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=False)
    return loss
  
  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=False)
    return loss

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=2e-5)
    warmup_steps = self.steps_per_epoch // 3
    total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer, warmup_steps, total_steps
    )
    return [optimizer], [scheduler]

In [32]:
model = CovidClassifier(n_classes=2, steps_per_epoch=len(df_train) // BATCH_SIZE, n_epochs=N_EPOCHS)

In [33]:
# # predictions with the untrained model
# _, predictions = model(sample_items["input_ids"].unsqueeze(dim=0), sample_items["attention_mask"].unsqueeze(dim=0))
# predictions

In [34]:
trainer = pl.Trainer(max_epochs=N_EPOCHS, gpus=1, progress_bar_refresh_rate=30)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [35]:
trainer.fit(model, data_module)


  | Name       | Type             | Params
------------------------------------------------
0 | bert_model | BertModel        | 109 M 
1 | classifier | Linear           | 1.5 K 
2 | criterion  | CrossEntropyLoss | 0     
------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [36]:
trainer.test()

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.2209876924753189}
--------------------------------------------------------------------------------


[{'test_loss': 0.2209876924753189}]

In [None]:
# # predictions with the untrained model
# _, predictions = model(sample_items["input_ids"].unsqueeze(dim=0), sample_items["attention_mask"].unsqueeze(dim=0))
# print(predictions)
# softmax = nn.Softmax(-1)
# print(softmax(predictions))

tensor([[ 0.7241, -0.0394]], grad_fn=<AddmmBackward>)
tensor([[0.6821, 0.3179]], grad_fn=<SoftmaxBackward>)


In [37]:
trainer.save_checkpoint("covid_classifier.ckpt")
new_model = CovidClassifier.load_from_checkpoint(n_classes=2, checkpoint_path="covid_classifier.ckpt")
new_model.freeze()



In [38]:
sample_abstract = "COVID-19 is a deadly coronavirus that broke out last year."

In [39]:
encoding = tokenizer.encode_plus(
    sample_abstract, 
    add_special_tokens=True,
    max_length=128, 
    return_token_type_ids=True, 
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
    )

In [40]:
_, predictions = new_model(encoding["input_ids"], encoding["attention_mask"])
print(predictions)
softmax = nn.Softmax(-1)
print(softmax(predictions))

tensor([[-0.4042,  0.6425]])
tensor([[0.2599, 0.7401]])


In [29]:
sample_items["labels"]

tensor(1)

In [35]:
df_train.iloc[8][0]

'The malarial parasite Plasmodium, infects red blood cells by remodeling them and transporting its own proteins to their cell surface. These proteins trigger adhesion of infected cells to uninfected cells (rosetting), and to the vascular endothelium, obstructing blood flow and contributing to pathogenesis. RIFINs (P. falciparum-encoded repetitive interspersed families of polypeptides) and STEVORs (subtelomeric variable open reading frame), are two classes of proteins that are involved in rosetting. Here we study the membrane insertion and topology of three RIFIN and two STEVOR proteins, employing a well-established assay that uses N-linked glycosylation of sites within the protein as a measure to assess the topology a protein adopts when inserted into the ER membrane. Our results indicate that all the proteins tested assume an overall topology of Ncyt-Ccyt, with predicted transmembrane helices TM1 and TM3 integrated into the ER membrane. We also show that the segments predicted as TM2 