In [None]:
!pip install opendatasets
import opendatasets as od
from transformers import AdamW, BertModel, BertTokenizer
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

torch.manual_seed(42)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
od.download('https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: kushrevankar
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018


In [None]:
EMBED_SIZE = 300
BERT_MODEL = 'prajjwal1/bert-mini'
LEARNING_RATE = 0.001
EPOCHS = 10

# Load dataset
kaggle_train = pd.read_csv('/content/kuc-hackathon-winter-2018/drugsComTrain_raw.csv')
kaggle_test = pd.read_csv('/content/kuc-hackathon-winter-2018/drugsComTest_raw.csv')

# Combine train & test sets
main_x = kaggle_train['review'].tolist() + kaggle_test['review'].tolist()
main_y = kaggle_train['condition'].tolist() + kaggle_test['condition'].tolist()



# Use full dataset (remove slicing)
vocab = list(set(main_y))  # Get all unique labels
i_to_x = {i: label for i, label in enumerate(vocab)}
x_to_i = {label: i for i, label in enumerate(vocab)}

# Split dataset without stratification
x_train, x_test, y_train, y_test = train_test_split(main_x, main_y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

def encode_texts(texts):
    return tokenizer(texts, truncation=True, max_length=512, padding=True, return_tensors='pt')

# Encode & move to device
x_train = encode_texts(x_train)
y_train = torch.tensor([x_to_i[y] for y in y_train], dtype=torch.long).to(device)
train_dataset = TensorDataset(x_train['input_ids'], x_train['attention_mask'], x_train['token_type_ids'], y_train)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

x_test = encode_texts(x_test)
y_test = torch.tensor([x_to_i[y] for y in y_test], dtype=torch.long).to(device)
test_dataset = TensorDataset(x_test['input_ids'], x_test['attention_mask'], x_test['token_type_ids'], y_test)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

x_val = encode_texts(x_val)
y_val = torch.tensor([x_to_i[y] for y in y_val], dtype=torch.long).to(device)
val_dataset = TensorDataset(x_val['input_ids'], x_val['attention_mask'], x_val['token_type_ids'], y_val)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-55-ddcfca6bcedf>", line 30, in <cell line: 0>
    x_train = encode_texts(x_train)
              ^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-55-ddcfca6bcedf>", line 27, in encode_texts
    return tokenizer(texts, truncation=True, max_length=512, padding=True, return_tensors='pt')
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py", line 2877, in __call__
    encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py", line 2965, in _call_one
    return self.batch_en

In [None]:
print(f"Train size: {len(x_train)}, Test size: {len(x_test)}, Val size: {len(x_val)}")
print(f"Total dataset size: {len(main_x)}")

Train size: 3, Test size: 3, Val size: 3
Total dataset size: 500


In [None]:
from sklearn.metrics import accuracy_score

class DrugClassifier(nn.Module):
  def __init__(self):
    super(DrugClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL)
    self.lin1 = nn.Linear(self.bert.config.hidden_size, 512)
    self.lin2 = nn.Linear(512, len(vocab))
    # self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
    output1 = self.lin1(pooled_output)
    output2 = self.lin2(output1)
    # return self.softmax(output)
    return output2

model = DrugClassifier().to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
best_model = {'accuracy':-1, 'epoch':-1, 'model':{}, 'optimizer':{}}
for epoch in range(EPOCHS):
  print('Epoch: ', epoch+1)
  losses = []
  accuracies = []
  f1_scores = []
  for input_ids, attention_mask, token_type_ids, labels in tqdm(train_dataloader):
    model.train()
    out = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
    loss = loss_func(out, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    pred = torch.max(out, dim=1, keepdim=True)[1]
    pred = pred.view(pred.shape[0]).to(torch.float32).to(device)
    acc = accuracy_score(pred.tolist(), labels.tolist())
    accuracies.append(acc)
    f1 = f1_score(pred.tolist(), labels.tolist(), average='weighted')
    f1_scores.append(f1)
    losses.append(loss)

  print('Train Loss: ', sum(losses)/len(losses))
  print('Train Accuracy: ', sum(accuracies)/len(accuracies))
  print('Train F1 score: ', sum(f1_scores)/len(f1_scores))

  val_accuracies = []
  val_losses = []
  val_f1 = []
  with torch.no_grad():
    for input_ids, attention_mask, token_type_ids, labels in tqdm(val_dataloader):
      model.eval()
      pred = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = loss_func(pred, labels)
      pred = torch.max(pred, dim=1, keepdim=True)[1]
      pred = pred.view(pred.shape[0]).to(torch.float32)
      acc = accuracy_score(pred.tolist(), labels.tolist())
      val_accuracies.append(acc)
      f1 = f1_score(pred.tolist(), labels.tolist(), average='weighted')
      val_f1.append(f1)
      val_losses.append(loss)
    print('Dev Loss: ', sum(val_losses)/len(val_losses))
    print('Dev Accuracy: ', sum(val_accuracies)/len(val_accuracies))
    print('Dev F1 score: ', sum(val_f1)/len(val_f1))

  if best_model['accuracy'] < sum(val_accuracies)/len(val_accuracies):
    best_model['accuracy'] = sum(val_accuracies)/len(val_accuracies)
    best_model['epoch'] = epoch+1
    best_model['model'] = model.state_dict()
    best_model['optimizer'] = optimizer.state_dict()

torch.save({
    'accuracy':best_model['accuracy'],
    'epoch':best_model['epoch'],
    'model':best_model['model'],
    'optimizer':best_model['optimizer']
}, './best_model61')

Epoch:  1


100%|██████████| 7/7 [00:00<00:00,  7.66it/s]


Train Loss:  tensor(4.4098, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.13392857142857142
Train F1 score:  0.23028545074606935


100%|██████████| 1/1 [00:00<00:00, 14.76it/s]


Dev Loss:  tensor(4.1830, device='cuda:0')
Dev Accuracy:  0.06
Dev F1 score:  0.11320754716981134
Epoch:  2


100%|██████████| 7/7 [00:00<00:00,  8.63it/s]


Train Loss:  tensor(4.0783, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.171875
Train F1 score:  0.2821382752912228


100%|██████████| 1/1 [00:00<00:00, 14.51it/s]


Dev Loss:  tensor(4.3682, device='cuda:0')
Dev Accuracy:  0.24
Dev F1 score:  0.3870967741935484
Epoch:  3


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]


Train Loss:  tensor(4.0368, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.18526785714285715
Train F1 score:  0.30921034890959703


100%|██████████| 1/1 [00:00<00:00, 13.75it/s]


Dev Loss:  tensor(4.3251, device='cuda:0')
Dev Accuracy:  0.24
Dev F1 score:  0.3870967741935484
Epoch:  4


100%|██████████| 7/7 [00:00<00:00,  7.42it/s]


Train Loss:  tensor(3.9249, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.19196428571428573
Train F1 score:  0.31398116353705685


100%|██████████| 1/1 [00:00<00:00, 12.47it/s]


Dev Loss:  tensor(4.0403, device='cuda:0')
Dev Accuracy:  0.24
Dev F1 score:  0.36244897959183675
Epoch:  5


100%|██████████| 7/7 [00:00<00:00,  7.07it/s]


Train Loss:  tensor(3.7543, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.16517857142857142
Train F1 score:  0.233674430781898


100%|██████████| 1/1 [00:00<00:00, 13.61it/s]


Dev Loss:  tensor(4.2141, device='cuda:0')
Dev Accuracy:  0.18
Dev F1 score:  0.1721739130434783
Epoch:  6


100%|██████████| 7/7 [00:00<00:00,  8.28it/s]


Train Loss:  tensor(3.6729, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.18973214285714285
Train F1 score:  0.2676156401227539


100%|██████████| 1/1 [00:00<00:00, 14.89it/s]


Dev Loss:  tensor(4.4192, device='cuda:0')
Dev Accuracy:  0.18
Dev F1 score:  0.18514285714285714
Epoch:  7


100%|██████████| 7/7 [00:00<00:00,  7.73it/s]


Train Loss:  tensor(3.4310, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.22991071428571427
Train F1 score:  0.29598602547943936


100%|██████████| 1/1 [00:00<00:00, 13.99it/s]


Dev Loss:  tensor(4.6388, device='cuda:0')
Dev Accuracy:  0.22
Dev F1 score:  0.22863636363636364
Epoch:  8


100%|██████████| 7/7 [00:00<00:00,  7.35it/s]


Train Loss:  tensor(3.3665, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.25669642857142855
Train F1 score:  0.3203134159378726


100%|██████████| 1/1 [00:00<00:00, 13.26it/s]


Dev Loss:  tensor(5.0251, device='cuda:0')
Dev Accuracy:  0.18
Dev F1 score:  0.16363636363636364
Epoch:  9


100%|██████████| 7/7 [00:00<00:00,  7.65it/s]


Train Loss:  tensor(3.2056, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.25
Train F1 score:  0.30926827192041156


100%|██████████| 1/1 [00:00<00:00, 10.94it/s]


Dev Loss:  tensor(5.0738, device='cuda:0')
Dev Accuracy:  0.22
Dev F1 score:  0.24389805097451275
Epoch:  10


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]


Train Loss:  tensor(3.2038, device='cuda:0', grad_fn=<DivBackward0>)
Train Accuracy:  0.24553571428571427
Train F1 score:  0.30340886729326233


100%|██████████| 1/1 [00:00<00:00, 15.41it/s]


Dev Loss:  tensor(5.4248, device='cuda:0')
Dev Accuracy:  0.2
Dev F1 score:  0.20767676767676768


In [None]:
model_path = './best_model61'
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

test_accuracies = []
test_f1_scores = []
test_losses = []
with torch.no_grad():
    for input_ids, attention_mask, token_type_ids, labels in tqdm(val_dataloader):
      model.eval()
      pred = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = loss_func(pred, labels)
      pred = torch.max(pred, dim=1, keepdim=True)[1]
      pred = pred.view(pred.shape[0]).to(torch.float32)
      acc = accuracy_score(pred.tolist(), labels.tolist())
      test_accuracies.append(acc)
      f1 = f1_score(pred.tolist(), labels.tolist(), average='weighted')
      test_f1_scores.append(f1)
      test_losses.append(loss)
      print('Test Loss: ', sum(test_losses)/len(test_losses))
      print('Test F1 score: ', sum(test_f1_scores)/len(test_f1_scores))
      print('Test Accuracy: ', sum(test_accuracies)/len(test_accuracies))

100%|██████████| 1/1 [00:00<00:00,  7.48it/s]

Test Loss:  tensor(5.4248, device='cuda:0')
Test F1 score:  0.20767676767676768
Test Accuracy:  0.2



