In [None]:
import pandas as pd
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import *
from tqdm.notebook import trange
import time

### Change device for GPU if available

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name(0))

### Load datasets

In [None]:
df_train = pd.read_csv('train.csv') 
df_train.shape
print(df_train.shape)
df_train.head()

In [None]:
df_valid = pd.read_csv('dev.csv') 
df_valid.shape

In [None]:
df_test = pd.read_csv('test.csv') 
df_test.shape

In [None]:
cols = df_train.columns
label_cols = list(cols[3:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
print(num_labels)

In [None]:
df_train = df_train.sample(frac=1).reset_index(drop=True) #shuffle rows
df_valid = df_valid.sample(frac=1).reset_index(drop=True) #shuffle rows
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle rows

### Get one hots labels

In [None]:
df_train['one_hot_labels'] = list(df_train[label_cols].values)
df_valid['one_hot_labels'] = list(df_valid[label_cols].values)
df_test['one_hot_labels'] = list(df_test[label_cols].values)
df_train.head()

In [None]:
train_labels = list(df_train.one_hot_labels.values)
train_text = list(df_train.abstract.values)

valid_labels = list(df_valid.one_hot_labels.values)
valid_text = list(df_valid.abstract.values)

test_labels = list(df_test.one_hot_labels.values)
test_text = list(df_test.abstract.values)

### Tokenize

In [None]:
max_length = 512
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True) # tokenizer
encodings_train = tokenizer.batch_encode_plus(train_text, max_length=max_length, padding='max_length', truncation=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings_train.keys())

In [None]:
encodings_valid = tokenizer.batch_encode_plus(valid_text,max_length=max_length,padding='max_length',truncation=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings_valid.keys())

In [None]:
encodings_test = tokenizer.batch_encode_plus(test_text,max_length=max_length,padding='max_length',truncation=True)
print('tokenizer outputs: ', encodings_test.keys())

In [None]:
train_input_ids = encodings_train['input_ids'] # tokenized and encoded sentences
train_attention_masks = encodings_train['attention_mask'] # attention masks

In [None]:
valid_input_ids = encodings_valid['input_ids'] # tokenized and encoded sentences
valid_attention_masks = encodings_valid['attention_mask'] # attention masks
test_input_ids = encodings_test['input_ids']
test_attention_masks = encodings_test['attention_mask']

### Create and save dataloaders

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs_tensor = torch.tensor(train_input_ids)
train_labels_tensor = torch.tensor(train_labels)
train_masks_tensor = torch.tensor(train_attention_masks)

validation_inputs_tensor = torch.tensor(valid_input_ids)
validation_labels_tensor = torch.tensor(valid_labels)
validation_masks_tensor = torch.tensor(valid_attention_masks)

test_inputs_tensor = torch.tensor(test_input_ids)
test_labels_tensor = torch.tensor(test_labels)
test_masks_tensor = torch.tensor(test_attention_masks)

In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 8

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs_tensor, train_masks_tensor, train_labels_tensor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs_tensor, validation_masks_tensor, validation_labels_tensor)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs_tensor, test_masks_tensor, test_labels_tensor)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
torch.save(train_dataloader,'BERT/dataloaders/train_data_loader-8-512')
torch.save(validation_dataloader,'BERT/dataloaders/validation_data_loader-8-512')
torch.save(test_dataloader,'BERT/dataloaders/test_data_loader-8-512')