In [None]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import *
from pathlib import Path
import numpy as np

#### Choose device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name(0))

#### CSV to DF

In [None]:
csv_path = Path("path_to_CSVs")
delimiter='\t'

In [None]:
df_train = pd.read_csv(csv_path/'train.csv', delimiter=delimiter) 
print(df_train.shape)
df_train.head()

In [None]:
df_valid = pd.read_csv(csv_path/'dev.csv', delimiter='\t') 
print(df_valid.shape)
df_valid.head()

In [None]:
df_test = pd.read_csv(csv_path/'test.csv', delimiter='\t') 
print(df_test.shape)
df_test.head()

#### Get list of labels from DF columns

In [None]:
cols = df_train.columns
label_cols = list(cols[3:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
print(num_labels)

In [None]:
#shuffle rows
df_train = df_train.sample(frac=1).reset_index(drop=True) 
df_valid = df_valid.sample(frac=1).reset_index(drop=True) 
df_test = df_test.sample(frac=1).reset_index(drop=True) 

In [None]:
df_train['one_hot_labels'] = list(df_train[label_cols].values)
df_valid['one_hot_labels'] = list(df_valid[label_cols].values)
df_test['one_hot_labels'] = list(df_test[label_cols].values)
df_train.head()

#### Convert to DF values to list

In [None]:
train_labels = list(df_train.one_hot_labels.values)
train_text = list(df_train.abstract.values)

valid_labels = list(df_valid.one_hot_labels.values)
valid_text = list(df_valid.abstract.values)

test_labels = list(df_test.one_hot_labels.values)
test_text = list(df_test.abstract.values)

#### Tokenize texts and gete input_ids + attention masks

In [None]:
max_length = 512 # max sequence length
model_name = "flaubert/flaubert_base_cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
encodings_train = tokenizer.batch_encode_plus(train_text,max_length=max_length,padding='max_length',truncation=True) 
print('tokenizer outputs: ', encodings_train.keys())

In [None]:
encodings_valid = tokenizer.batch_encode_plus(valid_text,max_length=max_length,padding='max_length',truncation=True) 
print('tokenizer outputs: ', encodings_valid.keys())

In [None]:
encodings_test = tokenizer.batch_encode_plus(test_text,max_length=max_length,padding='max_length',truncation=True)
print('tokenizer outputs: ', encodings_test.keys())

In [None]:
train_input_ids = encodings_train['input_ids'] # tokenized and encoded sentences
train_attention_masks = encodings_train['attention_mask'] # attention masks

In [None]:
valid_input_ids = encodings_valid['input_ids']
valid_attention_masks = encodings_valid['attention_mask']
test_input_ids = encodings_test['input_ids']
test_attention_masks = encodings_test['attention_mask']

### Convert to tensors and Make Dataloaders

In [None]:
train_inputs_tensor = torch.tensor(np.array(train_input_ids))
train_masks_tensor = torch.tensor(np.array(train_attention_masks))
train_labels_tensor = torch.tensor(np.array(train_labels))

validation_inputs_tensor = torch.tensor(np.array(valid_input_ids))
validation_masks_tensor = torch.tensor(np.array(valid_attention_masks))
validation_labels_tensor = torch.tensor(np.array(valid_labels))

test_inputs_tensor = torch.tensor(np.array(test_input_ids))
test_masks_tensor = torch.tensor(np.array(test_attention_masks))
test_labels_tensor = torch.tensor(np.array(test_labels))

In [None]:
# Select a batch size for training. a power of 2 is recommended
batch_size = 8

train_data = TensorDataset(train_inputs_tensor, train_masks_tensor, train_labels_tensor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs_tensor, validation_masks_tensor, validation_labels_tensor)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs_tensor, test_masks_tensor, test_labels_tensor)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
save_path = Path('FlauBERT/dataloaders/')

In [None]:
torch.save(train_dataloader,save_path/f'train_data_loader-{batch_size}-{max_length}')
torch.save(validation_dataloader,save_path/f'validation_data_loader-{batch_size}-{max_length}')
torch.save(test_dataloader,save_path/f'test_data_loader-{batch_size}-{max_length}')