In [1]:
import pandas as pd

In [2]:
df_train = pd.read_parquet('./doc_mat_train.parquet')
df_test = pd.read_parquet('./doc_mat_test.parquet')

In [3]:
from tqdm import tqdm
import torch
from transformers import BertTokenizer
device = torch.device('cuda')


def data_prepare(df, text_col, label_col):
    sentences = df[text_col].values.tolist()
    if label_col is not None:
        labels_list = df[label_col].values.tolist()
        labels = torch.tensor(labels_list)
    else:
        labels = None
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    input_ids_list = []
    attention_masks_list = []
    for sentence in tqdm(sentences):
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids_list.append(encoded_dict['input_ids'])
        attention_masks_list.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids_list, dim=0)
    attention_masks = torch.cat(attention_masks_list, dim=0)
    
    return input_ids, attention_masks, labels

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# df_train = df_train.head(100)
# df_test = df_test.head(50)


input_ids, attention_masks, labels = data_prepare(df_train, 'data', 'label1')
input_ids_test, attention_masks_test, _ = data_prepare(df_test, 'data', None)

  0%|                                                                                        | 0/13349 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|███████████████████████████████████████████████████████████████████████████| 13349/13349 [01:36<00:00, 138.11it/s]
  0%|                                                                                         | 0/6575 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the 

In [5]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)
dataset_test = TensorDataset(input_ids_test, attention_masks_test)

train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_size, val_size

(9344, 4005)

In [6]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 8
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size,
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size,
)

test_dataloader = DataLoader(
    dataset_test,
    sampler=SequentialSampler(dataset_test),
    batch_size=batch_size,
)

In [7]:
from transformers import BertForSequenceClassification, AdamW, BertConfig


model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(set(labels.tolist())),
    output_attentions=False,
    output_hidden_states=False
)

model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
from transformers import get_linear_schedule_with_warmup


optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps,
)



In [9]:
import time
import datetime


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [10]:
import random
import numpy as np

seed = 2023

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [11]:
training_stats = []
total_t0 = time.time()
for epoch_i in range(epochs):
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(f'Batch {elapsed}')
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()
        res = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
        )
        
        total_train_loss += res[0].item()
        res[0].backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print(f'average training loss: {avg_train_loss}')
    print(f'training epoch took: {training_time}')
    
    
    print()
    print('run validation')
    t0 = time.time()
    model.eval()
    total_val_loss = 0
    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():
            res = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels
            )
        total_val_loss += res[0].item()
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_time = format_time(time.time() - t0)
    print(f'validation loss {avg_val_loss}')
    print(f'val epoch took: {val_time}')

Batch 0:00:20
Batch 0:00:40
Batch 0:00:59
Batch 0:01:19
Batch 0:01:38
Batch 0:01:58
Batch 0:02:18
Batch 0:02:38
Batch 0:02:58
Batch 0:03:18
Batch 0:03:38
Batch 0:03:57
Batch 0:04:17
Batch 0:04:37
Batch 0:04:57
Batch 0:05:16
Batch 0:05:36
Batch 0:05:56
Batch 0:06:15
Batch 0:06:35
Batch 0:06:55
Batch 0:07:15
Batch 0:07:34
Batch 0:07:54
Batch 0:08:14
Batch 0:08:33
Batch 0:08:53
Batch 0:09:13
Batch 0:09:32
average training loss: 0.9282372263738605
training epoch took: 0:09:36

run validation
validation loss 0.4571434051214816
val epoch took: 0:01:18
Batch 0:00:20
Batch 0:00:40
Batch 0:00:59
Batch 0:01:19
Batch 0:01:39
Batch 0:01:58
Batch 0:02:18
Batch 0:02:38
Batch 0:02:58
Batch 0:03:17
Batch 0:03:37
Batch 0:03:57
Batch 0:04:16
Batch 0:04:36
Batch 0:04:56
Batch 0:05:16
Batch 0:05:35
Batch 0:05:55
Batch 0:06:15
Batch 0:06:35
Batch 0:06:54
Batch 0:07:14
Batch 0:07:34
Batch 0:07:54
Batch 0:08:13
Batch 0:08:33
Batch 0:08:53
Batch 0:09:13
Batch 0:09:32
average training loss: 0.3150745884161315


In [12]:
print('run test')
t0 = time.time()
model.eval()
res_list = []
for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    with torch.no_grad():
        res = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
        )
    res_list.extend(res['logits'].cpu().numpy())

run test


In [13]:
from scipy.special import softmax

res = np.array([softmax(x) for x in res_list])

In [14]:
np.save('./doc_mat.npy', res)