In [1]:
from transformers import DistilBertModel, DistilBertTokenizerFast, DistilBertConfig, Trainer, TrainingArguments, get_linear_schedule_with_warmup
from datasets import load_dataset, DatasetDict, Dataset

import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader


from tqdm.notebook import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [3]:
class ClassificationHead(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    super(ClassificationHead, self).__init__()
    self.linear1 = nn.Linear(input_dim, hidden_dim)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(hidden_dim, output_dim)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.relu(self.linear1(x))
    x = self.sigmoid(self.linear2(x))
    return x

In [4]:
class MutliClassDistilBert(nn.Module):
  def __init__(self):
    super(MutliClassDistilBert, self).__init__()

    self.base = DistilBertModel.from_pretrained('distilbert-base-cased', output_hidden_states=True)

    self.head_I_E = ClassificationHead(768, 320, 1)
    self.head_N_S = ClassificationHead(768, 320, 1)
    self.head_T_F = ClassificationHead(768, 320, 1)
    self.head_J_P = ClassificationHead(768, 320, 1)

  def forward(self, input_ids, attention_mask):
    outputs = self.base(input_ids=input_ids, attention_mask=attention_mask)
    inputs = outputs.last_hidden_state
    I_E = self.head_I_E(inputs)
    N_S = self.head_N_S(inputs)
    T_F = self.head_T_F(inputs)
    J_P = self.head_J_P(inputs)

    return I_E, N_S, T_F, J_P

In [5]:
model = MutliClassDistilBert()

In [13]:
authors = pd.read_csv("../data/pandora/pandora_profiles/author_profiles.csv")
comments = pd.read_csv("../data/pandora/pandora_comments/all_comments_since_2015.csv", nrows=20000)

In [14]:
authors['I/E'] = authors['mbti'].str[0].apply(lambda x: 1 if x == 'e' else 0)
authors['N/S'] = authors['mbti'].str[1].apply(lambda x: 1 if x == 's' else 0)
authors['T/F'] = authors['mbti'].str[2].apply(lambda x: 1 if x == 'f' else 0)
authors['J/P'] = authors['mbti'].str[3].apply(lambda x: 1 if x == 'p' else 0)

In [15]:
authors = authors[['author', 'introverted', 'intuitive', 'thinking', 'perceiving']]
comments = comments[['author', 'body']]

In [16]:
pandora = pd.merge(authors, comments, on='author')
pandora.drop('author', axis=1, inplace=True)

In [17]:
pandora = Dataset.from_pandas(pandora)

In [18]:
pandora.shuffle(seed=42)
train_test_val = pandora.train_test_split(test_size=0.2)
test_val = train_test_val['test'].train_test_split(test_size=0.5)
pandora = DatasetDict({
  'train': train_test_val['train'],
  'val': test_val['train'],
  'test': test_val['test']
})
pandora

DatasetDict({
    train: Dataset({
        features: ['introverted', 'intuitive', 'thinking', 'perceiving', 'body'],
        num_rows: 16000
    })
    val: Dataset({
        features: ['introverted', 'intuitive', 'thinking', 'perceiving', 'body'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['introverted', 'intuitive', 'thinking', 'perceiving', 'body'],
        num_rows: 2000
    })
})

In [19]:
def combine_labels(example):
  example['labels'] = [example['introverted'], example['intuitive'], example['thinking'], example['perceiving']]

pandora = pandora.map(combine_labels)

pandora = pandora.map(
  lambda example: tokenizer(example['body'], padding=True, truncation=True),
  batched=True,
  batch_size=16
)

columns_to_remove = ['introverted', 'intuitive', 'thinking', 'perceiving', 'body']
pandora = pandora.remove_columns(columns_to_remove)
pandora.set_format('torch')

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map: 100%|██████████| 16000/16000 [00:00<00:00, 31729.28 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 28933.92 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 29192.69 examples/s]
Map: 100%|██████████| 16000/16000 [00:01<00:00, 8309.83 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 6881.91 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 8887.40 examples/s]


In [20]:
train_dataloader = DataLoader(pandora['train'], batch_size=16)
val_dataloader = DataLoader(pandora['val'], batch_size=16)
test_dataloader = DataLoader(pandora['test'], batch_size=16)
pandora = 0

eval_dataloader = val_dataloader

In [21]:
num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
loss_func = CrossEntropyLoss()
lr_scheduler= get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

best_val_loss = float("inf")

for epoch in range(num_epochs):
  model.train()
  for batch in train_dataloader:
    optimizer.zero_grad()
    outputs = model(**batch)
    lr_scheduler.step()
    optimizer.step()

  model.eval()
  val_loss = 0
  for batch_i, batch in enumerate(eval_dataloader):
    with torch.no_grad():
      output = model(**batch)
    val_loss += output.loss

  avg_val_loss = val_loss / len(eval_dataloader)
  print(f'Average Validation Loss: {avg_val_loss}')
  if avg_val_loss < best_val_loss:
    print(f"Saving Checkpoint..")
    best_val_loss = avg_val_loss
    torch.save({
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'val_loss': best_val_loss
      },
      f'checkpoints/epoch_{epoch}.pt  '
      )



AttributeError: 'tuple' object has no attribute 'loss'