In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score

In [3]:
df_train = pd.read_csv("/content/drive/MyDrive/liar_dataset/train.tsv", sep="\t", header=None)
df_train.columns = ['json_ID', 'label','statement','subject','speaker','speaker_job','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context']

df_test = pd.read_csv("/content/drive/MyDrive/liar_dataset/test.tsv", sep="\t", header=None)
df_test.columns = ['json_ID', 'label','statement','subject','speaker','speaker_job','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context']

df_validate = pd.read_csv("/content/drive/MyDrive/liar_dataset/valid.tsv", sep="\t", header=None)
df_validate.columns = ['json_ID', 'label','statement','subject','speaker','speaker_job','state_info','party_affiliation','barely_true_counts','false_counts','half_true_counts','mostly_true_counts','pants_on_fire_counts','context']

df_train['label'] = df_train['label'].map({'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5})
df_test['label'] = df_test['label'].map({'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5})
df_validate['label'] = df_validate['label'].map({'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5})

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_text(input_text):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    
    for text in input_text:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=128,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_token_type_ids=True,
                            return_tensors='np'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        token_type_ids.append(encoded_dict['token_type_ids'])

    return {'input_ids': np.array(input_ids),
            'attention_mask': np.array(attention_masks),
            'token_type_ids': np.array(token_type_ids)}

In [5]:
X_train = preprocess_text(df_train['statement'].values)
y_train = df_train['label'].values

X_test = preprocess_text(df_test['statement'].values)
y_test = df_test['label'].values

X_validate = preprocess_text(df_validate['statement'].values)
y_validate = df_validate['label'].values

In [6]:
train_inputs = torch.tensor(X_train['input_ids'], dtype=torch.long)
train_labels = torch.tensor(y_train, dtype=torch.long)
train_masks = torch.tensor(X_train['attention_mask'], dtype=torch.long)
train_token_types = torch.tensor(X_train['token_type_ids'], dtype=torch.long)

test_inputs = torch.tensor(X_test['input_ids'], dtype=torch.long)
test_labels = torch.tensor(y_test, dtype=torch.long)
test_masks = torch.tensor(X_test['attention_mask'], dtype=torch.long)
test_token_types = torch.tensor(X_test['token_type_ids'], dtype=torch.long)
test_y = torch.tensor(test_labels.tolist())

validate_inputs = torch.tensor(X_validate['input_ids'], dtype=torch.long)
validate_labels = torch.tensor(y_validate, dtype=torch.long)
validate_masks = torch.tensor(X_validate['attention_mask'], dtype=torch.long)
validate_token_types = torch.tensor(X_validate['token_type_ids'], dtype=torch.long)
val_y = torch.tensor(validate_labels.tolist())

In [None]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_token_types, train_labels)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

val_data = TensorDataset(validate_inputs, validate_masks, validate_token_types, validate_labels)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_token_types, test_labels)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

model.to(device)

epochs = 3
learning_rate = 2e-5

# Set optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [8]:
def train(model, dataloader, optimizer, scheduler, device):
  model.train()
  total_loss = 0

  for batch in tqdm(dataloader, desc='Training'):
      input_ids = batch[0].to(device)
      attention_mask = batch[1].to(device)
      token_type_ids = batch[2].to(device)
      labels = batch[3].to(device)

      model.zero_grad()
      input_ids = input_ids.squeeze(1)
      attention_mask = attention_mask.squeeze(1)
      token_type_ids = token_type_ids.squeeze(1)
      
      outputs = model(input_ids, attention_mask=attention_mask, token_type_ids = token_type_ids, labels=labels)
      loss = outputs.loss
      logits = outputs.logits

      total_loss += loss.item()

      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      optimizer.step()
      scheduler.step()

  avg_loss = total_loss / len(dataloader)

  return avg_loss

In [9]:
def evaluate(model, dataloader, device):
  model.eval()
  
  all_preds = []
  total_loss = 0
  total_correct = 0
  total_samples = 0

  with torch.no_grad():
      for batch in tqdm(dataloader, desc='Evaluation'):
          input_ids = batch[0].to(device)
          attention_mask = batch[1].to(device)
          token_type_ids = batch[2].to(device)
          labels = batch[3].to(device)

          model.zero_grad()
          input_ids = input_ids.squeeze(1)
          attention_mask = attention_mask.squeeze(1)
          token_type_ids = token_type_ids.squeeze(1)
      
          outputs = model(input_ids, attention_mask=attention_mask, token_type_ids = token_type_ids, labels=labels)
          loss = outputs.loss
          logits = outputs.logits

          total_loss += loss.item()
          preds = torch.argmax(logits, dim=1)

          all_preds.extend(preds.cpu().numpy())
          total_correct += torch.sum(preds == labels).item()
          total_samples += labels.shape[0]
  
  avg_loss = total_loss / len(dataloader)
  accuracy = total_correct / total_samples

  return avg_loss, all_preds, accuracy

In [None]:
train_losses = []
val_losses = []

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}:')
    train_loss = train(model, train_dataloader, optimizer, scheduler, device)
    val_loss, val_f1, acc = evaluate(model, val_dataloader, device)
    val_f1_score = f1_score(val_y, val_f1, average = 'macro')

    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val Accuracy: {acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Macro F1 Score: {val_f1_score:.4f}')
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)

test_loss, test_f1, test_acc = evaluate(model, test_dataloader, device)
test_f1_score = f1_score(test_y, test_f1, average = 'macro')
print()
print(f'Test Accuracy: {test_acc:.4f}')
print(f'Test Loss: {test_loss:.4f}, Test Macro F1 Score: {test_f1_score:.4f}')

In [None]:
import matplotlib.pyplot as plt

# Plot losses
plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()