In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [1]:
import pandas as pd
import functools
import sys

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
import transformers

In [2]:

seed = 0

torch.manual_seed(seed)

<torch._C.Generator at 0x15e9574ba30>

In [3]:
datasets.list_datasets(with_community_datasets=True, with_details=False)

['acronym_identification',
 'ade_corpus_v2',
 'adversarial_qa',
 'aeslc',
 'afrikaans_ner_corpus',
 'ag_news',
 'ai2_arc',
 'air_dialogue',
 'ajgt_twitter_ar',
 'allegro_reviews',
 'allocine',
 'alt',
 'amazon_polarity',
 'amazon_reviews_multi',
 'amazon_us_reviews',
 'ambig_qa',
 'americas_nli',
 'ami',
 'amttl',
 'anli',
 'app_reviews',
 'aqua_rat',
 'aquamuse',
 'ar_cov19',
 'ar_res_reviews',
 'ar_sarcasm',
 'arabic_billion_words',
 'arabic_pos_dialect',
 'arabic_speech_corpus',
 'arcd',
 'arsentd_lev',
 'art',
 'arxiv_dataset',
 'ascent_kb',
 'aslg_pc12',
 'asnq',
 'asset',
 'assin',
 'assin2',
 'atomic',
 'autshumato',
 'facebook/babi_qa',
 'banking77',
 'bbaw_egyptian',
 'bbc_hindi_nli',
 'bc2gm_corpus',
 'beans',
 'best2009',
 'bianet',
 'bible_para',
 'big_patent',
 'billsum',
 'bing_coronavirus_query_set',
 'biomrc',
 'biosses',
 'blbooks',
 'blbooksgenre',
 'blended_skill_talk',
 'blimp',
 'blog_authorship_corpus',
 'bn_hate_speech',
 'bnl_newspapers',
 'bookcorpus',
 'bookco

In [91]:
train_data, test_data = datasets.load_dataset('sentiment140', split=['train', 'test'])

Found cached dataset sentiment140 (C:/Users/k0nv1ct/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997)


  0%|          | 0/2 [00:00<?, ?it/s]

In [94]:
# Modify the values in a column
def modify_function(example):
    example['sentiment'] = 0 if example['sentiment'] == 0 else 1
    return example

train_data = train_data.map(modify_function)
test_data = test_data.map(modify_function)

  0%|          | 0/1600000 [00:00<?, ?ex/s]

  0%|          | 0/498 [00:00<?, ?ex/s]

In [96]:

transformer_name = 'bert-base-uncased'

tokenizer = transformers.AutoTokenizer.from_pretrained(transformer_name)

In [97]:
tokenizer.tokenize('hello world!')

['hello', 'world', '!']

In [98]:
tokenizer.encode('hello world!')

[101, 7592, 2088, 999, 102]

In [99]:
tokenizer.convert_ids_to_tokens(tokenizer.encode('hello world'))

['[CLS]', 'hello', 'world', '[SEP]']

In [100]:
tokenizer('hello world!')

{'input_ids': [101, 7592, 2088, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [101]:
def tokenize_and_numericalize_data(example, tokenizer):
    ids = tokenizer(example['text'], truncation=True)['input_ids']
    return {'ids': ids}

In [102]:
train_data = train_data.map(tokenize_and_numericalize_data, fn_kwargs={'tokenizer': tokenizer})
test_data = test_data.map(tokenize_and_numericalize_data, fn_kwargs={'tokenizer': tokenizer})

  0%|          | 0/1600000 [00:00<?, ?ex/s]

  0%|          | 0/498 [00:00<?, ?ex/s]

In [103]:
train_data['ids'][1]

[101,
 2003,
 6314,
 2008,
 2002,
 2064,
 1005,
 1056,
 10651,
 2010,
 9130,
 2011,
 3793,
 2075,
 2009,
 1012,
 1012,
 1012,
 1998,
 2453,
 5390,
 2004,
 1037,
 2765,
 2082,
 2651,
 2036,
 1012,
 27984,
 999,
 102]

In [105]:
train_data[0]

{'text': "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
 'date': 'Mon Apr 06 22:19:45 PDT 2009',
 'user': '_TheSpecialOne_',
 'sentiment': 0,
 'query': 'NO_QUERY',
 'ids': [101,
  1030,
  6942,
  13064,
  8299,
  1024,
  1013,
  1013,
  1056,
  9148,
  25856,
  2594,
  1012,
  4012,
  1013,
  1016,
  2100,
  2487,
  2480,
  2140,
  1011,
  22091,
  2860,
  2860,
  1010,
  2008,
  1005,
  1055,
  1037,
  26352,
  5017,
  1012,
  2017,
  2323,
  2050,
  2288,
  2585,
  12385,
  1997,
  2353,
  2154,
  2000,
  2079,
  2009,
  1012,
  1025,
  1040,
  102]}

In [106]:
tokenizer.vocab['!']

999

In [107]:
tokenizer.pad_token

'[PAD]'

In [108]:
tokenizer.pad_token_id

0

In [109]:
tokenizer.vocab[tokenizer.pad_token]

0

In [110]:
pad_index = tokenizer.pad_token_id

In [111]:
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

In [112]:
train_data = train_data.with_format(type='torch', columns=['ids', 'sentiment'])
valid_data = valid_data.with_format(type='torch', columns=['ids', 'sentiment'])
test_data = test_data.with_format(type='torch', columns=['ids', 'sentiment'])

In [113]:
transformer = transformers.AutoModel.from_pretrained(transformer_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [114]:
transformer.config.hidden_size

768

In [115]:

class Transformer(nn.Module):
    def __init__(self, transformer, output_dim, freeze):
        super().__init__()
        self.transformer = transformer
        hidden_dim = transformer.config.hidden_size
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        if freeze:
            for param in self.transformer.parameters():
                param.requires_grad = False
        
    def forward(self, ids):
        # ids = [batch size, seq len]
        output = self.transformer(ids, output_attentions=True)
        hidden = output.last_hidden_state
        # hidden = [batch size, seq len, hidden dim]
        attention = output.attentions[-1]
        # attention = [batch size, n heads, seq len, seq len]
        cls_hidden = hidden[:,0,:]
        prediction = self.fc(torch.tanh(cls_hidden))
        # prediction = [batch size, output dim]
        return prediction

In [116]:
train_data['sentiment'].unique()

tensor([0, 1])

In [117]:
output_dim = len(train_data['sentiment'].unique())
freeze = False

model = Transformer(transformer, output_dim, freeze)

In [118]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,483,778 trainable parameters


In [119]:
lr = 1e-5

optimizer = optim.Adam(model.parameters(), lr=lr)

In [120]:
criterion = nn.CrossEntropyLoss()

In [121]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [122]:
device

device(type='cpu')

In [123]:
model = model.to(device)
criterion = criterion.to(device)

In [124]:
def collate(batch, pad_index):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_label = [i['sentiment'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {'ids': batch_ids,
             'sentiment': batch_label}
    return batch

In [125]:
batch_size = 8

collate = functools.partial(collate, pad_index=pad_index)

train_dataloader = torch.utils.data.DataLoader(train_data, 
                                               batch_size=batch_size, 
                                               collate_fn=collate, 
                                               shuffle=True)

valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, collate_fn=collate)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, collate_fn=collate)

In [126]:

def train(dataloader, model, criterion, optimizer, device):

    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        label = batch['sentiment'].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [127]:
import tqdm
import sys

def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='Training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        labels = batch['sentiment'].to(device)

        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(ids)
        loss = criterion(predictions, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        accuracy = get_accuracy(predictions, labels)
        
        # Append loss and accuracy to epoch lists
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy)

    return epoch_losses, epoch_accs


In [128]:
def evaluate(dataloader, model, criterion, device):
    
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            label = batch['sentiment'].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

In [129]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    #correct_predictions = predicted_classes.eq(label).sum()
    correct_predictions = (predicted_classes == label).sum().item()
    accuracy = correct_predictions / batch_size
    return accuracy

In [130]:
n_epochs = 5
best_valid_loss = float('inf')

train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

for epoch in range(n_epochs):

    train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
    valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    valid_losses.extend(valid_loss)
    valid_accs.extend(valid_acc)
    
    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)
    
    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'transformer.pt')
    
    print(f'Epoch: {epoch+1}/{n_epochs}')
    print(f'Train Loss: {epoch_train_loss:.3f}, Train Accuracy: {epoch_train_acc:.3f}')
    print(f'Validation Loss: {epoch_valid_loss:.3f}, Validation Accuracy: {epoch_valid_acc:.3f}')
    print()

# Plotting the training and validation losses
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(valid_losses, label='Validation Loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

# Plotting the training and validation accuracies
plt.figure(figsize=(10, 5))
plt.plot(train_accs, label='Train Accuracy')
plt.plot(valid_accs, label='Validation Accuracy')
plt.xlabel('Iterations')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')
plt.show()


Training...:   0%|          | 16/150000 [00:34<90:54:33,  2.18s/it]


KeyboardInterrupt: 

In [None]:
CUDA_LAUNCH_BLOCKING=1