In [1]:
import time

import pandas as pd
import numpy as np
from tqdm import tqdm

from transformers import BertModel, BertTokenizer

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset

In [2]:
class text_dataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.data_list = list(zip(df['text'], df['target']))
        
    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, index):
        tokenized_tweet = self.tokenizer.tokenize(self.data_list[index][0])
        
        if len(tokenized_tweet) > self.max_seq_length:
            tokenized_tweet = tokenized_tweet[:self.max_seq_length]
            
        ids_tweet  = self.tokenizer.convert_tokens_to_ids(tokenized_tweet)
        padding = [0] * (self.max_seq_length - len(ids_tweet))
        ids_tweet += padding
        
        assert len(ids_tweet) == self.max_seq_length
        
        ids_tweet = torch.tensor(ids_tweet)
        target = self.data_list[index][1]
        
        return ids_tweet, target

In [7]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 768)
        self.relu = nn.ReLU()

    def forward(self, input_id):
        _, pooled_output = self.bert(input_ids=input_id, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [4]:
test_size = 0.2
holdout_size = 0.1

df = pd.read_csv('./data/train.csv')

df_train, df_val, df_test = np.split(
    df.sample(frac=1, random_state=42),
    [int((1-holdout_size-test_size)*len(df)), int((1-holdout_size)*len(df))]
)

In [17]:
def train(model, train_data, val_data, learning_rate, epochs, tokenizer, max_seq_length=128):
    print('Setting Up...')
    
    train_ds, val_ds = text_dataset(train_data, tokenizer, max_seq_length), text_dataset(val_data, tokenizer, max_seq_length)
    train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_ds, batch_size=32)
        
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    print('Starting Training...')
    
    since = time.time()
    for epoch in range(epochs):
        print(f'Epoch {epoch}/{epochs - 1}')
        print('-' * 10)

        for phase, dl in {'train':train_dataloader, 'val':val_dataloader}.items():
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            tweet_corrects = 0
            
            for inputs, tweet in dl:
                inputs = inputs.to(device) 
                tweet = tweet.to(device)
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    outputs = nn.functional.softmax(outputs,dim=1)
                    loss = criterion(outputs, torch.max(tweet.float())) # torch.max(tweet.float(), 1)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                tweet_corrects += torch.sum(torch.max(outputs, 1)[1] == torch.max(tweet, 1)[1])
                
            epoch_loss = running_loss / dataset_sizes[phase]
            tweet_acc = tweet_corrects.double() / dataset_sizes[phase]

            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} tweet_acc: {:.4f}'.format(phase, tweet_acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(float(best_loss)))
    
    model.load_state_dict(best_model_wts)
    
    return model

In [18]:
epochs = 5
model = BertClassifier()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
lr = 1e-6

train(model, df_train, df_val, lr, epochs, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Setting Up...
Starting Training...
Epoch 0/4
----------


ValueError: Using a target size (torch.Size([])) that is different to the input size (torch.Size([32, 768])) is deprecated. Please ensure they have the same size.