In [1]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import transformers

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import metrics

In [2]:
device = 'cuda'
# Defining some key variables that will be used later on in the training
MAX_LEN = 100
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 50
LEARNING_RATE = 1e-06
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [3]:
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")

In [4]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.target
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            #pad_to_max_length=True,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [5]:
new_df = df_train[['text', 'target']].copy()

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

In [6]:
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [7]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [8]:
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [9]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 1)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output


In [10]:
model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [12]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [13]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [14]:
def train(epoch, best_f1_score):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float).unsqueeze(1)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%10==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if _%100==0:
            with torch.no_grad():
                outputs, targets = validation()
                outputs = np.array(outputs) >= 0.5
                accuracy = metrics.accuracy_score(targets, outputs)
                f1_score = metrics.f1_score(targets, outputs)
                print(f"Accuracy Score = {accuracy}")
                print(f"F1 Score = {f1_score}")

                if f1_score > best_f1_score:
                    best_f1_score = f1_score
                    torch.save(model.state_dict(), 'model.pt')
                    print("Model saved")
    return best_f1_score
        

In [15]:
best_f1_score = 0

for epoch in range(EPOCHS):
    best_f1_score = train(epoch, best_f1_score)

Epoch: 0, Loss:  0.6851012706756592
Accuracy Score = 0.43204202232435984
F1 Score = 0.6033929390187988
Model saved
Epoch: 0, Loss:  0.7208095788955688
Epoch: 0, Loss:  0.6340247392654419
Epoch: 0, Loss:  0.6317006349563599
Epoch: 0, Loss:  0.7365193963050842
Epoch: 0, Loss:  0.7259445786476135
Epoch: 0, Loss:  0.7301448583602905
Epoch: 0, Loss:  0.6558612585067749
Epoch: 0, Loss:  0.7106248140335083
Epoch: 0, Loss:  0.6737862825393677
Epoch: 0, Loss:  0.6864073276519775
Accuracy Score = 0.6776099803020355
F1 Score = 0.6564030790762772
Model saved
Epoch: 0, Loss:  0.6575319766998291
Epoch: 0, Loss:  0.6323495507240295
Epoch: 0, Loss:  0.6837042570114136
Epoch: 0, Loss:  0.5986202955245972
Epoch: 0, Loss:  0.6646816730499268
Epoch: 0, Loss:  0.5973669290542603
Epoch: 0, Loss:  0.6488198041915894
Epoch: 0, Loss:  0.6522070169448853
Epoch: 0, Loss:  0.569665253162384
Epoch: 0, Loss:  0.6168286800384521
Accuracy Score = 0.7426132632961261
F1 Score = 0.6781609195402298
Model saved
Epoch: 0, 

KeyboardInterrupt: 