In [None]:
! pip install transformers

In [None]:
cd drive/MyDrive/LXPER

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("./cola_public/raw/in_domain_train.tsv", delimiter="\t", header=None)[:2000]
test_data = pd.read_csv("./cola_public/raw/in_domain_dev.tsv", delimiter="\t", header=None)[:2000]

In [None]:
from transformers import BartForSequenceClassification, BartTokenizer
import torch
use_cuda = True

model = BartForSequenceClassification.from_pretrained('facebook/bart-large')
if use_cuda and torch.cuda.is_available():
    model.cuda()
model.train()

In [None]:
from transformers import AdamW
import torch.nn as nn
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
text_batch = list(data.iloc[:,3])
labels = list(data.iloc[:,1])


encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
print(input_ids.shape)
print(attention_mask.shape)

In [None]:
encoding['labels'] = torch.tensor(labels)

In [None]:
model.resize_token_embeddings(len(tokenizer))

In [None]:
device = 'cuda:0'
model = model.to(device)
encoding = encoding.to(device)

In [None]:
for epoch in range(3):
    for i in range(data.shape[0]):
        input = encoding['input_ids'][i].view(1, -1)
        mask = encoding['attention_mask'][i].view(1, -1)
        label = encoding['labels'][i].view(-1)

        optimizer.zero_grad()

        output = model(input, mask, labels=label)
        loss = criterion(output.logits, label)
        print(loss)
        print('EPOCH:',epoch, '%:', round(i/data.shape[0], 4))
        loss.backward()
        optimizer.step()

In [None]:
model.eval()

In [None]:
test_text = list(test_data.iloc[:,3])
test_label = list(test_data.iloc[:,1])

In [None]:
test_encoding = tokenizer(test_text, return_tensors='pt', padding=True, truncation=True)

test_encoding = test_encoding.to(device)
test_input_ids = test_encoding['input_ids']
test_attention_mask = test_encoding['attention_mask']

In [None]:
pred = []
for i in range(test_data.shape[0]):
        input = test_encoding['input_ids'][i].view(1, -1)
        mask = test_encoding['attention_mask'][i].view(1, -1)
        
        output = model(input, mask)
        output = int(output.logits.argmax())
        pred.append(output)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(test_label, pred)

In [None]:
torch.save(model.state_dict(), './models/bart_statedict')

In [None]:
test_encoding = tokenizer(['I are a boy'], return_tensors='pt', padding=True, truncation=True).to(device)
test_input_ids = test_encoding['input_ids']
test_attention_mask = test_encoding['attention_mask']

input = test_encoding['input_ids'].view(1, -1)
mask = test_encoding['attention_mask'].view(1, -1)        
output = model(input, mask)
