In [1]:
# !pip install datasets
# !pip install -U scikit-learn

In [2]:
# !pip install transformers[torch]

In [3]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import f1_score
import transformers
import random

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from tqdm import tqdm

In [5]:
random.seed(13)
torch.manual_seed(13)

<torch._C.Generator at 0x733eea181ef0>

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [7]:
torch.cuda.is_available()

True

In [8]:
train_df = pd.read_csv('train.csv')

In [9]:
train_df

Unnamed: 0,line_id,summary,question,answer,is_hallucination
0,0,Херманус Питер (Дик) Логгере (нидерл. Hermanus...,В каком городе проходил чемпионат мира по хокк...,В Хилверсюме.,1
1,1,Ходуткинские горячие источники (Худутские горя...,Как называется район в который входят источники?,Елизовским районом,0
2,2,Чёрная вдова (лат. Latrodectus mactans) — вид ...,Для кого опасны пауки-бокоходы?,Для рыб.,1
3,3,"Рысь — река в России, протекает по территориям...",Какова длина реки Рысь?,5 км.,1
4,4,"И́се (яп. 伊勢市), ранее Удзиямада — город в Япон...",Что такое Исе?,"Исе — это небольшой город в Японии, который не...",1
...,...,...,...,...,...
1045,1045,Восемь незарегистрированных правителей (яп. 欠史...,Что связывают с императорским домом?,ни одна легенда не связывает их с Японией,0
1046,1046,«Гастингс» (англ. Hastings) — название военной...,Какой род войск проводил военную операцию под ...,Танковые войска.,1
1047,1047,Bacillus cereus (лат.) — вид грамположительных...,У кого вызывает токсикоз?,У растений.,1
1048,1048,Стеклова́та — волокнистый минеральный теплоизо...,Какой способностью обладает стекловата?,Стекловата обладает способностью проводить эле...,1


In [10]:
train_df['text'] = train_df['summary'] + " " + train_df['question'] + " " + train_df['answer']

In [11]:
#bert_name = 'bert-base-uncased'
#bert_name = 'imvladikon/charbert-bert-wiki'
bert_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_name)
#model = BertForSequenceClassification.from_pretrained(bert_name, num_labels=2).to(device)



In [12]:
def tokenize_function(examples):
    return tokenizer(examples, #["text"], 
            padding='max_length', 
            max_length = 512, 
            truncation=True) #,
            #return_tensors="pt"
    #return tokenizer(examples["text"], padding="max_length", truncation=True)


train_df['input_ids'] = train_df['text'].apply(tokenize_function)
train_df['inputC'] = train_df['summary'].apply(tokenize_function)
train_df['inputQ'] = train_df['question'].apply(tokenize_function)
train_df['inputA'] = train_df['answer'].apply(tokenize_function)

In [13]:
def split_dataframe(dataframe, proc):
    ids_shu = list(dataframe['line_id'])
    random.shuffle(ids_shu)
    test  = ids_shu[:proc]
    train_df = dataframe[~dataframe['line_id'].isin(test)]
    test_df = dataframe[dataframe['line_id'].isin(test)]
    return train_df, test_df

posdf = train_df[train_df['is_hallucination'] == 1]
negdf = train_df[train_df['is_hallucination'] == 0]

train_pos, test_pos = split_dataframe(posdf, 100)
train_neg, test_neg = split_dataframe(negdf, 100)
#train_df[train_df['is_hallucination'] == 1]

In [14]:
len(train_pos),len(test_pos),len(train_neg),len(test_neg)

(432, 100, 418, 100)

In [15]:
if False:
    train_dataset = Dataset.from_pandas( pd.concat([train_pos, train_neg]).sample(frac=1) [['text', 'is_hallucination']] ) 
    val_dataset   = Dataset.from_pandas( pd.concat([test_pos, test_neg]).sample(frac=1) [['text', 'is_hallucination']] ) 
    train_dataset = train_dataset.rename_column("is_hallucination", "labels")
    val_dataset   = val_dataset.rename_column("is_hallucination", "labels")
    
    if False:
        train_dataset = Dataset.from_pandas(train_df[['text', 'is_hallucination']])
        train_dataset = train_dataset.rename_column("is_hallucination", "labels")
        
        train_test_split = train_dataset.train_test_split(test_size=0.2)
        train_dataset = train_test_split['train']
        val_dataset = train_test_split['test']

In [16]:
class ClassHall(torch.nn.Module):
    def __init__(self, hidden_size, output_size, activationfunc):
        super().__init__()
        self.BERTC = transformers.BertModel.from_pretrained(bert_name).to(device)
        self.BERTQ = transformers.BertModel.from_pretrained(bert_name).to(device)
        self.BERTA = transformers.BertModel.from_pretrained(bert_name).to(device)
        #self.linear1 = torch.nn.Linear(768, hidden_size)
        #self.act1 = activationfunc()
        #self.linear2 = torch.nn.Linear(hidden_size, output_size)

        self.linear2 = torch.nn.Linear(768*3, output_size)
        
        self.act2 = torch.nn.Softmax(dim = 1) # activationfunc()
        self.tofit = [False, False, False]
        
    def forward(self, inputC, inputQ, inputA,
                      attentionC, attentionQ, attentionA):
        #print(x)
        #batches, x2, target, view = x
        #h = self.BERTencoder( x, return_dict=False)[1]# self.BERTencoder(x)

        hC = self.BERTC (input_ids=inputC,
                         attention_mask=attentionC,
                         return_dict=False)[1]
        if not self.tofit[0]:
            hC = hC.detach()
        hQ = self.BERTQ (input_ids=inputQ,
                         attention_mask=attentionQ,
                         return_dict=False)[1]
        if not self.tofit[0]:
            hQ = hQ.detach()
        hA = self.BERTA (input_ids=inputA,
                         attention_mask=attentionA,
                         return_dict=False)[1]
        if not self.tofit[0]:
            hA = hA.detach()
        h = torch.cat([hC, hQ, hA], dim = 1)
        #h = self.linear1(h)
        #h = self.act1(h)
        
        h = self.linear2(h)
        h = self.act2(h)
        return h

In [17]:
model = ClassHall(256, 2, torch.nn.Tanh)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: 

In [18]:
getlr = 2e-5
weight_decay=0.01
criterion   = torch.nn.CrossEntropyLoss()
model = model.to(device)
optimizer   = torch.optim.AdamW(model.parameters(), lr= getlr) 

In [19]:
# len(train_pos),len(test_pos),len(train_neg),len(test_neg)

neg_value = [-1.0, 0.0][1] #

In [20]:
max_len = max(len(train_pos), len(train_neg))
batch_size = 1

#it_pos = list(train_pos['input_ids'])
#it_neg = list(train_neg['input_ids'])
it_posC = list(train_pos['inputC'])
it_posQ = list(train_pos['inputQ'])
it_posA = list(train_pos['inputA'])
it_negC = list(train_neg['inputC'])
it_negQ = list(train_neg['inputQ'])
it_negA = list(train_neg['inputA'])
model.train()
epochs = 3
for epoch in range(epochs):
    model.tofit = [False, False, False]
    model.tofit[epoch] = True
    for i in tqdm(range(max_len// batch_size)):
        groupetensor = []
        groupetarget = []
        groupatt = []
        for it in range(batch_size):
            assert batch_size == 1
            iterid = i*batch_size + it
            for party_list in [[it_posC, it_negC], 
                               [it_posQ, it_negQ], 
                               [it_posA, it_negA] ]:
                one_ids = torch.tensor([it_list[ iterid % len(it_list) ] ['input_ids']      for it_list  in party_list]).to(device)
                one_att = torch.tensor([it_list[ iterid % len(it_list) ] ['attention_mask'] for it_list  in party_list]).to(device)
                groupetensor += [one_ids]
                groupatt    += [one_att]
            #groupetensor+=[ pos_ids, neg_ids ]
            groupetarget+=[ [neg_value,1.0],[1.0,neg_value]   ]
            #groupatt    +=[ pos_att, neg_att ]
        #auggroups = [made_aug(onegroup)  for onegroup in  groupetensor]
        #groupetensor = groupetensor + auggroups
        #line_tensor = torch.cat([torch.cat(onegroup).reshape([1,*torch.cat(onegroup).shape ]) for onegroup in groupetensor]).to(device)
        #fit_target  = torch.cat([torch.cat(onegroup).reshape([1,*torch.cat(onegroup).shape ]) for onegroup in groupetarget]).to(device)
        #line_tensor = torch.tensor(groupetensor).to(device) # torch.cat([onegroup for onegroup in groupetensor]).to(device)
        #attention = torch.tensor(groupatt).to(device)
        fit_target =  torch.tensor(groupetarget).to(device) #torch.cat([onegroup.reshape([1,*onegroup.shape ]) for onegroup in groupetarget]).float().to(device)
        output = model( *groupetensor ,*groupatt  ) # line_tensor, attention ) #[:, -1, :]
        loss = criterion(output, fit_target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    

  0%|▏                                                                                  | 1/432 [00:00<02:16,  3.16it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacty of 11.76 GiB of which 22.31 MiB is free. Including non-PyTorch memory, this process has 11.73 GiB memory in use. Of the allocated memory 11.03 GiB is allocated by PyTorch, and 413.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [21]:
# len(train_pos),len(test_pos),len(train_neg),len(test_neg)
it_pos = list(test_pos['input_ids'])
it_neg = list(test_neg['input_ids'])

model.eval()

tp = 0
fn = 0
for pos_ids in it_pos:
    line_tensor = torch.tensor(pos_ids['input_ids'])
    line_tensor = line_tensor.reshape(1, *line_tensor.shape ).to(device)
    
    attention = torch.tensor(pos_ids['attention_mask'])
    attention = attention.reshape(1, *attention.shape ).to(device)
    output = model(line_tensor, attention)
    right_now = (output.argmax(dim = 1).detach().cpu() == torch.tensor([[neg_value,1.0]]).argmax(dim=1)).item()
    tp += right_now
    fn += 1-right_now

tn = 0
fp = 0
for neg_ids in it_neg:
    line_tensor = torch.tensor(neg_ids['input_ids'])
    line_tensor = line_tensor.reshape(1, *line_tensor.shape ).to(device)
    attention = torch.tensor(neg_ids['attention_mask'])
    attention = attention.reshape(1, *attention.shape ).to(device)
    output = model(line_tensor,attention)
    right_now = (output.argmax(dim = 1).detach().cpu() == torch.tensor([[1.0,neg_value]]).argmax(dim=1)).item()
    tn += right_now
    fp += 1-right_now

rec = tp/(tp+fn)
pre = tp/(tp+fp)
f1 = 2*(pre*rec)/(pre+rec)
f1

0.6666666666666666

In [23]:
tp,fn,tn,fp

(100, 0, 0, 100)

In [23]:
#torch.save(model.state_dict(), '0.85_f1_rubert.pt')

In [28]:
#torch.save(model.state_dict(), '0.76f1.pt')

In [33]:
#model.load_state_dict(torch.load('0.98f1.pt'))
#model.load_state_dict(torch.load('0.76f1.pt'))

<All keys matched successfully>