Установка зависимостей: !pip install transformers jsonlines

In [1]:
import json
import jsonlines
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import *
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

from tqdm.notebook import tqdm

In [2]:
tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
tokenizer.add_special_tokens({'pad_token': '<pad>'})

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


0

In [3]:
def text_splitter(text, amount=100000):
    tokens = text.split(' ')
    new_text = ' '.join(tokens[-amount:])
    return new_text

def get_X_y_for_gpt(data_json_file):
    X, y = [], []
    with open(data_json_file, 'r') as json_file:
        json_list = list(json_file)
        for json_str in json_list:
            item = json.loads(json_str)
            
            text = item['passage']['text'].replace('@header', '')
            
            correct_answers = []
            questions = item['qas']
            query = questions[0]['query']
            for q in questions:
                ans = q['answers']
                for a in ans:
                    X.append(text_splitter(text+' Query: '+query.replace('@placeholder', a['text'])))
                    y.append(1)
                    correct_answers.append(a['text'])
                    
            entities = item['passage']['entities']
            str_entities = []
            for entity in entities:
                start = entity['start']
                end = entity['end']
                str_entities.append(text[start:end])
                if text[start:end] not in correct_answers:
                    X.append(text_splitter(text+' Query: '+query.replace('@placeholder', text[start:end])))
                    y.append(0)
    return X, y

def get_X_for_gpt(data_json_file):
    X = []
    d = {}
    with open(data_json_file, 'r') as json_file:
        json_list = list(json_file)
        indexes = []
        for json_str in json_list:
            item = json.loads(json_str)
            
            text = item['passage']['text']
            
            correct_answers = []
            questions = item['qas']
            query = questions[0]['query']
                    
            entities = item['passage']['entities']
            str_entities = []
            indexes.append(len(entities))
            for entity in entities:
                start = entity['start']
                end = entity['end']
                if item['idx'] not in d.keys():
                    d[item['idx']] = []
                d[item['idx']].append([start, end, text[start:end]])
                str_entities.append(text[start:end])
                if text[start:end] not in correct_answers:
                    X.append(text_splitter(text+' Query: '+query.replace('@placeholder', text[start:end])))
    return X, indexes, d

In [4]:
X_train, y_train = get_X_y_for_gpt('RuCoS/train.jsonl')
X_test, y_test = get_X_y_for_gpt('RuCoS/val.jsonl')

In [5]:
X_train = X_train[100000:160000]
y_train = y_train[100000:160000]

X_test = X_test[20000:40000]
y_test = y_test[20000:40000]

In [6]:
maxl = 1024
batch_size = 8

In [7]:
X_train  = [tokenizer.encode(q, max_length=maxl, padding='max_length', truncation=True, pad_to_max_length=True) for q in tqdm(X_train)]
X_train = [i if i else [0] * maxl for i in X_train]
X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
train_data = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(
    train_data,
    sampler=RandomSampler(train_data),
    batch_size=batch_size,
    num_workers=4,
    pin_memory=True
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60000.0), HTML(value='')))




In [8]:
X_test  = [tokenizer.encode(q, max_length=maxl, padding='max_length', truncation=True, pad_to_max_length=True) for q in tqdm(X_test)]
X_test = [i if i else [0] * maxl for i in X_test]
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)
validation_data = TensorDataset(X_test, y_test)
validation_dataloader = DataLoader(
    validation_data,
    sampler=SequentialSampler(validation_data),
    batch_size=batch_size,
    num_workers=4,
    pin_memory=True
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20000.0), HTML(value='')))




In [9]:
model = AutoModelForSequenceClassification.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2', num_labels=2)
model.cuda()
model.config.pad_token_id = 0

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=551290714.0), HTML(value='')))




Some weights of the model checkpoint at sberbank-ai/rugpt3small_based_on_gpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/rugpt3small_based_on_gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-6)

In [19]:
epochs = 5  # сначала обучали 3 эпохи, потом обучали еще 2

for _ in range(epochs):
    model.train()
    train_loss = 0
    
    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_labels = batch
      
        optimizer.zero_grad()
      
        loss = model(b_input_ids.long(), token_type_ids=None, labels=b_labels)
        loss[0].backward()
        
        optimizer.step()
        
        train_loss += loss[0].item()
      
    print("Train Loss: {0:.5f}".format(train_loss / len(train_dataloader)))
    
    model.eval()

    valid_preds, valid_labels = [], []

    for batch in tqdm(validation_dataloader): 
        batch = tuple(t.cuda() for t in batch)
        b_input_ids, b_labels = batch

        with torch.no_grad():
            logits = model(b_input_ids.long(), token_type_ids=None)

        logits = logits[0].detach().cpu()
        label_ids = b_labels.to('cpu').numpy()

        batch_preds = torch.softmax(logits, axis=1).numpy()
        batch_labels = label_ids
        valid_preds.extend(batch_preds)
        valid_labels.extend(batch_labels)

    valid_preds = np.array(valid_preds)
    valid_preds = [round(i) for i in valid_preds[:, 1]]
    print("ROC AUC: " + str(roc_auc_score(valid_labels, valid_preds)))
    print("F1-score: " + str(f1_score(valid_labels, valid_preds)))
    print(classification_report(valid_labels, valid_preds))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7500.0), HTML(value='')))


Train Loss: 0.21077


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500.0), HTML(value='')))


ROC AUC: 0.7193316661716578
F1-score: 0.47595000631233425
              precision    recall  f1-score   support

           0       0.92      0.82      0.87     16929
           1       0.39      0.61      0.48      3071

    accuracy                           0.79     20000
   macro avg       0.66      0.72      0.67     20000
weighted avg       0.84      0.79      0.81     20000



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7500.0), HTML(value='')))


Train Loss: 0.18200


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2500.0), HTML(value='')))


ROC AUC: 0.7006012853613784
F1-score: 0.4790130886114036
              precision    recall  f1-score   support

           0       0.91      0.88      0.90     16929
           1       0.45      0.52      0.48      3071

    accuracy                           0.83     20000
   macro avg       0.68      0.70      0.69     20000
weighted avg       0.84      0.83      0.83     20000



In [20]:
X_final, indexes, d = get_X_for_gpt('RuCoS/test.jsonl')

In [21]:
X_final  = [tokenizer.encode(q, max_length=maxl, padding='max_length', truncation=True, pad_to_max_length=True) for q in tqdm(X_final)]
X_final = [i if i else [0] * maxl for i in X_final]
X_final = torch.tensor(X_final)
test_data = TensorDataset(X_final)
test_dataloader = DataLoader(
    test_data,
    sampler=SequentialSampler(test_data),
    batch_size=batch_size,
    num_workers=4,
    pin_memory=True
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=96996.0), HTML(value='')))




In [22]:
test_preds = []

for batch in tqdm(test_dataloader):   
    batch = batch[0]
    b_input_ids = batch.cuda()
    
    with torch.no_grad():
        logits = model(b_input_ids.long(), token_type_ids=None)

    logits = logits[0].detach().cpu()
    
    batch_preds = torch.softmax(logits, axis=1).numpy()
    test_preds.extend(batch_preds)
    
test_preds = np.array(test_preds)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12125.0), HTML(value='')))




In [23]:
def write_answers(predictions):
    s = []
    i = 0
    j = 0
    while i < len(predictions):
        m = max(predictions[i: i + indexes[j]])
        ind = predictions[i: i + indexes[j]].index(m)
        text_id = j
        i += indexes[j]
        j += 1
        start, end, text = d[text_id][ind]
        
        s.append({'idx': text_id, 'end': end, 'start': start, 'text': text})
    
    with jsonlines.open('answer.jsonl', 'w') as writer:
        writer.write_all(s)

In [24]:
write_answers([i for i in test_preds[:, 1]])