Установка зависимостей: !pip install jsonlines simpletransformers

In [1]:
import json
import jsonlines
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import *

In [3]:
def text_splitter(text, amount=100000):
    tokens = text.split(' ')
    new_text = ' '.join(tokens[-amount:])
    return new_text

def get_X_y_for_bert(data_json_file):
    X, y = [], []
    with open(data_json_file, 'r') as json_file:
        json_list = list(json_file)
        for json_str in json_list:
            item = json.loads(json_str)
            
            text = item['passage']['text'].replace('@header', '')
            
            correct_answers = []
            questions = item['qas']
            query = questions[0]['query']
            for q in questions:
                ans = q['answers']
                for a in ans:
                    X.append(text_splitter(text+' Query: '+query.replace('@placeholder', a['text'])))
                    y.append(1)
                    correct_answers.append(a['text'])
                    
            entities = item['passage']['entities']
            str_entities = []
            for entity in entities:
                start = entity['start']
                end = entity['end']
                str_entities.append(text[start:end])
                if text[start:end] not in correct_answers:
                    X.append(text_splitter(text+' Query: '+query.replace('@placeholder', text[start:end])))
                    y.append(0)
    return X, y

def get_X_for_bert(data_json_file):
    X = []
    d = {}
    with open(data_json_file, 'r') as json_file:
        json_list = list(json_file)
        indexes = []
        for json_str in json_list:
            item = json.loads(json_str)
            
            text = item['passage']['text']
            
            correct_answers = []
            questions = item['qas']
            query = questions[0]['query']
                    
            entities = item['passage']['entities']
            str_entities = []
            indexes.append(len(entities))
            for entity in entities:
                start = entity['start']
                end = entity['end']
                if item['idx'] not in d.keys():
                    d[item['idx']] = []
                d[item['idx']].append([start, end, text[start:end]])
                str_entities.append(text[start:end])
                if text[start:end] not in correct_answers:
                    X.append(text_splitter(text+' Query: '+query.replace('@placeholder', text[start:end])))
    return X, indexes, d

In [4]:
X_train, y_train = get_X_y_for_bert('RuCoS/train.jsonl')
X_test, y_test = get_X_y_for_bert('RuCoS/val.jsonl')

In [5]:
train_df = pd.DataFrame({
    'text': X_train,
    'label':y_train
})


eval_df = pd.DataFrame({
    'text': X_test,
    'label': y_test
})
train_df.head()

Unnamed: 0,text,label
0,"Наблюдатели полагают, что подоплекой теракта в...",1
1,"Наблюдатели полагают, что подоплекой теракта в...",0
2,"Наблюдатели полагают, что подоплекой теракта в...",0
3,"Наблюдатели полагают, что подоплекой теракта в...",0
4,"Наблюдатели полагают, что подоплекой теракта в...",0


In [6]:
train_df = train_df.iloc[100000:400000]
eval_df = eval_df.iloc[20000:80000]

In [9]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [10]:
model_args = {
    'num_train_epochs': 2,
    'train_batch_size': 32,
    'eval_batch_size': 32,
    'max_seq_length': 512,
    'learning_rate': 1e-5,
    'sliding_window': True,
    'reprocess_input_data': False,
    'overwrite_output_dir': True,
    'save_model_every_epoch': False,
    'save_steps': -1,
    }


In [11]:
import sklearn
model = ClassificationModel('bert', 'DeepPavlov/rubert-base-cased', use_cuda=True, args=model_args)
model.train_model(train_df, f1=sklearn.metrics.f1_score)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=642.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=711456796.0), HTML(value='')))




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1649718.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300000.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 2'), FloatProgress(value=0.0, max=9375.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 1 of 2'), FloatProgress(value=0.0, max=9375.0), HTML(value='')))





(18750, 0.20836572883434593)

In [23]:
predictions, raw_outputs = model.predict(X_test)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=110688.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3459.0), HTML(value='')))




In [24]:
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))
confusion_matrix(y_test, predictions)

Precision:   0.83
Recall:   0.81
F1-measure:   0.82
Accuracy:   0.88
              precision    recall  f1-score   support

           0       0.91      0.94      0.92     85770
           1       0.76      0.68      0.72     24918

    accuracy                           0.88    110688
   macro avg       0.83      0.81      0.82    110688
weighted avg       0.88      0.88      0.88    110688



array([[80321,  5449],
       [ 7879, 17039]])

In [13]:
X_final, indexes, d = get_X_for_bert('RuCoS/test.jsonl')
predictions, raw_outputs = model.predict(X_final)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=96996.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3032.0), HTML(value='')))




In [16]:
def write_answers(predictions):
    s = []
    i = 0
    j = 0
    while i < len(predictions):
        m = max(predictions[i: i + indexes[j]])
        ind = predictions[i: i + indexes[j]].index(m)
        text_id = j
        i += indexes[j]
        j += 1
        start, end, text = d[text_id][ind]
        
        s.append({'idx': text_id, 'end': end, 'start': start, 'text': text})
    
    with jsonlines.open('answer.jsonl', 'w') as writer:
        writer.write_all(s)

In [17]:
write_answers([i for i in predictions])