Ссылка на файл с обученной моделью: https://drive.google.com/file/d/1-98eC_25fcy8HZUwTQ70Oz_0oa5UxsgC/view?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 12.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.2 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 41.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 

In [None]:
import glob
import os
import codecs
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm.notebook import tqdm

In [None]:
def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    '''
    Read articles from files matching patterns <file_pattern> from  
    the directory <folder_name>. 
    The content of the article is saved in the dictionary whose key
    is the id of the article (extracted from the file name).
    Each element of <sentence_list> is one line of the article.
    '''
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split('.')[0][7:]
        with codecs.open(filename, 'r', encoding='utf8') as f:
            articles[article_id] = f.read()
    return articles

def read_predictions_from_file(filename):
    '''
    Reader for the gold file and the template output file. 
    Return values are four arrays with article ids, labels 
    (or ? in the case of a template file), begin of a fragment, 
    end of a fragment. 
    '''
    articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
    with open(filename, 'r') as f:
        for row in f.readlines():
            article_id, gold_label, span_start, span_end = row.rstrip().split('\t')
            articles_id.append(article_id)
            gold_labels.append((gold_label, int(span_start), int(span_end)))
    return articles_id, gold_labels

In [None]:
def label(text, gt_labels):
    tokens = []
    labels = []
    special_symbols = """!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~ \n\t\'\\"""
    sentence = []
    sent_labels = []
    word = ''
    inside = False
    word_start = 0
    for i in range(len(text)):
        if text[i] in special_symbols:
            if len(word) > 1:
                sentence.append(word)
                word = ''
                if inside:
                    sent_labels.append(1)
                else:
                    sent_labels.append(0)
                # if the sentence has ended
                if text[i] in "!.?\n" and (i < len(text) - 2 and not (text[i+1].islower() or text[i+2].islower())):
                    if len(sentence) > 1:
                        tokens.append(sentence)
                        if any(sent_labels):
                            labels.append(1)
                        else:
                            labels.append(0)
                        sentence = []
                        sent_labels = []
        else:
            if len(word) == 0:
                word_start = i
            word += text[i]
        if len(gt_labels) > 0:
            if i == gt_labels[0][1]:
                inside = True
            elif i == gt_labels[0][2] + 1:
                inside = False
                gt_labels.pop(0)
    return tokens, labels
    

def create_dataset(path_to_articles, path_to_labels):
    '''
    Creates the dataset from the files contained in 'datasets/train-articles/' folder
    
    texts : list, each represents one article and contains
    '''
    texts = []
    labels = []
    articles = read_articles_from_file_list(path_to_articles)
    article_names = list(articles.keys())
    prefix_lbl = path_to_labels + '/article'
    postfix_lbl = '.task-flc-tc.labels'
    for name in article_names:
        articles_id, gold_labels = read_predictions_from_file(prefix_lbl + name + postfix_lbl)
        gt_labels = []
        for i in range(len(gold_labels)):
            if gold_labels[i][0] == 'Loaded_Language':
                gt_labels.append(gold_labels[i])
        gt_labels.sort(key=lambda x: x[1])
        tokens, lbls = label(articles[name], gt_labels)
        texts.extend(tokens)
        labels.extend(lbls)
    
    return texts, labels, article_names

In [None]:
class ManipulationDataset(Dataset):
    def __init__(self, articles_dir, labels_dir, max_seq_len=50):
        self.articles_dir = articles_dir
        self.labels_dir = labels_dir
        self.texts, self.labels, self.article_names = create_dataset(self.articles_dir, self.labels_dir)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.max_seq_len = max_seq_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, i):
        tokenized = self.tokenizer.encode_plus(' '.join(self.texts[i]), None, add_special_tokens=True, 
                                               max_length=self.max_seq_len,pad_to_max_length=True, return_token_type_ids=True)
        inputs = torch.tensor(tokenized['input_ids'][:self.max_seq_len])
        mask = torch.tensor(tokenized['attention_mask'][:self.max_seq_len])
        lbls = [1, 0] if self.labels[i] == 0 else [0, 1]
        return inputs, mask, torch.tensor(self.labels[i]), torch.tensor(lbls)

In [None]:
root = 'drive/MyDrive/manipulation_dataset/'
train_data = ManipulationDataset(root + 'train-articles', root + 'train-labels-task-flc-tc')
test_data = ManipulationDataset(root + 'dev-articles', root + 'dev-labels-task-flc-tc')

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
class BERTClassifier(torch.nn.Module):
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
        
    def forward(self, inputs, mask, labels):
        out = self.bert(inputs, mask, labels=labels)
        return out

In [None]:
model = BERTClassifier()
train_loader = DataLoader(train_data, batch_size=64)
test_loader = DataLoader(test_data, batch_size=64)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([0.5, 5]))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [None]:
device = 'cuda'
model = model.to(device)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [None]:
def calculate_metrics(outputs, labels, threshold=0.35):
    sm = torch.softmax(outputs, dim=1).cpu().detach()
    classes = []
    for s in sm:
        if s[1] > threshold:
            classes.append(1)
        else:
            classes.append(0)
    classes = torch.tensor(classes)
    labels = labels.cpu().ravel()
    acc = accuracy_score(labels, classes)
    pr = precision_score(labels, classes)
    rec = recall_score(labels, classes)
    f1 = f1_score(labels, classes)
    return acc, pr, rec, f1

In [None]:
def train(model, train_loader, optimizer, loss_func, epoch):
    losses = []
    model.train()
    count = 0
    for inputs, mask, labels, labels_for_loss in tqdm(train_loader):
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        out = model(inputs, mask, labels)['logits']
        loss = loss_func(out.cpu(), labels_for_loss.cpu().float())
        losses.append(loss.cpu().detach().item())
        loss.backward()
        optimizer.step()
        # count += 1
        # if count % 1000 == 0 or count == 3970:
    torch.save(model, 'model2' + str(epoch) + '.pt')
    print(f'Loss = {np.mean(losses)}')
    return np.mean(losses)

def validate(model, val_loader, loss_func):
    losses = []
    accs = []
    precs = []
    recs = []
    f1s = []
    model.eval()
    for inputs, mask, labels, labels_for_loss in tqdm(val_loader):
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(device)
        out = model(inputs, mask, labels)['logits']
        loss = loss_func(out.cpu(), labels_for_loss.cpu().float())
        losses.append(loss.cpu().detach().item())
        acc, pr, rec, f1 = calculate_metrics(out, labels)
        accs.append(acc)
        precs.append(pr)
        recs.append(rec)
        f1s.append(f1)
    return np.mean(losses), np.mean(accs), np.mean(precs), np.mean(recs), np.mean(f1s)

In [None]:
epoch_num = 5

for epoch in range(epoch_num):
    train_loss = train(model, train_loader, optimizer, loss_func, epoch)
    print(f'Epoch {epoch}:\ttrain loss = {train_loss}')
    test_loss, test_acc, test_prec, test_rec, test_f1 = validate(model, test_loader, loss_func)
    print(f'\tvalidation loss = {test_loss}, accuracy={test_acc}, precision={test_prec}, recall={test_rec}, f1={test_f1}')

  0%|          | 0/249 [00:00<?, ?it/s]



Loss = 0.44184924009813364
Epoch 0:	train loss = 0.44184924009813364


  0%|          | 0/46 [00:00<?, ?it/s]

	validation loss = 0.4212140074890593, accuracy=0.8207044314381271, precision=0.3124082042417283, recall=0.6886639326856718, f1=0.4081678591293466


  0%|          | 0/249 [00:00<?, ?it/s]

Loss = 0.4228650233053778
Epoch 1:	train loss = 0.4228650233053778


  0%|          | 0/46 [00:00<?, ?it/s]

	validation loss = 0.418715308541837, accuracy=0.821383779264214, precision=0.32024291118836146, recall=0.7197616393268567, f1=0.4192224005599634


  0%|          | 0/249 [00:00<?, ?it/s]

Loss = 0.4044388478778931
Epoch 2:	train loss = 0.4044388478778931


  0%|          | 0/46 [00:00<?, ?it/s]

	validation loss = 0.41680124272470886, accuracy=0.8289524108138239, precision=0.3339578192525118, recall=0.7108244412592238, f1=0.42841411504642185


  0%|          | 0/249 [00:00<?, ?it/s]

Loss = 0.3903023344146679
Epoch 3:	train loss = 0.3903023344146679


  0%|          | 0/46 [00:00<?, ?it/s]

	validation loss = 0.4215393286684285, accuracy=0.8324710841694537, precision=0.33754425793002274, recall=0.7182778574082921, f1=0.43467946712564176


  0%|          | 0/249 [00:00<?, ?it/s]

Loss = 0.3734335567218712
Epoch 4:	train loss = 0.3734335567218712


  0%|          | 0/46 [00:00<?, ?it/s]

	validation loss = 0.4260818796313327, accuracy=0.8211399108138239, precision=0.3213764360312516, recall=0.7273617083399692, f1=0.42154323454462805


In [None]:
torch.save(model, "drive/MyDrive/model_044_rec68.pt")

In [None]:
import shutil

In [None]:
shutil.move("model23.pt", "drive/MyDrive/model_04.pt")

'drive/MyDrive/model_04.pt'