# **Выявление побочных действий лекарств в микроблогах**

Выполнила Ирина Долгалева

# 5. Построение продвинутых моделей: BERT (fine-tuned head and body)

Доустановим библиотеки:

In [0]:
!pip install pytorch_transformers
!pip install transformers
!pip install emoji
!pip install pytorch-pretrained-bert

Определим машину GPU:

In [0]:
import tensorflow as tf
import torch

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Found GPU at: /device:GPU:0


'Tesla P100-PCIE-16GB'

Заимпортируем бибилотеки:

In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange, notebook
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm_notebook
import emoji

import torch.nn.functional as F
from torch.utils import data
import torch.nn as nn
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertConfig, BertModel
import datetime
import os
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import random 

% matplotlib inline

Using TensorFlow backend.


Считаем и подготовим данные:

In [0]:
df_ru_tr = pd.read_csv('df_ru_tr.csv', encoding='utf-8-sig')
df_ru_val = pd.read_csv('df_ru_val.csv', encoding='utf-8-sig')
df_ru_tr['language'] = 'Русский'
df_ru_val['language'] = 'Русский'

df_en_tr = pd.read_csv('df_en_tr.csv', encoding='utf-8-sig')
df_en_val = pd.read_csv('df_en_val.csv', encoding='utf-8-sig')
df_en_tr['language'] = 'Английский'
df_en_val['language'] = 'Английский'

df_ru_tr['tweet_light_pr'] = df_ru_tr['tweet'].apply(lambda x: re.sub("(?:\@|https?\://)\S+", "", x))
df_ru_tr['tweet_light_pr'] = df_ru_tr['tweet_light_pr'].apply(lambda x: ''.join(ch for ch in x if ch not in emoji.UNICODE_EMOJI))
df_ru_val['tweet_light_pr'] = df_ru_val['tweet'].apply(lambda x: re.sub("(?:\@|https?\://)\S+", "", x))
df_ru_val['tweet_light_pr'] = df_ru_val['tweet_light_pr'].apply(lambda x: ''.join(ch for ch in x if ch not in emoji.UNICODE_EMOJI))

df_en_tr['tweet_light_pr'] = df_en_tr['tweet'].apply(lambda x: re.sub("(?:\@|https?\://)\S+", "", x))
df_en_tr['tweet_light_pr'] = df_en_tr['tweet_light_pr'].apply(lambda x: ''.join(ch for ch in x if ch not in emoji.UNICODE_EMOJI))
df_en_val['tweet_light_pr'] = df_en_val['tweet'].apply(lambda x: re.sub("(?:\@|https?\://)\S+", "", x))
df_en_val['tweet_light_pr'] = df_en_val['tweet_light_pr'].apply(lambda x: ''.join(ch for ch in x if ch not in emoji.UNICODE_EMOJI))

df_tr = df_ru_tr.append(df_en_tr)
df_tr = df_tr.sample(frac=1, random_state=123).reset_index(drop=True)

df_val = df_ru_val.append(df_en_val)
df_val = df_val.sample(frac=1, random_state=123).reset_index(drop=True)

Для удобства создадим свой класс для датасета:

In [0]:
class Tweet_Dataset(data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]    
    
    def __len__(self):
        return len(self.X)

Теперь создадим дата лоудеры:

In [0]:
BATCH_SIZE = 64

train_set = Tweet_Dataset(df_tr['tweet_light_pr'], df_tr['class'])
train_sampler = torch.utils.data.sampler.RandomSampler(train_set)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, sampler=train_sampler)

test_set = Tweet_Dataset(df_val['tweet_light_pr'], df_val['class'])
test_sampler = torch.utils.data.sampler.RandomSampler(test_set)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE, sampler=test_sampler)

## Определим нейросеть, у которой обучается голова и часть тела

In [0]:
class Dense_BERT_TR(nn.Module):
    def __init__(self, out_dim):
        super(Dense_BERT_TR, self).__init__()
        
        self.bert_layer = BertModel.from_pretrained('bert-base-multilingual-cased')

        for param in self.bert_layer.parameters():
            param.requires_grad = False
            
        for param in self.bert_layer.encoder.layer[9:].parameters():
            param.requires_grad = True

        self.dropout = torch.nn.Dropout(p=0.5)
        self.lin_layer1 = nn.Linear(in_features=59, out_features=59)
        self.relu = nn.ReLU()
        self.lin_layer2 = nn.Linear(in_features=59, out_features=out_dim)

    def forward(self, input_ids, attention_mask):
        output = self.bert_layer(input_ids, attention_mask)[0]
        output = torch.mean(output, 2)
        output = self.lin_layer1(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.lin_layer1(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.lin_layer2(output)
        return output

## Подготовим данные к обучению модели

In [0]:
def prepare_batch_to_bert(batch, max_seq_length=50):

    def _get_tokens_padding_attention(data):
        tokens = tokenizer.tokenize(data)[:max_seq_length-2]
        tokens = ['[CLS]'] + tokens + ['[SEP]'] 
        ids = tokenizer.convert_tokens_to_ids(tokens) 
        ids_padded = pad_sequences([ids], maxlen=max_seq_length, truncating="post", padding="post")
        ids_padded = ids_padded[0]
        attention_mask = np.where(ids_padded != 0, 1, 0)
        return ids_padded, attention_mask

    bert_features = dict(ids=[],
                         attention_mask=[],
                         label=[])
    
    for index, (data, label) in enumerate(zip(*batch)):
        ids, attention_mask = _get_tokens_padding_attention(data)
        
        assert len(ids) == max_seq_length
        assert len(attention_mask) == max_seq_length

        bert_features['ids'].append(ids)
        bert_features['attention_mask'].append(attention_mask) 
        bert_features['label'].append(label) 

    return bert_features

In [0]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def binary_f1(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    f1 = f1_score(y.cpu().detach().numpy(), rounded_preds.cpu().detach().numpy(), zero_division=0)    
    return f1

In [0]:
import torch.nn.functional as F

def train_func(model, iterator, optimizer, criterion=None):
    epoch_loss = 0
    epoch_f1 = 0
    epoch_acc = 0
    
    model.train()
    for idx, batch in tqdm(enumerate(iterator)):
        batch = prepare_batch_to_bert(batch, max_seq_length=59)

        input_ids_tensor = torch.tensor(batch['ids'], dtype=torch.long).to(device)
        mask_tensor = torch.tensor(batch['attention_mask'], dtype=torch.long).to(device)
        labels_tensor = torch.tensor(batch['label'], dtype=torch.float).to(device)

        optimizer.zero_grad()        
        probas = model(input_ids_tensor, attention_mask=mask_tensor)
        probas = torch.flatten(probas)

        loss = criterion(probas, labels_tensor)
        acc = binary_accuracy(probas, labels_tensor)
        f1 = binary_f1(probas, labels_tensor)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc
        epoch_f1 += f1

    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator)

In [0]:
def eval_func(model, iterator, optimizer, criterion=None):
    epoch_loss = 0
    epoch_f1 = 0
    epoch_acc = 0
    
    model.eval()
    for idx, batch in tqdm(enumerate(iterator)):
        batch = prepare_batch_to_bert(batch, max_seq_length=59)

        input_ids_tensor = torch.tensor(batch['ids'], dtype=torch.long).to(device)
        mask_tensor = torch.tensor(batch['attention_mask'], dtype=torch.long).to(device)
        labels_tensor = torch.tensor(batch['label'], dtype=torch.float).to(device)

        probas = model(input_ids_tensor, attention_mask=mask_tensor)
        probas = torch.flatten(probas)

        loss = criterion(probas, labels_tensor)
        acc = binary_accuracy(probas, labels_tensor)
        f1 = binary_f1(probas, labels_tensor)

        epoch_loss += loss.item()
        epoch_acc += acc
        epoch_f1 += f1

    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator)

In [0]:
def predict_func(model, iterator, optimizer, criterion=None):
    model.eval()
    y, y_prob, y_pred = [], [], []
    for idx, batch in tqdm(enumerate(iterator)):
        batch = prepare_batch_to_bert(batch, max_seq_length=59)

        input_ids_tensor = torch.tensor(batch['ids'], dtype=torch.long).to(device)
        mask_tensor = torch.tensor(batch['attention_mask'], dtype=torch.long).to(device)
        labels_tensor = torch.tensor(batch['label'], dtype=torch.float).to(device)
        probas = model(input_ids_tensor, attention_mask=mask_tensor)
        probas = torch.flatten(probas)
        probas = torch.sigmoid(probas)

        y.append(labels_tensor.item())
        y_prob.append(probas.item())
        y_pred.append(torch.round(probas).item())

    return y, y_prob, y_pred


def get_f1(y_tr, pred_tr, y_te, pred_te):
    mask_ru_tr = df_tr['language'] == 'Русский'
    mask_en_tr = df_tr['language'] == 'Английский'
    mask_ru_te = df_val['language'] == 'Русский'
    mask_en_te = df_val['language'] == 'Английский'

    print('USUAL')
    print('F1_all train =', f1_score(y_tr, pred_tr), ', F1_all test =', f1_score(y_te, pred_te))
    print('F1_ru train =', f1_score(y_tr[mask_ru_tr], pred_tr[mask_ru_tr]), ', F1_ru test =', f1_score(y_te[mask_ru_te], pred_te[mask_ru_te]))
    print('F1_en train =', f1_score(y_tr[mask_en_tr], pred_tr[mask_en_tr]), ', F1_en test =', f1_score(y_te[mask_en_te], pred_te[mask_en_te]))

    print('\nMACRO')
    print('F1_all train =', f1_score(y_tr, pred_tr, average='macro'), ', F1_all test =', f1_score(y_te, pred_te, average='macro'))
    print('F1_ru train =', f1_score(y_tr[mask_ru_tr], pred_tr[mask_ru_tr], average='macro'), ', F1_ru test =', f1_score(y_te[mask_ru_te], pred_te[mask_ru_te], average='macro'))
    print('F1_en train =', f1_score(y_tr[mask_en_tr], pred_tr[mask_en_tr], average='macro'), ', F1_en test =', f1_score(y_te[mask_en_te], pred_te[mask_en_te], average='macro'))


def get_predict_with_opt_threthold(y_tr, pred_tr, y_te, pred_te):
    fpr, tpr, thresholds = roc_curve(y_tr, pred_tr)

    f1_scores = []
    for th in thresholds:
        prediction = np.zeros_like(pred_tr)
        prediction[pred_tr >= th] = 1
        f1_scores.append(f1_score(y_tr, prediction))
    
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]

    prediction_tr = np.zeros_like(pred_tr)
    prediction_tr[pred_tr >= optimal_threshold] = 1
    prediction_te = np.zeros_like(pred_te)
    prediction_te[pred_te >= optimal_threshold] = 1
  
    print('Optimal threshold is', optimal_threshold)
    print('F1_train =', np.max(f1_scores))
    print('F1_test =', f1_score(y_te, prediction_te))
    print('Gini train =', 2 * roc_auc_score(y_tr, prediction_tr) - 1)
    
    return prediction_tr, prediction_te

def get_scores(y, pred, prediction):
    print('Precision =', precision_score(y, prediction))
    print('Recall =', recall_score(y, prediction))
    print('F1_score =', f1_score(y, prediction))
    print('Gini =', 2 * roc_auc_score(y, pred) - 1)


def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False


def varname(var, dir=locals()):
    return [key for key, val in dir.items() if id(val) == id(var)][0]


def save_model(model):
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    name = varname(model) + '_' + str(datetime.datetime.now())[:19] + '.bin'
    torch.save(model_to_save.state_dict(), name)
    print('Model ' + name + ' is saved!')


def load_model(model_class, model_file):
    model = model_class(out_dim=1)
    model.load_state_dict(torch.load(model_file))
    model.to(device)
    return model

seed_all(1701)

### Обучение модели

In [0]:
model_dense = Dense_BERT_TR(out_dim=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([{'params':model_dense.bert_layer.parameters(), 'lr':1e-5},#уменьшим lr для cлоев BERT
                        {'params':model_dense.dropout.parameters()},
                        {'params':model_dense.lin_layer1.parameters()},
                        {'params':model_dense.lin_layer2.parameters()}
                       ], lr=0.00005, weight_decay=1e-4)
model_dense = model_dense.cuda()
criterion = criterion.to(device)

In [0]:
for epoch in range(5):
    train_loss, train_acc, train_f1 = train_func(model_dense, train_loader, optimizer, criterion=criterion)
    valid_loss, valid_acc, valid_f1 = eval_func(model_dense, test_loader, optimizer, criterion=criterion)
    
    print(f'\nEpoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train acc: {train_acc*100:.2f}, Train f1: {train_f1*100:.2f}%\n    Val. Loss: {valid_loss:.3f}, Val. acc: {valid_acc*100:.2f} Val. f1: {valid_f1*100:.2f}%\n')

1665it [01:47, 15.48it/s]
416it [00:17, 23.85it/s]
2it [00:00, 15.59it/s]


Epoch: 01, Train Loss: 0.443, Train acc: 87.42, Train f1: 1.58%
    Val. Loss: 0.305, Val. acc: 90.88 Val. f1: 0.00%



1665it [01:47, 15.44it/s]
416it [00:17, 23.70it/s]
2it [00:00, 15.62it/s]


Epoch: 02, Train Loss: 0.272, Train acc: 90.85, Train f1: 0.00%
    Val. Loss: 0.240, Val. acc: 90.88 Val. f1: 0.00%



1665it [01:47, 15.46it/s]
416it [00:17, 23.67it/s]
2it [00:00, 15.78it/s]


Epoch: 03, Train Loss: 0.232, Train acc: 90.86, Train f1: 0.11%
    Val. Loss: 0.232, Val. acc: 90.88 Val. f1: 0.00%



1665it [01:48, 15.38it/s]
416it [00:17, 23.68it/s]
2it [00:00, 15.44it/s]


Epoch: 04, Train Loss: 0.219, Train acc: 90.94, Train f1: 1.48%
    Val. Loss: 0.215, Val. acc: 90.88 Val. f1: 0.00%



1665it [01:47, 15.43it/s]
416it [00:17, 23.64it/s]


Epoch: 05, Train Loss: 0.205, Train acc: 91.28, Train f1: 6.86%
    Val. Loss: 0.221, Val. acc: 92.13 Val. f1: 28.87%






### Оценка точности модели:

In [0]:
pred_train_dataloader = torch.utils.data.DataLoader(train_set, sampler=train_sampler, batch_size=1)
pred_test_dataloader = torch.utils.data.DataLoader(test_set, sampler=test_sampler, batch_size=1)

y_tr, y_prob_tr, y_pred_tr = predict_func(model_dense, pred_train_dataloader, optimizer, criterion=criterion)
y_te, y_prob_te, y_pred_te = predict_func(model_dense, pred_test_dataloader, optimizer, criterion=criterion)

prediction_tr, prediction_te = get_predict_with_opt_threthold(y_tr, y_prob_tr, y_te, y_prob_te)
get_scores(y_te, y_pred_te, prediction_te)

get_f1(pd.DataFrame(y_tr), pd.DataFrame(y_pred_tr), pd.DataFrame(y_te), pd.DataFrame(y_pred_te))

26634it [05:15, 84.38it/s]
6656it [01:18, 84.49it/s]


Optimal threshold is 0.4474070370197296
F1_train = 0.6134874759152216
F1_test = 0.5020632737276479
Gini train = 0.6055098801065266
Precision = 0.4309327036599764
Recall = 0.6013179571663921
F1_score = 0.5020632737276479
Gini = 0.3234864204820438
USUAL
F1_all train = 0.5002689618074233 , F1_all test = 0.4437367303609342
F1_ru train = 0.5166475315729048 , F1_ru test = 0.5019305019305019
F1_en train = 0.49525816649104326 , F1_en test = 0.4216691068814056

MACRO
F1_all train = 0.7313857422558812 , F1_all test = 0.7006880903219384
F1_ru train = 0.7397102721088504 , F1_ru test = 0.7278054664051072
F1_en train = 0.7288403486413011 , F1_en test = 0.6902294412862948


Теперь сохраним лучшую модель:

In [0]:
save_model(model_dense)

Model model_dense_2020-05-24 15:40:49.bin is saved!
