# **Выявление побочных действий лекарств в микроблогах**

Выполнила Ирина Долгалева


# 3. Построение базовых моделей: логрег на эмбеддингах

In [0]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


Доустановим библиотеки:

In [0]:
!pip install pytorch-pretrained-bert pytorch-nlp
!pip install pytorch_transformers
!pip install emoji

Импортнем бибилотеки:

In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange, notebook
import pandas as pd
import io
import numpy as np

import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, BertAdam, BertForSequenceClassification

import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.


Определим машину GPU:

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

Определим токенизатор текстов и саму модель BERT, из которой будем брать эмбеддинги:

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased", num_labels=2)

bert_model.cuda()

100%|██████████| 995526/995526 [00:00<00:00, 2013366.39B/s]
100%|██████████| 625/625 [00:00<00:00, 264925.72B/s]
100%|██████████| 714314041/714314041 [00:24<00:00, 29270554.73B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

Определим следующие функции:


1.   Функция для токенизации текстов и паддинга получившихся последовательностей

2.   Функция для применения предобученной модели BERT

In [0]:
MAX_LEN = 350

def get_tokens_padding_attention(data, tokenizer):
    index_list = []
    for item in notebook.tqdm(data):
        tokens = tokenizer.tokenize(item)[:MAX_LEN-2]
        tokens = ['[CLS]'] + tokens + ['[SEP]']     
        tokens_ids = tokenizer.convert_tokens_to_ids(tokens)    
        index_list.append(tokens_ids)

    padded = pad_sequences(index_list, maxlen=MAX_LEN, truncating="post", padding="post")
    attention_mask = np.where(padded != 0, 1, 0)
        
    return index_list, padded, attention_mask


def apply_bert(data, model, attention_mask):
    emb_list = []
    for tokens, mask in notebook.tqdm(zip(data, attention_mask)):
        with torch.no_grad():
            word_embeds = model(tokens[None,:], attention_mask=mask[None,:])[0]
            sent_emb = word_embeds.mean(dim=1)
        emb_list.append(sent_emb)
    return torch.cat(emb_list)

Считаем данные:

In [0]:
import re
from tqdm import tqdm_notebook
import emoji

df_ru_tr = pd.read_csv('df_ru_tr.csv', encoding='utf-8-sig')
df_ru_val = pd.read_csv('df_ru_val.csv', encoding='utf-8-sig')
df_ru_tr['language'] = 'Русский'
df_ru_val['language'] = 'Русский'

df_en_tr = pd.read_csv('df_en_tr.csv', encoding='utf-8-sig')
df_en_val = pd.read_csv('df_en_val.csv', encoding='utf-8-sig')
df_en_tr['language'] = 'Английский'
df_en_val['language'] = 'Английский'

df_ru_tr['tweet_light_pr'] = df_ru_tr['tweet'].apply(lambda x: re.sub("(?:\@|https?\://)\S+", "", x))
df_ru_tr['tweet_light_pr'] = df_ru_tr['tweet_light_pr'].apply(lambda x: ''.join(ch for ch in x if ch not in emoji.UNICODE_EMOJI))
df_ru_val['tweet_light_pr'] = df_ru_val['tweet'].apply(lambda x: re.sub("(?:\@|https?\://)\S+", "", x))
df_ru_val['tweet_light_pr'] = df_ru_val['tweet_light_pr'].apply(lambda x: ''.join(ch for ch in x if ch not in emoji.UNICODE_EMOJI))
df_en_tr['tweet_light_pr'] = df_en_tr['tweet'].apply(lambda x: re.sub("(?:\@|https?\://)\S+", "", x))
df_en_tr['tweet_light_pr'] = df_en_tr['tweet_light_pr'].apply(lambda x: ''.join(ch for ch in x if ch not in emoji.UNICODE_EMOJI))
df_en_val['tweet_light_pr'] = df_en_val['tweet'].apply(lambda x: re.sub("(?:\@|https?\://)\S+", "", x))
df_en_val['tweet_light_pr'] = df_en_val['tweet_light_pr'].apply(lambda x: ''.join(ch for ch in x if ch not in emoji.UNICODE_EMOJI))

df_tr = df_ru_tr.append(df_en_tr)
df_val = df_ru_val.append(df_en_val)

X_train, X_test = df_tr[['tweet_pr']], df_val[['tweet_pr']]
y_train, y_test = df_tr['class'], df_val['class']

mask_ru_tr = df_tr['language'] == 'Русский'
mask_en_tr = df_tr['language'] == 'Английский'
mask_ru_te = df_val['language'] == 'Русский'
mask_en_te = df_val['language'] == 'Английский'

df_tr = df_ru_tr.append(df_en_tr)
df_val = df_ru_val.append(df_en_val)

Применим к данным функции по предобработке данных и формированию численных пердставлений текстов с помощью эмбеддингов:

In [0]:
tokens, padded, attention_mask = get_tokens_padding_attention(df_tr['tweet_light_pr'], tokenizer)

train_input_ids = torch.tensor(padded).to(torch.int64).to(device)
train_attention_mask = torch.tensor(attention_mask).to(torch.int64).to(device)

train_features = apply_bert(train_input_ids, bert_model, train_attention_mask)
train_features = train_features.cpu().numpy()

train_labels = df_tr['class']

HBox(children=(FloatProgress(value=0.0, max=26634.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [0]:
tokens, padded, attention_mask = get_tokens_padding_attention(df_val['tweet_light_pr'], tokenizer)

test_input_ids = torch.tensor(padded).to(torch.int64).to(device)
test_attention_mask = torch.tensor(attention_mask).to(torch.int64).to(device)

test_features = apply_bert(test_input_ids, bert_model, test_attention_mask)
test_features = test_features.cpu().numpy()

test_labels = df_val['class']

HBox(children=(FloatProgress(value=0.0, max=6656.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Функции для определения оптимального порога отсечения модели по вероятности и расчета точности модели (precision, recall и т.д.):

In [0]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve

def get_predict_with_opt_threthold(y_tr, pred_tr, y_te, pred_te):
    fpr, tpr, thresholds = roc_curve(y_tr, pred_tr)

    f1_scores = []
    for th in thresholds:
        prediction = np.zeros_like(pred_tr)
        prediction[pred_tr >= th] = 1
        f1_scores.append(f1_score(y_tr, prediction))
    
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]

    prediction_tr = np.zeros_like(pred_tr)
    prediction_tr[pred_tr >= optimal_threshold] = 1
    prediction_te = np.zeros_like(pred_te)
    prediction_te[pred_te >= optimal_threshold] = 1
  
    print('Optimal threshold is', optimal_threshold)
    print('F1_train =', np.max(f1_scores))
    print('F1_test =', f1_score(y_te, prediction_te))
    print('Gini train =', 2 * roc_auc_score(y_tr, prediction_tr) - 1)
    
    return prediction_tr, prediction_te

def get_scores(y, pred, prediction):
    print('Precision =', precision_score(y, prediction))
    print('Recall =', recall_score(y, prediction))
    print('F1_score =', f1_score(y, prediction))
    print('Gini =', 2 * roc_auc_score(y, pred) - 1)

def get_f1(y_tr, pred_tr, y_te, pred_te):
    print('F1_all train =', f1_score(y_tr, pred_tr), ', F1_all test =', f1_score(y_te, pred_te))
    print('F1_ru train =', f1_score(y_tr[mask_ru_tr], pred_tr[mask_ru_tr]), ', F1_ru test =', f1_score(y_te[mask_ru_te], pred_te[mask_ru_te]))
    print('F1_en train =', f1_score(y_tr[mask_en_tr], pred_tr[mask_en_tr]), ', F1_en test =', f1_score(y_te[mask_en_te], pred_te[mask_en_te]))

    print('\nMACRO')
    print('F1_all train =', f1_score(y_tr, pred_tr, average='macro'), ', F1_all test =', f1_score(y_te, pred_te, average='macro'))
    print('F1_ru train =', f1_score(y_tr[mask_ru_tr], pred_tr[mask_ru_tr], average='macro'), ', F1_ru test =', f1_score(y_te[mask_ru_te], pred_te[mask_ru_te], average='macro'))
    print('F1_en train =', f1_score(y_tr[mask_en_tr], pred_tr[mask_en_tr], average='macro'), ', F1_en test =', f1_score(y_te[mask_en_te], pred_te[mask_en_te], average='macro'))

Построим логистическую регрессию на BERT-эмбеддингах:

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

train_features = train_last_hidden_states[0].mean(dim=1).numpy()
test_features = test_last_hidden_states[0].mean(dim=1).numpy()

lr = LogisticRegression(solver='lbfgs', random_state=123, max_iter=1000)
# parameters = {
#               'fit_intercept': [False, True],
#               'C': [0.0001, 0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25]
#              }
# clf = GridSearchCV(lr, parameters, cv=10, scoring='f1')
clf = LogisticRegression(penalty='l1', solver='saga', fit_intercept=False, random_state=123, max_iter=1000,\
                        C=0.1)

clf.fit(train_features, train_labels)

pred = clf.predict_proba(test_features)[:,1]
2 * roc_auc_score(test_labels, pred) - 1

0.674859869004993

Определение оптимального порога отсечения и точность модели:

In [0]:
pred_lr_tr = clf.predict_proba(train_features)[:,1]
pred_lr_te = clf.predict_proba(test_features)[:,1]
prediction_lr_tr, prediction_lr_te = get_predict_with_opt_threthold(train_labels, pred_lr_tr, test_labels, pred_lr_te)

get_scores(test_labels, pred_lr_te, prediction_lr_te)

Optimal threshold is 0.18098810786779743
F1_train = 0.4560343456829385
F1_test = 0.4103982300884956
Gini train = 0.48870301132243554
Precision = 0.3089092422980849
Recall = 0.6112026359143328
F1_score = 0.4103982300884956
Gini = 0.674859869004993


In [0]:
get_f1(train_labels, prediction_lr_tr, test_labels, prediction_lr_te)

F1_all train = 0.4560343456829385 , F1_all test = 0.4103982300884956
F1_ru train = 0.4180451127819549 , F1_ru test = 0.3940149625935162
F1_en train = 0.46622302883645894 , F1_en test = 0.41506751954513144

MACRO
F1_all train = 0.6916072875735836 , F1_all test = 0.6588674043349293
F1_ru train = 0.6733543536260005 , F1_ru test = 0.6510369932150328
F1_en train = 0.6964788924248169 , F1_en test = 0.6610943059863112


In [0]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(test_labels, prediction_lr_te))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91      6049
           1       0.31      0.61      0.41       607

    accuracy                           0.84      6656
   macro avg       0.63      0.74      0.66      6656
weighted avg       0.90      0.84      0.86      6656

