# **Выявление побочных действий лекарств в микроблогах**

Выполнила Ирина Долгалева


# 2. Предобработка данных и построение базовых моделей

Установим то, по дефолту не установлено в колабе:

In [0]:
!pip install pymorphy2
!pip install spacy
!pip install emoji
!pip install pattern3
!python3 -m spacy download en
!python3 -m spacy download fr
!pip install pattern3

Заимпортируем все бибилиотеки для исследования данных:

In [0]:
import pandas as pd
import numpy as np

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from string import punctuation
import spacy
import emoji
from collections import Counter
from tqdm import tqdm_notebook
import re
import nltk
nltk.download('stopwords')

import matplotlib.pyplot as plt
% matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Предобработка текстов

In [0]:
exclude = list(punctuation) + list('0123456789—«»₽•')

def del_urls(text):
    text = re.sub("(?:\@|https?\://)\S+", "", text)
    text = text.replace('\u200d️', '')
    buf = ''.join(ch for ch in text if ch not in exclude)
    return buf

def preprocess_russian(text):
    buf = del_urls(text)
    buf = ''.join(ch for ch in buf if ch not in emoji.UNICODE_EMOJI)
    tokens = WhitespaceTokenizer().tokenize(buf.lower())
    morph = MorphAnalyzer()
    lemmas = []
    for t in tokens:
        
        if not t in stopwords.words('russian') and len(t) > 2:
            
            try:
                lemma = morph.parse(t)[0].normal_form
            except:
                lemma = t
            lemmas.append(lemma)
    return lemmas

def preprocess_english(text, language):
    buf = del_urls(text)
    buf = ''.join(ch for ch in buf if ch not in emoji.UNICODE_EMOJI)
    tokens = WhitespaceTokenizer().tokenize(buf.lower())
    lemmas = []

    if language == 'english':
        morph = WordNetLemmatizer()

    lemmas = []
    for t in tokens:
        if not t in stopwords.words(language) and len(t) > 2:
            lemma = morph.lemmatize(t)
            lemmas.append(lemma)
            print(t, lemma)
    return lemmas

Считаем исходные данные:

In [0]:
# path = ''

# df_en_tr = pd.read_csv(path + 'task2_en_training.tsv', sep="\t")
# df_en_val = pd.read_csv(path + 'task2_en_validation.tsv', sep="\t")
# print(df_en_tr.shape)
# print(df_en_val.shape)

# df_fr_tr = pd.read_csv(path + 'task2_fr_training.tsv', sep="\t")
# df_fr_val = pd.read_csv(path + 'task2_fr_validation.tsv', sep="\t")
# print(df_fr_tr.shape)
# print(df_fr_val.shape)

# df_ru_tr = pd.read_csv(path + 'task2_ru_training.tsv', sep="\t")
# df_ru_val = pd.read_csv(path + 'task2_ru_validation.tsv', sep="\t")
# print(df_ru_tr.shape)
# print(df_ru_val.shape)

# df_en_tr.head()

Предобработаем тексты и сохраним результат:

In [0]:
# df_ru_tr['tweet_pr'] = df_ru_tr['tweet'].apply(lambda x: ' '.join(preprocess_russian(x)).strip())
# df_ru_val['tweet_pr'] = df_ru_val['tweet'].apply(lambda x: ' '.join(preprocess_russian(x)).strip())
# df_ru_tr.to_csv('df_ru_tr.csv', encoding='utf-8-sig')
# df_ru_val.to_csv('df_ru_val.csv', encoding='utf-8-sig')
# print('Russian texts are preproccessed!')

# df_en_val['tweet_pr'] = df_en_val['tweet'].apply(lambda x: ' '.join(preprocess_english(x, 'english')).strip())
# df_en_val.to_csv('df_en_val.csv', encoding='utf-8-sig')
# df_en_tr['tweet_pr'] = df_en_tr['tweet'].apply(lambda x: ' '.join(preprocess_english(x, 'english')).strip())
# df_en_tr.to_csv('df_en_tr.csv', encoding='utf-8-sig')
# print('English texts are preproccessed!')

In [0]:
df_ru_tr = pd.read_csv('df_ru_tr.csv', encoding='utf-8-sig')
df_ru_val = pd.read_csv('df_ru_val.csv', encoding='utf-8-sig')
df_ru_tr['language'] = 'Русский'
df_ru_val['language'] = 'Русский'

df_en_tr = pd.read_csv('df_en_tr.csv', encoding='utf-8-sig')
df_en_val = pd.read_csv('df_en_val.csv', encoding='utf-8-sig')
df_en_tr['language'] = 'Английский'
df_en_val['language'] = 'Английский'

df_tr = df_ru_tr.append(df_en_tr)
df_val = df_ru_val.append(df_en_val)
df_tr = df_tr.sample(frac=1, random_state=123).reset_index(drop=True)
df_val = df_val.sample(frac=1, random_state=1234).reset_index(drop=True)

X_train, X_test = df_tr[['tweet_pr']], df_val[['tweet_pr']]
y_train, y_test = df_tr['class'], df_val['class']

mask_ru_tr = df_tr['language'] == 'Русский'
mask_en_tr = df_tr['language'] == 'Английский'
mask_ru_te = df_val['language'] == 'Русский'
mask_en_te = df_val['language'] == 'Английский'

Представим тексты в виде матрицы "термин-документ" и применим tf-idf: 

In [0]:
vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1, 1), max_features=2000)

X_train_v = vectorizer.fit_transform(X_train['tweet_pr'])
X_train_v = X_train_v.toarray()

X_test_v = vectorizer.transform(X_test['tweet_pr'])
X_test_v = X_test_v.toarray()

len(vectorizer.vocabulary_)

2000

## Построение базовых моделей

Определение оптимального порога отсечения по вероятности:

In [0]:
def get_predict_with_opt_threthold(y_tr, pred_tr, y_te, pred_te):
    fpr, tpr, thresholds = roc_curve(y_tr, pred_tr)

    f1_scores = []
    for th in thresholds:
        prediction = np.zeros_like(pred_tr)
        prediction[pred_tr >= th] = 1
        f1_scores.append(f1_score(y_tr, prediction))
    
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]

    prediction_tr = np.zeros_like(pred_tr)
    prediction_tr[pred_tr >= optimal_threshold] = 1
    prediction_te = np.zeros_like(pred_te)
    prediction_te[pred_te >= optimal_threshold] = 1
  
    print('Optimal threshold is', optimal_threshold)
    print('F1_train =', np.max(f1_scores))
    print('F1_test =', f1_score(y_te, prediction_te))
    print('Gini train =', 2 * roc_auc_score(y_tr, prediction_tr) - 1)
    
    return prediction_tr, prediction_te

def get_scores(y, pred, prediction):
    print('Precision =', precision_score(y, prediction))
    print('Recall =', recall_score(y, prediction))
    print('F1_score =', f1_score(y, prediction))
    print('Gini =', 2 * roc_auc_score(y, pred) - 1)

def get_f1(y_tr, pred_tr, y_te, pred_te):
    print('F1_all train =', f1_score(y_tr, pred_tr), ', F1_all test =', f1_score(y_te, pred_te))
    print('F1_ru train =', f1_score(y_tr[mask_ru_tr], pred_tr[mask_ru_tr]), ', F1_ru test =', f1_score(y_te[mask_ru_te], pred_te[mask_ru_te]))
    print('F1_en train =', f1_score(y_tr[mask_en_tr], pred_tr[mask_en_tr]), ', F1_en test =', f1_score(y_te[mask_en_te], pred_te[mask_en_te]))

    print('\nMACRO')
    print('F1_all train =', f1_score(y_tr, pred_tr, average='macro'), ', F1_all test =', f1_score(y_te, pred_te, average='macro'))
    print('F1_ru train =', f1_score(y_tr[mask_ru_tr], pred_tr[mask_ru_tr], average='macro'), ', F1_ru test =', f1_score(y_te[mask_ru_te], pred_te[mask_ru_te], average='macro'))
    print('F1_en train =', f1_score(y_tr[mask_en_tr], pred_tr[mask_en_tr], average='macro'), ', F1_en test =', f1_score(y_te[mask_en_te], pred_te[mask_en_te], average='macro'))

Определение оптимального порога отсечения на кросс-валидации:

In [0]:
def get_cv_cutoff(y, y_pred, cv=10):
    y_splited = np.array_split(y, cv)
    y_pred_splited = np.array_split(y_pred, cv)
    
    for i in range(cv):
        y_tr = np.array([])
        y_pred_tr = np.array([])
        opt_th = []
        for j in range(cv):
            if j != i:
                y_tr = np.concatenate((y_tr, y_splited[j]))
                y_pred_tr = np.concatenate((y_pred_tr, y_pred_splited[j]))

        y_te = y_splited[i]
        y_pred_te = y_pred_splited[i]

        fpr, tpr, thresholds = roc_curve(y_tr, y_pred_tr)

        f1_scores = []
        pr = []
        for th in thresholds:
            prediction_tr = np.zeros_like(y_pred_tr)
            prediction_tr[y_pred_tr >= th] = 1

            prediction_te = np.zeros_like(y_pred_te)
            prediction_te[y_pred_te >= th] = 1

            f1_scores.append((th, f1_score(y_tr, prediction_tr), f1_score(y_te, prediction_te)))
            pr.append((th, precision_score(y_tr, prediction_tr), recall_score(y_tr, prediction_tr), f1_score(y_tr, prediction_tr)))

        optimal_idx = np.argmax([x[1] for x in f1_scores])
        optimal_threshold = thresholds[optimal_idx]

        tr_f1 = [x[1] for x in f1_scores if x[0] == optimal_threshold]
        te_f1 = [x[2] for x in f1_scores if x[0] == optimal_threshold]
        print(optimal_threshold, tr_f1, te_f1)

    return pr

### Naive Bayes

In [0]:
nb = GaussianNB()
nb.fit(X_train_v, y_train)

pred_nb_tr = nb.predict_proba(X_train_v)[:,1]
pred_nb_te = nb.predict_proba(X_test_v)[:,1]
prediction_nb_tr, prediction_nb_te = get_predict_with_opt_threthold(y_train, pred_nb_tr, y_test, pred_nb_te)

get_scores(y_test, pred_nb_te, prediction_nb_te)

Optimal threshold is 1.0
F1_train = 0.2302553655185483
F1_test = 0.19897144883844656
Gini train = 0.32636577464163663
Precision = 0.11148648648648649
Recall = 0.9242174629324547
F1_score = 0.19897144883844656
Gini = 0.18510990556800944


In [0]:
get_f1(y_train, prediction_nb_tr, y_test, prediction_nb_te)

F1_all train = 0.2302553655185483 , F1_all test = 0.19897144883844656
F1_ru train = 0.21398347773524076 , F1_ru test = 0.18666666666666665
F1_en train = 0.23525768087215068 , F1_en test = 0.20256645279560037

MACRO
F1_all train = 0.3625391438456832 , F1_all test = 0.30514192147383035
F1_ru train = 0.33672708596475215 , F1_ru test = 0.3002298850574712
F1_en train = 0.37015449790881433 , F1_en test = 0.3065677792433287


Log Reg

In [0]:
# lr = LogisticRegression(solver='saga',random_state=123, max_iter=1000)
# parameters = {
#               'penalty': ['l1', 'l2'],
#               'C': [0.4, 0.6, 0.8, 0.9]
#              }
# clf = GridSearchCV(lr, parameters, cv=10, scoring='f1')

clf = LogisticRegression(penalty='l2', C=0.8, solver='lbfgs', random_state=1273, max_iter=1000)
clf.fit(X_train_v, y_train)

pred_lr_tr = clf.predict_proba(X_train_v)[:,1]
pred_lr_te = clf.predict_proba(X_test_v)[:,1]
prediction_lr_tr, prediction_lr_te = get_predict_with_opt_threthold(y_train, pred_lr_tr, y_test, pred_lr_te)

get_scores(y_test, pred_lr_te, prediction_lr_te)

Optimal threshold is 0.2142165759435533
F1_train = 0.5544515103338632
F1_test = 0.44099797707349964
Gini train = 0.5230278975138198
Precision = 0.3732876712328767
Recall = 0.5387149917627677
F1_score = 0.44099797707349964
Gini = 0.6925190570255053


In [0]:
get_f1(y_train, prediction_lr_tr, y_test, prediction_lr_te)

F1_all train = 0.5544515103338632 , F1_all test = 0.44099797707349964
F1_ru train = 0.4449339207048458 , F1_ru test = 0.35684647302904565
F1_en train = 0.5785645004849661 , F1_en test = 0.4573268921095008

MACRO
F1_all train = 0.7539858513606459 , F1_all test = 0.6854579876068319
F1_ru train = 0.7001106793020326 , F1_ru test = 0.6507742889583331
F1_en train = 0.7657728897836582 , F1_en test = 0.6913268628506732


In [0]:
pr = get_cv_cutoff(y_train, pred_lr_tr, cv=3)

  _warn_prf(average, modifier, msg_start, len(result))


0.2142165759435533 [0.5618990384615384] [0.5399061032863849]
0.21240434888169568 [0.5606326889279437] [0.5410628019323672]
0.19547897584363993 [0.541620421753607] [0.5709552733296522]


Random Forest

In [0]:
rf = RandomForestClassifier(random_state=123, n_estimators=100, max_depth=4, \
                            max_samples=0.6, min_samples_leaf=20, min_samples_split=30, \
                            max_features=30)
rf.fit(X_train_v, y_train)

pred_rf_tr = rf.predict_proba(X_train_v)[:,1]
pred_rf_te = rf.predict_proba(X_test_v)[:,1]
prediction_rf_tr, prediction_rf_te = get_predict_with_opt_threthold(y_train, pred_rf_tr, y_test, pred_rf_te)

get_scores(y_test, pred_rf_te, prediction_rf_te)

Optimal threshold is 0.09896743065585221
F1_train = 0.40654123439423245
F1_test = 0.3641755634638197
Gini train = 0.38797103697451685
Precision = 0.2845227062094532
Recall = 0.5057660626029654
F1_score = 0.3641755634638197
Gini = 0.5277607937156821


In [0]:
get_f1(y_train, prediction_rf_tr, y_test, prediction_rf_te)

F1_all train = 0.40654123439423245 , F1_all test = 0.3641755634638197
F1_ru train = 0.2725 , F1_ru test = 0.2403846153846154
F1_en train = 0.4284837323511356 , F1_en test = 0.3815967523680649

MACRO
F1_all train = 0.6678047799931903 , F1_all test = 0.6359842207479084
F1_ru train = 0.6106788224956063 , F1_ru test = 0.5923361722903331
F1_en train = 0.6756655837524304 , F1_en test = 0.6388074774354545


XGBoost

In [0]:
# xgb_model = XGBClassifier(random_state=123, n_estimators=10)

# parameters = {
#               'max_depth': [2, 3]
#              }
# xgb = GridSearchCV(xgb_model, parameters, cv=10, scoring='f1')

xgb = XGBClassifier(random_state=123, n_estimators=100, subsample=0.5, max_depth=2, \
                    colsample_bytree=0.6, colsample_bylevel=0.6, colsample_bynode=0.6)
xgb.fit(X_train_v, y_train)

pred_xgb_tr = xgb.predict_proba(X_train_v)[:,1]
pred_xgb_te = xgb.predict_proba(X_test_v)[:,1]
prediction_xgb_tr, prediction_xgb_te = get_predict_with_opt_threthold(y_train, pred_xgb_tr, y_test, pred_xgb_te)

get_scores(y_test, pred_xgb_te, prediction_xgb_te)

Optimal threshold is 0.12440963
F1_train = 0.41157024793388425
F1_test = 0.3647128478389579
Gini train = 0.41318309089584004
Precision = 0.2846580406654344
Recall = 0.5074135090609555
F1_score = 0.3647128478389579
Gini = 0.5224477856974197


In [0]:
get_f1(y_train, prediction_xgb_tr, y_test, prediction_xgb_te)

F1_all train = 0.41157024793388425 , F1_all test = 0.3647128478389579
F1_ru train = 0.3132803632236095 , F1_ru test = 0.2727272727272727
F1_en train = 0.42832269297736514 , F1_en test = 0.38009675190048375

MACRO
F1_all train = 0.6680876357209343 , F1_all test = 0.6361979450413925
F1_ru train = 0.6298679008789966 , F1_ru test = 0.6049574978911167
F1_en train = 0.6730271278300339 , F1_en test = 0.6392038005052809


Списко значимых переменных из модели XGBoost:

In [0]:
xgb_imp = xgb.feature_importances_

imp_vars = {}
mask_imp_vars = []

for i, imp in enumerate(xgb_imp):
    if imp != 0:# and len(imp_vars) <= 20:
        imp_vars[vectorizer.get_feature_names()[i]] = imp
        mask_imp_vars.append(True)
    else:
        mask_imp_vars.append(False)

lst = imp_vars.items()
for i in sorted(lst, key = lambda x : x[1])[::-1]:
    print(i)

('feel', 0.02064121)
('withdrawal', 0.020026306)
('make', 0.01943361)
('hour', 0.018048786)
('weight', 0.017561737)
('gain', 0.017154697)
('cramp', 0.016972065)
('dream', 0.016892387)
('humira', 0.016664283)
('attack', 0.016593108)
('изз', 0.01574244)
('sleep', 0.015708549)
('cause', 0.015692843)
('сон', 0.015151574)
('headache', 0.014843466)
('leg', 0.014770963)
('спать', 0.014465898)
('sleepy', 0.014211495)
('кветиапина', 0.014091654)
('asleep', 0.013956622)
('allergic', 0.013857023)
('часы', 0.013833268)
('find', 0.0138045335)
('приём', 0.013802541)
('stop', 0.013779549)
('quetiapine', 0.013777723)
('sick', 0.0136053255)
('shake', 0.013544537)
('побочка', 0.013419584)
('wake', 0.013340957)
('тошнить', 0.01293862)
('day', 0.0129080545)
('addict', 0.012804417)
('stomach', 0.012656623)
('мочь', 0.01264424)
('reaction', 0.012473229)
('ache', 0.012350848)
('tire', 0.012045979)
('mouth', 0.011960387)
('пить', 0.011918345)
('remicade', 0.011820809)
('effect', 0.011654647)
('день', 0.011421