In this part of preprocessing, we tokenize our sentences and get rid of any data that are not necessary for training.

In [1]:
import nltk
import pickle
from copy import deepcopy
import pandas as pd
import numpy as np
import re

In [2]:
def open_pickle(data):
    pickle_in = open(data, 'rb')
    return pickle.load(pickle_in)

def write_to_pickle(data, file_name):
    pickle_out = open(file_name, 'wb')
    pickle.dump(data, pickle_out)
    pickle_out.close()

In [3]:
train_data = open_pickle('./train_data_preprocessed.pickle')
test_data = open_pickle('./test_data_preprocessed.pickle')

Steps to perform

1. Change NaNs in 'ddi_type' column to none
2. Remove negative instances
3. Tokenize sentences
4. Remove unecessary columns


In [4]:
len(train_data[train_data['removal_flag'] == 1]['ddi_type'])

10178

In [5]:
train_data['ddi_type'].value_counts()

effect       1690
mechanism    1325
advise        826
int           188
Name: ddi_type, dtype: int64

In [6]:
train_data[train_data['removal_flag'] == 1]['ddi_type'].value_counts()

advise       250
effect       114
mechanism     89
int            7
Name: ddi_type, dtype: int64

In [7]:
train_data[train_data['removal_flag'] == 0]['ddi_type'].value_counts()

effect       1576
mechanism    1236
advise        576
int           181
Name: ddi_type, dtype: int64

In [8]:
train_data[train_data['negative'] == 1]['ddi_type'].value_counts()

advise       250
effect       108
mechanism     85
int            6
Name: ddi_type, dtype: int64

In [9]:
train_data[train_data['in_series'] == 1]['ddi_type'].value_counts()

effect       3
mechanism    1
advise       1
int          1
Name: ddi_type, dtype: int64

A fairly large proportion of the positive drug-drug interaction examples are removed due to the presence of negation words. To reduce this, we can remove more specific negation phrases are are more likely to be present in negative drug-drug interaction statements as found in our previous exploratory data analysis. These include:
* 'no effect'
* 'not affect'
* 'no significant'
* 'no clinically'
* 'not significantly'
* 'not alter'
* 'no affected'
* 'not have'
* 'not altered'
* 'not influence'
* 'not result'
* 'no pharmacokinetic'
* 'no evidence'
* 'not a'
* 'no formal'
* 'not appear'
* 'not apparent'

In [10]:
def identify_negatives(row):
    '''returns 1 if there is a presence of a negative phrase, 0 otherwise'''
    pattern = re.compile("no effect|not affect|no significant|no clinically|not significantly|not alter|no affected|not have|not altered|not influence|not result|no pharmacokinetic|no evidence|not a|no formal|not appear|not apparent|not significant|not been|not known")
    match = re.search(pattern, row['text'].lower())
    if match:
        return 1
    return 0


In [11]:
train_data['negative_phrase'] = train_data.apply(identify_negatives, axis = 1)
test_data['negative_phrase'] = test_data.apply(identify_negatives, axis = 1)

In [12]:
train_data[train_data['negative_phrase'] == 1]['ddi_type'].value_counts()

mechanism    44
advise       43
effect       31
int           2
Name: ddi_type, dtype: int64

In [13]:
sum(train_data['negative_phrase'] == 1)

2830

By limiting the negative instance removal to certain negation phrases, we filter out less of the positive DDIs while keeping the number of negative DDIs filtered out relatively high

In [14]:
train_data['removal_flag_2'] = train_data.apply(lambda x: max(x['same_drug'], x['negative_phrase'], x['in_series'], x['special_cases']), axis = 1)
test_data['removal_flag_2'] = test_data.apply(lambda x: max(x['same_drug'], x['negative_phrase'], x['in_series'], x['special_cases']), axis = 1)

In [15]:
train_data[train_data['removal_flag_2'] == 1]['ddi_type'].value_counts()

mechanism    49
advise       45
effect       37
int           3
Name: ddi_type, dtype: int64

Removing flagged negative instances inadvertently tags some drug-drug interactions as false negatives and we will lose these examples in the training set; however, considering that there are about 8,600 instances that are removed, the proportion of false negatives is small and we are willing to make this trade off to have a stronger model at trainig.

In [16]:
def preprocess(data, train = True):
    d = deepcopy(data)
    d['ddi_type'] = d['ddi_type'].fillna('none')
    output_columns = ['text','tokenized_sentences', 'drug1', 'drug2', 'ddi', 'ddi_type']
    if train:
        d = d[d['removal_flag_2'] == 0]
    
    d['tokenized_sentences'] = d.apply(lambda row: " ".join(nltk.word_tokenize(row['anonymized_text'])), axis = 1)
    
    if not train:
        d_pos = d[d['removal_flag_2'] == 0]
        d_neg = d[d['removal_flag_2'] == 1]
        return (d_pos[output_columns], d_neg[output_columns])
    return d[output_columns]
                

In [17]:
train_data_final = preprocess(train_data, train = True)
test_data_final, test_data_negatives = preprocess(test_data, train = False)

In [18]:
len(train_data_final)

19204

In [19]:
len(test_data_final)

3894

In [20]:
len(test_data_negatives)

1832

Since tokenized sentences are easier to parse with regular expressions, we pass through the tokenized sentences to remove any other drug pair mentions in the same series

In [21]:
def series_2(row):
    patterns = [re.compile("drug1 [;,] ([a-zA-Z0-9*]* [;,])+ drug2"), 
                re.compile("drug1 [;,] (([a-zA-Z0-9*]* ){1,4}[;,])+ drug2")]
    for pattern in patterns:
        if re.search(pattern, row['tokenized_sentences']):
            return 1
    return 0

def preprocess_second_pass(data, train = True):
    output_columns = ['text','tokenized_sentences', 'drug1', 'drug2', 'ddi', 'ddi_type']
    data['series_flag'] = data.apply(lambda row: series_2(row), axis = 1)
    if train:
        return data[data['series_flag'] == 0][output_columns]
    else:
        negs = data[data['series_flag'] == 1][output_columns]
        pos = data[data['series_flag'] == 0][output_columns]
        return (pos, negs)
        

In [22]:
train_data_final_2 = preprocess_second_pass(train_data_final, train = True)
test_data_final_2, test_data_negatives_2 = preprocess_second_pass(test_data_final, train = False)
test_negatives = test_data_negatives.append(test_data_negatives_2)

In [23]:
test_negatives = test_negatives.reset_index(drop = True)
train = train_data_final_2.reset_index(drop = True)
test = test_data_final_2.reset_index(drop = True)

In [24]:
counts_after = train['ddi_type'].value_counts()

In [25]:
counts_before = train_data['ddi_type'].value_counts()
counts_before = counts_before.append(pd.Series([len(train_data) - sum(counts_before)]))
counts_before.index = ['effect', 'mechanism', 'advise', 'int', 'none']

In [26]:
counts_before

effect        1690
mechanism     1325
advise         826
int            188
none         23846
dtype: int64

In [27]:
counts_after

none         14155
effect        1653
mechanism     1276
advise         781
int            185
Name: ddi_type, dtype: int64

In [207]:
counts_before - counts_after

advise         45
effect         37
int             3
mechanism      49
none         9691
dtype: int64

In [208]:
counts_after/counts_before

advise       0.945521
effect       0.978107
int          0.984043
mechanism    0.963019
none         0.593601
dtype: float64

After negative instance filtering, we filter out 9691 examples of 'none' while keeping the false negative rate at a minimum. We managed to preserve about 95% of the DDI examples while filtering out about 40% of the 'none' examples.

In [209]:
write_to_pickle(train, 'train_complete_processed.pickle')
write_to_pickle(test, 'test_complete_processed.pickle')
write_to_pickle(test_negatives, 'test_negatives_processed.pickle')