### Data Preprocessing

This notebook defines functions that implement the following preprocessing functions. See function doc strings for more information.

1. Anonymize drug mentions in text.
2. Identify ddi-pairs of the same drug and flag them for removal.
3. Identify the pharses that have 'not', n't, or 'no' and flag for removal.
4. Identify drug pair mentions in the same coordinate structure and flag for removal.
5. Identify instances where one drug is a special case of another and flag for removal. 

In [9]:
from copy import deepcopy
import re
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [10]:
colnames = ['sentence_id', 'text', 'pair_id', 'drug1_id', 'drug1', 'drug1_type', 'drug2_id', 'drug2', 'drug2_type', 'ddi', 'ddi_type']
train_data = pd.read_csv('./ddi_train.csv', header = None, names = colnames)
test_data = pd.read_csv('./ddi_test.csv', header = None, names = colnames)

In [11]:
train_data['drug1'] = train_data['drug1'].apply(lambda x: x.lower())
train_data['drug2'] = train_data['drug2'].apply(lambda x: x.lower())
test_data['drug1'] = test_data['drug1'].apply(lambda x: x.lower())
test_data['drug2'] = test_data['drug2'].apply(lambda x: x.lower())

In [12]:
#mini set to test functions with
mini_set = deepcopy(train_data.loc[1535:1545])

In [13]:
def anonymize_drugs(data):
    '''replaces the drug mentions in the sentences with drug1, drug2, or drug0. The drug pair of interest is
    replaced with drug1 and drug2 while other drug mentions in the sentence that are not part of the pair
    are replaced with drug0
    
    Example:
    
    laboratory tests response to plenaxis should be monitored by measuring serum total testosterone 
    concentrations just prior to administration on day 29 and every 8 weeks thereafter.
    
    If the pair of interest is plenaxis and testosterone, this sentence becomes:
    
    laboratory tests response to drug1 should be monitored by measuring serum total drug2 
    concentrations just prior to administration on day 29 and every 8 weeks thereafter.
    '''
    sentences = data['text']
    drug1_list = data['drug1']
    drug2_list = data['drug2']
    drug_list = np.unique(np.concatenate([drug1_list, drug2_list]))
    anonymized_text = []
    if 'drug' in drug_list:
        drug_list = np.delete(drug_list, np.where(drug_list == 'drug')[0][0])
    for i in range(len(sentences)):
        sentence = sentences.iloc[i]
        #print(f'{drug1_list.iloc[i]}')
        #print(f'{drug2_list.iloc[i]}')
        #print(sentence)
        drug1 = drug1_list.iloc[i]
        drug2 = drug2_list.iloc[i]
        if sentence.find(drug1) != -1:
            try:
                regex_drug1 = re.compile(f'{drug1_list.iloc[i]}[^a-zA-Z0-9]')

                for m in re.finditer(regex_drug1, sentence):
                    last_char_drug1 = sentence[m.end() - 1]
                    if last_char_drug1 != ' ':
                        sentence = regex_drug1.sub(f'drug1{last_char_drug1}', sentence, count = 1)
                        break
                    sentence = regex_drug1.sub(f'drug1 ', sentence, count = 1)
                    break
            except: #this is to avoid 'nothing to repeat' errors that occassionally occur for some reason when compiling an re
                sentence = sentence.replace(drug1, 'drug1', 1) 
        
        if sentence.find(drug2) != -1:
            try:
                regex_drug2 = re.compile(f'{drug2_list.iloc[i]}[^a-zA-Z0-9]')
                for m in re.finditer(regex_drug2, sentence):
                    last_char_drug2 = sentence[m.end() - 1]
                    if last_char_drug2 != ' ':
                        sentence = regex_drug2.sub(f'drug2{last_char_drug2}', sentence, count = 1)
                        break
                    sentence = regex_drug2.sub(f'drug2 ', sentence, count = 1)
                    break
            except: #this is to avoid 'nothing to repeat' errors that ocassionally occur for some reason when compiling an re
                sentence = sentence.replace(drug2, 'drug2', 1)
        
        for drug in drug_list:
            if sentence.find(drug) != -1:
                try:
                    regex_drug0 = re.compile(f'{drug}\W')
                    last_chars = []
                    for m in re.finditer(regex_drug0, sentence):
                        last_chars.append(sentence[m.end() - 1])
                    for chars in last_chars:
                        sentence = regex_drug0.sub(f'drug0{chars}', sentence, count = 1)
                except: #this is to avoid 'nothing to repeat' errors that ocassionally occur for some reason when compiling an re
                    sentence = sentence.replace(drug, 'drug0')
        
        anonymized_text.append(sentence)
    
    data['anonymized_text'] = anonymized_text
            

In [14]:
def identify_same_drug(row):
    '''Returns 1 if the drugs in a given pair are the same, 0 otherwise'''
    if row['drug1'].strip().lower() == row['drug2'].strip().lower():
        return 1
    return 0

In [15]:
def identify_negative_phrases(row):
    '''Returns 1 if there is the presence of a negation word or phrase such as 'no', "n't", or 'not'.'''
    negative_regex = re.compile("no[^a-zA-Z0-9]|not[^a-zA-Z0-9]|.*n't[^a-zA-Z0-9]")
    match = re.search(negative_regex, row['text'].lower())
    if match:
        return 1
    return 0

In [16]:
def identify_series(row):
    '''Returns 1 if 'drug1' and 'drug2' appear in the same coordinate phrase, 0 otherwise'''
    patterns = [re.compile("drug1[,;](|\s)([a-zA-Z0-9]*,(|\s))+(|or\s|and\s)drug2"),
                re.compile("drug1[,;](|\s)drug2"),
                re.compile("drug2[,;](|\s)([a-zA-Z0-9]*,(|\s))+(|or\s|and\s)drug1"),
                re.compile("drug2[,;](|\s)drug1")]
    
    for re_pattern in patterns:
        if re.search(re_pattern, row['anonymized_text']):
            return 1
    return 0

In [17]:
def identify_special_cases(row):
    '''Returns 1 if drug1 is a special case of drug2 or vice versa. An example of this is when a drug is
    describing a class of drugs. For example, the phrase 'drug1 such as drug2' should return 1.'''
    patterns = [re.compile('drug1(|\s)\(drug2\)(|\W)'),
                re.compile('drug2(|\s)\(drug1\)(|\W)'),
                re.compile('drug1 such as drug2'),
                re.compile('drug2 such as drug1')]
    
    for re_pattern in patterns:
        if re.search(re_pattern, row['anonymized_text']):
            return 1
    return 0

In [18]:
def preprocess(data):
    '''goes through all the preprocesing steps and returns the resulting dataframe'''
    d = deepcopy(data)
    anonymize_drugs(d)
    d['same_drug'] = d.apply(identify_same_drug, axis = 1)
    d['negative'] = d.apply(identify_negative_phrases, axis = 1)
    d['in_series'] = d.apply(identify_series, axis = 1)
    d['special_cases'] = d.apply(identify_special_cases, axis = 1)
    d['removal_flag'] = d.apply(lambda x: max(x['same_drug'], x['negative'], x['in_series'], x['special_cases']), axis = 1)
    return d

In [19]:
train_data_preprocessed = preprocess(train_data)

In [20]:
train_data_preprocessed.head()

Unnamed: 0,sentence_id,text,pair_id,drug1_id,drug1,drug1_type,drug2_id,drug2,drug2_type,ddi,ddi_type,anonymized_text,same_drug,negative,in_series,special_cases,removal_flag
0,DDI-DrugBank.d519.s3,laboratory tests response to plenaxis should b...,DDI-DrugBank.d519.s3.p0,DDI-DrugBank.d519.s3.e0,plenaxis,brand,DDI-DrugBank.d519.s3.e1,testosterone,drug,False,,laboratory tests response to drug1 should be m...,0,0,0,0,0
1,DDI-DrugBank.d297.s1,population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.p0,DDI-DrugBank.d297.s1.e0,mtx,drug,DDI-DrugBank.d297.s1.e1,nsaids,group,False,,population pharmacokinetic analyses revealed t...,0,1,1,0,1
2,DDI-DrugBank.d297.s1,population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.p1,DDI-DrugBank.d297.s1.e0,mtx,drug,DDI-DrugBank.d297.s1.e2,corticosteroids,group,False,,population pharmacokinetic analyses revealed t...,0,1,1,0,1
3,DDI-DrugBank.d297.s1,population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.p2,DDI-DrugBank.d297.s1.e0,mtx,drug,DDI-DrugBank.d297.s1.e3,tnf blocking agents,group,False,,population pharmacokinetic analyses revealed t...,0,1,1,0,1
4,DDI-DrugBank.d297.s1,population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.p3,DDI-DrugBank.d297.s1.e0,mtx,drug,DDI-DrugBank.d297.s1.e4,abatacept,drug,False,,population pharmacokinetic analyses revealed t...,0,1,0,0,1


In [21]:
test_data_preprocessed = preprocess(test_data)

Serialize preprocessed data into pickle for further use.

In [22]:
def write_to_pickle(data, file_name):
    pickle_out = open(file_name, 'wb')
    pickle.dump(data, pickle_out)
    pickle_out.close()

In [23]:
write_to_pickle(train_data_preprocessed, 'train_data_preprocessed.pickle')

In [24]:
write_to_pickle(test_data_preprocessed, 'test_data_preprocessed.pickle')