# Processing of extra annotated records

1. remove duplicated contexts
2. replace symbols and acronyms
3. remove symbols (non-alphabets and non digits)
4. join with original training examples

In [57]:
# import packages
import pandas as pd
import re

In [58]:
# ***************** # IMPORT segmented data
df_2009 = pd.read_excel (r'.\train and test dataset.xlsx', sheet_name='2009') # training and validation
df_2010 = pd.read_excel (r'.\train and test dataset.xlsx', sheet_name='2010') # test 2010
df_2011 = pd.read_excel (r'.\train and test dataset.xlsx', sheet_name='Readpeer 2011') # test 2011
df_MIMIC = pd.read_excel (r'.\train and test dataset.xlsx', sheet_name='MIMIC III')
df_MIMIC_ur = pd.read_excel (r'.\train and test dataset.xlsx', sheet_name='MIMIC III unrestricted')

In [59]:
print(df_MIMIC_ur) # view dataset

                                                context  \
0     allergies:  intravenous iron which leads to an...   
1     percocet and keflex which lead to severe gastr...   
2     the patient also complained of some right leg ...   
3     Patient on appropriate cardiac medications exc...   
4     but vomited upon arrival to the[**hospital1 88...   
5     generalized convulsive seizure in [**2184**] i...   
6     the patient reported that the reason she did n...   
7     Patient's doxazosin was held given low BPs, sh...   
8     the patient's lasix drip had been stopped due ...   
9     discharged from detox program on day ofadmissi...   
10    nfectious disease recommendedto discontinue al...   
11    while the abdominal pain resolved, the patient...   
12    subsequently, the patient developed constipati...   
13    in the immediate postoperative period the pati...   
14    the patient's heparin drip was d/c'ed on [**21...   
15    overall weigth gain sicne starting prednisone ... 

In [60]:
print(len(df_2009))
df_2009=df_2009[df_2009['label']!=9] # remove cases when unsure if drug-AE pair (21 records) inside 2009 datasets
print(len(df_2009))

5355
5334


## 1. Remove duplicated contexts

In [61]:
# before dropping duplicates
print(len(df_2009))
print(len(df_2010))
print(len(df_2011))
print(len(df_MIMIC))
print(len(df_MIMIC_ur))

5334
321
1326
1342
3076


In [62]:
# remove records with duplicated contexts
df_2009 = df_2009.drop_duplicates(subset='context', keep = 'first')
df_2010 = df_2010.drop_duplicates(subset='context', keep = 'first')
df_2011 = df_2011.drop_duplicates(subset='context', keep = 'first')
df_MIMIC = df_MIMIC.drop_duplicates(subset='context', keep = 'first')
df_MIMIC_ur = df_MIMIC_ur.drop_duplicates(subset='context', keep = 'first')

In [63]:
# after dropping duplicates
print(len(df_2009))
print(len(df_2010))
print(len(df_2011))
print(len(df_MIMIC))
print(len(df_MIMIC_ur))

5088
321
1326
1342
3076


## 2. Replace symbols and acronyms

In [64]:
# function to replace symbols and acronyms
def replace_acro(text):
    text = text.replace("->"," resulted in ")
    text = text.replace("?"," suspect ")
    text = text.replace("%"," percent ")
    text = text.replace(">"," more than ")
    text = text.replace("<"," less than ")
    text = text.replace("s/p"," treated with ")
    text = text.replace("-ve"," negative ")
    text = text.replace("+ve"," positive ")
    text = text.replace("+"," positive ")
    text = text.replace("f/u"," follow up ")
    text = text.replace("c/x"," complicated with ")
    text = text.replace("s/b"," seen by ")
    text = text.replace("b/l"," bilateral ")
    text = text.replace("p/w"," presented with ")
    text = text.replace(" tcu "," appointment with ")
    text = text.replace(" kiv "," keep in view ")
    text = text.replace(" aor "," at own risk ")
    text = text.replace(" abdo "," abdomen ")
    text = text.replace(" sob "," shortness of breath ")
    text = text.replace(" adl "," activities of daily living ")
    text = text.replace(" cx "," complicated with ")
    text = text.replace(" tro "," to rule out ")
    text = text.replace(" ccod ", " cause of death ")
    text = text.replace(" dil "," dangerously ill ")
    text = text.replace(" dnr "," do not resuscitate ")
    return text

In [65]:
df_2009['segment_2']=df_2009['segment'].apply(lambda x : replace_acro(x))
df_2010['segment_2']=df_2010['segment'].apply(lambda x : replace_acro(x))
df_2011['segment_2']=df_2011['segment'].apply(lambda x : replace_acro(x))
df_MIMIC['segment_2']=df_MIMIC['segment'].apply(lambda x : replace_acro(x))
df_MIMIC_ur['segment_2']=df_MIMIC_ur['segment'].apply(lambda x : replace_acro(x))

In [None]:
df_2009.head()

## 3. Remove symbols (non-alphabets and non-digits)

In [67]:
# function to remove symbols (non-alphabets and non-digits)
def remove_symbols(text):
    text = re.sub("[^A-Za-z0-9]", " ", text)
    text=text.strip()
    return text

In [68]:
df_2009['segment_2']=df_2009['segment_2'].apply(lambda x : remove_symbols(x))
df_2010['segment_2']=df_2010['segment_2'].apply(lambda x : remove_symbols(x))
df_2011['segment_2']=df_2011['segment_2'].apply(lambda x : remove_symbols(x))
df_MIMIC['segment_2']=df_MIMIC['segment_2'].apply(lambda x : remove_symbols(x))
df_MIMIC_ur['segment_2']=df_MIMIC_ur['segment_2'].apply(lambda x : remove_symbols(x))

In [69]:
df_MIMIC_ur.head()

Unnamed: 0,context,segment,ae,drug,label,segment_2
0,allergies: intravenous iron which leads to an...,allergies: intravenous iron which leads to an...,anaphylaxis,iron,1,allergies intravenous iron which leads to an...
1,percocet and keflex which lead to severe gastr...,percocet and keflex which lead to severe gastr...,severe gastrointestional upset,Percocet,1,percocet and keflex which lead to severe gastr...
2,the patient also complained of some right leg ...,the patient also complained of some right leg ...,weakness,epidural,1,the patient also complained of some right leg ...
3,Patient on appropriate cardiac medications exc...,Patient on appropriate cardiac medications exc...,low LDL,statins,1,Patient on appropriate cardiac medications exc...
4,but vomited upon arrival to the[**hospital1 88...,but vomited upon arrival to the[**hospital1 88...,vomited,Morphine,1,but vomited upon arrival to the hospital1 88...


## 4. Save to excel file

In [70]:
writer = pd.ExcelWriter('train and test processed.xlsx')

df_2009.to_excel(writer, sheet_name = 'df_2009', index = False)
df_2010.to_excel(writer, sheet_name = 'df_2010', index = False)
df_2011.to_excel(writer, sheet_name = 'df_2011', index = False)
df_MIMIC.to_excel(writer, sheet_name = 'df_MIMIC', index = False)
df_MIMIC_ur.to_excel(writer, sheet_name = 'df_MIMIC_ur', index = False)

writer.save()