In [1]:
import os
print(os.getcwd())

C:\Personal_Data\Machine_Learning_Project\Medical_Information_Extraction\mrec\notebooks


In [2]:
cd ..

C:\Personal_Data\Machine_Learning_Project\Medical_Information_Extraction\mrec


In [3]:
from mrec.data.dataset import load_data
csv_fnames = {'train': 'dataset/raw/train.csv', 'validation': 'dataset/raw/validation.csv', 'test': 'dataset/raw/test.csv'}
dataset = load_data(csv_fnames)

[2020-12-18 17:33:49,524] [DEBUG] [mrec.data.dataset::load_data::48] Loaded dataset (train:dataset/raw/train.csv)
[2020-12-18 17:33:49,545] [DEBUG] [mrec.data.dataset::load_data::48] Loaded dataset (validation:dataset/raw/validation.csv)
[2020-12-18 17:33:49,569] [DEBUG] [mrec.data.dataset::load_data::48] Loaded dataset (test:dataset/raw/test.csv)


In [4]:
train, validation, test = dataset.train, dataset.validation, dataset.test

In [5]:
# Feature = sentence, target = relation
train = train[['sentence','relation']]
validation = validation[['sentence','relation']]

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13340 entries, 0 to 13339
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  13340 non-null  object
 1   relation  13340 non-null  object
dtypes: object(2)
memory usage: 208.6+ KB


# Cleaning dataset

### Filter target variable

In [7]:
# relation can either be treats or causes
relation_type = ['causes','treats']
train = train[train['relation'].isin(relation_type)]
validation = validation[validation['relation'].isin(relation_type)]

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12146 entries, 0 to 13339
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  12146 non-null  object
 1   relation  12146 non-null  object
dtypes: object(2)
memory usage: 284.7+ KB


### Check and Remove Duplicate

In [8]:
print('Number of dupicated in train set:', train.duplicated().sum())
print('Number of duplicated in validation set:', validation.duplicated().sum())

Number of dupicated in train set: 10588
Number of duplicated in validation set: 3287


In [9]:
train = train.drop_duplicates()
validation = validation.drop_duplicates()
print('Dropped duplicated values')

Dropped duplicated values


In [10]:
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1558 entries, 0 to 13333
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1558 non-null   object
 1   relation  1558 non-null   object
dtypes: object(2)
memory usage: 36.5+ KB
None


Unnamed: 0,sentence,relation
0,"For treatment of uncomplicated cervical, URETH...",treats
21,SALMETEROL prevented EXERCISE INDUCED ASTHMA i...,treats
28,The patients showed the characteristic facies ...,causes
35,Dyskinesias occur in the majority of patients ...,treats
42,ENDOTOXEMIA was evoked by bolus injection of E...,causes


### Remove punctuation

In [11]:
import string
punctuation = string.punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def remove_punctuation(row):
    return row['sentence'].translate(str.maketrans('', '', punctuation))

train['simplified'] = train.apply(remove_punctuation, axis=1)
validation['simplified'] = validation.apply(remove_punctuation, axis=1)

### Tokenization

In [13]:
def tokenize(row):
    return row['simplified'].split()

train['simplified'] = train.apply(tokenize, axis=1)
validation['simplified'] = validation.apply(tokenize, axis=1)

### Remove stopwords

In [14]:
'''
A new NLTK Downloader window will pop up.
Please make sure to go to All Packages tab and download these packages: state_union, stopwords, averaged_perceptron_tagger, wordnet and wordnet_ic
'''
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [15]:
stopwords = nltk.corpus.stopwords.words('english')

In [16]:
def remove_stopwords(row):
    sentence = [word for word in row['simplified'] if word not in stopwords]
    return sentence

train['simplified'] = train.apply(remove_stopwords, axis=1)
validation['simplified'] = validation.apply(remove_stopwords, axis=1)

In [17]:
train.head()

Unnamed: 0,sentence,relation,simplified
0,"For treatment of uncomplicated cervical, URETH...",treats,"[For, treatment, uncomplicated, cervical, URET..."
21,SALMETEROL prevented EXERCISE INDUCED ASTHMA i...,treats,"[SALMETEROL, prevented, EXERCISE, INDUCED, AST..."
28,The patients showed the characteristic facies ...,causes,"[The, patients, showed, characteristic, facies..."
35,Dyskinesias occur in the majority of patients ...,treats,"[Dyskinesias, occur, majority, patients, PARKI..."
42,ENDOTOXEMIA was evoked by bolus injection of E...,causes,"[ENDOTOXEMIA, evoked, bolus, injection, ESCHER..."


### Lemmatizing

In [19]:
from nltk.corpus import wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def lemmatizing(row):
    sentence = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in row['simplified']]
    return sentence
    
train['simplified'] = train.apply(lemmatizing, axis=1)
validation['simplified'] = validation.apply(lemmatizing, axis=1)

train.simplified

0        [For, treatment, uncomplicated, cervical, URET...
21       [SALMETEROL, prevent, EXERCISE, INDUCED, ASTHM...
28       [The, patient, show, characteristic, facies, C...
35       [Dyskinesias, occur, majority, patient, PARKIN...
42       [ENDOTOXEMIA, evoke, bolus, injection, ESCHERI...
                               ...                        
13305    [1, 2, 3, 4, 5, 6, 18, 22, 23, 26, BIPOLAR, DI...
13312    [672, For, empiric, treatment, epididymitis, e...
13319    [To, determine, whether, late, asthmatic, reac...
13326    [Comparison, ZEBRA, ANTIBODY, LEVELS, degree, ...
13333    [Kurth, MC, Adler, CH, St, Hilaire, M, et, al,...
Name: simplified, Length: 1558, dtype: object

## Vertorizing Data

### Apply CountVectorizer (N-Grams)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vect = CountVectorizer(ngram_range=(1,3))
X_counts = ngram_vect.fit_transform(train[train['relation'] == 'causes']['sentence'])

print(X_counts.shape)
print(ngram_vect.get_feature_names())

(888, 41214)
['000', '000 mothers', '000 mothers of', '000 persons', '000 persons by', '000 procedures', '000 procedures that', '000 to', '000 to million', '000031', '000031 urinary', '000031 urinary cellular', '000062', '000062 seizures', '000062 seizures or', '00024', '00024 pericarditis', '00024 pericarditis or', '0019', '0019 pleuritis', '0019 pleuritis or', '006', '006 especially', '006 especially those', '015', '028', '028 and', '028 and lymphopenia', '05', '10', '10 000', '10 000 mothers', '10 549', '10 549 were', '10 but', '10 but who', '10 have', '10 have sudden', '10 microliters', '10 microliters abnormal', '10 microliters to', '10 mm', '10 mm or', '10 of', '10 of persons', '10 of pregnancies', '10 patients', '10 patients time', '10 resection', '10 resection of', '10 to', '10 to 20', '100', '100 000', '100 000 persons', '100 10', '100 101', '100 101 102', '100 120', '100 120 121', '100 124', '100 124 126', '100 advise', '100 advise patients', '100 and', '100 and thrombocytope

## Build ML Classifiers 

In [31]:
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = text.split()
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [32]:
import pandas as pd

count_vect = CountVectorizer(ngram_range=(1,3), analyzer=clean_text)
X_counts = count_vect.fit_transform(train['sentence'])
X_count_feat = pd.DataFrame(X_counts.toarray())
X_count_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6233,6234,6235,6236,6237,6238,6239,6240,6241,6242
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

random_forest = RandomForestClassifier()
param = {'n_estimators': [10, 50, 100],
        'max_depth': [20, 40, 60, None]}

gs = GridSearchCV(random_forest, param, cv=10, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, train['relation'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
10,5.354843,0.152568,0.061003,0.002981,,50,"{'max_depth': None, 'n_estimators': 50}",0.775641,0.769231,0.762821,0.801282,0.782051,0.858974,0.820513,0.801282,0.845161,0.793548,0.80105,0.030359,1
11,8.59112,0.636209,0.04619,0.013067,,100,"{'max_depth': None, 'n_estimators': 100}",0.775641,0.762821,0.75641,0.75641,0.788462,0.865385,0.826923,0.782051,0.845161,0.780645,0.793991,0.036447,2
7,4.398973,0.0797,0.059001,0.00339,60.0,50,"{'max_depth': 60, 'n_estimators': 50}",0.775641,0.782051,0.724359,0.775641,0.788462,0.858974,0.807692,0.801282,0.806452,0.812903,0.793346,0.032661,3
8,8.664683,0.130361,0.071962,0.004607,60.0,100,"{'max_depth': 60, 'n_estimators': 100}",0.782051,0.769231,0.769231,0.782051,0.801282,0.858974,0.801282,0.775641,0.787097,0.793548,0.792039,0.024873,4
5,7.432376,0.092385,0.074864,0.007894,40.0,100,"{'max_depth': 40, 'n_estimators': 100}",0.75641,0.775641,0.74359,0.762821,0.775641,0.839744,0.807692,0.801282,0.806452,0.793548,0.786282,0.027373,5
