In [1]:
cd ..

C:\Personal_Data\Machine_Learning_Project\Medical_Information_Extraction\mrec


In [2]:
from mrec.data.dataset import load_data
csv_fnames = {'train': 'dataset/raw/train.csv', 'validation': 'dataset/raw/validation.csv', 'test': 'dataset/raw/test.csv'}
dataset = load_data(csv_fnames)

[2020-12-19 10:48:32,927] [DEBUG] [mrec.data.dataset::load_data::48] Loaded dataset (train:dataset/raw/train.csv)
[2020-12-19 10:48:32,948] [DEBUG] [mrec.data.dataset::load_data::48] Loaded dataset (validation:dataset/raw/validation.csv)
[2020-12-19 10:48:32,970] [DEBUG] [mrec.data.dataset::load_data::48] Loaded dataset (test:dataset/raw/test.csv)


In [3]:
train, validation, test = dataset.train, dataset.validation, dataset.test

In [4]:
# Feature = sentence, target = relation
train = train[['sentence','relation']]
validation = validation[['sentence','relation']]

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13340 entries, 0 to 13339
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  13340 non-null  object
 1   relation  13340 non-null  object
dtypes: object(2)
memory usage: 208.6+ KB


# Cleaning dataset

### Filter target variable

In [6]:
# relation can either be treats or causes
relation_type = ['causes','treats']
train = train[train['relation'].isin(relation_type)]
validation = validation[validation['relation'].isin(relation_type)]

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12146 entries, 0 to 13339
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  12146 non-null  object
 1   relation  12146 non-null  object
dtypes: object(2)
memory usage: 284.7+ KB


### Check and Remove Duplicate

In [7]:
print('Number of dupicated in train set:', train.duplicated().sum())
print('Number of duplicated in validation set:', validation.duplicated().sum())

Number of dupicated in train set: 10588
Number of duplicated in validation set: 3287


In [8]:
train = train.drop_duplicates()
validation = validation.drop_duplicates()
print('Dropped duplicated values')

Dropped duplicated values


In [9]:
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1558 entries, 0 to 13333
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1558 non-null   object
 1   relation  1558 non-null   object
dtypes: object(2)
memory usage: 36.5+ KB
None


Unnamed: 0,sentence,relation
0,"For treatment of uncomplicated cervical, URETH...",treats
21,SALMETEROL prevented EXERCISE INDUCED ASTHMA i...,treats
28,The patients showed the characteristic facies ...,causes
35,Dyskinesias occur in the majority of patients ...,treats
42,ENDOTOXEMIA was evoked by bolus injection of E...,causes


## Preprocessing

In [10]:
'''
A new NLTK Downloader window will pop up.
Please make sure to go to All Packages tab and download these packages: state_union, stopwords, averaged_perceptron_tagger, wordnet and wordnet_ic
'''
import string
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [11]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

# Function to get the correct position of word net
# Basically figure out if word is noun/verb/adj/adv to convert to its most basic form
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}
    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

# Function to remove punctuation, tokenize, remove stopwords and lemmatize
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = text.split()
    text = [word for word in tokens if word not in stopwords]
    text = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in text]
    return text

In [12]:
text = "I healthy and loved food. She likes basketball"
print(clean_text(text))

['healthy', 'love', 'food', 'like', 'basketball']


## Vectorizing Data

In [13]:
import pandas as pd

count_vect = CountVectorizer(ngram_range=(1,3), analyzer=clean_text)
X_counts = count_vect.fit_transform(train['sentence'])
X_count_train = pd.DataFrame(X_counts.toarray())
X_count_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6815,6816,6817,6818,6819,6820,6821,6822,6823,6824
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_counts = count_vect.transform(validation['sentence'])
X_count_validation = pd.DataFrame(X_counts.toarray())
X_count_validation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6815,6816,6817,6818,6819,6820,6821,6822,6823,6824
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Build ML Classifiers 

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score

random_forest = RandomForestClassifier()
random_forest.fit(X_count_train, train['relation'])

forest_accuracy = cross_val_score(random_forest, X_count_validation, validation['relation'], scoring="accuracy", cv=10)
print('Accuracy score on validation set:', forest_accuracy.mean())

Accuracy score on validation set: 0.7424178895877009


## Saving the model

In [16]:
from joblib import dump, load
dump(random_forest, 'random_forest.joblib')

['random_forest.joblib']

## Load the model and try on test set

In [17]:
model = load('random_forest.joblib')

In [18]:
test = test[['sentence','relation']]
test = test[test['relation'].isin(relation_type)].drop_duplicates()
test.head()

Unnamed: 0,sentence,relation
21,Ninety eight hypothyroid children without GOIT...,causes
28,"Clonidine, oxymetazoline, tetrahydozoline, bri...",treats
35,"Stumpe KO, Haworth D, Hoglund C et al et al. C...",treats
42,With successful treatment of the patient's dep...,causes
49,OC RELATED BLOOD PRESSURE/HYPERTENSION is defi...,causes


In [19]:
data = test.iloc[0]
data

sentence    Ninety eight hypothyroid children without GOIT...
relation                                               causes
Name: 21, dtype: object

In [25]:
sentence = [test.iloc[0]['sentence']]
sentence

['Ninety eight hypothyroid children without GOITER were divided into 6 groups: (i) athyreosis: RAIU low, no thyroid tissue identifiable (n = 39); (ii) hypoplasia: RAIU low, gland small, in normal position (n = 7); (iii) ectopia: RAIU low, gland in ectopic position (n = 24); (iv) THYROIDitis: TMA positive (n = 2); (v) iodine deficiency: low urinary iodine (n = 1); and (vi) cause unknown: RAIU.']

In [26]:
y_counts = count_vect.transform(sentence)
#y_count_feat = pd.DataFrame(y_counts.toarray())
#y_count_feat.head()

In [27]:
model.predict(y_counts)

array(['causes'], dtype=object)