## 1. import HPO gold standard from Bio-Lark
link: http://www.bio-lark.org/hpo_res.html 


In [1]:
import pandas as pd

### 1.1 Convert standoff to pandas Dataframe

In [15]:
import re
import ast
from os import listdir
from os.path import isfile, join

path_annot = 'corpus/stand-off/'
path_text = 'corpus/text/'

annot_files = [f for f in listdir(path_annot) if isfile(join(path_annot, f))]
text_files = [f for f in listdir(path_text) if isfile(join(path_text, f))]
#print(annot_files)


TRAIN_DATA = []

cnt_ent = 0
df_anno = pd.DataFrame(columns=['entID','docID', 'start', 'end', 'hpo', 'entity']) # , 'entity', 'hpo'
df_text = pd.DataFrame(columns=['docID','text']) 

for i in range(len(annot_files)):
    with open(path_annot + annot_files[i], 'r') as f:
        content = f.readlines()
    content = [x.strip() for x in content] 

    with open(path_text + text_files[i], 'r') as f:
        text = f.read().strip()
        text = re.sub(r"[^\x00-\x7F\']+",' ', text)
    df_text.loc[i] = [i, text]
    
    list_ent = []
    for match in content:
        m = re.findall('\[(\d+)::(\d+)\]\s(.*)\s\|\s(.*)', match)[0]
        #print(m)
        df_anno.loc[cnt_ent] = [cnt_ent, i, m[0], m[1], m[2], m[3]]
        cnt_ent += 1
    #TRAIN_DATA.append(i, m[0], m[1])
    
    #print(i)
    #break
df_anno.head()
   

Unnamed: 0,entID,docID,start,end,hpo,entity
0,0,0,14,27,HP_0001156,brachydactyly
1,1,0,29,71,HP_0009881,absence of some middle or distal phalanges
2,2,0,29,71,HP_0010239,absence of some middle or distal phalanges
3,3,0,74,103,HP_0001798,aplastic or hypoplastic nails
4,4,0,86,103,HP_0001792,hypoplastic nails


### 1.2 Write to CSV

In [16]:
df_anno.to_csv(r'data/Bio-Lark/df_anno.csv', index=False)
df_text.to_csv(r'data/Bio-Lark/df_text.csv', index=False)

### 1.3 Import Bio-Lark csv files

In [3]:
df_anno = pd.read_csv(r'data/Bio-Lark/df_anno.csv')
df_text = pd.read_csv(r'data/Bio-Lark/df_text.csv')
df_anno.head()

Unnamed: 0,entID,docID,start,end,hpo,entity
0,0,0,14,27,HP_0001156,brachydactyly
1,1,0,29,71,HP_0009881,absence of some middle or distal phalanges
2,2,0,29,71,HP_0010239,absence of some middle or distal phalanges
3,3,0,74,103,HP_0001798,aplastic or hypoplastic nails
4,4,0,86,103,HP_0001792,hypoplastic nails


## 2. Generate Training, Test & Validation split

1. Training - to explore the data and to ensure the right preprocessing steps are applied (improve upon lemmatizing/negex)
2. Validation - chose the optimal extraction method from a candidate pool
3. Test - final unbiased evaluation

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_text['text'], df_text['docID'], test_size=0.33, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

df_train_txt =  df_text[df_text['docID'].isin(list(y_train))]
df_train_ann =  df_anno[df_anno['docID'].isin(list(y_train))]
df_train = pd.merge(df_train_ann, df_train_txt, left_on='docID', right_on='docID')

df_test_txt =  df_text[df_text['docID'].isin(list(y_test))]
df_test_ann =  df_anno[df_anno['docID'].isin(list(y_test))]
df_test = pd.merge(df_test_ann, df_test_txt, left_on='docID', right_on='docID')

df_valid_txt =  df_text[df_text['docID'].isin(list(y_valid))]
df_valid_ann =  df_anno[df_anno['docID'].isin(list(y_valid))]
df_valid = pd.merge(df_valid_ann, df_valid_txt, left_on='docID', right_on='docID')

df_valid.head()

#, y_valid, y_test

Unnamed: 0,entID,docID,start,end,hpo,entity,text
0,0,0,14,27,HP_0001156,brachydactyly,A syndrome of brachydactyly (absence of some m...
1,1,0,29,71,HP_0009881,absence of some middle or distal phalanges,A syndrome of brachydactyly (absence of some m...
2,2,0,29,71,HP_0010239,absence of some middle or distal phalanges,A syndrome of brachydactyly (absence of some m...
3,3,0,74,103,HP_0001798,aplastic or hypoplastic nails,A syndrome of brachydactyly (absence of some m...
4,4,0,86,103,HP_0001792,hypoplastic nails,A syndrome of brachydactyly (absence of some m...


### write sets to csv

In [32]:
df_train.to_csv(r'data/Bio-Lark/df_train.csv', index=False)
df_valid.to_csv(r'data/Bio-Lark/df_valid.csv', index=False)
df_test.to_csv(r'data/Bio-Lark/df_test.csv', index=False)

### load csv

In [33]:
df_train = pd.read_csv('data/Bio-Lark/df_train.csv')
df_test = pd.read_csv('data/Bio-Lark/df_test.csv')
df_valid = pd.read_csv('data/Bio-Lark/df_valid.csv')
df_valid.head()

Unnamed: 0,entID,docID,start,end,hpo,entity,text
0,0,0,14,27,HP_0001156,brachydactyly,A syndrome of brachydactyly (absence of some m...
1,1,0,29,71,HP_0009881,absence of some middle or distal phalanges,A syndrome of brachydactyly (absence of some m...
2,2,0,29,71,HP_0010239,absence of some middle or distal phalanges,A syndrome of brachydactyly (absence of some m...
3,3,0,74,103,HP_0001798,aplastic or hypoplastic nails,A syndrome of brachydactyly (absence of some m...
4,4,0,86,103,HP_0001792,hypoplastic nails,A syndrome of brachydactyly (absence of some m...


## 3. Generate acronym list

In [31]:
import pickle
import regex

def makeAcronymDict(content):
    """
    Content = list of lines
    """
    d = {}
    SPLIT_ON = r'(?<!,)(\s)'
    for line in content:
        ix = regex.search(SPLIT_ON , line).start()
        if ',' in line[:ix]:
            for acronym in line[:ix].split(', '):
                d[acronym] = line[ix+1:]
        else :
            d[line[:ix]] = line[ix+1:].split('; ')
    return d

with open('preprocessing/acronym_list.txt') as f:
    content = f.read()
    
d_acronyms = makeAcronymDict(content.split('\n'))

## Save acronym list!
a_file = open('preprocessing/acronym_list.pkl', "wb")

pickle.dump(d_acronyms, a_file)

a_file.close()

### Load acronym list

In [30]:
import pickle
a_file = open("preprocessing/acronym_list.pkl", "rb")
d_acronyms = pickle.load(a_file)
a_file.close()

## 3. Perform model selection
1. import models
2. Evaluate models on validation set (F1, ROC, PR, Sens, PPV)
3. Pick best model

### 3.1 Import models

## 4. Unbiased Performance evaluation
- Goal = prevent optimistic bias