In [None]:
# ============================================================================================
# >>>>>>>>>>>>>>>>>>>>>>>>>>>> Phenotype Tagging <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# AUTHOR : Roberto Olayo and Larisa Morales 
# VERSION: 1.0
# CREATED: 18/09/2017 10:55 pm
# DESCRIPTION: This programs first parses the output file with the classified sentences to 
# extract the rs and phenotypes by means of their tags in order to perform the final evaluation
# comparing the results to the gold standard 
# USAGE :
# > PART 1
# newarch = open("3_Sentence_Splitted_Pars_Tagged.txt","w")
# coreOUT = r'/Users/larisams/Dropbox/Projects/BI/MineriaDatos/Files/2_All_sentences_Tagged_RS_PHEN_COREoutput.txt.out'
# > PART 2 
# tagFile = r'/Users/larisams/Dropbox/Projects/BI/MineriaDatos/Files/1_AB_All_abstract_Tagged_RS_PHEN.txt'
# AssFile = open(r'/Users/larisams/Dropbox/Projects/BI/MineriaDatos/Files/3_RS-Phen_Assoc.txt','w')
# REQUIREMENTS : Flat text input files for the gold standard and the classified sentences, paths must be added directly to the code
# CATEGORY : Standalone
# INPUT FORMAT : Flat text
# OUTPUT FORMAT : The output resembles the input file including the text where the phenotype 
# names are located, including the added tags (<PHE>phenotype name</PHE>), it also prints to the stdout the evaluation metrics 
# LANGUAGE : Python
# PATH PROGRAM : /Users/larisams/Dropbox/Projects/BI/MineriaDatos/scripts
# =============================================================================================

In [3]:
import re
from collections import defaultdict as df

### Part 1: Parsing sentences  

input = file generated by coreNLP ssplit   
output = Sentence splitted with RS. and phenotype tagged. Each abstract has a header of the form >PMID followed by all the sentences

In [9]:
newarch = open("3_Sentence_Splitted_Pars_Tagged.txt","w")
coreOUT = r'/Users/larisams/Dropbox/Projects/BI/MineriaDatos/Files/2_All_sentences_Tagged_RS_PHEN_COREoutput.txt.out'

In [10]:
with open (coreOUT, "r") as abstracts:
    for line in abstracts:
        if line.startswith("Sentence"):
            SentenceNumber = re.search("^Sentence\s#\d+",line).group(0)
            b=1
            continue
        if b==1:
            if re.search("^\d{8}\t", line):
                
                #Retrieving PMID
                
                PMID = re.search("^(\d{8})\t",line).group(1)
                newarch.write(">")
                newarch.write(PMID)
                newarch.write("\n")
                
                #Resto de esa linea
                
                line=re.search("\d{8}\t(.*)",line).group(1)
                
            # Writing file     
            newarch.write(SentenceNumber)
            newarch.write("\t")
            newarch.write(line.rstrip())
            newarch.write('\n')
            b=0 
newarch.close()  # This file contains the splitted sentences with rsID's and Phenotypes tagged 

### Part 2: RS-Phenotype association extraction 

input = All abstracts collection with phenotypes and rs tagged    
output = file in tabular format. The first column is the rs, the second column is the phenotype and the third column is the PMID

In [51]:
#All abstracts tagged file 
tagFile = r'/Users/larisams/Dropbox/Projects/BI/MineriaDatos/Files/1_AB_All_abstract_Tagged_RS_PHEN.txt'
AssFile = open(r'/Users/larisams/Dropbox/Projects/BI/MineriaDatos/Files/3_RS-Phen_Assoc.txt','w')
C=0
R=0
with open (tagFile,'r') as t:
    for line in t:
        pmid = re.search(('^\d*\t'),line).group(0)
        all_rs = re.findall(r'<RS>(rs\d+)</RS>',line)
        all_ph = re.findall(r'<PHE>(.*?)</PHE>',line)
        for rs in all_rs:
            for ph in all_ph:
                newline = rs+'\t'+ph+'\t'+pmid+'\n'
                AssFile.write(newline)
            R+=1
AssFile.close()
print (R)

1243


### Part 3 : Performance evaluation 

input = non-redundant gold standard and associations file    
output = values of the metrics precision, recall and F1 score

In [1]:
goldstd = r'/Users/larisams/Dropbox/Projects/BI/MineriaDatos/Files/3_gold_std_nonredundant.txt'
output = r'/Users/larisams/Dropbox/Projects/BI/MineriaDatos/Files/3_Nonredundant_RS-Phen_Assoc.txt'

In [4]:
#Loading gold standard in gs_dic
gs_dic = df(str)
ref = 0
with open(goldstd,'r') as gs:
    for line in gs:
        gs_dic[line.split('\t')[0]] += line.split('\t')[1]
        ref += 1


In [5]:
#Comparing output to gold standard
extr,extr_corr = 0,0
with open(output,'r') as out:
    for line in out:
        line = line.rstrip('\n')
        rs,phen = line.split('\t')[0],line.split('\t')[1]
        if re.search(phen,gs_dic[rs],re.IGNORECASE):
            extr_corr += 1
        extr +=1

In [6]:
# Calculating evaluation metrics
precision = extr_corr/extr
recall = extr_corr/ref
F_1 = 2*((precision*recall)/(precision+recall))

In [7]:
print (extr,extr_corr,ref)
print (precision,recall,F_1)

497 305 72863
0.613682092555332 0.004185937993220153 0.008315158124318428
