In [1]:
#############################################################################################################
# Extracting chemical-disease associations from the biological literature
# R214: Main Practical
# Jan Ondras (jo356), Trinity College
#############################################################################################################
# Calculate co-mentions: using simple approx. string matching
# All counts are calculated on a per sentence basis 
# (if a concept appears multiple times in a sentence, it will increase its overall count only by one)
#############################################################################################################

import numpy as np
import matplotlib.pyplot as plt
import time
import glob

# Dictionaries of total counts of occurrences of chemical/disease: 
# key=chemicalCode/diseaseCode, value=count 
# if chemical/disease is not in this dictionary then assume zero count
chemCnts = {} 
diseCnts = {}
# Dictionary of cooccurrences: key=chemicalCode+diseaseCode, value=count
cooc = {}     

TMP_chemCnts = {} # dictionaries as above, but temporary, for current sentence only
TMP_diseCnts = {}

tmpStr = '' # temporary string storing a current concept being parsed

########################################################################################
# HELPER FUNCTIONS 

# ABSTRACTION FUNCTION
# Abstract the given full concept name (c)
def abstract_concept(c):
    # Remove the following characters from (c)
    rmv_chars = " '-,/*+.!@#$%^&*()_={}[];\:<>?" 
    #rmv_chars = " " 
    c = c.translate(None, rmv_chars)
    c = c.translate(None, '"') # quotes are removed separately
    # Convert to lower case
    c = c.lower()
    return c

# End of concept check & update: update counts once a whole concept is parsed
def eoc():
    
    global tmpStr
    global TMP_diseCnts
    global TMP_chemCnts
    global cnt_matched
    global cnt_NOT_matched

    if tmpStr != '': # yes, temporary string is not empty
        # Simple approximate string matching
        ac = abstract_concept(tmpStr)
        # Is this abstracted concept in our grounding dictionary ?
        if ac in ground_dict:
            matched_MESH_concept = ground_dict[ac]
            mLabel = matched_MESH_concept[1]  # matched label = chemical / disease   
            mCode =  matched_MESH_concept[2]  # matched code of chemical / disease
            #print "YES: ", tmpStr, "<==>", matched_MESH_concept[0], "\t *** ", ac
            
            # UPDATE COUNTS for this concept
            if mLabel == 'Disease':
                if mCode not in TMP_diseCnts.keys():
                    TMP_diseCnts[mCode] = 1
                else:
                    TMP_diseCnts[mCode] += 1
            elif mLabel == 'Chemical':
                if mCode not in TMP_chemCnts.keys():
                    TMP_chemCnts[mCode] = 1
                else:
                    TMP_chemCnts[mCode] += 1
            else:
                raise ValueError('Unknown concept!')

            cnt_matched += 1
        else:
            #print "NO :", tmpStr
            cnt_NOT_matched += 1
        
        tmpStr = ''
        
# End of sentence update: update total counts based on temporary counts from this sentence, once a whole sentence is parsed
def eos():
    
    global chemCnts
    global diseCnts
    global cooc
    
    for chem in TMP_chemCnts.keys():
        # Update chemical counts
        if chem not in chemCnts.keys(): 
            chemCnts[chem] = 1
        else:
            chemCnts[chem] += 1

        for dise in TMP_diseCnts.keys():
            # Update disease counts
            if dise not in diseCnts.keys():
                diseCnts[dise] = 1
            else:
                diseCnts[dise] += 1

            # Update co-occurences - minimum of 2x in total will be considered by filtering later
            k = chem + '+' + dise # construct key into dictionary of cooccurences: chemicalCode+diseaseCode, always this ordering
            if k not in cooc.keys():
                cooc[k] = 1
            else:
                cooc[k] += 1

# END OF HELPER FUNCTIONS 
########################################################################################

# Load MESH concepts: <name, type (Chemical or Disease), code>
MESH = np.loadtxt('./../Dataset/CDR_MeSH.tsv', delimiter='\t', skiprows=0, dtype=str) # <name, label, code>

# Construct the Gounding dictionary
# maps grounded/abstracted concept to list [full concept name, type (Chemical or Disease), code]
ground_dict = {}
for r in MESH:
    ac = abstract_concept(r[0])
    ground_dict[ac] = r
    
########################################################################################
########################################################################################
                    
# Iterate over PubMed abstracts and associated tags (produced by named entity recogniser)

st = time.time()
N_sentences = 0      # to count number of sentences
LINE_NUM = 0
cnt_matched = 0      # count matched concepts
cnt_NOT_matched = 0  # count not-matched concepts
cnt_cwb = 0          # count concepts without beginning tag B-...

with open('/home/janciovec/Desktop/pubmed-abstracts.conll') as f_NE:     # named entities;  301,084,933 lines, using grep -c '' pubmed-abstracts_tags
    with open('/home/janciovec/Desktop/pubmed-abstracts_tags') as f_TA:  # associated tags; 301,084,933 lines
        
        # Iterate over all lines
        while True: 
            
            LINE_NUM += 1

            # Read line - named entity
            rl = f_NE.readline()
            rlwn = rl.split('\n')[0] # read line skipping the new line character

            # End of file
            if rl == '':
                if f_TA.readline() != '':
                    raise ValueError("File with tags does not match the file with named entities!")
                # End of concept check & update
                eoc()
                # End of sentence update
                eos()
                break

            # End of sentence
            elif rlwn == '':
                if f_TA.readline() != '\n':
                    raise ValueError("File with tags does not match the file with named entities!")
                # End of concept check & update
                eoc()
                # End of sentence update
                eos()
                # Reset temporary counters for next sentence
                TMP_chemCnts = {}
                TMP_diseCnts = {}
                N_sentences += 1

            # Beginning or within sentence
            else:
                # Read associated tag
                tag = f_TA.readline().split('\n')[0]
                # Named entity - first item at the line
                NE = rlwn.split('\t')[0]
                
                # Beginning of concept
                if tag[0] == 'B': 
                    # End of concept check & update - since there could be a concept ending right now
                    eoc()
                    # Start new concept
                    tmpStr = NE
                    
                # Continuation of the concept name
                elif tag[0] == 'I': 
        
                    if tmpStr == '': # Concept without beginning: start new one from this string
                        print LINE_NUM, "Concept without beginning!"
                        cnt_cwb += 1
                    
                    tmpStr = tmpStr + NE # concatenate words without a space between them
                    
                elif tag[0] == 'O':
                    # End of concept check & update
                    eoc()
                else:
                    raise ValueError("Unknown tag!")
                    
            if LINE_NUM % 30000000 == 0:
                print 100.*LINE_NUM/301084933, " % processed; time so far: ", time.time()-st, " s"

print "Time taken ...", time.time()-st, " s"

print "Total # concepts: ", cnt_NOT_matched + cnt_matched
print "Matched concepts: ", cnt_matched

print "Concepts without beginning tag (B-...): ", cnt_cwb

print "Total # sentences: ", N_sentences

# Save all 3 dictionaries of counts
np.savez('./../Dataset/PubMed_counts.npz', 
                         chemCnts=chemCnts.items(), diseCnts=diseCnts.items(), cooc=cooc.items(), 
                         cnt_matched=cnt_matched, cnt_NOT_matched=cnt_NOT_matched, cnt_cwb=cnt_cwb, 
                         timeTaken=time.time()-st, numLines=301084933, N_sentences=N_sentences)

##############################################################################################################
# Total time: 1568.43784904 seconds = 26.14 minutes (with printing)
# Total # concepts:   9,105,005
# Matched concepts:   5,940,715
# Concepts without beginning tag (B-...):  452
# Total # sentences: 10,573,978

# To give examples of concepts witout beginning
# sed -n '1750573,1750577p' < pubmed-abstracts_tags
# sed -n '1750573,1750577p' < pubmed-abstracts.conll

1750575 Concept without beginning!
2788688 Concept without beginning!
3387645 Concept without beginning!
3701988 Concept without beginning!
4262071 Concept without beginning!
4430739 Concept without beginning!
5522210 Concept without beginning!
6175482 Concept without beginning!
6175655 Concept without beginning!
7790449 Concept without beginning!
7965579 Concept without beginning!
8317729 Concept without beginning!
9115904 Concept without beginning!
9323695 Concept without beginning!
10325323 Concept without beginning!
10604145 Concept without beginning!
10752818 Concept without beginning!
11628326 Concept without beginning!
12072635 Concept without beginning!
14946930 Concept without beginning!
15644633 Concept without beginning!
15918772 Concept without beginning!
16107006 Concept without beginning!
16225038 Concept without beginning!
16565696 Concept without beginning!
17053299 Concept without beginning!
17623190 Concept without beginning!
18159327 Concept without beginning!
205426

150499343 Concept without beginning!
150886972 Concept without beginning!
151142021 Concept without beginning!
151187087 Concept without beginning!
151505034 Concept without beginning!
151929345 Concept without beginning!
153034654 Concept without beginning!
153648851 Concept without beginning!
153705342 Concept without beginning!
154373340 Concept without beginning!
154762826 Concept without beginning!
155341105 Concept without beginning!
155341210 Concept without beginning!
155408863 Concept without beginning!
155417138 Concept without beginning!
155417173 Concept without beginning!
155982759 Concept without beginning!
156634865 Concept without beginning!
156930585 Concept without beginning!
157061255 Concept without beginning!
158561873 Concept without beginning!
159397853 Concept without beginning!
159761865 Concept without beginning!
159842127 Concept without beginning!
162701164 Concept without beginning!
163175541 Concept without beginning!
163423632 Concept without beginning!
1

291600334 Concept without beginning!
292310442 Concept without beginning!
293915053 Concept without beginning!
293943886 Concept without beginning!
294582644 Concept without beginning!
294869662 Concept without beginning!
295358007 Concept without beginning!
296147656 Concept without beginning!
296286463 Concept without beginning!
296328336 Concept without beginning!
297100570 Concept without beginning!
297392187 Concept without beginning!
298064154 Concept without beginning!
299268702 Concept without beginning!
299289353 Concept without beginning!
299399175 Concept without beginning!
99.6396588201  % processed; time so far:  1560.95438719  s
300159762 Concept without beginning!
300227491 Concept without beginning!
Time taken ... 1568.43784904  s
Total # concepts:  9105005
Matched concepts:  5940715
Concepts without beginning tag (B-...):  452
Total # sentences:  10573978
