In [None]:
#############################################################################################################
# Extracting chemical-disease associations from the biological literature
# R214: Main Practical
# Jan Ondras (jo356), Trinity College
#############################################################################################################
# Scripts to extract chemical-disease pairs (CDPs) of interest from PubMed:
#############################################################################################################
'''
# 	Find LINE NUMBERS of chemical-disease pair (CDP) relations in PubMed:
#     (A) given chemical and disease concept IDs
#     (B) given the sentence number
# 	Extract PubMed text and assigned tags around specified LINE NUMBERS 
# 	Create array of strings: each string is 1 sentence from PubMed where each NE item was abstracted 
#                         	 (by abstraction function) and concatenate without spaces
# 	Then search for string STR in every sentence, uses the above array of sentences
'''

In [48]:
#############################################################################################################
# Find LINE NUMBERS of chemical-disease pair (CDP) relations in PubMed:
#     (A) given chemical and disease concept IDs
#     (B) given the sentence number (check at the end of sentence)
#############################################################################################################

import numpy as np
import matplotlib.pyplot as plt
import time
import glob

##################################
# What pairs to find
find_pairs = [ # chemical, disease IDs
#     ['D006220', 'D009127'],               # haloperidol + rigidity
#     ['D000082', 'D056486'], # BELOW: also found in the intersection with BioCreative CDR CID relations
#     ['D000431', 'D003866'],
#     ['D000431', 'D056486'],
#     ['D002251', 'D056486'],
#     ['D004967', 'D001943'],
#     ['D007980', 'D004409'],
#     ['D013311', 'D003920'], 
    
#     ['D010622', 'D011020'], # Below: UNKNOWN RELATIONS
#     ['D001640', 'D002280'], 
    
#     ['D005947', 'D003920'], # Glucose + diabetes 
    ['D007980', 'D004409']    # levodopa + abnormal movements
]

#################################
# What sentence to find
sentenceToFind = 166709 # top 1, SCP, DC
sentenceToFind = 9797    # top2 , SCP, DC
sentenceToFind = 53842 # top 2&3 SCC
sentenceToFind = 6628014 # BCC confusion

# Dictionaries of total counts of occurrences of chemical/disease: 
# key=chemicalCode/diseaseCode, value=count 
# if chemical/disease is not in this dictionary then assume zero count
chemCnts = {} 
diseCnts = {}
# Dictionary of cooccurrences: key=chemicalCode+diseaseCode, value=count
cooc = {}     

TMP_chemCnts = {} # dictionaries as above, but temporary, for current sentence only
TMP_diseCnts = {}

tmpStr = '' # temporary string storing a current concept being parsed

########################################################################################
# HELPER FUNCTIONS 

# Abstract the given full concept name (c)
def abstract_concept(c):
    # Remove the following characters from (c)
    rmv_chars = " '-,/*+.!@#$%^&*()_={}[];\:<>?" 
    #rmv_chars = " " 
    c = c.translate(None, rmv_chars)
    c = c.translate(None, '"') # quotes are removed separately
    # Convert to lower case
    c = c.lower()
    return c

# End of concept check & update: update counts once a whole concept is parsed
def eoc():
    
    global tmpStr
    global TMP_diseCnts
    global TMP_chemCnts
    global cnt_matched
    global cnt_NOT_matched

    if tmpStr != '': # yes, temporary string is not empty
        # Simple approximate string matching
        ac = abstract_concept(tmpStr)
        # Is this abstracted concept in our grounding dictionary ?
        if ac in ground_dict:
            matched_MESH_concept = ground_dict[ac]
            mLabel = matched_MESH_concept[1]  # matched label = chemical / disease   
            mCode =  matched_MESH_concept[2]  # matched code of chemical / disease
            #print "YES: ", tmpStr, "<==>", matched_MESH_concept[0], "\t *** ", ac
            
            # UPDATE COUNTS for this concept
            if mLabel == 'Disease':
                if mCode not in TMP_diseCnts.keys():
                    TMP_diseCnts[mCode] = 1
                else:
                    TMP_diseCnts[mCode] += 1
            elif mLabel == 'Chemical':
                if mCode not in TMP_chemCnts.keys():
                    TMP_chemCnts[mCode] = 1
                else:
                    TMP_chemCnts[mCode] += 1
            else:
                raise ValueError('Unknown concept!')

            cnt_matched += 1
        else:
            #print "NO :", tmpStr
            cnt_NOT_matched += 1
        
        tmpStr = ''
        
# End of sentence update: update total counts based on temporary counts from this sentence, once a whole sentence is parsed
def eos():
    
    global chemCnts
    global diseCnts
    global cooc
    global LINE_NUM
    
    for chem in TMP_chemCnts.keys():
        # Update chemical counts
        if chem not in chemCnts.keys(): 
            chemCnts[chem] = 1
        else:
            chemCnts[chem] += 1

        for dise in TMP_diseCnts.keys():
            
            #############################
            # Check CID pairs
            for [checkC, checkD] in find_pairs:
                if checkC == chem and checkD == dise: # found a matching pair
                    print LINE_NUM, ": ", chem, dise
            
            
            
            # Update disease counts
            if dise not in diseCnts.keys():
                diseCnts[dise] = 1
            else:
                diseCnts[dise] += 1

            # Update co-occurences - minimum of 2x in total will be considered by filtering later
            k = chem + '+' + dise # construct key into dictionary of cooccurences: chemicalCode+diseaseCode, always this ordering
            if k not in cooc.keys():
                cooc[k] = 1
            else:
                cooc[k] += 1

# END OF HELPER FUNCTIONS 
########################################################################################

# Load MESH concepts: <name, type (Chemical or Disease), code>
MESH = np.loadtxt('./../Dataset/CDR_MeSH.tsv', delimiter='\t', skiprows=0, dtype=str) # <name, label, code>

# Construct the Gounding dictionary
# maps grounded/abstracted concept to list [full concept name, type (Chemical or Disease), code]
ground_dict = {}
for r in MESH:
    ac = abstract_concept(r[0])
    ground_dict[ac] = r
    
########################################################################################
########################################################################################
                    
# Iterate over PubMed abstracts and associated tags (produced by named entity recogniser)

st = time.time()
N_sentences = 0      # to count number of sentences
LINE_NUM = 0
cnt_matched = 0      # count matched concepts
cnt_NOT_matched = 0  # count not-matched concepts
cnt_cwb = 0          # count concepts without beginning tag B-...

with open('/home/janciovec/Desktop/pubmed-abstracts.conll') as f_NE:     # named entities;  301,084,933 lines, using grep -c '' pubmed-abstracts_tags
    with open('/home/janciovec/Desktop/pubmed-abstracts_tags') as f_TA:  # associated tags; 301,084,933 lines
        
        # Iterate over all lines
        while True: 
            
            LINE_NUM += 1
            
            if N_sentences == sentenceToFind:
                print "FOUND SENTENCE ENDS AT LINE:", LINE_NUM

            # Read line - named entity
            rl = f_NE.readline()
            rlwn = rl.split('\n')[0] # read line skipping the new line character

            # End of file
            if rl == '':
                if f_TA.readline() != '':
                    raise ValueError("File with tags does not match the file with named entities!")
                # End of concept check & update
                eoc()
                # End of sentence update
                eos()
                break

            # End of sentence
            elif rlwn == '':
                if f_TA.readline() != '\n':
                    raise ValueError("File with tags does not match the file with named entities!")
                # End of concept check & update
                eoc()
                # End of sentence update
                eos()
                # Reset temporary counters for next sentence
                TMP_chemCnts = {}
                TMP_diseCnts = {}
                N_sentences += 1

            # Beginning or within sentence
            else:
                # Read associated tag
                tag = f_TA.readline().split('\n')[0]
                # Named entity - first item at the line
                NE = rlwn.split('\t')[0]
                
                # Beginning of concept
                if tag[0] == 'B': 
                    # End of concept check & update - since there could be a concept ending right now
                    eoc()
                    # Start new concept
                    tmpStr = NE
                    
                # Continuation of the concept name
                elif tag[0] == 'I': 
        
                    if tmpStr == '': # Concept without beginning: start new one from this string
                        print LINE_NUM, "Concept without beginning!"
                        cnt_cwb += 1
                    
                    tmpStr = tmpStr + NE # concatenate words without a space between them
                    
                elif tag[0] == 'O':
                    # End of concept check & update
                    eoc()
                else:
                    raise ValueError("Unknown tag!")
                    
            if LINE_NUM % 30000000 == 0:
                print 100.*LINE_NUM/301084933, " % processed; time so far: ", time.time()-st, " s"

print "Time taken ...", time.time()-st, " s"

print "Total # concepts: ", cnt_NOT_matched + cnt_matched
print "Matched concepts: ", cnt_matched

print "Concepts without beginning tag (B-...): ", cnt_cwb

print "Total # sentences: ", N_sentences

# Save all 3 dictionaries of counts
# np.savez('./../Dataset/PubMed_counts.npz', 
#                          chemCnts=chemCnts.items(), diseCnts=diseCnts.items(), cooc=cooc.items(), 
#                          cnt_matched=cnt_matched, cnt_NOT_matched=cnt_NOT_matched, cnt_cwb=cnt_cwb, 
#                          timeTaken=time.time()-st, numLines=301084933, N_sentences=N_sentences)

##############################################################################################################
# Total time: 1568.43784904 seconds = 26.14 minutes (with printing)
# Total # concepts:   9,105,005
# Matched concepts:   5,940,715
# Concepts without beginning tag (B-...):  452
# Total # sentences: 10,573,978

# To give examples of concepts witout beginning
# sed -n '1750573,1750577p' < pubmed-abstracts_tags
# sed -n '1750573,1750577p' < pubmed-abstracts.conll

703925 :  D007980 D004409
752860 :  D007980 D004409
796146 :  D007980 D004409
943741 :  D007980 D004409
943780 :  D007980 D004409
943900 :  D007980 D004409
943924 :  D007980 D004409
944059 :  D007980 D004409
944146 :  D007980 D004409
944249 :  D007980 D004409
944290 :  D007980 D004409
1648137 :  D007980 D004409
1750575 Concept without beginning!
2350810 :  D007980 D004409
2466041 :  D007980 D004409
2591155 :  D007980 D004409
2788688 Concept without beginning!
3044013 :  D007980 D004409
3044041 :  D007980 D004409
3366957 :  D007980 D004409
3367028 :  D007980 D004409
3367037 :  D007980 D004409
3387645 Concept without beginning!
3701988 Concept without beginning!
3871928 :  D007980 D004409
4000738 :  D007980 D004409
4000783 :  D007980 D004409
4000965 :  D007980 D004409
4001118 :  D007980 D004409
4001135 :  D007980 D004409
4001275 :  D007980 D004409
4069983 :  D007980 D004409
4070205 :  D007980 D004409
4070247 :  D007980 D004409
4246214 :  D007980 D004409
4262071 Concept without beginning!

17623190 Concept without beginning!
18159327 Concept without beginning!
18981471 :  D007980 D004409
18981506 :  D007980 D004409
18981602 :  D007980 D004409
18981709 :  D007980 D004409
18981795 :  D007980 D004409
18981970 :  D007980 D004409
18982284 :  D007980 D004409
18982326 :  D007980 D004409
18982410 :  D007980 D004409
19686022 :  D007980 D004409
19772173 :  D007980 D004409
19772203 :  D007980 D004409
19772286 :  D007980 D004409
19772390 :  D007980 D004409
19786264 :  D007980 D004409
19786748 :  D007980 D004409
20513875 :  D007980 D004409
20513936 :  D007980 D004409
20513973 :  D007980 D004409
20514116 :  D007980 D004409
20514138 :  D007980 D004409
20514198 :  D007980 D004409
20514228 :  D007980 D004409
20514307 :  D007980 D004409
20514429 :  D007980 D004409
20514492 :  D007980 D004409
20514551 :  D007980 D004409
20514568 :  D007980 D004409
20514608 :  D007980 D004409
20514892 :  D007980 D004409
20514960 :  D007980 D004409
20515028 :  D007980 D004409
20515060 :  D007980 D004409
2051

47263702 :  D007980 D004409
47263816 :  D007980 D004409
47284466 Concept without beginning!
47673219 :  D007980 D004409
47673258 :  D007980 D004409
47953517 :  D007980 D004409
48044071 :  D007980 D004409
48044271 :  D007980 D004409
48044310 :  D007980 D004409
48044353 :  D007980 D004409
48571378 :  D007980 D004409
48571425 :  D007980 D004409
48853632 :  D007980 D004409
48853664 :  D007980 D004409
48853694 :  D007980 D004409
48856086 :  D007980 D004409
49045690 :  D007980 D004409
49045724 :  D007980 D004409
49045894 :  D007980 D004409
49045980 :  D007980 D004409
49142170 :  D007980 D004409
49142379 :  D007980 D004409
49495989 :  D007980 D004409
49496042 :  D007980 D004409
49496122 :  D007980 D004409
49496163 :  D007980 D004409
49496334 :  D007980 D004409
49791374 Concept without beginning!
49858526 :  D007980 D004409
49858738 :  D007980 D004409
49858789 :  D007980 D004409
49858816 :  D007980 D004409
50252215 Concept without beginning!
50263142 :  D007980 D004409
50282759 :  D007980 D004

69373605 :  D007980 D004409
69373712 :  D007980 D004409
69373737 :  D007980 D004409
69373908 :  D007980 D004409
69431396 :  D007980 D004409
69431460 :  D007980 D004409
69431482 :  D007980 D004409
69431503 :  D007980 D004409
69431699 :  D007980 D004409
69564775 Concept without beginning!
69605930 Concept without beginning!
69618100 Concept without beginning!
69895339 :  D007980 D004409
69959490 Concept without beginning!
69960690 Concept without beginning!
69973164 Concept without beginning!
70516150 Concept without beginning!
70617202 Concept without beginning!
70733462 :  D007980 D004409
70946433 :  D007980 D004409
70946454 :  D007980 D004409
71163007 :  D007980 D004409
71163088 :  D007980 D004409
71446913 :  D007980 D004409
71446952 :  D007980 D004409
71558650 :  D007980 D004409
72141244 :  D007980 D004409
72141260 :  D007980 D004409
72141328 :  D007980 D004409
72141349 :  D007980 D004409
72141369 :  D007980 D004409
72141409 :  D007980 D004409
72141547 :  D007980 D004409
72294958 :  

95412518 :  D007980 D004409
95412561 :  D007980 D004409
95412795 :  D007980 D004409
95412851 :  D007980 D004409
95431973 :  D007980 D004409
95762364 :  D007980 D004409
95762399 :  D007980 D004409
95762520 :  D007980 D004409
95830591 :  D007980 D004409
95830625 :  D007980 D004409
95830661 :  D007980 D004409
95830683 :  D007980 D004409
96055676 :  D007980 D004409
96055890 :  D007980 D004409
96056050 :  D007980 D004409
96064315 :  D007980 D004409
96064633 :  D007980 D004409
96104379 Concept without beginning!
96398450 :  D007980 D004409
96398632 :  D007980 D004409
96398667 :  D007980 D004409
96398702 :  D007980 D004409
96398757 :  D007980 D004409
96526832 :  D007980 D004409
96526860 :  D007980 D004409
96527171 :  D007980 D004409
96527198 :  D007980 D004409
96560887 :  D007980 D004409
96677623 Concept without beginning!
96870487 :  D007980 D004409
96870511 :  D007980 D004409
97321168 Concept without beginning!
97326089 :  D007980 D004409
98112253 :  D007980 D004409
98391618 :  D007980 D004

116737000 :  D007980 D004409
116831884 :  D007980 D004409
116832232 :  D007980 D004409
116881388 :  D007980 D004409
116881547 :  D007980 D004409
116881671 :  D007980 D004409
116881719 :  D007980 D004409
116937208 :  D007980 D004409
117304159 :  D007980 D004409
117356244 Concept without beginning!
117421908 Concept without beginning!
117470354 :  D007980 D004409
117474795 :  D007980 D004409
117474870 :  D007980 D004409
117589593 :  D007980 D004409
117827450 Concept without beginning!
117875497 :  D007980 D004409
117975051 :  D007980 D004409
118078704 :  D007980 D004409
118362251 :  D007980 D004409
118814288 :  D007980 D004409
118814330 :  D007980 D004409
118883929 :  D007980 D004409
119130475 :  D007980 D004409
119130506 :  D007980 D004409
119130543 :  D007980 D004409
119130596 :  D007980 D004409
119130628 :  D007980 D004409
119130672 :  D007980 D004409
119130713 :  D007980 D004409
119241020 :  D007980 D004409
119241123 :  D007980 D004409
119241268 :  D007980 D004409
119583634 Concept w

133691264 :  D007980 D004409
133691304 :  D007980 D004409
133719719 :  D007980 D004409
134689880 :  D007980 D004409
134689959 :  D007980 D004409
134690052 :  D007980 D004409
134690090 :  D007980 D004409
134748863 :  D007980 D004409
134973384 Concept without beginning!
135037864 :  D007980 D004409
135037889 :  D007980 D004409
135037921 :  D007980 D004409
135131827 Concept without beginning!
135307562 :  D007980 D004409
135307584 :  D007980 D004409
135307677 :  D007980 D004409
135307737 :  D007980 D004409
135307785 :  D007980 D004409
135353116 :  D007980 D004409
135353165 :  D007980 D004409
135353337 :  D007980 D004409
135470756 :  D007980 D004409
135470783 :  D007980 D004409
135470850 :  D007980 D004409
135470898 :  D007980 D004409
135470989 :  D007980 D004409
135471168 :  D007980 D004409
135824582 :  D007980 D004409
136030303 :  D007980 D004409
136030329 :  D007980 D004409
136030472 :  D007980 D004409
136244467 :  D007980 D004409
136244661 :  D007980 D004409
136546870 Concept without b

150301361 :  D007980 D004409
150370618 Concept without beginning!
150447255 :  D007980 D004409
150447309 :  D007980 D004409
150447350 :  D007980 D004409
150447397 :  D007980 D004409
150447448 :  D007980 D004409
150447522 :  D007980 D004409
150499343 Concept without beginning!
150528056 :  D007980 D004409
150528078 :  D007980 D004409
150644356 :  D007980 D004409
150886972 Concept without beginning!
150940456 :  D007980 D004409
150940507 :  D007980 D004409
150940639 :  D007980 D004409
150940672 :  D007980 D004409
150940750 :  D007980 D004409
151142021 Concept without beginning!
151187087 Concept without beginning!
151326616 :  D007980 D004409
151326648 :  D007980 D004409
151326989 :  D007980 D004409
151505034 Concept without beginning!
151591101 :  D007980 D004409
151591136 :  D007980 D004409
151929345 Concept without beginning!
152430267 :  D007980 D004409
152430334 :  D007980 D004409
152481549 :  D007980 D004409
152481563 :  D007980 D004409
152481660 :  D007980 D004409
152481820 :  D00

169845238 Concept without beginning!
170644277 Concept without beginning!
170644318 Concept without beginning!
170644369 Concept without beginning!
170749673 :  D007980 D004409
170749705 :  D007980 D004409
170749956 :  D007980 D004409
170846016 :  D007980 D004409
170846045 :  D007980 D004409
171131163 :  D007980 D004409
171131375 :  D007980 D004409
171204293 Concept without beginning!
171312505 :  D007980 D004409
171312546 :  D007980 D004409
171377603 :  D007980 D004409
171377630 :  D007980 D004409
171377837 :  D007980 D004409
171377860 :  D007980 D004409
171754270 :  D007980 D004409
171754321 :  D007980 D004409
171754498 :  D007980 D004409
171890707 :  D007980 D004409
171890763 :  D007980 D004409
171890912 :  D007980 D004409
172360744 :  D007980 D004409
172360781 :  D007980 D004409
172931569 :  D007980 D004409
172931609 :  D007980 D004409
172931717 :  D007980 D004409
172931912 :  D007980 D004409
172937344 Concept without beginning!
172937376 Concept without beginning!
173169543 :  D00

186049662 :  D007980 D004409
186049718 :  D007980 D004409
186180114 Concept without beginning!
186215518 :  D007980 D004409
186408077 :  D007980 D004409
186408130 :  D007980 D004409
186446978 :  D007980 D004409
186700175 Concept without beginning!
186957477 :  D007980 D004409
186957531 :  D007980 D004409
187027923 Concept without beginning!
187285716 :  D007980 D004409
187288471 :  D007980 D004409
187288583 :  D007980 D004409
187288701 :  D007980 D004409
187422194 :  D007980 D004409
187423942 :  D007980 D004409
187424043 :  D007980 D004409
187424086 :  D007980 D004409
187426799 :  D007980 D004409
187426833 :  D007980 D004409
187426854 :  D007980 D004409
187426896 :  D007980 D004409
187426939 :  D007980 D004409
187506086 :  D007980 D004409
187506134 :  D007980 D004409
187506333 :  D007980 D004409
187506451 :  D007980 D004409
187514665 :  D007980 D004409
187514702 :  D007980 D004409
187514855 :  D007980 D004409
187514938 :  D007980 D004409
187517233 Concept without beginning!
187594382 C

201301070 :  D007980 D004409
201327593 :  D007980 D004409
201327674 :  D007980 D004409
201327824 :  D007980 D004409
201327864 :  D007980 D004409
201330181 :  D007980 D004409
201330217 :  D007980 D004409
201330419 :  D007980 D004409
201372073 :  D007980 D004409
201372139 :  D007980 D004409
201444609 :  D007980 D004409
201453577 :  D007980 D004409
201453640 :  D007980 D004409
201453665 :  D007980 D004409
201453717 :  D007980 D004409
201454001 :  D007980 D004409
201458600 :  D007980 D004409
201458627 :  D007980 D004409
201458724 :  D007980 D004409
201458932 :  D007980 D004409
201458958 :  D007980 D004409
201472915 :  D007980 D004409
201473258 :  D007980 D004409
201473289 :  D007980 D004409
201473331 :  D007980 D004409
201518147 :  D007980 D004409
201518177 :  D007980 D004409
201518450 :  D007980 D004409
201520129 :  D007980 D004409
201520237 :  D007980 D004409
201520400 :  D007980 D004409
201724523 Concept without beginning!
201743216 :  D007980 D004409
201743302 :  D007980 D004409
201797

215261414 :  D007980 D004409
215261469 :  D007980 D004409
215261515 :  D007980 D004409
215261578 :  D007980 D004409
215261651 :  D007980 D004409
215273626 :  D007980 D004409
215273688 :  D007980 D004409
215273730 :  D007980 D004409
215588031 Concept without beginning!
216061195 :  D007980 D004409
216061289 :  D007980 D004409
216177779 :  D007980 D004409
216177832 :  D007980 D004409
216464440 :  D007980 D004409
216464464 :  D007980 D004409
216464522 :  D007980 D004409
216492951 :  D007980 D004409
216492971 :  D007980 D004409
216493033 :  D007980 D004409
216493113 :  D007980 D004409
216757235 Concept without beginning!
216809322 Concept without beginning!
216834318 :  D007980 D004409
216834436 :  D007980 D004409
217143727 :  D007980 D004409
217143899 :  D007980 D004409
217143990 :  D007980 D004409
217144035 :  D007980 D004409
217182507 :  D007980 D004409
217182559 :  D007980 D004409
217182601 :  D007980 D004409
217182794 :  D007980 D004409
217304900 Concept without beginning!
217420064 :

KeyboardInterrupt: 

In [50]:
#########################################################################################################
# Extract PubMed text and assigned tags around specified LINE NUMBERS (specify window size before and after)
#########################################################################################################
# D001943
# D003920
# D003866
# D056486 2x
# D004409
# D009127

find_lines = [
    191423420,                   # BCC confusion
    1517791,            # top 2.3 by SCC
    278813,             # top 2, by SCC, DC
    4743043                # top 1, levodopa
     
#     8946,
#     89918,
#     116820,
#     160704,
#     133594, 
#     796146,
#     20515371, 
#     622143, # UNKNOWN 1
#     1716334, 
#     3109256, # UNKNOWN 2
#     31614081, 
#     164477584, 
#     165835766, 
    
    # Glucose, diabetes
#     2052553,
#     2052455,
#     2183972,
#     2350946,
#     2904875,
#     72273, 
#     167380, 
#     173424, 
#     269025,
#     471660,
#     938473,
#     1120124,
#     1127074,
#     1127392
    
]
WINDOW_before = 40 # print
WINDOW_after = 0


for FIND_LINE_NUM in find_lines:
    # FIND_LINE_NUM = 19772286
    RESULT_ITEMS = ''
    RESULT_TAGS = ''

    LINE_NUM = 0
    with open('/home/janciovec/Desktop/pubmed-abstracts.conll') as f_NE:     # named entities;  301,084,933 lines, using grep -c '' pubmed-abstracts_tags
        with open('/home/janciovec/Desktop/pubmed-abstracts_tags') as f_TA:  # associated tags; 301,084,933 lines

            # Iterate over all lines
            while True: 

                LINE_NUM += 1

                # Read line - named entity
                rl = f_NE.readline()
                rlwn = rl.split('\n')[0] # read line skipping the new line character

                # End of file
                if rl == '':
                    if f_TA.readline() != '':
                        raise ValueError("File with tags does not match the file with named entities!")
                    break



                if LINE_NUM < FIND_LINE_NUM - WINDOW_before:
                    f_TA.readline()
                    continue
                elif LINE_NUM > FIND_LINE_NUM + WINDOW_after:
                    break
                else:              # LINE OF INTEREST - PRINT

                    # End of sentence
                    if rlwn == '':
                        if f_TA.readline() != '\n':
                            raise ValueError("File with tags does not match the file with named entities!")

                        RESULT_ITEMS = RESULT_ITEMS + ' '

                    # Beginning or within sentence
                    else:
                        # Read associated tag
                        tag = f_TA.readline().split('\n')[0]
                        # Named entity - first item at the line
                        NE = rlwn.split('\t')[0]

                        RESULT_ITEMS = RESULT_ITEMS + ' ' + NE
                        RESULT_TAGS = RESULT_TAGS + ' ' + tag

    print RESULT_ITEMS
    print RESULT_TAGS
    print 

 chemopreventive and chemotherapeutic management of cancer .  PI3K - AKT signaling is a downstream effector of retinoid prevention of murine basal cell carcinogenesis .  Basal cell carcinoma ( BCC ) is the most common human cancer .  We
 O O O O O B-Disease O O O O O O O O O O O O O O O O O O B-Disease I-Disease I-Disease O B-Chemical O O O O O O B-Disease O O

 in cancer patients .  Previous studies have suggested the potentiation of the antitumor activity of certain cytotoxic drugs , such as cisplatin and doxorubicin , in human cancer cell lines by treatment with anti - EGFR antibodies .  We
 O B-Disease O O O O O O O O O O O O O O O O O O O B-Chemical O B-Chemical O O O B-Disease O O O O O O O O O O O

 lamivudine resistance : A study by in vitro full - length viral DNA transfection .  Recently , lamivudine used to treat patients with hepatitis B virus ( HBV ) infection was revealed to have potent antiviral activity .  However
 B-Chemical O O O O O O O O O O O O O O O O B-Chemical O

In [22]:
#########################################################################################################
# Create array of strings: each string is 1 sentence from PubMed where each NE item was abstracted 
#                         (by abstraction function) and concatenate without spaces
#########################################################################################################

import numpy as np
import matplotlib.pyplot as plt
import time
import glob
from collections import deque

tmpStr = '' # temporary string storing a current concept being parsed
RESULT = []

# Abstract the given full concept name (c)
def abstract_concept(c):
    # Remove the following characters from (c)
    rmv_chars = " '-,/*+.!@#$%^&*()_={}[];\:<>?" 
    #rmv_chars = " " 
    c = c.translate(None, rmv_chars)
    c = c.translate(None, '"') # quotes are removed separately
    # Convert to lower case
    c = c.lower()
    return c

# Iterate over PubMed abstracts and associated tags (produced by named entity recogniser)

st = time.time()
LINE_NUM = 0
with open('/home/janciovec/Desktop/pubmed-abstracts.conll') as f_NE:     # named entities;  301,084,933 lines, using grep -c '' pubmed-abstracts_tags
#     with open('/home/janciovec/Desktop/pubmed-abstracts_tags') as f_TA:  # associated tags; 301,084,933 lines
        
        # Iterate over all lines
        while True: 
            
            LINE_NUM += 1

            # Read line - named entity
            rl = f_NE.readline()
            rlwn = rl.split('\n')[0] # read line skipping the new line character

            # End of file
            if rl == '':
                break

            # End of sentence
            elif rlwn == '':
                RESULT.append( tmpStr )
                tmpStr = ''
            # Beginning or within sentence
            else:
                # Named entity item - first item at the line
                NE = rlwn.split('\t')[0]
                
                tmpStr = tmpStr + abstract_concept(NE)
    
            if LINE_NUM % 30000000 == 0:
                print 100.*LINE_NUM/301084933, " % processed; time so far: ", time.time()-st, " s"

print "Time taken ...", time.time()-st, " s"
# np.savez('./../Dataset/PubMed_concatBySentences.npz', R=RESULT)

9.96396588201  % processed; time so far:  59.7284851074  s
19.927931764  % processed; time so far:  120.373895168  s
29.891897646  % processed; time so far:  188.896291018  s
39.855863528  % processed; time so far:  253.689700127  s
49.8198294101  % processed; time so far:  318.72897315  s
59.7837952921  % processed; time so far:  385.987893105  s
69.7477611741  % processed; time so far:  455.371173143  s
79.7117270561  % processed; time so far:  524.035975218  s
89.6756929381  % processed; time so far:  592.425779104  s
99.6396588201  % processed; time so far:  657.544903994  s
Time taken ... 659.951008081  s


MemoryError: 

In [47]:
#########################################################################################################
# Search for string STR in every sentence (sentence generated without spaces, and each NE item was abstracted);
# uses the above array of sentences
#########################################################################################################

# 1. option) search for 1 string
STR = 'basalcellcarcinomabcc'
# STR = 'phencyclidinepcp'
# STR = 'pneumocystispneumoniapcp'

# 2. option) search for 2 strings
# STR = 'abnormalmovements' # top 1
# STR2 = 'levodopa'

# STR = 'lamivudine'  # top 2
# STR2 = 'hepatitisb'

# STR = 'doxorubicin'
# STR2 = 'cancer'

# R = np.load('./../Dataset/PubMed_concatBySentences.npz')['R']

cnt = 0
L = 0
for r in RESULT:
    L += 1
    if STR in r:
        cnt += 1
        if STR2 in r:         # TO REMOVE if searching 2 strings
            print L, r
        
print cnt

2006507 aberrantactivationofthehedgehogpathwayinkeratinocytesisahallmarkofbasalcellcarcinomabccthemostcommoncancerinlightskinnedindividuals
2847105 backgroundandobjectiveswhilebasalcellcarcinomabcciseffectivelytreatedbyseveralmethodsmanypatientswithnumerousorfrequentlyoccurringlesionsseekalternativesthatcantreatmultiplecancerswithimprovedcosmeticoutcome
3046860 methodsrealtimereversetranscriptionpolymerasechainreactionexperimentswereperformedtoanalysethemrnaexpressionlevelsofpsoriasintogetherwithinvolucrinasamarkerforepithelialdifferentiationandinterleukin8il8asamarkerforinflammationinskinbiopsysamplesfrompatientswithprecancerousskinlesionspsln6squamouscellcarcinomasccn11basalcellcarcinomabccn17andhealthycontrolsn10
3607843 mutationsinthehedgehogsignalingpathwayisresponsiblefortheformationofvariouscancersincludingsomeformsofbasalcellcarcinomabcc
4692620 basalcellcarcinomabccisknowntobethemostprevalenttypeofskincancerandmelanomaisthemostlethalform
4874556 basalcellcarcinomabccoftheskini

In [None]:
# EXAMPLES

# established or characterized six lines of human breast cancer maintained in long - term tissue culture for at least 1 year and have examined these lines for estrogen responsiveness . 
#  O O O O O O O B-Disease I-Disease O O O O O O O O O O O O O O O O O O B-Chemical O O

#  explants .  The streptozotocin - induced diabetic ( STZ - DB ) rat model is associated with fetal hyperglycemia , but with low to normal plasma insulin concentration . 
#  O O O B-Chemical O O B-Disease O B-Chemical I-Chemical I-Chemical O O O O O O O O O O O O O O O O O O

#  Among patients with less than 100 % confidence , confidence interacted with age , depressed mood scores , addiction scores , and alcohol intake to discriminate 5 additional subgroups . 
#  O O O O O O O O O O O O O O B-Disease I-Disease O O O O O O B-Chemical O O O O O O O

#  natural root and root callus extracts of Cichorium intybus were compared for their anti - hepatotoxic effects in Wistar strain of Albino rats against carbon tetrachloride induced hepatic damage . 
#  O O O O O O O O O O O O O O O O O O O O O O O O B-Chemical I-Chemical O B-Disease I-Disease O

#  pure mixture of linoleate - rich polyunsaturated phosphatidylcholines that protects against alcohol - induced liver injury , also affects 2E1 , either in the presence or absence of iron . 
#  O O O B-Chemical O O O O O O O B-Chemical O O B-Disease I-Disease O O O O O O O O O O O O B-Chemical O

#  is therefore recommended that flunitrazepam should be classified as a controlled substance in Sweden as it is elsewhere .  [ Understanding and treating levodopa - induced dyskinesias ] . 
#  O O O O B-Chemical O O O O O O O O O O O O O O O O O O B-Chemical O O B-Disease O O

#  syndrome with subcutaneous apomorphine monotherapy .  A 20 - year - old psychiatric patient receiving haloperidol treatment developed acute - onset fever , rigidity , and mental changes . 
#  I-Disease O O B-Chemical O O O O O O O O B-Disease O O B-Chemical O O O O O B-Disease O B-Disease O O O O O

#  LP - BM5 infection , we studied the effects of infection on discriminative stimulus properties of phencyclidine ( PCP ) , a Ca2 + channel blocker at NMDA receptors . 
#  B-Disease I-Disease I-Disease I-Disease O O O O O O B-Disease O O O O O B-Chemical O B-Disease O O O O O O O O B-Chemical O O

#  administration of the NMDA receptor antagonist phencyclidine ( PCP ; 4 mg / kg ) produced a profound reduction in prepulse inhibition of the acoustic startle response in rats . 
#  O O O B-Chemical O O B-Chemical O B-Disease O O O O O O O O O O O O O O O O O O O O O

#  , pep pills , lysergic acid diethylamide ( LSD ) , cocaine , designer drugs , phencyclidine ( PCP ) , Talwin and Ritalin , speed , and narcotics . 
#  O O O O B-Chemical I-Chemical I-Chemical O O O O B-Chemical O O O O B-Chemical O B-Disease O O B-Chemical O O O O O O O O

#  to the PTCH1 protein .  The PTCH1 gene is a human tumour suppressor gene frequently mutated in basal cell carcinoma ( BCC ) and several other tumour types . 
#  O O O O O O O O O O O B-Disease O O O O O B-Disease I-Disease I-Disease O B-Chemical O O O O B-Disease O O

#  have been reported for a variety of different cancers .  Differentially expressed miRNAs have not been systematically evaluated in basal cell carcinoma ( BCC ) of the skin . 
#  O O O O O O O O B-Disease O O O O O O O O O O B-Disease I-Disease I-Disease O B-Chemical O O O O O

#  the association between NSAID use and the risk of squamous cell carcinoma ( SCC ) , basal cell carcinoma ( BCC ) , and malignant melanoma ( MM ) . 
#  O O O O O O O O O B-Disease I-Disease I-Disease O O O O B-Disease I-Disease I-Disease O B-Chemical O O O O O O O O O
