In [7]:
from os import listdir
from os.path import isfile, join
import codecs
from file_types import ParseObject
import pickle as pkl
import time

def open_unicode_file(name):
    f =  codecs.open(name, encoding='utf-8', mode='r').read()
    return f

In [8]:
from extraction import extract_topics, extract_species, extract_regions, extract_antagonists,extract_agonists,extract_year,extract_methods,extract_receptor

def create_objs():
    count = 0
    mypath = "../data/subfiles"
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    timers = [0]*9
    for fname in onlyfiles:
        count += 1
        if count % 10 == 0:
            print(str(count) + " " + str(timers))
            timers = [0]*9
        ld = open_unicode_file(mypath + "/" + fname)
        
        parse = ParseObject()
        t0 = time.time()
        parse.topics = extract_topics(ld)
        t1 = time.time()
        timers[0] += t1 - t0
        parse.species = extract_species(ld)
        t2 = time.time()
        timers[1] += t2 - t1
        parse.regions = extract_regions(ld)
        t3 = time.time()
        timers[2] += t3 - t2
        parse.antagonists = extract_antagonists(ld)
        t4 = time.time()
        timers[3] += t4 - t3
        parse.agonists = extract_agonists(ld)
        t5 = time.time()
        timers[4] += t5 - t4
        parse.year = extract_year(ld)
        t6 = time.time()
        timers[5] += t6 - t5
        parse.methods = extract_methods(ld)
        t7 = time.time()
        timers[6] += t7 - t6
        parse.receptors = extract_receptor(ld)
        t8 = time.time()
        timers[7] += t8 - t7
        pkl.dump(parse, open("../data/pkls/" + fname.replace(".txt", ".p"), "wb" ))
        t9 = time.time()
        timers[8] += t9 - t8


In [9]:

def print_csv():
    count = 0
    mypath = "../data/pkls"
#     csv = open("../results.csv", "w+")
#     csv.write("Year,Receptor,Species,Methods,Agonist,Antagonist,Brain_Regions,Topic_Spec\n")
    
    
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

    for fname in onlyfiles:
        count+=1
        if count % 100 == 0:
            print(count)
        obj = pkl.load(open(mypath + "/" + fname, 'rb'))
        
        obj.species = remove_commas(obj.species)
        obj.agonists = remove_commas(obj.agonists)
        obj.antagonists = remove_commas(obj.antagonists)
        obj.regions = remove_commas(obj.regions)
        obj.topics = remove_commas(obj.topics)
        
        csv.write(obj.year + ",")
        csv.write(';'.join(obj.receptors) + ",")
        csv.write(';'.join(obj.species) + ",")
        csv.write(';'.join(obj.methods) + ",")
        csv.write(';'.join(obj.agonists) + ",")
        csv.write(';'.join(obj.antagonists) + ",")
        csv.write(';'.join(obj.regions) + ",")
        csv.write(';'.join(obj.topics) + ",")
        csv.write("\n")

In [10]:
def remove_commas(st):
    new_st = set()
    for element in st:
        new_str = ''
        for char in element:
            if char == ',':
                char =  ''
            new_str += char
        new_st.add(new_str)
    return new_st
    
        
        
        

In [23]:
import re
from file_types import Receptor,ReceptorFamily,ReceptorSubtype,ReceptorPrefix,Method
def get_family(suff):
    fam_map = {
        ReceptorFamily._1 : r"(?i)1",
        ReceptorFamily._2 : r"(?i)2",
        ReceptorFamily._3 : r"(?i)3",
        ReceptorFamily._4 : r"(?i)4",
        ReceptorFamily._5 : r"(?i)5",
        ReceptorFamily._6 : r"(?i)6",
        ReceptorFamily._7 : r"(?i)7"
    }
    val = ReceptorFamily.NONE
    for k,v in fam_map.iteritems():
        if(re.search(v,suff)):
            return k, re.search(v,suff).end()
    return ReceptorFamily.NONE,0
def get_sub(suff):
    sub_map = {
        ReceptorSubtype.a : r"(?i)a",
        ReceptorSubtype.b : r"(?i)b",
        ReceptorSubtype.c : r"(?i)c",
        ReceptorSubtype.d : r"(?i)d",
        ReceptorSubtype.e : r"(?i)e",
        ReceptorSubtype.f : r"(?i)f"
    }
    val = ReceptorSubtype.NONE
    for k,v in sub_map.iteritems():
        if(re.search(v,suff)):
            return k
    return ReceptorSubtype.NONE
def extract_receptor(doc):
    recepts = set()
    pre = re.compile(ReceptorPrefix._5HT.value)
    for x in pre.finditer(doc):
        suffix = doc[x.end():(x.end()+10)]
        suffixes = re.split(",|/",suffix.replace("(", "").replace(")", "").replace("-", ""))
        pre_family = ReceptorFamily.NONE
        for suff in suffixes:
            recept = Receptor()
            recept.pre = ReceptorPrefix._5HT
            
            suff = suff.strip()
            print suff
            if len(suff) > 0:
                suff_fam,end = get_family(suff[0])
                if suff[end] == 'h':
                    continue
                if suff_fam == ReceptorFamily.NONE:
                    suff_fam = pre_family
                pre_family = suff_fam
                recept.fam = suff_fam
                suff = suff[end:].strip()
                if len(suff) > 0:
                    sub = get_sub(suff[0])
                    recept.sub = sub
            if recept.fam != ReceptorFamily.NONE:
                recepts.add(str(recept))
    return recepts

In [24]:
def create_parse(ld):
    parse = ParseObject()
    parse.topics = remove_commas(extract_topics(ld))
    
    parse.species = remove_commas(extract_species(ld))

    parse.regions = remove_commas(extract_regions(ld))

    parse.antagonists = remove_commas(extract_antagonists(ld))

    parse.agonists = remove_commas(extract_agonists(ld))

    parse.year = remove_commas(extract_year(ld))

    parse.methods = remove_commas(extract_methods(ld))

    parse.receptors = remove_commas(extract_receptor(ld))
    print(parse.receptors)


In [25]:
ld = "Audenaert, K., et al. (2003). \"Decreased 5-HT2a receptor binding in patients with anorexia nervosa.\" Journal of Nuclear Medicine 44(2): 163-169. \nIndirect estimations of brain neurotransmitters in patients with anorexia nervosa (AN) and low weight have demonstrated a reduction in brain serotonin (5-HT) turnover in general and led to hypotheses about dysfunction in the 5-HT2a receptor system. It was our aim to investigate the central 5-HT2a receptor binding index using SPECT brain imaging. Methods: The 5-HT2a receptors of low-weight patients with AN were studied by means of the highly specific radiolodinated 5-HT2a receptor antagonist 4-amino-N-[1-[3-(4-fluorophenoxy)propyl]-4-methyl-4-piperidinyl]-5-iodo-2-methoxybenzamide or (123)\-5-\-R91150. Fifteen patients with clinical diagnoses of AN and 11 age-matched healthy volunteers received intravenous injections of 185 MBq (123)\-5-\-R91150 and were scanned with high-resolution brain SPECT. Results: Compared with healthy volunteers, patients with AN had a significantly reduced 6-HT2a binding index in the left frontal cortex, the left and right parietal cortex, and the left and right occipital cortex. A significant left-right asymmetry was noted in the frontal cortex (left < right). Conclusion: These results are in accordance with diminished metabolic and perfusion of frontal and parietal cortices reported in recent neuroimaging studies and imply localized disturbed serotonergic function. The data are discussed in the light of possible confounding factors related to the low-weight AN status. A regional cortical reduction in 5-HT2a binding index is not likely to be caused by a general reduction in serotonergic function due to the possible confounding factors. Suggestions for further research are given."
create_parse(ld)

(<_sre.SRE_Match object at 0x7f4e4d6c2a48>, '2a recepto')
(<_sre.SRE_Match object at 0x7f4e4d6c29c0>, ' (5-HT) tu')
(<_sre.SRE_Match object at 0x7f4e4d6c2a48>, ') turnover')
(<_sre.SRE_Match object at 0x7f4e4d6c29c0>, '2a recepto')
(<_sre.SRE_Match object at 0x7f4e4d6c2a48>, '2a recepto')
(<_sre.SRE_Match object at 0x7f4e4d6c29c0>, '2a recepto')
(<_sre.SRE_Match object at 0x7f4e4d6c2a48>, '2a recepto')
(<_sre.SRE_Match object at 0x7f4e4d6c29c0>, '2a binding')
set(['5ht5', '5ht2a'])
