In [1]:
from trialstreamer import dbutil, config
import psycopg2
import trialstreamer
import json
import tqdm
from trialstreamer import minimap

In [44]:
from psycopg2.extensions import QuotedString

In [47]:
dbutil.db.rollback()

In [4]:
cur = dbutil.db.cursor(cursor_factory=psycopg2.extras.RealDictCursor)


In [5]:
import pickle

with open('trialstreamer/data/drugs_from_class.pck', 'rb') as f:
    drugs_from_class = pickle.load(f)
    
with open('trialstreamer/data/class_from_drug.pck', 'rb') as f:
    class_from_drug = pickle.load(f)

In [6]:
def nested_mesh(q):
    out = []
    for i in q:
        if isinstance(i, str):
            if i in ['OR', 'AND']:
                out.append(i)
            else:
                terms = [t['mesh_term'] for t in minimap.minimap(i)]
                if len(terms) == 1:
                    out.append(terms[0])
                else:
                    out.append(["OR", *terms])
        else:
            out.append(nested_mesh(i))
    return out

def nested_subtrees(q):
    """
    adds an 'or'ed subtree where needed
    """
    out = []
    for i in q:
        if isinstance(i, str):
            if i in ['OR', 'AND']:
                out.append(i)
            else:
                terms = get_subtree(i)
                if len(terms) == 1:
                    out.append(terms[0])
                else:
                    out.append(["OR", *terms])
        else:
            out.append(nested_subtrees(i))
    return out

def nested_pa(q):
    """
    if any terms are pharmacological actions, then add the component drugs to the search
    """
    out = []
    for i in q:
        if isinstance(i, str):
            if i in ['OR', 'AND']:
                out.append(i)
            elif i in drugs_from_class:
                terms = [r['mesh_term'] for r in drugs_from_class[i]]
                if len(terms) == 1:
                    out.append(terms[0])
                else:
                    out.append(["OR", *terms])
            else:
                out.append(i)
        else:
            out.append(nested_pa(i))
    return out   
        
def unravel(q):
    return nested_pa(nested_subtrees(nested_mesh(q)))

def queryize(q, pio='population'):
    """
    deals with queries in format:
    p=["Migraine", ["OR", "Migraine with Aura", "Migraine Disorders"]]
    """
    if isinstance(q, str):
        return 'pp.{}_mesh @> \'[{{"mesh_term": {}}}]\''.format(pio, json.dumps(q))
    elif isinstance(q, list):
        out = []
        for i in q:
            if isinstance(i, str):
                if i in ['OR', 'AND']:
                    
                    if len(q) < 3: # otherwise an AND/OR without anything to operate over
                        return ""
                    out2 = []
                    for r in q[1:]:
                        qr = queryize(r, pio=pio)
                        if qr:
                            out2.append(qr)                    
                    out.extend(['(' + ' {} '.format(i).join(out2) + ')'])
                    break                    
                else:
                    out.append(queryize(i, pio=pio))   

            elif isinstance(i, list):
                out.append(queryize(i, pio=pio))
        return ' '.join(out)

In [7]:
print(QuotedString("Bowen's disease").getquoted().decode())

'Bowen''s disease'


In [8]:
make_pico_q2(p="squamous cell carcinoma", get_mesh=True, subtrees=False)

NameError: name 'make_pico_q2' is not defined

In [9]:
print(queryize(["OR", ["AND", "Calcium Channel Blockers", ["OR", "Stroke", "Stroke2"]], "Atrial Fib"]))

((pp.population_mesh @> '[{"mesh_term": "Calcium Channel Blockers"}]' AND (pp.population_mesh @> '[{"mesh_term": "Stroke"}]' OR pp.population_mesh @> '[{"mesh_term": "Stroke2"}]')) OR pp.population_mesh @> '[{"mesh_term": "Atrial Fib"}]')


In [10]:
def make_pico_q2(p=None, i=None, o=None, join_reg=False, join_pm=False, subtrees=False, get_mesh=False, pm_data=True):
    """Calcium Channel Blockers
    deals with queries in format:
    p=["Migraine", ["OR", "Migraine with Aura", "Migraine Disorders"]]
    """
    
    if isinstance(p, str):
        if p:
            p = [p]
        else:
            p = None
    if isinstance(i, str):
        if i:
            i = [i]
        else:
            i = None

    if isinstance(o, str):
        if o:
            o = [o]
        else:
            o = None

    if not (p or i or o):
        raise Exception("No terms in the search")
    
    pico_parts = []
    
    if p:
        pico_parts.append(queryize(unravel(p), pio='population'))
        
    if i:
        pico_parts.append(queryize(unravel(i), pio='interventions'))
    if o:
        pico_parts.append(queryize(unravel(o), pio='outcomes'))    
    
    parts = []
    parts.append('select pp.pmid')
    
    if get_mesh:
        parts.append(', pp.population_mesh, pp.interventions_mesh, pp.outcomes_mesh')
    
    if join_reg:
        parts.append(', rl.regid')
        
    if join_pm:
        parts.append(', pm.ti, pm.ab')
    
    if pm_data and join_pm:
        parts.append(', pm.pm_data')
            
    parts.append('from pubmed_pico as pp')
    
    if join_reg:
        parts.append(', registry_links as rl')
        
    if join_pm:
        parts.append(', pubmed as pm')
    
    parts.append('where')

    parts.append('(')
    parts.append(') and ('.join(pico_parts))
    parts.append(')')
    if join_reg:
        parts.append('and pp.pmid=rl.pmid')
    
    if join_pm:
        parts.append('and pp.pmid=pm.pmid')
    
    return ' '.join(parts)
    


In [11]:
def make_pico_q(p=None, i=None, o=None, join_reg=False, join_pm=False, subtrees=True, get_mesh=False, pm_data=True):
    
    if isinstance(p, str):
        p = [p]
    if isinstance(i, str):
        i = [i]
    if isinstance(o, str):
        o = [o]
    
    pico_parts = []
    
    if p:
        if subtrees:            
            pico_parts.extend([' or '.join(['pp.population_mesh @> \'[{{"mesh_term": "{}"}}]\''.format(r)
                                           for r in get_subtree(pp)]) for pp in p])
        else:
            pico_parts.extend(['pp.population_mesh @> \'[{{"mesh_term": "{}"}}]\''.format(r) for r in p])
    if i:
        if subtrees:
            pico_parts.extend([' or '.join(['pp.interventions_mesh @> \'[{{"mesh_term": "{}"}}]\''.format(r)
                                           for r in get_subtree(ip)]) for ip in i])

        else:
            pico_parts.extend(['pp.interventions_mesh @> \'[{{"mesh_term": "{}"}}]\''.format(r) for r in i])
    if o:
        if subtrees:
            pico_parts.extend([' or '.join(['pp.outcomes_mesh @> \'[{{"mesh_term": "{}"}}]\''.format(r)
                                           for r in get_subtree(op)]) for op in o])
        else:
            pico_parts.extend(['pp.outcomes_mesh @> \'[{{"mesh_term": "{}"}}]\''.format(r) for r in o])
    
    
    parts = []
    parts.append('select pp.pmid')
    
    if get_mesh:
        parts.append(', pp.population_mesh, pp.interventions_mesh, pp.outcomes_mesh')
        
    if pm_data and join_pm:
        parts.append(', pm.pm_data')
    
    if join_reg:
        parts.append(', rl.regid')
        
    if join_pm:
        parts.append(', pm.ti, pm.ab')
            
    parts.append('from pubmed_pico as pp')
    
    if join_reg:
        parts.append(', registry_links as rl')
        
    if join_pm:
        parts.append(', pubmed as pm')
    
    parts.append('where')

    parts.append('(')
    parts.append(') and ('.join(pico_parts))
    parts.append(')')
    if join_reg:
        parts.append('and pp.pmid=rl.pmid')
    
    if join_pm:
        parts.append('and pp.pmid=pm.pmid')
    
    return ' '.join(parts)
    


In [26]:

def pmids(p=None, i=None, o=None):
    sql = make_pico_q2(p=p, i=i, o=o)
    cur.execute(sql)
    records = cur.fetchall()
    return [r['pmid'] for r in records]
 

def linked_search(p=None, i=None, o=None, join_pm=False, join_reg=False, get_mesh=False, pm_data=False):    
    sql = make_pico_q2(p=p, i=i, o=o, join_pm=join_pm, join_reg=join_reg, get_mesh=get_mesh, pm_data=pm_data)
    cur.execute(sql)
    records = cur.fetchall()
    return list(records)
    
def pubmed(p=None, i=None, o=None, pm_data=False):
    return linked_search(p=p, i=i, o=o, join_pm=True, pm_data=pm_data)

def reg(p=None, i=None, o=None):
    return linked_search(p=p, i=i, o=o, join_reg=True)

    
def pubmed_ti(p=None, i=None, o=None):
    ab = pubmed(p, i, o)
    return [r['ti'] for r in ab]
    
    
def mesh(p=None, i=None, o=None):
    ab = linked_search(p, i, o, get_mesh=True)
    return ab


In [13]:
import pickle
with open('trialstreamer/data/minimap/subtrees.pck', 'rb') as f:
    subtrees = pickle.load(f)

def get_subtree(term):
    out = set([term])
    if term in subtrees:
        for subterms in subtrees[term]:
            out.update(get_subtree(subterms))
    return list(out)

In [14]:
print(make_pico_q(p='squamous cell carcinoma'))

select pp.pmid from pubmed_pico as pp where ( pp.population_mesh @> '[{"mesh_term": "squamous cell carcinoma"}]' )


In [15]:
json.dumps("Bowen's disease")

'"Bowen\'s disease"'

In [16]:
terms = mesh(p="Acne")

In [17]:
len(terms)

574

In [49]:
len(pubmed_ti(p=["stroke"], i=["calcium channel blockers"]))

86

In [588]:
minimap.minimap("Migraine with aura")

[{'mesh_term': 'Migraine with Aura',
  'mesh_ui': 'D020325',
  'cui': 'C0154723',
  'start_idx': 0,
  'end_idx': 3,
  'source_text': 'migraine with aura'}]

In [336]:
# top co-occuring p's
from collections import Counter


c = Counter()
for r in terms:
    for t in r['population_mesh']:
        c[t['mesh_term']]+= 1
        
c.most_common(30)

[('Acne', 572),
 ('Patient', 399),
 ('Acne Vulgaris', 191),
 ('Aging', 88),
 ('Cicatrix', 73),
 ('Female', 71),
 ('Women', 61),
 ('Therapeutics', 58),
 ('Atrophy', 49),
 ('Skin', 43),
 ('Male', 41),
 ('Adult', 34),
 ('Face', 32),
 ('Population Groups', 32),
 ('(GlyA)12', 30),
 ('Adolescent', 24),
 ('Asians', 18),
 ('(Asn-Ala-Asn-Pro)3', 18),
 ('(Gly)10', 18),
 ('Isotretinoin', 17),
 ('Men', 15),
 ('Researcher', 14),
 ('Dermatology', 14),
 ('Hirsutism', 13),
 ('Volunteers', 13),
 ('Mental Suffering', 13),
 ('Aged', 12),
 ('Koreans', 12),
 ('(Glc)4', 11),
 ('control', 10)]

In [337]:
# top outcomes


c = Counter()
for r in terms:
    for t in r['outcomes_mesh']:
        c[t['mesh_term']]+= 1

c.most_common(50)

[('Acne', 236),
 ('Safety', 141),
 ('adverse effects', 81),
 ('Skin', 62),
 ('Erythemas', 62),
 ('Cicatrix', 48),
 ('Index', 43),
 ('Sebum', 41),
 ('Scales', 40),
 ('Therapeutics', 40),
 ('Researcher', 38),
 ('Patient', 31),
 ('Quality of Life', 29),
 ('Pain', 29),
 ('Overall', 25),
 ('Satisfaction', 23),
 ('Hyperpigmentation', 21),
 ('Burn', 20),
 ('Treatment Efficacy', 20),
 ('Serum', 19),
 ('Patient Satisfaction', 19),
 ('symptoms', 17),
 ('Pruritus', 17),
 ('Plasma', 17),
 ('Testosterone', 16),
 ('Immunoglobulin A', 16),
 ('Concentration', 15),
 ('Production', 14),
 ('Questionnaires', 14),
 ('Incidence', 13),
 ('Atrophy', 13),
 ('Visual Analog Scale', 13),
 ('Lipids', 12),
 ('Time', 12),
 ('Dermatology', 12),
 ('Bodily Secretions', 11),
 ('frequency', 11),
 ('Sex Hormone-Binding Globulin', 11),
 ('Laboratory', 10),
 ('Edema', 10),
 ('Water', 10),
 ('Physician', 10),
 ('Inflammations', 10),
 ('Androgens', 10),
 ('Dermatitis, Seborrheic', 10),
 ('Transients', 9),
 ('Gagging', 9),
 ('

In [338]:
# top interventions
c = Counter()
for r in terms:
    for t in r['interventions_mesh']:
        c[t['mesh_term']]+= 1
        
c.most_common(20)

[('Gels', 131),
 ('Placebos', 102),
 ('Therapeutics', 84),
 ('Benzoyl Peroxide', 83),
 ('Clindamycin', 59),
 ('Adapalene', 57),
 ('Isotretinoin', 52),
 ('Laser', 49),
 ('Tretinoin', 43),
 ('clindamycin phosphate', 42),
 ('Ethinyl Estradiol', 36),
 ('Light', 35),
 ('N2,N2-dimethylguanosine', 35),
 ('Population Groups', 30),
 ('Photochemotherapy', 27),
 ('5-(PAHA)', 27),
 ('3,3-dimethyl-1-phenyltriazene', 21),
 ('(SNOPPP)2', 21),
 ('Contraceptives, Oral', 20),
 ('Peroxides', 20)]

In [28]:
def get_simple_cite(pm_data_row):
    if 'authors' in pm_data_row and pm_data_row['authors']:
        return "{} {}, {} ({})".format(pm_data_row['authors'][0]['LastName'], pm_data_row['authors'][0]['Initials'], pm_data_row['year'], pm_data_row['pmid'])
    else:
        return pm_data_row['pmid']

In [36]:
pubmed_ti(i='aspirin', o='colorectal cancer')

['Reproductive, lifestyle, and anthropometric risk factors for cancer in elderly women.',
 'Folate, vitamin B6, multivitamin supplements, and colorectal cancer risk in women.',
 'Cyclooxygenase-2 expression and recurrence of colorectal adenomas: effect of aspirin chemoprevention.',
 'Effect of daily aspirin on long-term risk of death due to cancer: analysis of individual patient data from randomised trials.',
 "Aspirin use and colorectal cancer: post-trial follow-up data from the Physicians' Health Study.",
 'Alternate-day, low-dose aspirin and cancer risk: long-term observational follow-up of a randomized trial.',
 'Nonsteroidal anti-inflammatory drug use and protection against colorectal cancer in women.',
 'Preventive effects of low-dose aspirin on colorectal adenoma growth in patients with familial adenomatous polyposis: double-blind, randomized clinical trial.',
 "Low-dose aspirin in the primary prevention of cancer: the Women's Health Study: a randomized controlled trial.",
 'Obe

In [33]:
results = pubmed(p='Stroke', i=["AND", "Calcium channel blockers", ["OR", "placebo", "control"]], pm_data=True)

papers = []

for r in results:
    papers.append(get_simple_cite(r['pm_data']))
    
papers.sort()

print('\n'.join(papers))

1731418
Afshari D, 2013 (22749947)
Ahmed N, 2000 (10835440)
Ahmed N, 2001 (11350571)
Ameriso SF, 1992 (26486430)
Azcona A, 1990 (2150642)
Bogousslavsky J, 1990 (2404768)
Fagan SC, 1988 (3354029)
Fischhof PK, 1993 (7794292)
Fogelholm R, 2000 (10773644)
Franke CL, 1996 (8825274)
Gaab MR, 1985 (4010872)
Gelmers HJ, 1987 (2433932)
Gelmers HJ, 1988 (3275894)
Gelmers HJ, 1990 (2260153)
Heiss WD, 1990 (2298828)
Holthoff V, 1990 (2260157)
Horn J, 2001 (11157183)
Kaste M, 1994 (8023348)
Kim JS, 2011 (21316855)
Kirwan BA, 2007 (17509947)
Lawlor B, 2014 (25300460)
Lawlor B, 2018 (30248105)
Limburg M, 1990 (2358004)
Lisk DR, 1993 (8352673)
Liu L, 1998 (9869017)
Liu LS, 1989 (2505975)
Martínez-Vila E, 1990 (2195714)
Muir KW, 1998 (9596235)
Nag D, 1998 (10874361)
Novosel D, 1994 (7898083)
Paci A, 1989 (2683557)
Pantoni L, 2000 (10831772)
Pantoni L, 2000 (10831773)
Perez I, 1998 (9391222)
Poole-Wilson PA, 2007 (17573989)
Saver JL, 2014 (24444116)
Saver JL, 2015 (25651247)
Schrier RW, 2002 (11849464)


In [613]:
len(papers)

43

In [608]:
pubmed(p='Stroke', pm_data=True)[0]

{'pmid': '7631339',
 'ti': 'Hyperbaric oxygen in the treatment of acute ischemic stroke. A double-blind pilot study.',
 'ab': "BACKGROUND AND PURPOSE\n\n\nThe effects of hyperbaric oxygen (HBO) therapy on humans are uncertain. Our study aims first to outline the practical aspects and the safety of HBO treatment and then to evaluate the effect of HBO on long-term disability.\nMETHODS\n\n\nPatients who experienced middle cerebral artery occlusion and were seen within 24 hours of onset were randomized to receive either active (HBO) or sham (air) treatment. The HBO patients were exposed daily to 40 minutes at 1.5 atmospheres absolute for a total of 10 dives. We used the Orgogozo scale to establish a pretreatment functional level. Changes in the Orgogozo scale score at 6 months and 1 year after therapy were used to assess the therapeutic efficacy of HBO. In addition, we used the Rankin scale and our own 10-point scale to assess long term-disability at 6 months and 1 year. Two sample t tests

In [211]:
from trialstreamer import minimap

In [440]:
minimap.minimap('Virtual Reality')

[{'mesh_term': 'Virtual Reality',
  'mesh_ui': 'D000076142',
  'cui': 'C0871582',
  'start_idx': 0,
  'end_idx': 2,
  'source_text': 'virtual reality'}]

In [459]:
drugs_from_class['Calcium Channel Blockers']

[{'mesh_ui': 'C038806', 'mesh_term': '1,4-dihydropyridine'},
 {'mesh_ui': 'C063159',
  'mesh_term': '1-(2-(3-(4-methoxyphenyl)propoxy)-4-methoxyphenylethyl)-1H-imidazole'},
 {'mesh_ui': 'C011786',
  'mesh_term': '2-(4-(dimethylamino)styryl)-1-methylpyridinium'},
 {'mesh_ui': 'C006014',
  'mesh_term': '8-(N,N-diethylamino)octyl-3,4,5-trimethoxybenzoate'},
 {'mesh_ui': 'C064970', 'mesh_term': 'AE0047'},
 {'mesh_ui': 'C119283', 'mesh_term': 'AH 1058'},
 {'mesh_ui': 'D017311', 'mesh_term': 'Amlodipine'},
 {'mesh_ui': 'D000068558',
  'mesh_term': 'Amlodipine Besylate, Olmesartan Medoxomil Drug Combination'},
 {'mesh_ui': 'C487936',
  'mesh_term': 'amlodipine, atorvastatin drug combination'},
 {'mesh_ui': 'D000676', 'mesh_term': 'Amrinone'},
 {'mesh_ui': 'C078814', 'mesh_term': 'anandamide'},
 {'mesh_ui': 'C045313', 'mesh_term': 'anipamil'},
 {'mesh_ui': 'C086123', 'mesh_term': 'azimilide'},
 {'mesh_ui': 'D001537', 'mesh_term': 'Bencyclane'},
 {'mesh_ui': 'C061004', 'mesh_term': 'benidipine'

In [76]:
pubmed_ti(p='Carcinoma, Squamous Cell')

ProgrammingError: syntax error at or near "s"
LINE 1: ...re ( pp.population_mesh @> '[{"mesh_term": "Bowen's Disease"...
                                                             ^


In [42]:
from psycopg2.extensions import adapt


In [45]:
print(adapt("Bowen's disease"))

'Bowen''s disease'


In [57]:
type(json.dumps("Bowen\'s disease"))

str