In [2]:
from flask import Flask, render_template, request, redirect
from bokeh.embed import components
import PubDatePlotting as pdp
#import StateGraph as sg

# output_notebook()

app = Flask(__name__)

app.vars = {}


@app.route('/', methods=['GET', 'POST'])
def index():
    # nquestions=app_lulu.nquestions
    if request.method == 'GET':
        return render_template('select_cancer.html')
    else:
        app.vars['cancertype'] = request.form['cancertype']

        p1 = pdp.pubByDate(app.vars['cancertype'], False)
        p2 = pdp.pubByDate(app.vars['cancertype'], True)
        p3, p4 = stateGraph(app.vars['cancertype'])
        plots = {'p1': p1, 'p2': p2, 'p3': p3, 'p4': p4}
        script, div = components(plots)

        return render_template('view_cancer.html', script=script, div=div, ctype=app.vars['cancertype'])


if __name__ == "__main__":
    app.run(port=33507)

 * Running on http://127.0.0.1:33507/ (Press CTRL+C to quit)
127.0.0.1 - - [11/May/2017 15:01:06] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [11/May/2017 15:01:22] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [11/May/2017 15:02:12] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [11/May/2017 15:02:25] "POST / HTTP/1.1" 200 -


In [5]:
# PubDatePlotting
# reading in data from pickles and plotting data
from bokeh.plotting import figure, show
from bokeh.embed import components
import pickle
from collections import Counter

countFile = "CancerPubsWordCounts"
dataDir = "./static/"
years = [2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999,
         1998]


# for loading counts files from previous analysis, later convert to searching db for counts
def loadCounts(cf):
    # need to normalize to total pubs at some point, maybe normalize by total number of cancer pubs
    lCounts = []
    for year in years:
        lCounts.append(pickle.load(open(dataDir + cf + str(year) + ".pkl", "rb")))
    return lCounts


def pubByDate(ctype, norm):
    lCounts = loadCounts(countFile)

    ys = []

    for counts in lCounts:
        # number of counts per cancer / sum to normalize for fraction of total publication
        if (norm):
            ys.append(counts[ctype.lower()] / sum(counts.values()))
            ftit = ctype + " cancer pubs / total cancer pubs"
            ylab = "Publications (normalized)"
        else:
            ys.append(counts[ctype.lower()])
            ftit = ctype + " cancer publications"
            ylab = "Publications"
            # print(ctype)
            # print(counts[ctype])

    p = figure(title=ftit, plot_width=400, plot_height=400, x_axis_label="Year", y_axis_label=ylab)

    # to add legend use: legend=ctype
    p.line(years, ys, color='red')
    return p


In [1]:
#########ASYNCIO version of StateGraph

# StateGraph -  used to pull data from pubmed through an api
from aiohttp import ClientSession
import asyncio
#import concurrent.futures
#import requests

import json
from bokeh.sampledata import us_states
from bokeh.plotting import figure, show, output_notebook
import pickle

dataDir = "./static/"
moneyFile = "FundingPerState2016.pkl"

# affiliation = AD
searchField = "[AD]"
us_states = us_states.data.copy()
del us_states["HI"]
del us_states["AK"]

state_xs = [us_states[code]["lons"] for code in us_states]
state_ys = [us_states[code]["lats"] for code in us_states]

async def fetch(url, session):
    async with session.get(url) as response:
        return await response.text()

###add states correlating with responses############
async def run(states, ss):
    tasks = []

    # Fetch all responses within one Client session,
    # keep connection alive for all requests.
    async with ClientSession() as session:
        for state in states:
            tu = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term="+ss+"&mindate=2012/01/01&maxdate=2016/12/31&usehistory=y&retmode=json"
            task = asyncio.ensure_future(fetch(tu, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        
        # you now have all response bodies in this variable
        #responses can be converted to json, originally were strings
        #print(json.loads(responses[0]))
    return responses

# gets data from each state while string : ss=searchstring
def getStates(ss):
    #start threads and create queue of URLs
    loop = asyncio.get_event_loop()
    states = [us_states[state]["name"] for state in us_states]
    future = asyncio.ensure_future(run(states,ss))
    res = loop.run_until_complete(future)
    #print(res)
    for idx, state in enumerate(us_states):
        search_data = json.loads(res[idx])
        #webenv = search_data["esearchresult"]['webenv']
        total_records = int(search_data["esearchresult"]['count'])
        us_states[state]["count"] = total_records
        

def stateGraph(ss):
    p = figure(title=ss + " Cancer Publications",
               toolbar_location="left", plot_width=800, plot_height=510)
    p2 = figure(title=ss + " Cancer Publications (Normalized by funding)",
                toolbar_location="left", plot_width=800, plot_height=510)
    p.xaxis.visible = False
    p.xgrid.visible = False
    p.yaxis.visible = False
    p.ygrid.visible = False
    
    p2.xaxis.visible = False
    p2.xgrid.visible = False
    p2.yaxis.visible = False
    p2.ygrid.visible = False

    ##FOR SEARCHING YOU WILL NEED TO ESCAPE SPACES AND SPECIAL CHARS
    getStates(ss+"+cancer")

    # unnormalized to money version
    state_counts = [us_states[code]["count"] for code in us_states]
    state_counts_norm = state_counts
    max_state_counts = max(state_counts)
    state_counts = [x / max_state_counts for x in state_counts]

    # normalized to money
    fbs = pickle.load(open(dataDir + moneyFile, "rb"))
    state_counts_norm = [us_states[code]["count"] / fbs[us_states[code]["name"]] for code in us_states]
    max_state_counts_norm = max(state_counts_norm)
    state_counts_norm = [x / max_state_counts_norm for x in state_counts_norm]

    p.patches(state_xs, state_ys, fill_color="#377BA8", fill_alpha=state_counts,
              line_color="#884444", line_width=1.5)

    p2.patches(state_xs, state_ys, fill_color="#377BA8", fill_alpha=state_counts_norm,
               line_color="#884444", line_width=1.5)

    #show(p)
    #show(p2)
    return p, p2

#p, p2 = stateGraph("prostate")
#show(p)
#show(p2)
# stateGraph("lung")
# stateGraph("breast")



In [13]:
#######URL Search Parsing

from urllib.parse import quote

def urlFix(s):
    return urllib.parse.quote(s)

#print(urlFix('breast AND cancer[Affiliation]'))

breast%20AND%20cancer%5BAffiliation%5D


In [10]:
# StateGraph -  used to pull data from pubmed through an api
import requests
import json
from bokeh.sampledata import us_states
from bokeh.plotting import figure, show, output_notebook
import pickle

dataDir = "./static/"
moneyFile = "FundingPerState2016.pkl"

# affiliation = AD
searchField = "[AD]"
us_states = us_states.data.copy()
del us_states["HI"]
del us_states["AK"]

state_xs = [us_states[code]["lons"] for code in us_states]
state_ys = [us_states[code]["lats"] for code in us_states]


# gets data from each state while string : ss=searchstring
def getStates(ss):
    for state in us_states:
        # print(state)
        us_states[state]["count"] = getState(ss, us_states[state]["name"])


def getState(ss, state):
    search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + state + searchField + "+AND+" + ss + "+AND+cancer&mindate=2012/01/01&maxdate=2016/12/31&usehistory=y&retmode=json"
    search_r = requests.post(search_url)
    search_data = search_r.json()
    webenv = search_data["esearchresult"]['webenv']
    total_records = int(search_data["esearchresult"]['count'])
    return total_records


def stateGraph(ss):
    p = figure(title=ss + " Cancer Publications",
               toolbar_location="left", plot_width=800, plot_height=510)
    p2 = figure(title=ss + " Cancer Publications (Normalized by NIH funding)",
                toolbar_location="left", plot_width=800, plot_height=510)
    p.xaxis.visible = False
    p.xgrid.visible = False
    p.yaxis.visible = False
    p.ygrid.visible = False
    
    p2.xaxis.visible = False
    p2.xgrid.visible = False
    p2.yaxis.visible = False
    p2.ygrid.visible = False

    ##FOR SEARCHING YOU WILL NEED TO ESCAPE SPACES AND SPECIAL CHARS
    getStates(ss)

    # unnormalized to money version
    state_counts = [us_states[code]["count"] for code in us_states]
    state_counts_norm = state_counts
    max_state_counts = max(state_counts)
    state_counts = [x / max_state_counts for x in state_counts]

    # normalized to money
    fbs = pickle.load(open(dataDir + moneyFile, "rb"))
    state_counts_norm = [us_states[code]["count"] / fbs[us_states[code]["name"]] for code in us_states]
    max_state_counts_norm = max(state_counts_norm)
    state_counts_norm = [x / max_state_counts_norm for x in state_counts_norm]

    p.patches(state_xs, state_ys, fill_color="#377BA8", fill_alpha=state_counts,
              line_color="#884444", line_width=1.5)

    p2.patches(state_xs, state_ys, fill_color="#377BA8", fill_alpha=state_counts_norm,
               line_color="#884444", line_width=1.5)

    # show(p)
    # show(p2)
    return p, p2

# stateGraph("prostate")
# stateGraph("lung")
# stateGraph("breast")

In [None]:
#########QUEUE WORKED IT JUST WASN'T working with Pubmed, couldn't handle parallel request

# StateGraph -  used to pull data from pubmed through an api
import requests
import json
from bokeh.sampledata import us_states
from bokeh.plotting import figure, show, output_notebook
import pickle
#from urlparse import urlparse
from threading import Thread
#import httplib, sys
from queue import Queue

dataDir = "./static/"
moneyFile = "FundingPerState2016.pkl"

# affiliation = AD
searchField = "[AD]"
us_states = us_states.data.copy()
del us_states["HI"]
del us_states["AK"]

state_xs = [us_states[code]["lons"] for code in us_states]
state_ys = [us_states[code]["lats"] for code in us_states]


# gets data from each state while string : ss=searchstring
def getStates(ss):
    q = Queue(len(us_states))
    
    #start threads and create queue of URLs
    for state in us_states:
        surl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + state + searchField + "+AND+" + ss + "+AND+cancer&mindate=2012/01/01&maxdate=2016/12/31&usehistory=y&retmode=json"
        q.put(surl)
        t = Thread(target=getState, args=[state,q])
        t.daemon = True
        t.start()
        
        

#worker function
def getState(state,q):
    url = q.get()
    search_r = requests.post(url)
    search_data = search_r.json()
    webenv = search_data["esearchresult"]['webenv']
    total_records = int(search_data["esearchresult"]['count'])
    
    us_states[state]["count"] = total_records
    q.task_done()
#     status, url = getStatus(url)
#     doSomethingWithResult(status, url)
#     q.task_done()


def stateGraph(ss):
    p = figure(title=ss + " Cancer Publications",
               toolbar_location="left", plot_width=800, plot_height=510)
    p2 = figure(title=ss + " Cancer Publications (Normalized by funding)",
                toolbar_location="left", plot_width=800, plot_height=510)
    p.xaxis.visible = False
    p.xgrid.visible = False
    p.yaxis.visible = False
    p.ygrid.visible = False
    
    p2.xaxis.visible = False
    p2.xgrid.visible = False
    p2.yaxis.visible = False
    p2.ygrid.visible = False

    ##FOR SEARCHING YOU WILL NEED TO ESCAPE SPACES AND SPECIAL CHARS
    getStates(ss)

    # unnormalized to money version
    state_counts = [us_states[code]["count"] for code in us_states]
    state_counts_norm = state_counts
    max_state_counts = max(state_counts)
    state_counts = [x / max_state_counts for x in state_counts]

    # normalized to money
    fbs = pickle.load(open(dataDir + moneyFile, "rb"))
    state_counts_norm = [us_states[code]["count"] / fbs[us_states[code]["name"]] for code in us_states]
    max_state_counts_norm = max(state_counts_norm)
    state_counts_norm = [x / max_state_counts_norm for x in state_counts_norm]

    p.patches(state_xs, state_ys, fill_color="#377BA8", fill_alpha=state_counts,
              line_color="#884444", line_width=1.5)

    p2.patches(state_xs, state_ys, fill_color="#377BA8", fill_alpha=state_counts_norm,
               line_color="#884444", line_width=1.5)

    # show(p)
    # show(p2)
    return p, p2

# stateGraph("prostate")
# stateGraph("lung")
# stateGraph("breast")

In [None]:
###example from which the queue data was built for above

from urlparse import urlparse
from threading import Thread
import httplib, sys
from Queue import Queue

concurrent = 200

def doWork():
    while True:
        url = q.get()
        status, url = getStatus(url)
        doSomethingWithResult(status, url)
        q.task_done()

def getStatus(ourl):
    try:
        url = urlparse(ourl)
        conn = httplib.HTTPConnection(url.netloc)   
        conn.request("HEAD", url.path)
        res = conn.getresponse()
        return res.status, ourl
    except:
        return "error", ourl

def doSomethingWithResult(status, url):
    print status, url

q = Queue(concurrent * 2)
for i in range(concurrent):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()
try:
    for url in open('urllist.txt'):
        q.put(url.strip())
    q.join()
except KeyboardInterrupt:
    sys.exit(1)

In [112]:
print(us_states["DC"]["name"])
fbs = pickle.load(open(dataDir+moneyFile, "rb"))
print(fbs)

District of Columbia
{'Alabama': 294964217, 'Arizona': 163447535, 'Arkansas': 96652655, 'California': 3686026589, 'Colorado': 349974172, 'Connecticut': 510609681, 'Delaware': 45371848, 'District Of Columbia': 214175791, 'Florida': 531720813, 'Georgia': 520595434, 'Idaho': 14139675, 'Illinois': 818027921, 'Indiana': 225125822, 'Iowa': 170060863, 'Kansas': 91306154, 'Kentucky': 163613208, 'Louisiana': 141817492, 'Maine': 75619398, 'Maryland': 1465624060, 'Massachusetts': 2572549176, 'Michigan': 669562129, 'Minnesota': 520225717, 'Mississippi': 53538828, 'Missouri': 508984218, 'Montana': 37310308, 'Nebraska': 107024633, 'Nevada': 31316007, 'New Hampshire': 98857889, 'New Jersey': 240135510, 'New Mexico': 99743752, 'New York': 2205949608, 'North Carolina': 1154347750, 'North Dakota': 22471682, 'Ohio': 734159508, 'Oklahoma': 90675755, 'Oregon': 274614404, 'Pennsylvania': 1570151520, 'Rhode Island': 150833713, 'South Carolina': 179069761, 'South Dakota': 21563049, 'Tennessee': 512414823, 'Te

In [110]:
for code in us_states:
    print(code)
    print(us_states[code]["name"])

NV
Nevada
AZ
Arizona
WI
Wisconsin
GA
Georgia
KS
Kansas
CT
Connecticut
IN
Indiana
ME
Maine
MA
Massachusetts
MT
Montana
MD
Maryland
AR
Arkansas
AL
Alabama
VA
Virginia
NE
Nebraska
KY
Kentucky
NY
New York
CO
Colorado
VT
Vermont
SD
South Dakota
MI
Michigan
MO
Missouri
NC
North Carolina
RI
Rhode Island
ID
Idaho
DE
Delaware
DC
District of Columbia
NH
New Hampshire
MN
Minnesota
ND
North Dakota
OK
Oklahoma
IA
Iowa
TN
Tennessee
FL
Florida
LA
Louisiana
NM
New Mexico
WY
Wyoming
PA
Pennsylvania
SC
South Carolina
UT
Utah
WV
West Virginia
WA
Washington
MS
Mississippi
OR
Oregon
IL
Illinois
NJ
New Jersey
CA
California
OH
Ohio
TX
Texas


In [7]:
##read in cancer data for list of cancers, monthly

import pandas as pd
from pandas import Series

#df = pd.read_csv("FundingPerState.csv",header=0)
#dic = Series(df.FUNDING.values,index=df.LOCATION).to_dict()

cancersforout = ['Bladder','Breast','Colorectal','Liver','Lung','Pancreatic','Prostate','Skin','Stomach']
#cancersforout = ['Bladder','Breast']
years = range(1975,2015)
#years = [2016]
cPd = pd.DataFrame(columns = cancersforout, index = years)

searches = []
for c in cancersforout:
    tempSearches = []
    for y in years:
        tempStr = c.lower() + "+AND+cancer&mindate="+str(y)+"/01/01&maxdate="+str(y)+"/12/31"
        tempSearches.append(tempStr)
    searches.append(tempSearches)

async def fetch2(url, session):
    async with session.get(url) as response:
        return await response.text()
    
###add states correlating with responses############
async def run2(ss):
    tasks = []

    # Fetch all responses within one Client session,
    # keep connection alive for all requests.
    async with ClientSession() as session:
        for s in ss:
            tu = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term="+s+"&usehistory=y&retmode=json"
            task = asyncio.ensure_future(fetch2(tu, session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        
        # you now have all response bodies in this variable
        #responses can be converted to json, originally were strings
        #print(json.loads(responses[0]))
    return responses

def getRecordSet(ss):
    pubCounts = []
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(run2(ss))
    res = loop.run_until_complete(future)
    return res
    
for i, search in enumerate(searches):
    searchset = getRecordSet(search)
    search_data = []
    for s in searchset:
        tempdata = json.loads(s)
        temprecords = int(tempdata["esearchresult"]['count'])
        search_data.append(temprecords)
    cPd[cancersforout[i]] = search_data
    print(cancersforout[i])

print(cPd.head())
output = open("cancerPublicationsPanda.pkl", 'wb')
pickle.dump(cPd, output)
#print(dic)
output.close()

Bladder
Breast
Colorectal
Liver
Lung
Pancreatic
Prostate
Skin
Stomach
      Bladder  Breast  Colorectal  Liver  Lung  Pancreatic  Prostate  Skin  \
1975      571    1648          75   2157  1779         714       222  1536   
1976      565    1610          84   2092  1775         703       233  1562   
1977      614    1955          97   2054  1868         740       285  1694   
1978      639    1875         170   2122  1950         784       252  1749   
1979      692    1929         190   2333  2267         893       323  1920   

      Stomach  
1975      960  
1976      943  
1977      969  
1978      991  
1979     1005  


In [8]:
print(cPd)

      Bladder  Breast  Colorectal  Liver   Lung  Pancreatic  Prostate  Skin  \
1975      571    1648          75   2157   1779         714       222  1536   
1976      565    1610          84   2092   1775         703       233  1562   
1977      614    1955          97   2054   1868         740       285  1694   
1978      639    1875         170   2122   1950         784       252  1749   
1979      692    1929         190   2333   2267         893       323  1920   
1980      711    2155         211   2424   2358         875       380  1944   
1981      765    2136         282   2402   2379         887       420  1950   
1982      818    2239         308   2514   2756         925       389  2122   
1983      996    2395         375   2991   2956        1000       440  2299   
1984     1149    2790         406   3209   3323        1063       515  2467   
1985     1169    2871         510   3230   3510        1146       586  2509   
1986     1125    2883         636   3340   3479     

In [1]:
import asyncio
from aiohttp import ClientSession

async def fetch(url, session):
    async with session.get(url) as response:
        return await response.read()

async def run(r):
    url = "http://localhost:8080/{}"
    tasks = []

    # Fetch all responses within one Client session,
    # keep connection alive for all requests.
    async with ClientSession() as session:
        for i in range(r):
            task = asyncio.ensure_future(fetch(url.format(i), session))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)
        # you now have all response bodies in this variable
        print(responses)

def print_responses(result):
    print(result)

loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(4))
loop.run_until_complete(future)

ClientConnectorError: [Errno 61] Cannot connect to host localhost:8080 ssl:False [Can not connect to localhost:8080 [Connect call failed ('127.0.0.1', 8080)]]