##### Front matter - get pathway pacids + full transcriptNames

In [None]:
import os
import sys
import clustergrammer
import json
import pandas
from clustergrammer_widget import *
import urllib
import requests
from urlparse import urljoin
from clustergrammer import Network
from collections import defaultdict

#grab pids for a pathway and transcript names
api_url_base = "https://njp-spin.jgi.doe.gov/api/db/"
pathway_url = api_url_base + "pathway/"
pathway_id = "PWY-6608"
params = {"proteome" : "297"}
r = requests.get(pathway_url + pathway_id, params=params)
pathway_json = r.json() #json.loads(urllib.urlopen(pathway_url).read())
pacids = set()
for annot in pathway_json["annotations"]:
    pacids.update(set(pacid for pacid in annot["proteins"]))
feature_url = urljoin(api_url_base, "sequence/transcript/" + ','.join(pacids))
pacid_to_tname = {}
r = requests.get(feature_url)
trans_json = r.json()
for entry in trans_json:
    pacid_to_tname[entry["uniquename"]] = entry["name"]

## Clustered expression example

In [36]:
ex_base = api_url_base + "expression/gene/"
ex_url = ex_base + urllib.quote(','.join(pacids))

exh = urllib.urlopen(ex_url)
score_dict = defaultdict(dict)
condition_set = set()
for line in exh:
    pacid, tname, condition, score = line.strip().split() 
    score_dict[pacid_to_tname[pacid]][condition] = float(score) #api returns truncated tname, map for now
    condition_set.add(condition)
exh.close()
scs = sorted(condition_set)
df = pandas.DataFrame(0.0, columns = scs, index = sorted([str(pacid_to_tname[x]) for x in pacids]))
for tname in score_dict:
    for condition in scs:
        df[condition][tname] = score_dict[tname][condition]          
net = Network(clustergrammer_widget)
net.load_df(df)
net.cluster()
net.widget()

## Coexpression example

In [15]:
score_dict = defaultdict(dict)
condition_set = set()
stnames = sorted([pacid_to_tname[x] for x in pacids])
df = pandas.DataFrame(0.0, columns=stnames, index=stnames)
coex_base = api_url_base + "coexpression/gene/"
coex_url = coex_base + urllib.quote(','.join(pacids))
coexh = urllib.urlopen(coex_url)
for line in coexh:
    pacid1, pacid2, score = line.strip().split()
    tname1 = str(pacid_to_tname[pacid1])
    tname2 = str(pacid_to_tname[pacid2])
    df[tname1][tname2] = float(score)
    df[tname2][tname1] = float(score)
rows = df.index.tolist()
coexh.close()
net = Network(clustergrammer_widget)
net.load_df(df)
net.cluster()
net.widget()

## Clustered expression w/ homology filter

In [29]:
max_evalue = 1e-100
seed_gene = "32056480"
homology_base = api_url_base + "homologs/gene/identifier/"
filter_params = params # grab proteomeId above
homology_url = homology_base + seed_gene
r = requests.get(homology_url, params=params)
homolog_json = r.json()
included_pacids = set(["PAC:" + seed_gene])
for entry in homolog_json:
    prepend_pac = "PAC:" + entry["hitIdentifier"]
    if float(entry["evalue"]) < max_evalue and prepend_pac in pacids:
        included_pacids.add(prepend_pac)
ex_url = ex_base + urllib.quote(','.join(included_pacids))
exh = urllib.urlopen(ex_url)
score_dict = defaultdict(dict)
condition_set = set()
for line in exh:
    pacid, tname, condition, score = line.strip().split() 
    score_dict[pacid_to_tname[pacid]][condition] = float(score) #api returns truncated tname, map for now
    condition_set.add(condition)
exh.close()
scs = sorted(condition_set)
df = pandas.DataFrame(0.0, columns = scs, index = sorted([str(pacid_to_tname[x]) for x in pacids]))
for tname in score_dict:
    for condition in scs:
        df[condition][tname] = score_dict[tname][condition] 
net = Network(clustergrammer_widget)
net.load_df(df)
net.cluster()
net.widget()