In [27]:
%%time

import networkx as nx
from eden.util import read

# Load in graphs from file
def load_graphs(file):
    graphs = []
    for line in read(file):
        if not(line.startswith('#')): # skip info lines
            parts = line.split()
            pdb_id = parts[0]
            target = parts[3]
            ligand_marker = parts[7].strip('()')
            if '-' not in ligand_marker: # TODO: figure out what to do with peptides and if there are multiple ligands
                if '&' not in ligand_marker:
                    file_name = 'gml/' + pdb_id + '.gml'
                    g = nx.read_gml(file_name)
                    graphs.append(g)
    return graphs

CPU times: user 25 µs, sys: 59 µs, total: 84 µs
Wall time: 99.9 µs


In [28]:
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict, deque
class MarkActive(BaseEstimator, TransformerMixin):

    """
    Missing.
    """

    def __init__(self, max_depth=10, attribute='active', root_attribute=None, root_value=None):
        self.max_depth = max_depth
        self.attribute = attribute
        self.root_attribute = root_attribute
        self.root_value = root_value
        self.key_nesting='nesting'

    def fit(self):
        return self

    def transform(self, graphs):
        """
        Todo.
        """
        try:
            for graph in graphs:
                marked_graph = self.mark_active(graph)
                yield marked_graph
        except Exception as e:
            logger.debug('Failed iteration. Reason: %s' % e)
            logger.debug('Exception', exc_info=True)

    def mark_active(self, graph):
        # mark all nodes as False
        for u in graph.nodes():
            graph.node[u][self.attribute]=False
        # mark as True all nodes that are within distance 'max_depth' from a node that has root_attribute=root_value
        for u in graph.nodes(): 
            node_dict = graph.node[u]
            if self.root_attribute in node_dict and node_dict[self.root_attribute] == self.root_value:
                self.mark_single_vertex_breadth_first_visit(graph, root=u)
        return graph
            
    def mark_single_vertex_breadth_first_visit(self, graph, root=None):
        graph.node[root][self.attribute]=True
        visited = set()  # use a set as we can end up exploring few nodes
        # q is the queue containing the frontieer to be expanded in the BFV
        q = deque()
        q.append(root)
        # the map associates to each vertex id the distance from the root
        dist = {}
        dist[root] = 0
        visited.add(root)
        while len(q) > 0:
            # extract the current vertex
            u = q.popleft()
            d = dist[u] + 1
            if d <= self.max_depth:
                # iterate over the neighbors of the current vertex
                for v in graph.neighbors(u):
                    if v not in visited:
                        # skip nesting edge-nodes
                        if graph.edge[u][v].get(self.key_nesting, False) is False:
                            dist[v] = d
                            visited.add(v)
                            graph.node[v][self.attribute]=True
                            q.append(v)

In [29]:
import numpy as np

def make_target(graphs):
    y = [float(graph.graph['target']) for graph in graphs ]
    y = np.array(y)
    return y

In [30]:
%%time

graphs = load_graphs('INDEX_refined_data.2015_temp')

CPU times: user 8min 4s, sys: 39.3 s, total: 8min 43s
Wall time: 8min 23s


In [31]:
%%time

from GArDen.interfaces import convert, transform

from eden.graph import Vectorizer
from eden.util import vectorize

#graphs = load_graphs('INDEX_refined_data.2015_temp')

#parameters_priors = dict(max_depth=max_depth, attribute='active')
#graphs = transform(graphs, program=MarkActive(), parameters_priors=parameters_priors)

#graphs = list(graphs)

# first delete nodes with weird label?

complexity = 5
X = Vectorizer(complexity).transform(graphs)

CPU times: user 1.66 s, sys: 359 ms, total: 2.02 s
Wall time: 1min 21s


In [36]:
from GArDen.interfaces import convert, transform
#from GArDen.transform.node import AddNodeAttributeValue
#from GArDen.transform import DeleteNode
#from GArDen.transform.trim_to_largest_component import TrimToLargestComponent
from eden.graph import Vectorizer
from eden.util import vectorize
from sklearn.svm import SVR
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import KFold
import time

def run_experiment(data_fname='interaction_data', 
                   #target_fname = 'PDBbind_refined07-core07.txt', 
                   graphs=graphs, 
                   max_depth=1, 
                   complexity=5,
                   C=100,
                   gamma=0.1,
                   n_iter=5):    
    #graphs = convert(data_fname, program=ConvertProteinLigand())
    #parameters_priors = dict(attribute='active', value=False)
    #graphs = transform(graphs, program=AddNodeAttributeValue(), parameters_priors=parameters_priors)
    parameters_priors = dict(max_depth=max_depth, attribute='active')
    graphs = transform(graphs, program=MarkActive(), parameters_priors=parameters_priors)
    parameters_priors = dict(attribute_value_dict=[dict(active=False, type='protein')])
    #graphs = transform(graphs, program=DeleteNode(), parameters_priors=parameters_priors)
    #graphs = transform(graphs, program=TrimToLargestComponent())
    graphs = list(graphs)
    #X = vectorize(graphs, vectorizer=Vectorizer(complexity=complexity), fit_flag=False, n_jobs=-1)
    #X = Vectorizer(complexity).transform(graphs)
    y = make_target(graphs)
    
    corr_list=[]
    for i in range(n_iter):
        predictor = SVR(C=C, gamma=gamma)
        #predicted = cross_val_predict(predictor, X, y, cv=StratifiedKFold(y, n_folds=5, shuffle=True, random_state=int(time.time())), n_jobs=-1)
        predicted = cross_val_predict(predictor, X, y, cv=KFold(n_folds=5,shuffle=True,random_state=None), n_jobs=-1)
        corr = np.corrcoef([predicted,y])[0][1]
        corr_list.append(corr)
    return np.mean(corr_list), np.std(corr_list)

In [37]:
%%time
import time
import datetime
x1 = []
x2 = []
x3 = []

graphs = load_graphs('INDEX_refined_data.2015_temp')

for max_depth in range(2):
    start = time.time()
    corr_mean, corr_std = run_experiment(data_fname='interaction_data', 
                                         graphs=graphs,
                                         max_depth=max_depth,
                                         complexity=2,
                                         C=62.52,
                                         gamma=0.279, 
                                         n_iter=20)
    
    delta_time = datetime.timedelta(seconds=(time.time() - start))
    print 'Max depth: %d   Corr coeff: %.3f +- %.3f   in: %s' % (max_depth, corr_mean, corr_std, str(delta_time))
    x1.append(corr_mean)
    x3.append(corr_std)
    x2.append(max_depth)
x1=np.array(x1)
x2=np.array(x2)
x3=np.array(x3)

TypeError: __init__() got an unexpected keyword argument 'n_splits'