In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Core python modules
import sys, os
import dill
from functools import partial
import json

# Data processing
import pandas as pd
import numpy as np
import scipy

# Networks
import networkx as nx

# Modules
sys.path.insert(0, '../scripts')
from evaluation import Data
from samplers import *
from gsd import GSD
from benchmarks import *

In [3]:
# Plotting
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

# 1. Import parameters

In [4]:
parameter_path = "../parameter_testing/parameters.json"
experiment_dir = os.path.dirname(parameter_path)

with open(parameter_path) as f: 
    param_grid = json.load(f)

In [5]:
from pprint import pprint

pprint(param_grid)

{'data': {'n_samples': [20, 100],
          'n_sources': [10],
          'noise': [0, 0.5, 1.0],
          'rep': [1, 2]},
 'models': {'gsd': {'a': [0.1],
                    'graph': ['../data/interactomes/STRING_v10.5.experimental.costs_adj_g4.gpickle'],
                    'initializer': ['ica'],
                    'n_components': [10]},
            'ica1': {'cutoff': [0.01], 'n_components': [10]}},
 'sampler': {'graph': ['../data/interactomes/STRING_v10.5.experimental.costs_adj_g4.gpickle'],
             'size': [500]}}


# 2. Generate data

In [6]:
from sklearn.model_selection import ParameterGrid

In [7]:
datasets = []

for sampler_param in ParameterGrid(param_grid['sampler']): 
    
    sampler = partial(sample_random_walk_fixed_size, **sampler_param)
    data = Data(sampler, param_grid['data'])
    
    dic = {**sampler_param, **param_grid['data']}
    dic['data_object'] = data
    
    datasets.append(dic)

03:13:32 - Data: INFO - 12 parameter sets loaded.


In [8]:
datasets

[{'graph': '../data/interactomes/STRING_v10.5.experimental.costs_adj_g4.gpickle',
  'size': 500,
  'n_samples': [20, 100],
  'n_sources': [10],
  'noise': [0, 0.5, 1.0],
  'rep': [1, 2],
  'data_object': <evaluation.Data at 0x7fb27af08390>}]

In [9]:
datasets_path = os.path.join(experiment_dir, "datasets.dill")
dill.dump(datasets, open(datasets_path, 'wb'))

# 3. Test data on models

`datasets` are list of data objects which evaluate model performance over `n_samples`, `n_sources`, `noise`, `rep`. Test each model on each data object in `datasets`

In [10]:
datasets = dill.load(open(datasets_path, 'rb'))
datasets

[{'graph': '../data/interactomes/STRING_v10.5.experimental.costs_adj_g4.gpickle',
  'size': 500,
  'n_samples': [20, 100, 500],
  'n_sources': [10],
  'noise': [0, 0.2, 0.4, 0.6, 0.8, 1.0],
  'rep': [1, 2, 3],
  'data_object': <evaluation.Data at 0x7fc1c7ac0cc0>}]

In [10]:
def get_model(name, **params): 
    if name == 'ica1': 
        return partial(ICA1, **params)
    elif name == 'gsd': 
        return partial(GSD, **params)
    else: 
        pass

In [11]:
all_results = []

for dataset in datasets: 
    # Loop through model frameworks
    for name, model_param_grid in param_grid['models'].items(): 
        # Loop through different model instantiations
        for model_params in ParameterGrid(model_param_grid): 
            
            model_dic = model_params.copy()
            model_dic['method'] = name
            print(model_dic)
            
            # Evaluate model performance
            model = get_model(name, **model_params)
            data = dataset['data_object']
            results, scores = data.evaluate(model)
            
            model_dic['results'] = results
            model_dic['scores'] = scores
            
            all_results.append(model_dic)

{'cutoff': 0.01, 'n_components': 10, 'method': 'ica1'}




{'a': 0.1, 'graph': '../data/interactomes/STRING_v10.5.experimental.costs_adj_g4.gpickle', 'initializer': 'ica', 'n_components': 10, 'method': 'gsd'}


MemoryError: 