In [None]:
    def _conditional_mutual_information(self, bdb, genid, modelno, X, W, Z, Y,
            numsamples=None):
        # WARNING: SUPER EXPERIMENTAL.
        # Computes the conditional mutual information I(X:W|Z,Y=y), defined
        # defined as the expectation E_z~Z{X:W|Z=z,Y=y}.
        # X, W, and Z must each be a list [(rowid, colno), ..].
        # Y is an evidence list [(rowid,colno,val), ..].
        if numsamples is None:
            numsamples = self.n_samples
        # All sets must be disjoint.
        all_cols = X + W + Z + [(r,c) for r,c,_ in Y]
        if len(all_cols) != len(set(all_cols)):
            raise ValueError('Duplicate cells received in '
                'conditional_mutual_information.\n'
                'X: {}\nW: {}\nZ: {}\nY: {}'.format(X, W, Z, Y))
        # Simulate from joint.
        XWZ_samples = self.simulate(bdb, genid, modelno, X+W+Z,
            Y, numpredictions=numsamples)
        # Simple Monte Carlo
        mi = logpz = logpxwz = logpxz = logpwz = 0
        for s in XWZ_samples:
            Qx = [(r,c,v) for ((r,c),v) in zip(X, s[:len(X)])]
            Qw = [(r,c,v) for ((r,c),v) in zip(W, s[len(X):len(X)+len(W)])]
            Qz = [(r,c,v) for ((r,c),v) in zip(Z, s[len(X)+len(W):])]
            if Z:
                logpz = self._joint_logpdf(bdb, genid, modelno, Qz, Y)
            else:
                logpz = 0
            logpxwz = self._joint_logpdf(bdb, genid, modelno, Qx+Qw+Qz, Y)
            logpxz = self._joint_logpdf(bdb, genid, modelno, Qx+Qz, Y)
            logpwz = self._joint_logpdf(bdb, genid, modelno, Qw+Qz, Y)
            mi += logpz + logpxwz - logpxz - logpwz
        # TODO: linfoot?
        # TODO: If negative, report to user that reliable answer cannot be
        # returned with current `numsamples`.
        # Averaging is in direct space is correct.
        return mi/numsamples


[(1, 2, 3), (4, 5, 6)]

In [None]:
# X + W + Z -> concatenates lists
# set(X) -> unique(X) in MATLAB lingo
# zip([1,2,3],[1,2,3])
# zip(*zip([1,2,3],[1,2,3]))

In [1]:
    def simulate(self, bdb, genid, modelno, targets, constraints,
            numpredictions=1):
        # Delegate to crosscat if colnos+constraints all lcols.
        colnos = [c for _,c in targets]
        all_cols = [c for _,c,_ in constraints] + colnos
        if all(f not in all_cols for f in self.fcols(bdb, genid)):
            Y_cc = [(r, self.cc_colno(bdb, genid, c), v)
                for r, c, v in constraints]
            Q_cc = [(r, self.cc_colno(bdb, genid, c)) for r,c in targets]
            return self.cc(bdb, genid).simulate_joint(bdb,
                self.cc_id(bdb, genid), Q_cc, Y_cc, modelno,
                num_predictions=numpredictions)
        # Solve inference problem by sampling-importance resampling.
        result = []
        for r,_ in targets:
            assert r == targets[0][0], "Cannot simulate more than one row, "\
                "%s and %s requested" % (targets[0][0], r)
        for _ in xrange(numpredictions):
            samples, weights = self._weighted_sample(bdb, genid, modelno,
                targets[0][0], constraints)
            p = np.exp(np.asarray(weights) - np.max(weights))
            p /= np.sum(p)
            draw = np.nonzero(bdb.np_prng.multinomial(1,p))[0][0]
            s = [samples[draw].get(col) for col in colnos]
            result.append(s)
        return result

{1, 2}

In [7]:
# all(iterable) -> Return True if bool(x) is True for all values x in the iterable.
# for x in (f not in [0,3,4] for f in [1,2,3]):
#     print x


False

[(1, 2, 3), (1, 2, 3)]

In [2]:
import os
import pytest

import bayeslite
from bayeslite.sqlite3_util import sqlite3_quote_name as quote

import bdbcontrib
from bdbcontrib.metamodels.composer import Composer
from bdbcontrib.predictors import random_forest
from bdbcontrib.predictors import keplers_law
from bdbcontrib.predictors import multiple_regression

In [3]:
PATH_TESTS = os.path.dirname('/home/casarsa/Git/bdbcontrib/tests/test_composer.py')
PATH_ROOT = os.path.dirname(PATH_TESTS)
PATH_EXAMPLES = os.path.join(PATH_ROOT, 'examples')
PATH_SATELLITES = os.path.join(PATH_EXAMPLES, 'satellites')
PATH_SATELLITES_CSV = os.path.join(PATH_SATELLITES, 'satellites.csv')

In [4]:
bdb = bayeslite.bayesdb_open()
bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True)
bdbcontrib.nullify(bdb, 'satellites', 'NaN')
# Composer.
composer = Composer(n_samples=5)
composer.register_foreign_predictor(
    multiple_regression.MultipleRegression)
composer.register_foreign_predictor(keplers_law.KeplersLaw)
composer.register_foreign_predictor(random_forest.RandomForest)
bayeslite.bayesdb_register_metamodel(bdb, composer)

bdb.execute('''
    CREATE GENERATOR t1 FOR satellites USING composer(
        default (
            Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
            Users CATEGORICAL, Purpose CATEGORICAL,
            Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
            Apogee_km NUMERICAL, Eccentricity NUMERICAL,
            Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
            Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL,
            Inclination_radians NUMERICAL,
        ),
        random_forest (
            Type_of_Orbit CATEGORICAL
                GIVEN Apogee_km, Perigee_km,
                    Eccentricity, Period_minutes, Launch_Mass_kg,
                    Power_watts, Anticipated_Lifetime, Class_of_orbit
        ),
        keplers_law (
            Period_minutes NUMERICAL
                GIVEN Perigee_km, Apogee_km
        ),
        multiple_regression (
            Anticipated_Lifetime NUMERICAL
                GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg,
                Contractor
        ),
        DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
        DEPENDENT(Contractor, Country_of_Contractor),
        INDEPENDENT(Country_of_Operator, Date_of_Launch)
    );''')


    # Use complex generator for interesting test cases.
# generator_id = bayeslite.core.bayesdb_get_generator(bdb, 't1_cc')
# composer = core.bayesdb_generator_metamodel(bdb, generator_id)
# composer.conditional_mutual_information(bdb, generator_id, modelno, X, W, Z, Y, numsamples=None)

<bayeslite.bql.BayesDBCursor at 0x7f66aa3069d0>

In [21]:
bdb = bayeslite.bayesdb_open()
bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
    header=True, create=True)
composer = Composer(n_samples=5)
bayeslite.bayesdb_register_metamodel(bdb, composer)

os.getenv('BAYESDB_WIZARD_MODE')

bdb.execute('''
    CREATE GENERATOR t1 FOR satellites USING composer(
        default (
            Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
            Users CATEGORICAL, Purpose CATEGORICAL,
            Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
            Apogee_km NUMERICAL, Eccentricity NUMERICAL
        ),
        crosscat (
            Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL
        )
    );''')

ERROR: An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 6))



BQLParseError: Parse errors:
  syntax error near t1
  If you would like to analyze your own data with BayesDB, please contact
  bayesdb@mit.edu to participate in our research project.


In [19]:
os.system('export BAYESDB_WIZARD_MODE=1')

0

In [7]:
bdb.execute('''
    CREATE GENERATOR t3 FOR satellites USING composer(
        default (
            Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
            Users CATEGORICAL, Purpose CATEGORICAL,
            Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
            Apogee_km NUMERICAL, Eccentricity NUMERICAL
        ),
        random_forest (
            Apogee_km NUMERICAL GIVEN Operator_Owner
        )
    );''')

ERROR: An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 6))



BQLParseError: Parse errors:
  syntax error near t3
  If you would like to analyze your own data with BayesDB, please contact
  bayesdb@mit.edu to participate in our research project.


In [5]:
# Leo Casarsa - Dec 21 2015
# First test on how to compute mutual information between columns using the Composer class
# Mutual information between columns that should give a high value in the satellites example is here low.

# Using one model and 1000 samplees
# 	MI('Country_of_Operator'; 'Operator_Owner') = 8.32664164641e-07
# Using one model and 100 samples
# 	MI('Operator_Owner';'Excentricity') = 4.4408920985e-17

os.environ['BAYESDB_WIZARD_MODE'] = '1'

# Use satellites for all tests.
PATH_TESTS = os.path.dirname('/home/casarsa/Git/bdbcontrib/tests/test_composer.py')
PATH_ROOT = os.path.dirname(PATH_TESTS)
PATH_EXAMPLES = os.path.join(PATH_ROOT, 'examples')
PATH_SATELLITES = os.path.join(PATH_EXAMPLES, 'satellites')
PATH_SATELLITES_CSV = os.path.join(PATH_SATELLITES, 'satellites.csv')

bdb = bayeslite.bayesdb_open()
bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
header=True, create=True)
composer = Composer(n_samples=5)
bayeslite.bayesdb_register_metamodel(bdb, composer)
# Using crosscat and default to specify models should work.
bdb.execute('''
CREATE GENERATOR t1 FOR satellites USING composer(
    default (
        Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
        Users CATEGORICAL, Purpose CATEGORICAL,
        Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
        Apogee_km NUMERICAL, Eccentricity NUMERICAL
    ),
    crosscat (
        Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL
    )
);''')

generator_id = bayeslite.core.bayesdb_get_generator(bdb, 't1')
composer = bayeslite.core.bayesdb_generator_metamodel(bdb, generator_id)
bdb.execute('INITIALIZE 1 MODELS FOR t1')

c1 = bayeslite.core.bayesdb_table_column_number(bdb,'satellites','Country_of_Operator')
c2 = bayeslite.core.bayesdb_table_column_number(bdb,'satellites','Operator_Owner')
c3 = bayeslite.core.bayesdb_table_column_number(bdb,'satellites','Eccentricity')


MI_first = composer.column_mutual_information(bdb, generator_id, 0, c1, c2, numsamples=1000)
MI_second = composer.column_mutual_information(bdb, generator_id, 0, c2, c3, numsamples=100)

print MI_first, MI_second

8.32664164641e-07 4.4408920985e-17
