Browse files

Add debugging gems and refactor Stats module

  • Loading branch information...
1 parent d5455d3 commit c4d81e3ca754fa521079fbb2c049f24def82d0da Brian Stanwyck committed Aug 30, 2011
Showing with 211 additions and 145 deletions.
  1. +1 −1 Gemfile
  2. +2 −0 glymour.gemspec
  3. +150 −101 lib/glymour.rb
  4. +22 −16 spec/statistics_spec.rb
  5. +36 −27 spec/structure_learning_spec.rb
View
2 Gemfile
@@ -1,4 +1,4 @@
source "http://rubygems.org"
# Specify your gem's dependencies in glymour.gemspec
-gemspec
+gemspec
View
2 glymour.gemspec
@@ -12,6 +12,8 @@ Gem::Specification.new do |s|
s.description = %q{Implements supervised Bayesian structure learning, as well as extra tools to help train a Bayesian net using ActiveRecord data}
s.add_development_dependency "rspec"
+ s.add_development_dependency "pry"
+ s.add_development_dependency "ruby-debug"
s.add_dependency 'rgl'
s.add_dependency 'sbn'
View
251 lib/glymour.rb
@@ -1,9 +1,12 @@
require "glymour"
+require "pry"
+require "rinruby"
+#require "ruby-debug/debugger"
@vanstee
vanstee added a note Aug 30, 2011

Oops. Probably didn't want to commit that.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
# Generates the complete graph on n vertices if n is an integer, otherwise
# the complete graph on the vertices in the enumerable given
def complete_graph(n)
- set = (n.class == 'Integer' || n.class == 'Fixnum') ? (1..n) : n
+ set = n.is_a?(Integer) ? (1..n) : n
RGL::ImplicitGraph.new do |g|
g.vertex_iterator { |b| set.each(&b) }
g.adjacent_iterator do |x, b|
@@ -12,7 +15,141 @@ def complete_graph(n)
end
end
+def remove_edge(orig, e)
+ new_graph = RGL::ImplicitGraph.new do |g|
+ g.vertex_iterator { |b| orig.vertices.each(&b) }
+ g.adjacent_iterator do |x, b|
+ new_adj = orig.adjacent_vertices(x).reject { |v| e.source == v or e.target == v }
+ new_adj.each { |y| b.call(y) }
+ end
+ end
+ new_graph
+end
+
+def cartprod(*args)
@vanstee
vanstee added a note Aug 30, 2011

Any reason Array#product didn't work?

Oh, man. Seriously? It's always in the last method name you look for :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+ result = [[]]
+ while [] != args
+ t, result = result, []
+ b, *args = args
+ t.each do |a|
+ b.each do |n|
+ result << a + [n]
+ end
+ end
+ end
+ result
+end
+
module Glymour
+ module Statistics
+ # Grabs variable data from a table (mostly for quantizing continous vars)
+ # block determines the variable value for a given row of table, e.g. { |row| row[:first_seen_at] } or &:first_seen_at
+ class Variable
+ attr_accessor :intervals, :table
+
+ def initialize(table, num_classes=nil, &block)
+ @table = table
+ @block = Proc.new &block
+ @intervals = num_classes ? to_intervals(num_classes) : nil
+ end
+
+ # Apply @block to each column value, and
+ # return a list of evenly divided intervals [x1, x2, ..., x(n_classes)]
+ # So that x1 is the minimum, xn is the max
+ def to_intervals(num_classes)
+ step = (values.max - values.min)/(num_classes-1).to_f
+ (0..(num_classes-1)).map { |k| values.min + k*step }
+ end
+
+ def value_at(row)
+ @block.call(row)
+ end
+
+ # Gives an array of all variable values in table
+ def values
+ @intervals ? @table.map { |row| location_in_interval(row) } : @table.map(&@block)
+ end
+
+ # Gives the location of a column value within a finite set of interval values (i.e. gives discrete state after classing a continuous variable)
+ def location_in_interval(row)
+ intervals.each_with_index do |x, i|
+ return i if value_at(row) <= x
+ end
+
+ # Return -1 if value is not within intervals
+ -1
+ end
+ end
+
+ # Accepts a list of variables and a row of data and generates a learning row for a Bayes net
+ def learning_row(variables, row)
+ var_values = {}
+
+ variables.each do |var|
+ if var.intervals
+ var_values[var] = var.value_at(row).location_in_interval
+ else
+ var_values[var] = var.value_at row
+ end
+ end
+
+ var_values
+ end
+
+ # Takes two or more Variables
+ # Returns true if first two variables are coindependent given the rest
+ def coindependent?(p_val, *variables)
+ #TODO: Raise an exception if variables have different tables?
+ R.echo(false)
+ # Push variable data into R
+ n_vars = variables.length
+ var_names = (1..n_vars).map { |k| "var#{k}" }
+ R.eval "library(vcd)"
+ variables.each_with_index do |var, k|
+ # Rinruby can't handle true and false values, so use 1 and 0 resp. instead
+ sanitized_values = var.values.map do |value|
+ if value.is_a?(TrueClass) || value.is_a?(FalseClass)
+ # 1 if true, 0 if false
+ (value && 1) || 0
+ else
+ value
+ end
+ end
+
+ R.assign "var#{k+1}", sanitized_values
+ end
+
+ R.eval "cond_data <- data.frame(#{var_names.join(', ')})\n"
+ R.eval "t <-table(cond_data)"
+ s_vals =[]
+
+ cond_vars = variables[2..(variables.length-1)]
+
+ # If no conditioning variables are given, just return the chi square test for the first two
+ if cond_vars.length == 0
+ R.eval "chisq <- chisq.test(t)"
+ observed_p = R.pull "chisq$p.value"
+ return observed_p > p_val
+ end
+
+ cond_values = cond_vars.map { |var| (1..var.values.uniq.length).collect }
+
+ cartprod(*cond_values).each do |value|
+ R.eval "chisq <- chisq.test(t[,,#{value.join(',')}])"
+ R.eval "s <- chisq$statistic"
+ if R.pull("s") == (1/0.0)
+ puts "s is Infinity"
+ return true
+ else
+ observed_s = R.pull("s").to_f
+ s_vals << observed_s
+ end
+ end
+ chisq_sum = s_vals.compact.reduce(0, :+)
+ R.pull("pchisq(#{chisq_sum}, #{cond_vars.length})").to_f > p_val
+ end
+ end
+
# Provides graph structures and algorithms for determining edge structure of a Bayesian net
module StructureLearning
module PowerSet
@@ -30,6 +167,10 @@ def power_set
end
module GraphAlgorithms
+ def has_edge?(e)
+ self.edges.include? e
+ end
+
# Returns a (unique) list of vertices adjacent to vertex a or b.
# This is denoted "Aab" in Spirtes-Glymour's paper.
def adjacent_either(a, b)
@@ -70,7 +211,6 @@ def non_transitive
end
end
-
# Takes a list of vertices and a hash of source => [targets] pairs and generates a directed graph
def make_directed(vertices, directed_edges)
RGL::ImplicitGraph.new do |g|
@@ -83,7 +223,7 @@ def make_directed(vertices, directed_edges)
end
class LearningNet
-
+ include Glymour::Statistics
attr_accessor :net, :directed_edges, :n
def initialize(variables)
@net = complete_graph(variables).extend(GraphAlgorithms)
@@ -104,14 +244,12 @@ def step
next
else
# Are a and b independent conditioned on any subsets of Aab ^ Uab of cardinality n+1?
- if intersect.power_set.select {|s| s.length == n+1}.any? { |subset|
- coindependent?(a, b, subset)
- }
- g.remove_vertex e
+ if intersect.power_set.select {|s| s.length == n+1}.any? { |subset| coindependent?(0.05, a, b, *subset) }
+ @net = remove_edge(net, e)
any_independent = true
end
end
- n += 1
+ @n += 1
end
any_independent
end
@@ -122,15 +260,15 @@ def learn_structure
# set final_net to the _second-to-last_ state of @net
begin
final_net = net
- end while step
+ end while self.step
# Direct remaining edges in @net as much as possible
final_net.non_transitive.each do |triple|
- a, b, c = triple[0], triple[1], triple[2]
- intersect = (final_net.adjacent_either(a, c) & final_net.verts_on_path(a, c)).extend(PowerSet)
+ a, b, c = *triple
+ intersect = (final_net.extend(StructureLearning::GraphAlgorithms).adjacent_either(a, c) & final_net.extend(StructureLearning::GraphAlgorithms).verts_on_paths(a, c)).extend(PowerSet)
if intersect.power_set.select {|s| s.include? b}.all? { |subset|
- # TODO: Are a and c dependent conditioned on subset?
+ coindependent?(a, c, *subset)
}
directed_edges[a] << b
directed_edges[c] << b
@@ -189,95 +327,6 @@ def to_bn(title)
end
end
- module Statistics
- require 'rinruby'
- # Grabs variable data from a table (mostly for quantizing continous vars)
- # block determines the variable value for a given row of table, e.g. { |row| row[:first_seen_at] } or &:first_seen_at
- class Variable
- attr_accessor :intervals, :table
-
- def initialize(table, num_classes=nil, &block)
- @table = table
- @block = Proc.new &block
- @intervals = num_classes ? to_intervals(num_classes) : nil
- end
-
- # Apply @block to each column value, and
- # return a list of evenly divided intervals [x1, x2, ..., x(n_classes)]
- # So that x1 is the minimum, xn is the max
- def to_intervals(num_classes)
- step = (values.max - values.min)/(num_classes-1).to_f
- (0..(num_classes-1)).map { |k| values.min + k*step }
- end
-
- def value_at(row)
- @block.call(row)
- end
-
- # Gives an array of all variable values in table
- def values
- @intervals ? @table.map { |row| location_in_interval(row) } : @table.map(&@block)
- end
-
- # Gives the location of a column value within a finite set of interval values (i.e. gives discrete state after classing a continuous variable)
- def location_in_interval(row)
- intervals.each_with_index do |x, i|
- return i if value_at(row) <= x
- end
-
- # Return -1 if value is not within intervals
- -1
- end
- end
-
- # Accepts a list of variables and a row of data and generates a learning row for a Bayes net
- def learning_row(variables, row)
- var_values = {}
-
- variables.each do |var|
- if var.intervals
- var_values[var] = var.value_at(row).location_in_interval
- else
- var_values[var] = var.value_at row
- end
- end
-
- var_values
- end
-
- # Takes two or more Variables
- # Returns true if first two variables are coindependent given the rest
- def coindependent?(p_val, *variables)
- #TODO: Raise an exception if variables have different tables?
-
- # Push variable data into R
- n_vars = variables.length
- var_names = (1..n_vars).map { |k| "var#{k}" }
- R.eval "library(vcd)"
- variables.each_with_index do |var, k|
- # Rinruby can't handle true and false values, so use 1 and 0 resp. instead
-
- sanitized_values = var.values.map do |value|
- if value.is_a?(TrueClass) || value.is_a?(FalseClass)
- # 1 if true, 0 if false
- (value && 1) || 0
- else
- value
- end
- end
-
- R.assign "var#{k+1}", sanitized_values
- end
-
- R.eval "cond_data <- data.frame(#{var_names.join(', ')})\n"
- R.eval "indep <- coindep_test(cond_data)"
- R.eval "p <- indep$p.value"
-
- observed_p = R.pull("p").to_f
- observed_p > p_val
- end
- end
-
module Correlator
end
View
38 spec/statistics_spec.rb
@@ -1,28 +1,34 @@
+require 'glymour'
+require 'rgl/implicit'
+require 'rgl/dot'
require 'spec_helper'
describe Glymour::Statistics do
before(:all) do
+ R.echo(true)
Stats = StatsDummy.new
- @table_data = []
-
- 500.times do
- rain = rand < 0.5
- temp = rain ? 65 + 10 * (rand - 0.5) : 75 + 10 * (rand - 0.5)
- sprinklers = rain ? rand < 0.1 : rand < 0.5
- cat_out = rain || sprinklers ? 0.05 : 0.4
- grass_wet = (rain && (rand < 0.9)) || (sprinklers && (rand < 0.7)) || (cat_out && (rand < 0.01))
- @table_data << { :rain => rain, :sprinklers => sprinklers, :cat_out => cat_out, :grass_wet => grass_wet, :temp => temp }
+ @alarm_data = []
+ 10000.times do
+ e = rand < 0.002
+ b = rand < 0.001
+ a = b ? (e ? rand < 0.95 : rand < 0.94) : (e ? rand < 0.29 : rand < 0.001)
+ j = a ? rand < 0.90 : rand < 0.05
+ m = a ? rand < 0.70 : rand < 0.01
+ @alarm_data << { :e => e, :b => b, :a => a, :j => j, :m => m}
end
-
- @rain_var = Glymour::Statistics::Variable.new(@table_data) { |r| r[:rain] }
- @temp_var = Glymour::Statistics::Variable.new(@table_data, 10) { |r| r[:temp] }
- @grass_var = Glymour::Statistics::Variable.new(@table_data) { |r| r[:grass_wet] }
- @sprinklers_var = Glymour::Statistics::Variable.new(@table_data) { |r| r[:sprinklers] }
+ @e = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:e] }
+ @b = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:b] }
+ @a = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:a] }
+ @j = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:j] }
+ @m = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:m] }
+ alarm_vars = [@e, @b, @a, @j, @m]
+ @alarm_net = Glymour::StructureLearning::LearningNet.new(alarm_vars)
+ binding.pry
end
it 'should give coindependence data for two variable' do
- Stats.coindependent?(0.05, @temp_var, @sprinklers_var, @rain_var).should be_true
- Stats.coindependent?(0.05, @rain_var, @sprinklers_var).should be_false
+ Stats.coindependent?(0.05, @e, @a).should be_false
+ Stats.coindependent?(0.05, @j, @m, @a).should be_true
end
end
View
63 spec/structure_learning_spec.rb
@@ -1,5 +1,6 @@
require 'glymour'
require 'rgl/implicit'
+require 'rgl/dot'
describe Glymour::StructureLearning do
before(:each) do
@@ -16,7 +17,7 @@
end
describe 'Within GraphAlgorithms' do
- before(:each) do
+ before(:each) do
class RGL::ImplicitGraph
include Glymour::StructureLearning::GraphAlgorithms
end
@@ -29,6 +30,12 @@ class RGL::ImplicitGraph
@g = make_directed(vertices, edges)
end
+ it 'should remove an edge from a graph' do
+ g = complete_graph(4)
+ orig_edge_count = g.edges.length
+ remove_edge(g, g.edges.first).edges.length.should_not eq orig_edge_count
+ end
+
it 'should compute the vertices on all paths between two vertices' do
path_verts = @g.verts_on_paths(4, 6)
[4, 3, 7, 5, 6].each { |v| path_verts.should include v }
@@ -52,35 +59,37 @@ class RGL::ImplicitGraph
describe Glymour::StructureLearning::LearningNet do
before(:all) do
- @table_data = []
-
- 500.times do
- rain = rand < 0.5
- temp = rain ? 65 + 10 * (rand - 0.5) : 75 + 10 * (rand - 0.5)
- sprinklers = rain ? rand < 0.1 : rand < 0.5
- cat_out = rain || sprinklers ? 0.05 : 0.4
- grass_wet = (rain && (rand < 0.9)) || (sprinklers && (rand < 0.7)) || (cat_out && (rand < 0.01))
- @table_data << { :rain => rain, :sprinklers => sprinklers, :cat_out => cat_out, :grass_wet => grass_wet, :temp => temp }
+ @alarm_data = []
+ 100000.times do
+ e = rand < 0.002
+ b = rand < 0.001
+ a = b ? (e ? rand < 0.95 : rand < 0.94) : (e ? rand < 0.29 : rand < 0.001)
+ j = a ? rand < 0.90 : rand < 0.05
+ m = a ? rand < 0.70 : rand < 0.01
+ @alarm_data << { :e => e, :b => b, :a => a, :j => j, :m => m}
end
-
- @rain_var = Glymour::Statistics::Variable.new(@table_data) { |r| r[:rain] }
- @temp_var = Glymour::Statistics::Variable.new(@table_data, 10) { |r| r[:temp] }
- @grass_var = Glymour::Statistics::Variable.new(@table_data) { |r| r[:grass_wet] }
- @sprinklers_var = Glymour::Statistics::Variable.new(@table_data) { |r| r[:sprinklers] }
- @cat_var = Glymour::Statistics::Variable.new(@table_data) { |r| r[:cat_out] }
-
- @vars = [@rain_var, @temp_var, @grass_var, @sprinklers_var, @cat_var]
- @lnet = Glymour::StructureLearning::LearningNet.new(@vars)
+ @e = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:e] }
+ @b = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:b] }
+ @a = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:a] }
+ @j = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:j] }
+ @m = Glymour::Statistics::Variable.new(@alarm_data) { |r| r[:m] }
+ alarm_vars = [@e, @b, @a, @j, @m]
+ @v_hash = { @e => 'e', @b => 'b', @a => 'a', @j => 'j', @m => 'm' }
+ @alarm_net = Glymour::StructureLearning::LearningNet.new(alarm_vars)
end
- it 'should initialize a LearningNet on a set of variables' do
- @lnet.should_not be_nil
- end
-
- it 'should perform a step of the structure learning algorithm' do
- prev_net = @lnet.net
- @lnet.step
- prev_net.should_not eq @lnet.net
+ # it 'should perform a step of the structure learning algorithm' do
+ # prev_net = @lnet.net
+ # @lnet.step
+ # prev_net.should_not eq @lnet.net
+ # end
+ #
+ it 'should perform the structure learning algorithm' do
+ @alarm_net.learn_structure
+
+ @alarm_net.net.edges.each do |e|
+ puts "#{@v_hash[e.source]} => #{@v_hash[e.target]}"
+ end
end
end
end

0 comments on commit c4d81e3

Please sign in to comment.