ENH documentation

jmschrei · Mar 24, 2016 · 925aa98 · 925aa98
1 parent 5f1d3e1
commit 925aa98
Show file tree

Hide file tree

Showing 9 changed files with 607 additions and 171 deletions.
diff --git a/pomegranate/BayesianNetwork.pyx b/pomegranate/BayesianNetwork.pyx
@@ -18,6 +18,19 @@ cdef class BayesianNetwork( Model ):
 	represent conditional dependencies of the children on their parents, and the
 	lack of an edge represents a conditional independence. 
 
+	Parameters
+	----------
+	name : str, optional
+		The name of the model. Default is None
+
+	Attributes
+	----------
+	states : list, shape (n_states,)
+		A list of all the state objects in the model
+
+	graph : networkx.DiGraph
+		The underlying graph object.
+
 	Example
 	-------
 	>>> from pomegranate import *
@@ -61,11 +74,22 @@ cdef class BayesianNetwork( Model ):
 	[['B', 'A']]
 	"""
 
-	def bake( self, verbose=False ): 
-		"""
-		The Bayesian Network is going to be mostly a wrapper for the Factor
-		Graph, as probabilities, inference, and training can be done more
-		efficiently on them.
+	def bake( self ): 
+		"""Finalize the topology of the model.
+
+		Assign a numerical index to every state and create the underlying arrays
+		corresponding to the states and edges between the states. This method 
+		must be called before any of the probability-calculating methods. This
+		includes converting conditional probability tables into joint probability
+		tables and creating a list of both marginal and table nodes.
+
+		Parameters
+		----------
+		None
+
+		Returns
+		-------
+		None
 		"""
 
 		# Initialize the factor graph
@@ -205,17 +229,19 @@ cdef class BayesianNetwork( Model ):
 			array with the values being ordered according to the nodes incorporation
 			in the graph (the order fed into .add_states/add_nodes) and None for
 			variables which are unknown. If nothing is fed in then calculate the
-			marginal of the graph.
+			marginal of the graph. Default is {}.
+
 		max_iterations : int, optional
 			The number of iterations with which to do loopy belief propogation.
-			Usually requires only 1.
+			Usually requires only 1. Default is 100.
+
 		check_input : bool, optional
 			Check to make sure that the observed symbol is a valid symbol for that
-			distribution to produce.
+			distribution to produce. Default is True.
 
 		Returns
 		-------
-		probabilitie : array-like, shape (n_nodes)
+		probabilities : array-like, shape (n_nodes)
 			An array of univariate distribution objects showing the probabilities
 			of each variable.
 		"""
@@ -239,17 +265,19 @@ cdef class BayesianNetwork( Model ):
 			array with the values being ordered according to the nodes incorporation
 			in the graph (the order fed into .add_states/add_nodes) and None for
 			variables which are unknown. If nothing is fed in then calculate the
-			marginal of the graph.
+			marginal of the graph. Default is {}.
+
 		max_iterations : int, optional
 			The number of iterations with which to do loopy belief propogation.
-			Usually requires only 1.
+			Usually requires only 1. Default is 100.
+
 		check_input : bool, optional
 			Check to make sure that the observed symbol is a valid symbol for that
-			distribution to produce.
+			distribution to produce. Default is True.
 
 		Returns
 		-------
-		probabilitie : array-like, shape (n_nodes)
+		probabilities : array-like, shape (n_nodes)
 			An array of univariate distribution objects showing the probabilities
 			of each variable.
 		"""
@@ -278,11 +306,13 @@ cdef class BayesianNetwork( Model ):
 		items : array-like, shape (n_samples, n_nodes)
 			The data to train on, where each row is a sample and each column
 			corresponds to the associated variable.
+
 		weights : array-like, shape (n_nodes), optional
-			The weight of each sample as a positive double
+			The weight of each sample as a positive double. Default is None.
+
 		inertia : double, optional
 			The inertia for updating the distributions, passed along to the
-			distribution method.
+			distribution method. Default is 0.0.
 
 		Returns
 		-------
@@ -298,9 +328,9 @@ cdef class BayesianNetwork( Model ):
 			if isinstance( state.distribution, ConditionalProbabilityTable ):
 				idx = [ indices[ dist ] for dist in state.distribution.parameters[1] ] + [i]
 				data = [ [ item[i] for i in idx ] for item in items ]
-				state.distribution.from_sample( data, weights, inertia )
+				state.distribution.fit( data, weights, inertia )
 			else:
-				state.distribution.from_sample( [ item[i] for item in items ], weights, inertia )
+				state.distribution.fit( [ item[i] for item in items ], weights, inertia )
 
 		self.bake()
 		return self
@@ -319,8 +349,10 @@ cdef class BayesianNetwork( Model ):
 			Data matrix to impute. Missing values must be either None (if lists)
 			or np.nan (if numpy.ndarray). Will fill in these values with the
 			maximally likely ones.
+
 		max_iterations : int, optional
-			Number of iterations to run loopy belief propogation for.
+			Number of iterations to run loopy belief propogation for. Default
+			is 100.
 
 		Returns
 		-------

diff --git a/pomegranate/FactorGraph.pyx b/pomegranate/FactorGraph.pyx
@@ -14,9 +14,16 @@ if sys.version_info[0] > 2:
 	xrange = range
 
 cdef class FactorGraph( Model ):
-	"""
-	A biparte graph between factors and conditional probability
-	distributions.
+	"""A Factor Graph model.
+
+	A biparte graph where conditional probability tables are on one side,
+	and marginals for each of the variables involved are on the other
+	side.
+
+	Parameters
+	----------
+	name : str, optional
+		The name of the model. Default is None.
 	"""
 
 	cdef numpy.ndarray transitions, edge_count, marginals
@@ -27,27 +34,44 @@ cdef class FactorGraph( Model ):
 		the model when output. Name may not contain spaces or newlines.
 		"""
 
-		# Save the name or make up a name.
 		self.name = name or str( id(self) )
 		self.states = []
 		self.edges = []
 
 	def add_node( self, n ):
-		"""
-		Add a node to the graph.
+		"""Add a node to the given model. 
+		
+		The node must not already be in the model, nor may it be part of any 
+		other model that will eventually be combined with this one.
+
+		Parameters
+		----------
+		state : Node
+			A node object to be added to the model.
+
+		Returns
+		-------
+		None
 		"""
 
 		self.states.append( n )
 
-	def bake( self, verbose=False ): 
-		"""
-		Finalize the topology of the model, and assign a numerical index to
-		every node. This method must be called before any of the probability-
-		calculating or sampling methods.
-		
-		This fills in self.states (a list of all states in order), the sparse
-		matrices of transitions and their weights, and also will merge silent
-		states.
+	def bake( self ): 
+		"""Finalize the topology of the model.
+
+		Assign a numerical index to every state and create the underlying arrays
+		corresponding to the states and edges between the states. This method 
+		must be called before any of the probability-calculating methods. This 
+		is the same as the HMM bake, except that at the end it sets current
+		state information.
+
+		Parameters
+		----------
+		None
+
+		Returns
+		-------
+		None
 		"""
 
 		n, m = len(self.states), len(self.edges)
@@ -127,17 +151,55 @@ cdef class FactorGraph( Model ):
 		self.edges = []  
 
 	def marginal( self ):
-		"""
-		Return the marginal of the graph.
+		"""Return the marginal probabilities of each variable in the graph.
+
+		This is equivalent to a pass of belief propogation on a graph where
+		no data has been given. This will calculate the probability of each
+		variable being in each possible emission when nothing is known.
+
+		Parameters
+		----------
+		None
+
+		Returns
+		-------
+		marginals : array-like, shape (n_nodes)
+			An array of univariate distribution objects showing the marginal
+			probabilities of that variable.
 		"""
 
 		return self.forward_backward( {} )
 
 	def forward_backward( self, data, max_iterations=10, verbose=False ):
-		"""
-		Perform the sum-product algorithm. The term 'marginal node' and 'variable node'
-		are used interchangably as I wrote this method while very excited over the course
-		of several days.
+		"""Returns the probabilities of each variable in the graph given evidence.
+
+		This calculates the marginal probability distributions for each state given
+		the evidence provided through loopy belief propogation. Loopy belief
+		propogation is an approximate algorithm which is exact for certain graph
+		structures.
+
+		Parameters
+		----------
+		data : dict or array-like, shape <= n_nodes, optional
+			The evidence supplied to the graph. This can either be a dictionary
+			with keys being state names and values being the observed values
+			(either the emissions or a distribution over the emissions) or an
+			array with the values being ordered according to the nodes incorporation
+			in the graph (the order fed into .add_states/add_nodes) and None for
+			variables which are unknown. If nothing is fed in then calculate the
+			marginal of the graph.
+		max_iterations : int, optional
+			The number of iterations with which to do loopy belief propogation.
+			Usually requires only 1.
+		check_input : bool, optional
+			Check to make sure that the observed symbol is a valid symbol for that
+			distribution to produce.
+
+		Returns
+		-------
+		probabilities : array-like, shape (n_nodes)
+			An array of univariate distribution objects showing the probabilities
+			of each variable.
 		"""
 
 		n, m = len( self.states ), len( self.transitions )
@@ -288,4 +350,4 @@ cdef class FactorGraph( Model ):
 
 		# We've already computed the current belief about the marginals, so
 		# we can just return that.
-		return current_distributions[ numpy.where( self.marginals == 1 ) ]
+		return current_distributions[ numpy.where( self.marginals == 1 ) ]
diff --git a/pomegranate/MarkovChain.pyx b/pomegranate/MarkovChain.pyx
@@ -4,6 +4,9 @@
 # Contact: Jacob Schreiber <jmschreiber91@gmail.com>
 
 import numpy
+import json
+
+from .distributions import Distribution
 
 cdef class MarkovChain(object):
 	"""A Markov Chain.
@@ -89,13 +92,13 @@ cdef class MarkovChain(object):
 
 		weights : array-like, shape (n_samples,), optional
 			The initial weights of each sample. If nothing is passed in then 
-			each sample is assumed to be the same weight.
+			each sample is assumed to be the same weight. Default is None.
 
 		inertia : double, optional
 			The weight of the previous parameters of the model. The new
 			parameters will roughly be old_param*inertia + new_param*(1-inertia), 
 			so an inertia of 0 means ignore the old parameters, whereas an
-			inertia of 1 means ignore the new parameters.
+			inertia of 1 means ignore the new parameters. Default is 0.0.
 
 		Returns
 		-------
@@ -119,7 +122,7 @@ cdef class MarkovChain(object):
 
 		weights : array-like, shape (n_samples,), optional
 			The initial weights of each sample. If nothing is passed in then 
-			each sample is assumed to be the same weight.
+			each sample is assumed to be the same weight. Default is None.
 
 		Returns
 		-------
@@ -159,7 +162,7 @@ cdef class MarkovChain(object):
 			The weight of the previous parameters of the model. The new
 			parameters will roughly be old_param*inertia + new_param*(1-inertia), 
 			so an inertia of 0 means ignore the old parameters, whereas an
-			inertia of 1 means ignore the new parameters.
+			inertia of 1 means ignore the new parameters. Default is 0.0.
 
 		Returns
 		-------
@@ -168,3 +171,49 @@ cdef class MarkovChain(object):
 
 		for i in range(self.k+1):
 			self.distributions[i].from_summaries( inertia=inertia )
+
+	def to_json( self, separators=(',', ' : '), indent=4 ):
+		"""Serialize the model to a JSON.
+
+		Parameters
+		----------
+		separators : tuple, optional 
+		    The two separaters to pass to the json.dumps function for formatting.
+		    Default is (',', ' : ').
+
+		indent : int, optional
+		    The indentation to use at each level. Passed to json.dumps for
+		    formatting. Default is 4.
+
+		Returns
+		-------
+		json : str
+		    A properly formatted JSON object.
+		"""
+
+		model = { 
+		            'class' : 'MarkovChain',
+		            'distributions'  : [ json.loads( d.to_json() ) for d in self.distributions ]
+		        }
+
+		return json.dumps( model, separators=separators, indent=indent )
+
+	@classmethod
+	def from_json( cls, s ):
+		"""Read in a serialized model and return the appropriate classifier.
+
+		Parameters
+		----------
+		s : str
+		    A JSON formatted string containing the file.
+
+		Returns
+		-------
+		model : object
+		    A properly initialized and baked model.
+		"""
+
+		d = json.loads( s )
+		distributions = [ Distribution.from_json( json.dumps(j) ) for j in d['distributions'] ] 
+		model = MarkovChain( distributions )
+		return model