Skip to content

Commit

Permalink
[SPARK-5477] refactor stat.py
Browse files Browse the repository at this point in the history
There is only a single `stat.py` file for the `mllib.stat` package. We recently added `MultivariateGaussian` under `mllib.stat.distribution` in Scala/Java. It would be nice to refactor `stat.py` and make it easy to expand. Note that `ChiSqTestResult` is moved from `mllib.stat` to `mllib.stat.test`. The latter is used in Scala/Java. It is only used in the return value of `Statistics.chiSqTest`, so this should be an okay change.

davies

Author: Xiangrui Meng <meng@databricks.com>

Closes apache#4266 from mengxr/py-stat-refactor and squashes the following commits:

1a5e1db [Xiangrui Meng] refactor stat.py
  • Loading branch information
mengxr committed Jan 29, 2015
1 parent 5ad78f6 commit a3dc618
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 54 deletions.
1 change: 1 addition & 0 deletions mllib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@
<directory>../python</directory>
<includes>
<include>pyspark/mllib/*.py</include>
<include>pyspark/mllib/stat/*.py</include>
<include>pyspark/ml/*.py</include>
<include>pyspark/ml/param/*.py</include>
</includes>
Expand Down
24 changes: 24 additions & 0 deletions python/pyspark/mllib/stat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
Python package for statistical functions in MLlib.
"""

from pyspark.mllib.stat._statistics import *

__all__ = ["Statistics", "MultivariateStatisticalSummary"]
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,14 @@
# limitations under the License.
#

"""
Python package for statistical functions in MLlib.
"""

from pyspark import RDD
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import Matrix, _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat.test import ChiSqTestResult


__all__ = ['MultivariateStatisticalSummary', 'ChiSqTestResult', 'Statistics']
__all__ = ['MultivariateStatisticalSummary', 'Statistics']


class MultivariateStatisticalSummary(JavaModelWrapper):
Expand Down Expand Up @@ -53,54 +50,6 @@ def min(self):
return self.call("min").toArray()


class ChiSqTestResult(JavaModelWrapper):
"""
.. note:: Experimental
Object containing the test results for the chi-squared hypothesis test.
"""
@property
def method(self):
"""
Name of the test method
"""
return self._java_model.method()

@property
def pValue(self):
"""
The probability of obtaining a test statistic result at least as
extreme as the one that was actually observed, assuming that the
null hypothesis is true.
"""
return self._java_model.pValue()

@property
def degreesOfFreedom(self):
"""
Returns the degree(s) of freedom of the hypothesis test.
Return type should be Number(e.g. Int, Double) or tuples of Numbers.
"""
return self._java_model.degreesOfFreedom()

@property
def statistic(self):
"""
Test statistic.
"""
return self._java_model.statistic()

@property
def nullHypothesis(self):
"""
Null hypothesis of the test.
"""
return self._java_model.nullHypothesis()

def __str__(self):
return self._java_model.toString()


class Statistics(object):

@staticmethod
Expand Down
69 changes: 69 additions & 0 deletions python/pyspark/mllib/stat/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark.mllib.common import JavaModelWrapper


__all__ = ["ChiSqTestResult"]


class ChiSqTestResult(JavaModelWrapper):
"""
.. note:: Experimental
Object containing the test results for the chi-squared hypothesis test.
"""
@property
def method(self):
"""
Name of the test method
"""
return self._java_model.method()

@property
def pValue(self):
"""
The probability of obtaining a test statistic result at least as
extreme as the one that was actually observed, assuming that the
null hypothesis is true.
"""
return self._java_model.pValue()

@property
def degreesOfFreedom(self):
"""
Returns the degree(s) of freedom of the hypothesis test.
Return type should be Number(e.g. Int, Double) or tuples of Numbers.
"""
return self._java_model.degreesOfFreedom()

@property
def statistic(self):
"""
Test statistic.
"""
return self._java_model.statistic()

@property
def nullHypothesis(self):
"""
Null hypothesis of the test.
"""
return self._java_model.nullHypothesis()

def __str__(self):
return self._java_model.toString()
2 changes: 1 addition & 1 deletion python/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ function run_mllib_tests() {
run_test "pyspark/mllib/rand.py"
run_test "pyspark/mllib/recommendation.py"
run_test "pyspark/mllib/regression.py"
run_test "pyspark/mllib/stat.py"
run_test "pyspark/mllib/stat/_statistics.py"
run_test "pyspark/mllib/tree.py"
run_test "pyspark/mllib/util.py"
run_test "pyspark/mllib/tests.py"
Expand Down

0 comments on commit a3dc618

Please sign in to comment.