Skip to content

Commit

Permalink
addressed comments v0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
brkyvz committed May 2, 2015
1 parent 4b74b24 commit d10babb
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 10 deletions.
8 changes: 5 additions & 3 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,9 +875,9 @@ def fillna(self, value, subset=None):

return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx)

def corr(self, col1, col2, method="pearson"):
def corr(self, col1, col2, method=None):
"""
Calculate the correlation of two columns of a DataFrame as a double value. Currently only
Calculates the correlation of two columns of a DataFrame as a double value. Currently only
supports the Pearson Correlation Coefficient.
:func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases.
Expand All @@ -889,6 +889,8 @@ def corr(self, col1, col2, method="pearson"):
raise ValueError("col1 should be a string.")
if not isinstance(col2, str):
raise ValueError("col2 should be a string.")
if not method:
method = "pearson"
if not method == "pearson":
raise ValueError("Currently only the calculation of the Pearson Correlation " +
"coefficient is supported.")
Expand Down Expand Up @@ -1378,7 +1380,7 @@ class DataFrameStatFunctions(object):
def __init__(self, df):
self.df = df

def corr(self, col1, col2, method="pearson"):
def corr(self, col1, col2, method=None):
return self.df.corr(col1, col2, method)

corr.__doc__ = DataFrame.corr.__doc__
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @return The Pearson Correlation Coefficient as a Double.
*/
def corr(col1: String, col2: String, method: String): Double = {
assert(method == "pearson", "Currently only the calculation of the Pearson Correlation " +
require(method == "pearson", "Currently only the calculation of the Pearson Correlation " +
"coefficient is supported.")
StatFunctions.pearsonCorrelation(df, Seq(col1, col2))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ class DataFrameStatSuite extends FunSuite {
def toLetter(i: Int): String = (i + 97).toChar.toString

test("Frequent Items") {
val rows = Array.tabulate(1000) { i =>
val rows = Seq.tabulate(1000) { i =>
if (i % 3 == 0) (1, toLetter(1), -1.0) else (i, toLetter(i), i * -1.0)
}
val df = sqlCtx.sparkContext.parallelize(rows).toDF("numbers", "letters", "negDoubles")
val df = rows.toDF("numbers", "letters", "negDoubles")

val results = df.stat.freqItems(Array("numbers", "letters"), 0.1)
val items = results.collect().head
Expand All @@ -46,17 +46,15 @@ class DataFrameStatSuite extends FunSuite {
}

test("pearson correlation") {
val df = sqlCtx.sparkContext.parallelize(
Array.tabulate(10)(i => (i, 2 * i, i * -1.0))).toDF("a", "b", "c")
val df = Seq.tabulate(10)(i => (i, 2 * i, i * -1.0)).toDF("a", "b", "c")
val corr1 = df.stat.corr("a", "b", "pearson")
assert(math.abs(corr1 - 1.0) < 1e-6)
val corr2 = df.stat.corr("a", "c", "pearson")
assert(math.abs(corr2 + 1.0) < 1e-6)
}

test("covariance") {
val rows = Array.tabulate(10)(i => (i, 2.0 * i, toLetter(i)))
val df = sqlCtx.sparkContext.parallelize(rows).toDF("singles", "doubles", "letters")
val df = Seq.tabulate(10)(i => (i, 2.0 * i, toLetter(i))).toDF("singles", "doubles", "letters")

val results = df.stat.cov("singles", "doubles")
assert(math.abs(results - 55.0 / 3) < 1e-6)
Expand Down

0 comments on commit d10babb

Please sign in to comment.