From 77409967008415b7ea57e9349d296350e6519687 Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Wed, 6 May 2015 14:48:25 -0700 Subject: [PATCH 01/48] [HOT-FIX] Move HiveWindowFunctionQuerySuite.scala to hive compatibility dir. Author: Yin Huai Closes #5951 from yhuai/fixBuildMaven and squashes the following commits: fdde183 [Yin Huai] Move HiveWindowFunctionQuerySuite.scala to hive compatibility dir. --- .../spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sql/hive/{ => compatibility}/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala (100%) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala similarity index 100% rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala rename to sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala From 1ad04dae038673a448f529c39b17817b78d6acd0 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Wed, 6 May 2015 16:15:51 -0700 Subject: [PATCH 02/48] [SPARK-5995] [ML] Make Prediction dev API public Changes: * Update protected prediction methods, following design doc. **<--most interesting change** * Changed abstract classes for Estimator and Model to be public. Added DeveloperApi tag. (I kept the traits for Estimator/Model Params private.) * Changed ProbabilisticClassificationModel method names to use probability instead of probabilities. CC: mengxr shivaram etrain Author: Joseph K. Bradley Closes #5913 from jkbradley/public-dev-api and squashes the following commits: e9aa0ea [Joseph K. Bradley] moved findMax to DenseVector and renamed to argmax. fixed bug for vector of length 0 15b9957 [Joseph K. Bradley] renamed probabilities to probability in method names 5cda84d [Joseph K. Bradley] regenerated sharedParams 7d1877a [Joseph K. Bradley] Made spark.ml prediction abstractions public. Organized their prediction methods for efficient computation of multiple output columns. --- .../ml/{impl/estimator => }/Predictor.scala | 50 ++------ .../spark/ml/classification/Classifier.scala | 110 ++++++------------ .../DecisionTreeClassifier.scala | 5 +- .../ml/classification/GBTClassifier.scala | 5 +- .../classification/LogisticRegression.scala | 100 ++++++---------- .../ProbabilisticClassifier.scala | 100 +++++++++++----- .../RandomForestClassifier.scala | 5 +- .../ml/param/shared/SharedParamsCodeGen.scala | 6 +- .../spark/ml/param/shared/sharedParams.scala | 4 +- .../ml/regression/DecisionTreeRegressor.scala | 5 +- .../spark/ml/regression/GBTRegressor.scala | 5 +- .../ml/regression/LinearRegression.scala | 5 +- .../ml/regression/RandomForestRegressor.scala | 5 +- .../spark/ml/regression/Regressor.scala | 42 ++----- .../spark/ml/{impl => }/tree/treeParams.scala | 4 +- .../apache/spark/mllib/linalg/Vectors.scala | 22 ++++ 16 files changed, 206 insertions(+), 267 deletions(-) rename mllib/src/main/scala/org/apache/spark/ml/{impl/estimator => }/Predictor.scala (86%) rename mllib/src/main/scala/org/apache/spark/ml/{impl => }/tree/treeParams.scala (99%) diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala similarity index 86% rename from mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala rename to mllib/src/main/scala/org/apache/spark/ml/Predictor.scala index e8b3628140e99..0e53877de92db 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala @@ -15,29 +15,23 @@ * limitations under the License. */ -package org.apache.spark.ml.impl.estimator +package org.apache.spark.ml -import org.apache.spark.annotation.{AlphaComponent, DeveloperApi} -import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DataType, DoubleType, StructType} +import org.apache.spark.sql.{DataFrame, Row} /** - * :: DeveloperApi :: - * - * Trait for parameters for prediction (regression and classification). - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. + * (private[ml]) Trait for parameters for prediction (regression and classification). */ -@DeveloperApi -private[spark] trait PredictorParams extends Params +private[ml] trait PredictorParams extends Params with HasLabelCol with HasFeaturesCol with HasPredictionCol { /** @@ -63,7 +57,7 @@ private[spark] trait PredictorParams extends Params } /** - * :: AlphaComponent :: + * :: DeveloperApi :: * * Abstraction for prediction problems (regression and classification). * @@ -73,11 +67,9 @@ private[spark] trait PredictorParams extends Params * parameter to specify the concrete type. * @tparam M Specialization of [[PredictionModel]]. If you subclass this type, use this type * parameter to specify the concrete type for the corresponding model. - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@AlphaComponent -private[spark] abstract class Predictor[ +@DeveloperApi +abstract class Predictor[ FeaturesType, Learner <: Predictor[FeaturesType, Learner, M], M <: PredictionModel[FeaturesType, M]] @@ -104,8 +96,6 @@ private[spark] abstract class Predictor[ } /** - * :: DeveloperApi :: - * * Train a model using the given dataset and parameters. * Developers can implement this instead of [[fit()]] to avoid dealing with schema validation * and copying parameters into the model. @@ -113,12 +103,9 @@ private[spark] abstract class Predictor[ * @param dataset Training dataset * @return Fitted model */ - @DeveloperApi protected def train(dataset: DataFrame): M /** - * :: DeveloperApi :: - * * Returns the SQL DataType corresponding to the FeaturesType type parameter. * * This is used by [[validateAndTransformSchema()]]. @@ -126,7 +113,6 @@ private[spark] abstract class Predictor[ * * The default value is VectorUDT, but it may be overridden if FeaturesType is not Vector. */ - @DeveloperApi protected def featuresDataType: DataType = new VectorUDT override def transformSchema(schema: StructType): StructType = { @@ -146,7 +132,7 @@ private[spark] abstract class Predictor[ } /** - * :: AlphaComponent :: + * :: DeveloperApi :: * * Abstraction for a model for prediction tasks (regression and classification). * @@ -154,11 +140,9 @@ private[spark] abstract class Predictor[ * E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features. * @tparam M Specialization of [[PredictionModel]]. If you subclass this type, use this type * parameter to specify the concrete type for the corresponding model. - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@AlphaComponent -private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, M]] +@DeveloperApi +abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, M]] extends Model[M] with PredictorParams { /** @group setParam */ @@ -168,8 +152,6 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel def setPredictionCol(value: String): M = set(predictionCol, value).asInstanceOf[M] /** - * :: DeveloperApi :: - * * Returns the SQL DataType corresponding to the FeaturesType type parameter. * * This is used by [[validateAndTransformSchema()]]. @@ -177,7 +159,6 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel * * The default value is VectorUDT, but it may be overridden if FeaturesType is not Vector. */ - @DeveloperApi protected def featuresDataType: DataType = new VectorUDT override def transformSchema(schema: StructType): StructType = { @@ -192,12 +173,8 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel * @return transformed dataset with [[predictionCol]] of type [[Double]] */ override def transform(dataset: DataFrame): DataFrame = { - // This default implementation should be overridden as needed. - - // Check schema transformSchema(dataset.schema, logging = true) - - if ($(predictionCol) != "") { + if ($(predictionCol).nonEmpty) { dataset.withColumn($(predictionCol), callUDF(predict _, DoubleType, col($(featuresCol)))) } else { this.logWarning(s"$uid: Predictor.transform() was called as NOOP" + @@ -207,11 +184,8 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel } /** - * :: DeveloperApi :: - * * Predict label for the given features. * This internal method is used to implement [[transform()]] and output [[predictionCol]]. */ - @DeveloperApi protected def predict(features: FeaturesType): Double } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala index d3361e24705c8..263d580fe2dd3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala @@ -17,8 +17,8 @@ package org.apache.spark.ml.classification -import org.apache.spark.annotation.{AlphaComponent, DeveloperApi} -import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams} +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.ml.{PredictionModel, PredictorParams, Predictor} import org.apache.spark.ml.param.shared.HasRawPredictionCol import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.mllib.linalg.{Vector, VectorUDT} @@ -26,15 +26,12 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DataType, DoubleType, StructType} + /** - * :: DeveloperApi :: - * Params for classification. - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. + * (private[spark]) Params for classification. */ -@DeveloperApi -private[spark] trait ClassifierParams extends PredictorParams - with HasRawPredictionCol { +private[spark] trait ClassifierParams + extends PredictorParams with HasRawPredictionCol { override protected def validateAndTransformSchema( schema: StructType, @@ -46,23 +43,21 @@ private[spark] trait ClassifierParams extends PredictorParams } /** - * :: AlphaComponent :: + * :: DeveloperApi :: + * * Single-label binary or multiclass classification. * Classes are indexed {0, 1, ..., numClasses - 1}. * * @tparam FeaturesType Type of input features. E.g., [[Vector]] * @tparam E Concrete Estimator type * @tparam M Concrete Model type - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@AlphaComponent -private[spark] abstract class Classifier[ +@DeveloperApi +abstract class Classifier[ FeaturesType, E <: Classifier[FeaturesType, E, M], M <: ClassificationModel[FeaturesType, M]] - extends Predictor[FeaturesType, E, M] - with ClassifierParams { + extends Predictor[FeaturesType, E, M] with ClassifierParams { /** @group setParam */ def setRawPredictionCol(value: String): E = set(rawPredictionCol, value).asInstanceOf[E] @@ -71,17 +66,15 @@ private[spark] abstract class Classifier[ } /** - * :: AlphaComponent :: + * :: DeveloperApi :: + * * Model produced by a [[Classifier]]. * Classes are indexed {0, 1, ..., numClasses - 1}. * * @tparam FeaturesType Type of input features. E.g., [[Vector]] * @tparam M Concrete Model type - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@AlphaComponent -private[spark] +@DeveloperApi abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[FeaturesType, M]] extends PredictionModel[FeaturesType, M] with ClassifierParams { @@ -101,13 +94,27 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur * @return transformed dataset */ override def transform(dataset: DataFrame): DataFrame = { - // This default implementation should be overridden as needed. - - // Check schema transformSchema(dataset.schema, logging = true) - val (numColsOutput, outputData) = - ClassificationModel.transformColumnsImpl[FeaturesType](dataset, this) + // Output selected columns only. + // This is a bit complicated since it tries to avoid repeated computation. + var outputData = dataset + var numColsOutput = 0 + if (getRawPredictionCol != "") { + outputData = outputData.withColumn(getRawPredictionCol, + callUDF(predictRaw _, new VectorUDT, col(getFeaturesCol))) + numColsOutput += 1 + } + if (getPredictionCol != "") { + val predUDF = if (getRawPredictionCol != "") { + callUDF(raw2prediction _, DoubleType, col(getRawPredictionCol)) + } else { + callUDF(predict _, DoubleType, col(getFeaturesCol)) + } + outputData = outputData.withColumn(getPredictionCol, predUDF) + numColsOutput += 1 + } + if (numColsOutput == 0) { logWarning(s"$uid: ClassificationModel.transform() was called as NOOP" + " since no output columns were set.") @@ -116,22 +123,17 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur } /** - * :: DeveloperApi :: - * * Predict label for the given features. * This internal method is used to implement [[transform()]] and output [[predictionCol]]. * * This default implementation for classification predicts the index of the maximum value * from [[predictRaw()]]. */ - @DeveloperApi override protected def predict(features: FeaturesType): Double = { - predictRaw(features).toArray.zipWithIndex.maxBy(_._1)._2 + raw2prediction(predictRaw(features)) } /** - * :: DeveloperApi :: - * * Raw prediction for each possible label. * The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives * a measure of confidence in each possible label (where larger = more confident). @@ -141,48 +143,12 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur * This raw prediction may be any real number, where a larger value indicates greater * confidence for that label. */ - @DeveloperApi protected def predictRaw(features: FeaturesType): Vector -} - -private[ml] object ClassificationModel { /** - * Added prediction column(s). This is separated from [[ClassificationModel.transform()]] - * since it is used by [[org.apache.spark.ml.classification.ProbabilisticClassificationModel]]. - * @param dataset Input dataset - * @return (number of columns added, transformed dataset) + * Given a vector of raw predictions, select the predicted label. + * This may be overridden to support thresholds which favor particular labels. + * @return predicted label */ - def transformColumnsImpl[FeaturesType]( - dataset: DataFrame, - model: ClassificationModel[FeaturesType, _]): (Int, DataFrame) = { - - // Output selected columns only. - // This is a bit complicated since it tries to avoid repeated computation. - var tmpData = dataset - var numColsOutput = 0 - if (model.getRawPredictionCol != "") { - // output raw prediction - val features2raw: FeaturesType => Vector = model.predictRaw - tmpData = tmpData.withColumn(model.getRawPredictionCol, - callUDF(features2raw, new VectorUDT, col(model.getFeaturesCol))) - numColsOutput += 1 - if (model.getPredictionCol != "") { - val raw2pred: Vector => Double = (rawPred) => { - rawPred.toArray.zipWithIndex.maxBy(_._1)._2 - } - tmpData = tmpData.withColumn(model.getPredictionCol, - callUDF(raw2pred, DoubleType, col(model.getRawPredictionCol))) - numColsOutput += 1 - } - } else if (model.getPredictionCol != "") { - // output prediction - val features2pred: FeaturesType => Double = model.predict - tmpData = tmpData.withColumn(model.getPredictionCol, - callUDF(features2pred, DoubleType, col(model.getFeaturesCol))) - numColsOutput += 1 - } - (numColsOutput, tmpData) - } - + protected def raw2prediction(rawPrediction: Vector): Double = rawPrediction.toDense.argmax } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 419e5ba05d38a..dcebea1d4b015 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -18,10 +18,9 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.AlphaComponent -import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor} -import org.apache.spark.ml.impl.tree._ +import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.tree.{DecisionTreeModel, Node} +import org.apache.spark.ml.tree.{TreeClassifierParams, DecisionTreeParams, DecisionTreeModel, Node} import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 534ea95b1c538..ae51b05a0c42d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -21,11 +21,10 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent -import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor} -import org.apache.spark.ml.impl.tree._ +import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.regression.DecisionTreeRegressionModel -import org.apache.spark.ml.tree.{DecisionTreeModel, TreeEnsembleModel} +import org.apache.spark.ml.tree.{GBTParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel} import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index b73be035e29b5..550369d18cfec 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -21,9 +21,8 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS -import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT, Vectors} +import org.apache.spark.mllib.linalg._ import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions._ import org.apache.spark.storage.StorageLevel /** @@ -99,76 +98,17 @@ class LogisticRegressionModel private[ml] ( /** @group setParam */ def setThreshold(value: Double): this.type = set(threshold, value) + /** Margin (rawPrediction) for class label 1. For binary classification only. */ private val margin: Vector => Double = (features) => { BLAS.dot(features, weights) + intercept } + /** Score (probability) for class label 1. For binary classification only. */ private val score: Vector => Double = (features) => { val m = margin(features) 1.0 / (1.0 + math.exp(-m)) } - override def transform(dataset: DataFrame): DataFrame = { - // This is overridden (a) to be more efficient (avoiding re-computing values when creating - // multiple output columns) and (b) to handle threshold, which the abstractions do not use. - // TODO: We should abstract away the steps defined by UDFs below so that the abstractions - // can call whichever UDFs are needed to create the output columns. - - // Check schema - transformSchema(dataset.schema, logging = true) - - // Output selected columns only. - // This is a bit complicated since it tries to avoid repeated computation. - // rawPrediction (-margin, margin) - // probability (1.0-score, score) - // prediction (max margin) - var tmpData = dataset - var numColsOutput = 0 - if ($(rawPredictionCol) != "") { - val features2raw: Vector => Vector = (features) => predictRaw(features) - tmpData = tmpData.withColumn($(rawPredictionCol), - callUDF(features2raw, new VectorUDT, col($(featuresCol)))) - numColsOutput += 1 - } - if ($(probabilityCol) != "") { - if ($(rawPredictionCol) != "") { - val raw2prob = udf { (rawPreds: Vector) => - val prob1 = 1.0 / (1.0 + math.exp(-rawPreds(1))) - Vectors.dense(1.0 - prob1, prob1): Vector - } - tmpData = tmpData.withColumn($(probabilityCol), raw2prob(col($(rawPredictionCol)))) - } else { - val features2prob = udf { (features: Vector) => predictProbabilities(features) : Vector } - tmpData = tmpData.withColumn($(probabilityCol), features2prob(col($(featuresCol)))) - } - numColsOutput += 1 - } - if ($(predictionCol) != "") { - val t = $(threshold) - if ($(probabilityCol) != "") { - val predict = udf { probs: Vector => - if (probs(1) > t) 1.0 else 0.0 - } - tmpData = tmpData.withColumn($(predictionCol), predict(col($(probabilityCol)))) - } else if ($(rawPredictionCol) != "") { - val predict = udf { rawPreds: Vector => - val prob1 = 1.0 / (1.0 + math.exp(-rawPreds(1))) - if (prob1 > t) 1.0 else 0.0 - } - tmpData = tmpData.withColumn($(predictionCol), predict(col($(rawPredictionCol)))) - } else { - val predict = udf { features: Vector => this.predict(features) } - tmpData = tmpData.withColumn($(predictionCol), predict(col($(featuresCol)))) - } - numColsOutput += 1 - } - if (numColsOutput == 0) { - this.logWarning(s"$uid: LogisticRegressionModel.transform() was called as NOOP" + - " since no output columns were set.") - } - tmpData - } - override val numClasses: Int = 2 /** @@ -179,17 +119,43 @@ class LogisticRegressionModel private[ml] ( if (score(features) > getThreshold) 1 else 0 } - override protected def predictProbabilities(features: Vector): Vector = { - val s = score(features) - Vectors.dense(1.0 - s, s) + override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { + rawPrediction match { + case dv: DenseVector => + var i = 0 + while (i < dv.size) { + dv.values(i) = 1.0 / (1.0 + math.exp(-dv.values(i))) + i += 1 + } + dv + case sv: SparseVector => + throw new RuntimeException("Unexpected error in LogisticRegressionModel:" + + " raw2probabilitiesInPlace encountered SparseVector") + } } override protected def predictRaw(features: Vector): Vector = { val m = margin(features) - Vectors.dense(0.0, m) + Vectors.dense(-m, m) } override def copy(extra: ParamMap): LogisticRegressionModel = { copyValues(new LogisticRegressionModel(parent, weights, intercept), extra) } + + override protected def raw2prediction(rawPrediction: Vector): Double = { + val t = getThreshold + val rawThreshold = if (t == 0.0) { + Double.NegativeInfinity + } else if (t == 1.0) { + Double.PositiveInfinity + } else { + Math.log(t / (1.0 - t)) + } + if (rawPrediction(1) > rawThreshold) 1 else 0 + } + + override protected def probability2prediction(probability: Vector): Double = { + if (probability(1) > getThreshold) 1 else 0 + } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala index 8519841c5c26c..330ae2938f4e0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala @@ -17,16 +17,16 @@ package org.apache.spark.ml.classification -import org.apache.spark.annotation.{AlphaComponent, DeveloperApi} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.types.{DoubleType, DataType, StructType} /** - * Params for probabilistic classification. + * (private[classification]) Params for probabilistic classification. */ private[classification] trait ProbabilisticClassifierParams extends ClassifierParams with HasProbabilityCol { @@ -42,17 +42,15 @@ private[classification] trait ProbabilisticClassifierParams /** - * :: AlphaComponent :: + * :: DeveloperApi :: * * Single-label binary or multiclass classifier which can output class conditional probabilities. * * @tparam FeaturesType Type of input features. E.g., [[Vector]] * @tparam E Concrete Estimator type * @tparam M Concrete Model type - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@AlphaComponent +@DeveloperApi private[spark] abstract class ProbabilisticClassifier[ FeaturesType, E <: ProbabilisticClassifier[FeaturesType, E, M], @@ -65,17 +63,15 @@ private[spark] abstract class ProbabilisticClassifier[ /** - * :: AlphaComponent :: + * :: DeveloperApi :: * * Model produced by a [[ProbabilisticClassifier]]. * Classes are indexed {0, 1, ..., numClasses - 1}. * * @tparam FeaturesType Type of input features. E.g., [[Vector]] * @tparam M Concrete Model type - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@AlphaComponent +@DeveloperApi private[spark] abstract class ProbabilisticClassificationModel[ FeaturesType, M <: ProbabilisticClassificationModel[FeaturesType, M]] @@ -95,39 +91,79 @@ private[spark] abstract class ProbabilisticClassificationModel[ * @return transformed dataset */ override def transform(dataset: DataFrame): DataFrame = { - // This default implementation should be overridden as needed. - - // Check schema transformSchema(dataset.schema, logging = true) - val (numColsOutput, outputData) = - ClassificationModel.transformColumnsImpl[FeaturesType](dataset, this) - // Output selected columns only. - if ($(probabilityCol) != "") { - // output probabilities - outputData.withColumn($(probabilityCol), - callUDF(predictProbabilities _, new VectorUDT, col($(featuresCol)))) - } else { - if (numColsOutput == 0) { - this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" + - " since no output columns were set.") + // This is a bit complicated since it tries to avoid repeated computation. + var outputData = dataset + var numColsOutput = 0 + if ($(rawPredictionCol).nonEmpty) { + outputData = outputData.withColumn(getRawPredictionCol, + callUDF(predictRaw _, new VectorUDT, col(getFeaturesCol))) + numColsOutput += 1 + } + if ($(probabilityCol).nonEmpty) { + val probUDF = if ($(rawPredictionCol).nonEmpty) { + callUDF(raw2probability _, new VectorUDT, col($(rawPredictionCol))) + } else { + callUDF(predictProbability _, new VectorUDT, col($(featuresCol))) + } + outputData = outputData.withColumn($(probabilityCol), probUDF) + numColsOutput += 1 + } + if ($(predictionCol).nonEmpty) { + val predUDF = if ($(rawPredictionCol).nonEmpty) { + callUDF(raw2prediction _, DoubleType, col($(rawPredictionCol))) + } else if ($(probabilityCol).nonEmpty) { + callUDF(probability2prediction _, DoubleType, col($(probabilityCol))) + } else { + callUDF(predict _, DoubleType, col($(featuresCol))) } - outputData + outputData = outputData.withColumn($(predictionCol), predUDF) + numColsOutput += 1 + } + + if (numColsOutput == 0) { + this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" + + " since no output columns were set.") } + outputData } /** - * :: DeveloperApi :: + * Estimate the probability of each class given the raw prediction, + * doing the computation in-place. + * These predictions are also called class conditional probabilities. + * + * This internal method is used to implement [[transform()]] and output [[probabilityCol]]. * + * @return Estimated class conditional probabilities (modified input vector) + */ + protected def raw2probabilityInPlace(rawPrediction: Vector): Vector + + /** Non-in-place version of [[raw2probabilityInPlace()]] */ + protected def raw2probability(rawPrediction: Vector): Vector = { + val probs = rawPrediction.copy + raw2probabilityInPlace(probs) + } + + /** * Predict the probability of each class given the features. * These predictions are also called class conditional probabilities. * - * WARNING: Not all models output well-calibrated probability estimates! These probabilities - * should be treated as confidences, not precise probabilities. - * * This internal method is used to implement [[transform()]] and output [[probabilityCol]]. + * + * @return Estimated class conditional probabilities + */ + protected def predictProbability(features: FeaturesType): Vector = { + val rawPreds = predictRaw(features) + raw2probabilityInPlace(rawPreds) + } + + /** + * Given a vector of class conditional probabilities, select the predicted label. + * This may be overridden to support thresholds which favor particular labels. + * @return predicted label */ - @DeveloperApi - protected def predictProbabilities(features: FeaturesType): Vector + protected def probability2prediction(probability: Vector): Double = probability.toDense.argmax } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 17f59bb42e129..9954893f14359 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -20,10 +20,9 @@ package org.apache.spark.ml.classification import scala.collection.mutable import org.apache.spark.annotation.AlphaComponent -import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor} -import org.apache.spark.ml.impl.tree._ +import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.tree.{DecisionTreeModel, TreeEnsembleModel} +import org.apache.spark.ml.tree.{RandomForestParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel} import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index d379172e0bf53..0e1ff97a8bf60 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -40,8 +40,10 @@ private[shared] object SharedParamsCodeGen { ParamDesc[String]("predictionCol", "prediction column name", Some("\"prediction\"")), ParamDesc[String]("rawPredictionCol", "raw prediction (a.k.a. confidence) column name", Some("\"rawPrediction\"")), - ParamDesc[String]("probabilityCol", - "column name for predicted class conditional probabilities", Some("\"probability\"")), + ParamDesc[String]("probabilityCol", "Column name for predicted class conditional" + + " probabilities. Note: Not all models output well-calibrated probability estimates!" + + " These probabilities should be treated as confidences, not precise probabilities.", + Some("\"probability\"")), ParamDesc[Double]("threshold", "threshold in binary classification prediction, in range [0, 1]", isValid = "ParamValidators.inRange(0, 1)"), diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index fb1874ccfc8dc..87f86807c3c91 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -128,10 +128,10 @@ private[ml] trait HasRawPredictionCol extends Params { private[ml] trait HasProbabilityCol extends Params { /** - * Param for column name for predicted class conditional probabilities. + * Param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.. * @group param */ - final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "column name for predicted class conditional probabilities") + final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.") setDefault(probabilityCol, "probability") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index b07c26fe79b36..f8f0b161a4812 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -18,10 +18,9 @@ package org.apache.spark.ml.regression import org.apache.spark.annotation.AlphaComponent -import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor} -import org.apache.spark.ml.impl.tree._ +import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.tree.{DecisionTreeModel, Node} +import org.apache.spark.ml.tree.{TreeRegressorParams, DecisionTreeParams, DecisionTreeModel, Node} import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index bc796958e4545..461905c12701a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -21,10 +21,9 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent -import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor} -import org.apache.spark.ml.impl.tree._ +import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.{Param, ParamMap} -import org.apache.spark.ml.tree.{DecisionTreeModel, TreeEnsembleModel} +import org.apache.spark.ml.tree.{GBTParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel} import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 66c475f2d9840..e63c9a3eead52 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -25,6 +25,7 @@ import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent +import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol} import org.apache.spark.mllib.linalg.{Vector, Vectors} @@ -39,7 +40,7 @@ import org.apache.spark.util.StatCounter /** * Params for linear regression. */ -private[regression] trait LinearRegressionParams extends RegressorParams +private[regression] trait LinearRegressionParams extends PredictorParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol /** @@ -240,7 +241,7 @@ class LinearRegressionModel private[ml] ( * + \bar{y} / \hat{y}||^2 * = 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2 * }}} - * where w_i^\prime is the effective weights defined by w_i/\hat{x_i}, offset is + * where w_i^\prime^ is the effective weights defined by w_i/\hat{x_i}, offset is * {{{ * - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}. * }}}, and diff is diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 0468a1be1ba74..dbc628927433d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -18,10 +18,9 @@ package org.apache.spark.ml.regression import org.apache.spark.annotation.AlphaComponent -import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor} -import org.apache.spark.ml.impl.tree.{RandomForestParams, TreeRegressorParams} +import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap -import org.apache.spark.ml.tree.{DecisionTreeModel, TreeEnsembleModel} +import org.apache.spark.ml.tree.{RandomForestParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel} import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala index c6b3327db6ad3..c72ef29680329 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala @@ -17,62 +17,40 @@ package org.apache.spark.ml.regression -import org.apache.spark.annotation.{AlphaComponent, DeveloperApi} -import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams} +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.ml.{PredictionModel, PredictorParams, Predictor} -/** - * :: DeveloperApi :: - * Params for regression. - * Currently empty, but may add functionality later. - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. - */ -@DeveloperApi -private[spark] trait RegressorParams extends PredictorParams /** - * :: AlphaComponent :: + * :: DeveloperApi :: * * Single-label regression * * @tparam FeaturesType Type of input features. E.g., [[org.apache.spark.mllib.linalg.Vector]] * @tparam Learner Concrete Estimator type * @tparam M Concrete Model type - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@AlphaComponent +@DeveloperApi private[spark] abstract class Regressor[ FeaturesType, Learner <: Regressor[FeaturesType, Learner, M], M <: RegressionModel[FeaturesType, M]] - extends Predictor[FeaturesType, Learner, M] - with RegressorParams { + extends Predictor[FeaturesType, Learner, M] with PredictorParams { // TODO: defaultEvaluator (follow-up PR) } /** - * :: AlphaComponent :: + * :: DeveloperApi :: * * Model produced by a [[Regressor]]. * * @tparam FeaturesType Type of input features. E.g., [[org.apache.spark.mllib.linalg.Vector]] * @tparam M Concrete Model type. - * - * NOTE: This is currently private[spark] but will be made public later once it is stabilized. */ -@AlphaComponent -private[spark] abstract class RegressionModel[FeaturesType, M <: RegressionModel[FeaturesType, M]] - extends PredictionModel[FeaturesType, M] with RegressorParams { - - /** - * :: DeveloperApi :: - * - * Predict real-valued label for the given features. - * This internal method is used to implement [[transform()]] and output [[predictionCol]]. - */ - @DeveloperApi - protected def predict(features: FeaturesType): Double +@DeveloperApi +abstract class RegressionModel[FeaturesType, M <: RegressionModel[FeaturesType, M]] + extends PredictionModel[FeaturesType, M] with PredictorParams { + // TODO: defaultEvaluator (follow-up PR) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala similarity index 99% rename from mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala rename to mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 0e225627d4ee3..816fcedf2efb3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -15,10 +15,10 @@ * limitations under the License. */ -package org.apache.spark.ml.impl.tree +package org.apache.spark.ml.tree import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.ml.impl.estimator.PredictorParams +import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasMaxIter, HasSeed} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, BoostingStrategy => OldBoostingStrategy, Strategy => OldStrategy} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 188d1e542b5b5..f6bcdf83cd337 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -587,6 +587,28 @@ class DenseVector(val values: Array[Double]) extends Vector { } new SparseVector(size, ii, vv) } + + /** + * Find the index of a maximal element. Returns the first maximal element in case of a tie. + * Returns -1 if vector has length 0. + */ + private[spark] def argmax: Int = { + if (size == 0) { + -1 + } else { + var maxIdx = 0 + var maxValue = values(0) + var i = 1 + while (i < size) { + if (values(i) > maxValue) { + maxIdx = i + maxValue = values(i) + } + i += 1 + } + maxIdx + } + } } object DenseVector { From fbf1f342a02af65f69e0ee770a2b983c69e7c089 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 6 May 2015 17:08:39 -0700 Subject: [PATCH 03/48] [HOT FIX] [SPARK-7418] Ignore flaky SparkSubmitUtilsSuite test --- .../scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala index 2df2597e058cd..da9578478bed9 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala @@ -104,7 +104,7 @@ class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll { assert(jarPath.indexOf(ivyPath) >= 0, "should use non-default ivy path") } - test("search for artifact at other repositories") { + ignore("search for artifact at other repositories") { val path = SparkSubmitUtils.resolveMavenCoordinates("com.agimatec:agimatec-validation:0.9.3", Option("https://oss.sonatype.org/content/repositories/agimatec/"), None, true) assert(path.indexOf("agimatec-validation") >= 0, "should find package. If it doesn't, check" + From 4e930420c19ae7773b138dfc7db8fc03b4660251 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Wed, 6 May 2015 17:28:11 -0700 Subject: [PATCH 04/48] [SPARK-6799] [SPARKR] Remove SparkR RDD examples, add dataframe examples This PR also makes some of the DataFrame to RDD methods private as the RDD class is private in 1.4 cc rxin pwendell Author: Shivaram Venkataraman Closes #5949 from shivaram/sparkr-examples and squashes the following commits: 6c42fdc [Shivaram Venkataraman] Remove SparkR RDD examples, add dataframe examples --- R/pkg/NAMESPACE | 4 - R/pkg/R/DataFrame.R | 2 +- examples/src/main/r/dataframe.R | 54 +++++++++++ examples/src/main/r/kmeans.R | 93 ------------------- examples/src/main/r/linear_solver_mnist.R | 107 ---------------------- examples/src/main/r/logistic_regression.R | 62 ------------- examples/src/main/r/pi.R | 46 ---------- examples/src/main/r/wordcount.R | 42 --------- 8 files changed, 55 insertions(+), 355 deletions(-) create mode 100644 examples/src/main/r/dataframe.R delete mode 100644 examples/src/main/r/kmeans.R delete mode 100644 examples/src/main/r/linear_solver_mnist.R delete mode 100644 examples/src/main/r/logistic_regression.R delete mode 100644 examples/src/main/r/pi.R delete mode 100644 examples/src/main/r/wordcount.R diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 528e6608c3c82..3fb92be0940b7 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -45,8 +45,6 @@ exportMethods("cache", "showDF", "sortDF", "take", - "toJSON", - "toRDD", "unionAll", "unpersist", "where", @@ -95,14 +93,12 @@ export("cacheTable", "createExternalTable", "dropTempTable", "jsonFile", - "jsonRDD", "loadDF", "parquetFile", "sql", "table", "tableNames", "tables", - "toDF", "uncacheTable") export("sparkRSQL.init", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 56c305d912587..47d92f141cc7d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -272,7 +272,7 @@ setMethod("names", setMethod("registerTempTable", signature(x = "DataFrame", tableName = "character"), function(x, tableName) { - callJMethod(x@sdf, "registerTempTable", tableName) + invisible(callJMethod(x@sdf, "registerTempTable", tableName)) }) #' insertInto diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R new file mode 100644 index 0000000000000..53b817144f6ac --- /dev/null +++ b/examples/src/main/r/dataframe.R @@ -0,0 +1,54 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(SparkR) + +# Initialize SparkContext and SQLContext +sc <- sparkR.init(appName="SparkR-DataFrame-example") +sqlContext <- sparkRSQL.init(sc) + +# Create a simple local data.frame +localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) + +# Convert local data frame to a SparkR DataFrame +df <- createDataFrame(sqlContext, localDF) + +# Print its schema +printSchema(df) +# root +# |-- name: string (nullable = true) +# |-- age: double (nullable = true) + +# Create a DataFrame from a JSON file +path <- file.path(Sys.getenv("SPARK_HOME"), "examples/src/main/resources/people.json") +peopleDF <- jsonFile(sqlContext, path) +printSchema(peopleDF) + +# Register this DataFrame as a table. +registerTempTable(peopleDF, "people") + +# SQL statements can be run by using the sql methods provided by sqlContext +teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19") + +# Call collect to get a local data.frame +teenagersLocalDF <- collect(teenagers) + +# Print the teenagers in our dataset +print(teenagersLocalDF) + +# Stop the SparkContext now +sparkR.stop() diff --git a/examples/src/main/r/kmeans.R b/examples/src/main/r/kmeans.R deleted file mode 100644 index 6e6b5cb93789c..0000000000000 --- a/examples/src/main/r/kmeans.R +++ /dev/null @@ -1,93 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(SparkR) - -# Logistic regression in Spark. -# Note: unlike the example in Scala, a point here is represented as a vector of -# doubles. - -parseVectors <- function(lines) { - lines <- strsplit(as.character(lines) , " ", fixed = TRUE) - list(matrix(as.numeric(unlist(lines)), ncol = length(lines[[1]]))) -} - -dist.fun <- function(P, C) { - apply( - C, - 1, - function(x) { - colSums((t(P) - x)^2) - } - ) -} - -closestPoint <- function(P, C) { - max.col(-dist.fun(P, C)) -} -# Main program - -args <- commandArgs(trailing = TRUE) - -if (length(args) != 3) { - print("Usage: kmeans ") - q("no") -} - -sc <- sparkR.init(appName = "RKMeans") -K <- as.integer(args[[2]]) -convergeDist <- as.double(args[[3]]) - -lines <- textFile(sc, args[[1]]) -points <- cache(lapplyPartition(lines, parseVectors)) -# kPoints <- take(points, K) -kPoints <- do.call(rbind, takeSample(points, FALSE, K, 16189L)) -tempDist <- 1.0 - -while (tempDist > convergeDist) { - closest <- lapplyPartition( - lapply(points, - function(p) { - cp <- closestPoint(p, kPoints); - mapply(list, unique(cp), split.data.frame(cbind(1, p), cp), SIMPLIFY=FALSE) - }), - function(x) {do.call(c, x) - }) - - pointStats <- reduceByKey(closest, - function(p1, p2) { - t(colSums(rbind(p1, p2))) - }, - 2L) - - newPoints <- do.call( - rbind, - collect(lapply(pointStats, - function(tup) { - point.sum <- tup[[2]][, -1] - point.count <- tup[[2]][, 1] - point.sum/point.count - }))) - - D <- dist.fun(kPoints, newPoints) - tempDist <- sum(D[cbind(1:3, max.col(-D))]) - kPoints <- newPoints - cat("Finished iteration (delta = ", tempDist, ")\n") -} - -cat("Final centers:\n") -writeLines(unlist(lapply(kPoints, paste, collapse = " "))) diff --git a/examples/src/main/r/linear_solver_mnist.R b/examples/src/main/r/linear_solver_mnist.R deleted file mode 100644 index c864a4232d010..0000000000000 --- a/examples/src/main/r/linear_solver_mnist.R +++ /dev/null @@ -1,107 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Instructions: https://github.com/amplab-extras/SparkR-pkg/wiki/SparkR-Example:-Digit-Recognition-on-EC2 - -library(SparkR) -library(Matrix) - -args <- commandArgs(trailing = TRUE) - -# number of random features; default to 1100 -D <- ifelse(length(args) > 0, as.integer(args[[1]]), 1100) -# number of partitions for training dataset -trainParts <- 12 -# dimension of digits -d <- 784 -# number of test examples -NTrain <- 60000 -# number of training examples -NTest <- 10000 -# scale of features -gamma <- 4e-4 - -sc <- sparkR.init(appName = "SparkR-LinearSolver") - -# You can also use HDFS path to speed things up: -# hdfs:///train-mnist-dense-with-labels.data -file <- textFile(sc, "/data/train-mnist-dense-with-labels.data", trainParts) - -W <- gamma * matrix(nrow=D, ncol=d, data=rnorm(D*d)) -b <- 2 * pi * matrix(nrow=D, ncol=1, data=runif(D)) -broadcastW <- broadcast(sc, W) -broadcastB <- broadcast(sc, b) - -includePackage(sc, Matrix) -numericLines <- lapplyPartitionsWithIndex(file, - function(split, part) { - matList <- sapply(part, function(line) { - as.numeric(strsplit(line, ",", fixed=TRUE)[[1]]) - }, simplify=FALSE) - mat <- Matrix(ncol=d+1, data=unlist(matList, F, F), - sparse=T, byrow=T) - mat - }) - -featureLabels <- cache(lapplyPartition( - numericLines, - function(part) { - label <- part[,1] - mat <- part[,-1] - ones <- rep(1, nrow(mat)) - features <- cos( - mat %*% t(value(broadcastW)) + (matrix(ncol=1, data=ones) %*% t(value(broadcastB)))) - onesMat <- Matrix(ones) - featuresPlus <- cBind(features, onesMat) - labels <- matrix(nrow=nrow(mat), ncol=10, data=-1) - for (i in 1:nrow(mat)) { - labels[i, label[i]] <- 1 - } - list(label=labels, features=featuresPlus) - })) - -FTF <- Reduce("+", collect(lapplyPartition(featureLabels, - function(part) { - t(part$features) %*% part$features - }), flatten=F)) - -FTY <- Reduce("+", collect(lapplyPartition(featureLabels, - function(part) { - t(part$features) %*% part$label - }), flatten=F)) - -# solve for the coefficient matrix -C <- solve(FTF, FTY) - -test <- Matrix(as.matrix(read.csv("/data/test-mnist-dense-with-labels.data", - header=F), sparse=T)) -testData <- test[,-1] -testLabels <- matrix(ncol=1, test[,1]) - -err <- 0 - -# contstruct the feature maps for all examples from this digit -featuresTest <- cos(testData %*% t(value(broadcastW)) + - (matrix(ncol=1, data=rep(1, NTest)) %*% t(value(broadcastB)))) -featuresTest <- cBind(featuresTest, Matrix(rep(1, NTest))) - -# extract the one vs. all assignment -results <- featuresTest %*% C -labelsGot <- apply(results, 1, which.max) -err <- sum(testLabels != labelsGot) / nrow(testLabels) - -cat("\nFinished running. The error rate is: ", err, ".\n") diff --git a/examples/src/main/r/logistic_regression.R b/examples/src/main/r/logistic_regression.R deleted file mode 100644 index 2a86aa98160d3..0000000000000 --- a/examples/src/main/r/logistic_regression.R +++ /dev/null @@ -1,62 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(SparkR) - -args <- commandArgs(trailing = TRUE) - -if (length(args) != 3) { - print("Usage: logistic_regression ") - q("no") -} - -# Initialize Spark context -sc <- sparkR.init(appName = "LogisticRegressionR") -iterations <- as.integer(args[[2]]) -D <- as.integer(args[[3]]) - -readPartition <- function(part){ - part = strsplit(part, " ", fixed = T) - list(matrix(as.numeric(unlist(part)), ncol = length(part[[1]]))) -} - -# Read data points and convert each partition to a matrix -points <- cache(lapplyPartition(textFile(sc, args[[1]]), readPartition)) - -# Initialize w to a random value -w <- runif(n=D, min = -1, max = 1) -cat("Initial w: ", w, "\n") - -# Compute logistic regression gradient for a matrix of data points -gradient <- function(partition) { - partition = partition[[1]] - Y <- partition[, 1] # point labels (first column of input file) - X <- partition[, -1] # point coordinates - - # For each point (x, y), compute gradient function - dot <- X %*% w - logit <- 1 / (1 + exp(-Y * dot)) - grad <- t(X) %*% ((logit - 1) * Y) - list(grad) -} - -for (i in 1:iterations) { - cat("On iteration ", i, "\n") - w <- w - reduce(lapplyPartition(points, gradient), "+") -} - -cat("Final w: ", w, "\n") diff --git a/examples/src/main/r/pi.R b/examples/src/main/r/pi.R deleted file mode 100644 index aa7a833e147a0..0000000000000 --- a/examples/src/main/r/pi.R +++ /dev/null @@ -1,46 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(SparkR) - -args <- commandArgs(trailing = TRUE) - -sc <- sparkR.init(appName = "PiR") - -slices <- ifelse(length(args) > 1, as.integer(args[[2]]), 2) - -n <- 100000 * slices - -piFunc <- function(elem) { - rands <- runif(n = 2, min = -1, max = 1) - val <- ifelse((rands[1]^2 + rands[2]^2) < 1, 1.0, 0.0) - val -} - - -piFuncVec <- function(elems) { - message(length(elems)) - rands1 <- runif(n = length(elems), min = -1, max = 1) - rands2 <- runif(n = length(elems), min = -1, max = 1) - val <- ifelse((rands1^2 + rands2^2) < 1, 1.0, 0.0) - sum(val) -} - -rdd <- parallelize(sc, 1:n, slices) -count <- reduce(lapplyPartition(rdd, piFuncVec), sum) -cat("Pi is roughly", 4.0 * count / n, "\n") -cat("Num elements in RDD ", count(rdd), "\n") diff --git a/examples/src/main/r/wordcount.R b/examples/src/main/r/wordcount.R deleted file mode 100644 index b734cb0ecf55b..0000000000000 --- a/examples/src/main/r/wordcount.R +++ /dev/null @@ -1,42 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -library(SparkR) - -args <- commandArgs(trailing = TRUE) - -if (length(args) != 1) { - print("Usage: wordcount ") - q("no") -} - -# Initialize Spark context -sc <- sparkR.init(appName = "RwordCount") -lines <- textFile(sc, args[[1]]) - -words <- flatMap(lines, - function(line) { - strsplit(line, " ")[[1]] - }) -wordCount <- lapply(words, function(word) { list(word, 1L) }) - -counts <- reduceByKey(wordCount, "+", 2L) -output <- collect(counts) - -for (wordcount in output) { - cat(wordcount[[1]], ": ", wordcount[[2]], "\n") -} From 316a5c0423ba3688cacd3acc3c5b5571e8a71d1d Mon Sep 17 00:00:00 2001 From: jerryshao Date: Wed, 6 May 2015 17:44:43 -0700 Subject: [PATCH 05/48] [SPARK-7396] [STREAMING] [EXAMPLE] Update KafkaWordCountProducer to use new Producer API Otherwise it will throw exception: ``` Exception in thread "main" kafka.common.FailedToSendMessageException: Failed to send messages after 3 tries. at kafka.producer.async.DefaultEventHandler.handle(DefaultEventHandler.scala:90) at kafka.producer.Producer.send(Producer.scala:77) at org.apache.spark.examples.streaming.KafkaWordCountProducer$.main(KafkaWordCount.scala:96) at org.apache.spark.examples.streaming.KafkaWordCountProducer.main(KafkaWordCount.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:623) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:169) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:192) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:111) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) ``` Author: jerryshao Closes #5936 from jerryshao/SPARK-7396 and squashes the following commits: 270bbe2 [jerryshao] Fix Kafka Produce throw Exception issue --- .../examples/streaming/KafkaWordCount.scala | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala index 387c0e421334b..f407367a54f6c 100644 --- a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala +++ b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala @@ -17,9 +17,9 @@ package org.apache.spark.examples.streaming -import java.util.Properties +import java.util.HashMap -import kafka.producer._ +import org.apache.kafka.clients.producer.{ProducerConfig, KafkaProducer, ProducerRecord} import org.apache.spark.streaming._ import org.apache.spark.streaming.kafka._ @@ -77,23 +77,25 @@ object KafkaWordCountProducer { val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args // Zookeeper connection properties - val props = new Properties() - props.put("metadata.broker.list", brokers) - props.put("serializer.class", "kafka.serializer.StringEncoder") + val props = new HashMap[String, Object]() + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, + "org.apache.kafka.common.serialization.StringSerializer") + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, + "org.apache.kafka.common.serialization.StringSerializer") - val config = new ProducerConfig(props) - val producer = new Producer[String, String](config) + val producer = new KafkaProducer[String, String](props) // Send some messages while(true) { - val messages = (1 to messagesPerSec.toInt).map { messageNum => + (1 to messagesPerSec.toInt).foreach { messageNum => val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(10).toString) .mkString(" ") - new KeyedMessage[String, String](topic, str) - }.toArray + val message = new ProducerRecord[String, String](topic, null, str) + producer.send(message) + } - producer.send(messages: _*) Thread.sleep(100) } } From 8fa6829f5e6d8b8f33097c3f20fcb96198d4031b Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 6 May 2015 17:52:34 -0700 Subject: [PATCH 06/48] [SPARK-7371] [SPARK-7377] [SPARK-7408] DAG visualization addendum (#5729) This is a follow-up patch for #5729. **[SPARK-7408]** Move as much style code from JS to CSS as possible **[SPARK-7377]** Fix JS error if a job / stage contains only one RDD **[SPARK-7371]** Decrease emphasis on RDD on stage page as requested by mateiz pwendell This patch also includes general code clean up. Author: Andrew Or Closes #5954 from andrewor14/viz-emphasize-rdd and squashes the following commits: 3c0d4f0 [Andrew Or] Guard against JS error by rendering arrows only if needed f23e15b [Andrew Or] Merge branch 'master' of github.com:apache/spark into viz-emphasize-rdd 565801f [Andrew Or] Clean up code 9dab5f0 [Andrew Or] Move styling from JS to CSS + clean up code 107c0b6 [Andrew Or] Tweak background color, stroke width, font size etc. 1610c62 [Andrew Or] Implement cluster padding for stage page --- .../apache/spark/ui/static/dagre-d3.min.js | 14 +- .../apache/spark/ui/static/spark-dag-viz.css | 83 ++++ .../apache/spark/ui/static/spark-dag-viz.js | 357 ++++++++++-------- .../scala/org/apache/spark/ui/UIUtils.scala | 14 +- .../spark/ui/scope/RDDOperationGraph.scala | 19 +- 5 files changed, 310 insertions(+), 177 deletions(-) create mode 100644 core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css diff --git a/core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js b/core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js index 6d2da25024a83..6beb7db855ca0 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js +++ b/core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js @@ -1,4 +1,4 @@ -/*v0.4.3 with 1 additional commit (see http://github.com/andrewor14/dagre-d3)*/(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.dagreD3=f()}})(function(){var define,module,exports;return function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o0}},{}],14:[function(require,module,exports){module.exports=intersectNode;function intersectNode(node,point){return node.intersect(point)}},{}],15:[function(require,module,exports){var intersectLine=require("./intersect-line");module.exports=intersectPolygon;function intersectPolygon(node,polyPoints,point){var x1=node.x;var y1=node.y;var intersections=[];var minX=Number.POSITIVE_INFINITY,minY=Number.POSITIVE_INFINITY;polyPoints.forEach(function(entry){minX=Math.min(minX,entry.x);minY=Math.min(minY,entry.y)});var left=x1-node.width/2-minX;var top=y1-node.height/2-minY;for(var i=0;i1){intersections.sort(function(p,q){var pdx=p.x-point.x,pdy=p.y-point.y,distp=Math.sqrt(pdx*pdx+pdy*pdy),qdx=q.x-point.x,qdy=q.y-point.y,distq=Math.sqrt(qdx*qdx+qdy*qdy);return distpMath.abs(dx)*h){if(dy<0){h=-h}sx=dy===0?0:h*dx/dy;sy=h}else{if(dx<0){w=-w}sx=w;sy=dx===0?0:w*dy/dx}return{x:x+sx,y:y+sy}}},{}],17:[function(require,module,exports){var util=require("../util");module.exports=addHtmlLabel;function addHtmlLabel(root,node){var fo=root.append("foreignObject").attr("width","100000");var div=fo.append("xhtml:div");var label=node.label;switch(typeof label){case"function":div.insert(label);break;case"object":div.insert(function(){return label});break;default:div.html(label)}util.applyStyle(div,node.labelStyle);div.style("display","inline-block");div.style("white-space","nowrap");var w,h;div.each(function(){w=this.clientWidth;h=this.clientHeight});fo.attr("width",w).attr("height",h);return fo}},{"../util":25}],18:[function(require,module,exports){var addTextLabel=require("./add-text-label"),addHtmlLabel=require("./add-html-label");module.exports=addLabel;function addLabel(root,node){var label=node.label;var labelSvg=root.append("g");if(typeof label!=="string"||node.labelType==="html"){addHtmlLabel(labelSvg,node)}else{addTextLabel(labelSvg,node)}var labelBBox=labelSvg.node().getBBox();labelSvg.attr("transform","translate("+-labelBBox.width/2+","+-labelBBox.height/2+")");return labelSvg}},{"./add-html-label":17,"./add-text-label":19}],19:[function(require,module,exports){var util=require("../util");module.exports=addTextLabel;function addTextLabel(root,node){var domNode=root.append("text");var lines=processEscapeSequences(node.label).split("\n");for(var i=0;i0;--i){entry=buckets[i].dequeue();if(entry){results=results.concat(removeNode(g,buckets,zeroIdx,entry,true));break}}}}return results}function removeNode(g,buckets,zeroIdx,entry,collectPredecessors){var results=collectPredecessors?[]:undefined;_.each(g.inEdges(entry.v),function(edge){var weight=g.edge(edge),uEntry=g.node(edge.v);if(collectPredecessors){results.push({v:edge.v,w:edge.w})}uEntry.out-=weight;assignBucket(buckets,zeroIdx,uEntry)});_.each(g.outEdges(entry.v),function(edge){var weight=g.edge(edge),w=edge.w,wEntry=g.node(w);wEntry["in"]-=weight;assignBucket(buckets,zeroIdx,wEntry)});g.removeNode(entry.v);return results}function buildState(g,weightFn){var fasGraph=new Graph,maxIn=0,maxOut=0;_.each(g.nodes(),function(v){fasGraph.setNode(v,{v:v,"in":0,out:0})});_.each(g.edges(),function(e){var prevWeight=fasGraph.edge(e.v,e.w)||0,weight=weightFn(e),edgeWeight=prevWeight+weight;fasGraph.setEdge(e.v,e.w,edgeWeight);maxOut=Math.max(maxOut,fasGraph.node(e.v).out+=weight);maxIn=Math.max(maxIn,fasGraph.node(e.w)["in"]+=weight)});var buckets=_.range(maxOut+maxIn+3).map(function(){return new List});var zeroIdx=maxIn+1;_.each(fasGraph.nodes(),function(v){assignBucket(buckets,zeroIdx,fasGraph.node(v))});return{graph:fasGraph,buckets:buckets,zeroIdx:zeroIdx}}function assignBucket(buckets,zeroIdx,entry){if(!entry.out){buckets[0].enqueue(entry)}else if(!entry["in"]){buckets[buckets.length-1].enqueue(entry)}else{buckets[entry.out-entry["in"]+zeroIdx].enqueue(entry)}}},{"./data/list":31,"./graphlib":33,"./lodash":36}],35:[function(require,module,exports){"use strict";var _=require("./lodash"),acyclic=require("./acyclic"),normalize=require("./normalize"),rank=require("./rank"),normalizeRanks=require("./util").normalizeRanks,parentDummyChains=require("./parent-dummy-chains"),removeEmptyRanks=require("./util").removeEmptyRanks,nestingGraph=require("./nesting-graph"),addBorderSegments=require("./add-border-segments"),coordinateSystem=require("./coordinate-system"),order=require("./order"),position=require("./position"),util=require("./util"),Graph=require("./graphlib").Graph;module.exports=layout;function layout(g,opts){var time=opts&&opts.debugTiming?util.time:util.notime;time("layout",function(){var layoutGraph=time(" buildLayoutGraph",function(){return buildLayoutGraph(g)});time(" runLayout",function(){runLayout(layoutGraph,time)});time(" updateInputGraph",function(){updateInputGraph(g,layoutGraph)})})}function runLayout(g,time){time(" makeSpaceForEdgeLabels",function(){makeSpaceForEdgeLabels(g)});time(" removeSelfEdges",function(){removeSelfEdges(g)});time(" acyclic",function(){acyclic.run(g)});time(" nestingGraph.run",function(){nestingGraph.run(g)});time(" rank",function(){rank(util.asNonCompoundGraph(g))});time(" injectEdgeLabelProxies",function(){injectEdgeLabelProxies(g)});time(" removeEmptyRanks",function(){removeEmptyRanks(g)});time(" nestingGraph.cleanup",function(){nestingGraph.cleanup(g)});time(" normalizeRanks",function(){normalizeRanks(g)});time(" assignRankMinMax",function(){assignRankMinMax(g)});time(" removeEdgeLabelProxies",function(){removeEdgeLabelProxies(g)});time(" normalize.run",function(){normalize.run(g)});time(" parentDummyChains",function(){parentDummyChains(g)});time(" addBorderSegments",function(){addBorderSegments(g)});time(" order",function(){order(g)});time(" insertSelfEdges",function(){insertSelfEdges(g)});time(" adjustCoordinateSystem",function(){coordinateSystem.adjust(g)});time(" position",function(){position(g)});time(" positionSelfEdges",function(){positionSelfEdges(g)});time(" removeBorderNodes",function(){removeBorderNodes(g)});time(" normalize.undo",function(){normalize.undo(g)});time(" fixupEdgeLabelCoords",function(){fixupEdgeLabelCoords(g)});time(" undoCoordinateSystem",function(){coordinateSystem.undo(g)});time(" translateGraph",function(){translateGraph(g)});time(" assignNodeIntersects",function(){assignNodeIntersects(g)});time(" reversePoints",function(){reversePointsForReversedEdges(g)});time(" acyclic.undo",function(){acyclic.undo(g)})}function updateInputGraph(inputGraph,layoutGraph){_.each(inputGraph.nodes(),function(v){var inputLabel=inputGraph.node(v),layoutLabel=layoutGraph.node(v);if(inputLabel){inputLabel.x=layoutLabel.x;inputLabel.y=layoutLabel.y;if(layoutGraph.children(v).length){inputLabel.width=layoutLabel.width;inputLabel.height=layoutLabel.height}}});_.each(inputGraph.edges(),function(e){var inputLabel=inputGraph.edge(e),layoutLabel=layoutGraph.edge(e);inputLabel.points=layoutLabel.points;if(_.has(layoutLabel,"x")){inputLabel.x=layoutLabel.x;inputLabel.y=layoutLabel.y}});inputGraph.graph().width=layoutGraph.graph().width;inputGraph.graph().height=layoutGraph.graph().height}var graphNumAttrs=["nodesep","edgesep","ranksep","marginx","marginy"],graphDefaults={ranksep:50,edgesep:20,nodesep:50,rankdir:"tb"},graphAttrs=["acyclicer","ranker","rankdir","align"],nodeNumAttrs=["width","height"],nodeDefaults={width:0,height:0},edgeNumAttrs=["minlen","weight","width","height","labeloffset"],edgeDefaults={minlen:1,weight:1,width:0,height:0,labeloffset:10,labelpos:"r"},edgeAttrs=["labelpos"];function buildLayoutGraph(inputGraph){var g=new Graph({multigraph:true,compound:true}),graph=canonicalize(inputGraph.graph());g.setGraph(_.merge({},graphDefaults,selectNumberAttrs(graph,graphNumAttrs),_.pick(graph,graphAttrs)));_.each(inputGraph.nodes(),function(v){var node=canonicalize(inputGraph.node(v));g.setNode(v,_.defaults(selectNumberAttrs(node,nodeNumAttrs),nodeDefaults));g.setParent(v,inputGraph.parent(v))});_.each(inputGraph.edges(),function(e){var edge=canonicalize(inputGraph.edge(e));g.setEdge(e,_.merge({},edgeDefaults,selectNumberAttrs(edge,edgeNumAttrs),_.pick(edge,edgeAttrs)))});return g}function makeSpaceForEdgeLabels(g){var graph=g.graph(); +module.exports={graphlib:require("./lib/graphlib"),dagre:require("./lib/dagre"),intersect:require("./lib/intersect"),render:require("./lib/render"),util:require("./lib/util"),version:require("./lib/version")}},{"./lib/dagre":8,"./lib/graphlib":9,"./lib/intersect":10,"./lib/render":23,"./lib/util":25,"./lib/version":26}],2:[function(require,module,exports){var util=require("./util");module.exports={"default":normal,normal:normal,vee:vee,undirected:undirected};function normal(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 0 L 10 5 L 0 10 z").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}function vee(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 0 L 10 5 L 0 10 L 4 5 z").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}function undirected(parent,id,edge,type){var marker=parent.append("marker").attr("id",id).attr("viewBox","0 0 10 10").attr("refX",9).attr("refY",5).attr("markerUnits","strokeWidth").attr("markerWidth",8).attr("markerHeight",6).attr("orient","auto");var path=marker.append("path").attr("d","M 0 5 L 10 5").style("stroke-width",1).style("stroke-dasharray","1,0");util.applyStyle(path,edge[type+"Style"])}},{"./util":25}],3:[function(require,module,exports){var util=require("./util");module.exports=createClusters;function createClusters(selection,g){var clusters=g.nodes().filter(function(v){return util.isSubgraph(g,v)}),svgClusters=selection.selectAll("g.cluster").data(clusters,function(v){return v});svgClusters.enter().append("g").attr("id",function(v){return"cluster_"+v.replace(/^cluster/,"")}).attr("name",function(v){return g.node(v).label}).attr("class","cluster").style("opacity",0).append("rect");util.applyTransition(svgClusters.exit(),g).style("opacity",0).remove();util.applyTransition(svgClusters,g).style("opacity",1);util.applyTransition(svgClusters.selectAll("rect"),g).attr("width",function(v){var node=g.node(v);return node.width+node.paddingLeft+node.paddingRight}).attr("height",function(v){var node=g.node(v);return node.height+node.paddingTop+node.paddingBottom}).attr("x",function(v){var node=g.node(v);return node.x-node.width/2-node.paddingLeft}).attr("y",function(v){var node=g.node(v);return node.y-node.height/2-node.paddingTop})}},{"./util":25}],4:[function(require,module,exports){"use strict";var _=require("./lodash"),addLabel=require("./label/add-label"),util=require("./util"),d3=require("./d3");module.exports=createEdgeLabels;function createEdgeLabels(selection,g){var svgEdgeLabels=selection.selectAll("g.edgeLabel").data(g.edges(),function(e){return util.edgeToId(e)}).classed("update",true);svgEdgeLabels.selectAll("*").remove();svgEdgeLabels.enter().append("g").classed("edgeLabel",true).style("opacity",0);svgEdgeLabels.each(function(e){var edge=g.edge(e),label=addLabel(d3.select(this),g.edge(e),0,0).classed("label",true),bbox=label.node().getBBox();if(edge.labelId){label.attr("id",edge.labelId)}if(!_.has(edge,"width")){edge.width=bbox.width}if(!_.has(edge,"height")){edge.height=bbox.height}});util.applyTransition(svgEdgeLabels.exit(),g).style("opacity",0).remove();return svgEdgeLabels}},{"./d3":7,"./label/add-label":18,"./lodash":20,"./util":25}],5:[function(require,module,exports){"use strict";var _=require("./lodash"),intersectNode=require("./intersect/intersect-node"),util=require("./util"),d3=require("./d3");module.exports=createEdgePaths;function createEdgePaths(selection,g,arrows){var svgPaths=selection.selectAll("g.edgePath").data(g.edges(),function(e){return util.edgeToId(e)}).classed("update",true);enter(svgPaths,g);exit(svgPaths,g);util.applyTransition(svgPaths,g).style("opacity",1);svgPaths.each(function(e){var domEdge=d3.select(this);var edge=g.edge(e);edge.elem=this;if(edge.id){domEdge.attr("id",edge.id)}util.applyClass(domEdge,edge["class"],(domEdge.classed("update")?"update ":"")+"edgePath")});svgPaths.selectAll("path.path").each(function(e){var edge=g.edge(e);edge.arrowheadId=_.uniqueId("arrowhead");var domEdge=d3.select(this).attr("marker-end",function(){return"url(#"+edge.arrowheadId+")"}).style("fill","none");util.applyTransition(domEdge,g).attr("d",function(e){return calcPoints(g,e)});util.applyStyle(domEdge,edge.style)});svgPaths.selectAll("defs *").remove();svgPaths.selectAll("defs").each(function(e){var edge=g.edge(e),arrowhead=arrows[edge.arrowhead];arrowhead(d3.select(this),edge.arrowheadId,edge,"arrowhead")});return svgPaths}function calcPoints(g,e){var edge=g.edge(e),tail=g.node(e.v),head=g.node(e.w),points=edge.points.slice(1,edge.points.length-1);points.unshift(intersectNode(tail,points[0]));points.push(intersectNode(head,points[points.length-1]));return createLine(edge,points)}function createLine(edge,points){var line=d3.svg.line().x(function(d){return d.x}).y(function(d){return d.y});if(_.has(edge,"lineInterpolate")){line.interpolate(edge.lineInterpolate)}if(_.has(edge,"lineTension")){line.tension(Number(edge.lineTension))}return line(points)}function getCoords(elem){var bbox=elem.getBBox(),matrix=elem.getTransformToElement(elem.ownerSVGElement).translate(bbox.width/2,bbox.height/2);return{x:matrix.e,y:matrix.f}}function enter(svgPaths,g){var svgPathsEnter=svgPaths.enter().append("g").attr("class","edgePath").style("opacity",0);svgPathsEnter.append("path").attr("class","path").attr("d",function(e){var edge=g.edge(e),sourceElem=g.node(e.v).elem,points=_.range(edge.points.length).map(function(){return getCoords(sourceElem)});return createLine(edge,points)});svgPathsEnter.append("defs")}function exit(svgPaths,g){var svgPathExit=svgPaths.exit();util.applyTransition(svgPathExit,g).style("opacity",0).remove();util.applyTransition(svgPathExit.select("path.path"),g).attr("d",function(e){var source=g.node(e.v);if(source){var points=_.range(this.pathSegList.length).map(function(){return source});return createLine({},points)}else{return d3.select(this).attr("d")}})}},{"./d3":7,"./intersect/intersect-node":14,"./lodash":20,"./util":25}],6:[function(require,module,exports){"use strict";var _=require("./lodash"),addLabel=require("./label/add-label"),util=require("./util"),d3=require("./d3");module.exports=createNodes;function createNodes(selection,g,shapes){var simpleNodes=g.nodes().filter(function(v){return!util.isSubgraph(g,v)});var svgNodes=selection.selectAll("g.node").data(simpleNodes,function(v){return v}).classed("update",true);svgNodes.selectAll("*").remove();svgNodes.enter().append("g").attr("id",function(v){return"node_"+v}).attr("name",function(v){return g.node(v).label}).attr("class","node").style("opacity",0);svgNodes.each(function(v){var node=g.node(v),thisGroup=d3.select(this),labelGroup=thisGroup.append("g").attr("class","label"),labelDom=addLabel(labelGroup,node),shape=shapes[node.shape],bbox=_.pick(labelDom.node().getBBox(),"width","height");node.elem=this;if(node.id){thisGroup.attr("id",node.id)}if(node.labelId){labelGroup.attr("id",node.labelId)}util.applyClass(thisGroup,node["class"],(thisGroup.classed("update")?"update ":"")+"node");if(_.has(node,"width")){bbox.width=node.width}if(_.has(node,"height")){bbox.height=node.height}bbox.width+=node.paddingLeft+node.paddingRight;bbox.height+=node.paddingTop+node.paddingBottom;labelGroup.attr("transform","translate("+(node.paddingLeft-node.paddingRight)/2+","+(node.paddingTop-node.paddingBottom)/2+")");var shapeSvg=shape(d3.select(this),bbox,node);util.applyStyle(shapeSvg,node.style);var shapeBBox=shapeSvg.node().getBBox();node.width=shapeBBox.width;node.height=shapeBBox.height});util.applyTransition(svgNodes.exit(),g).style("opacity",0).remove();return svgNodes}},{"./d3":7,"./label/add-label":18,"./lodash":20,"./util":25}],7:[function(require,module,exports){module.exports=window.d3},{}],8:[function(require,module,exports){var dagre;if(require){try{dagre=require("dagre")}catch(e){}}if(!dagre){dagre=window.dagre}module.exports=dagre},{dagre:27}],9:[function(require,module,exports){var graphlib;if(require){try{graphlib=require("graphlib")}catch(e){}}if(!graphlib){graphlib=window.graphlib}module.exports=graphlib},{graphlib:57}],10:[function(require,module,exports){module.exports={node:require("./intersect-node"),circle:require("./intersect-circle"),ellipse:require("./intersect-ellipse"),polygon:require("./intersect-polygon"),rect:require("./intersect-rect")}},{"./intersect-circle":11,"./intersect-ellipse":12,"./intersect-node":14,"./intersect-polygon":15,"./intersect-rect":16}],11:[function(require,module,exports){var intersectEllipse=require("./intersect-ellipse");module.exports=intersectCircle;function intersectCircle(node,rx,point){return intersectEllipse(node,rx,rx,point)}},{"./intersect-ellipse":12}],12:[function(require,module,exports){module.exports=intersectEllipse;function intersectEllipse(node,rx,ry,point){var cx=node.x;var cy=node.y;var px=cx-point.x;var py=cy-point.y;var det=Math.sqrt(rx*rx*py*py+ry*ry*px*px);var dx=Math.abs(rx*ry*px/det);if(point.x0}},{}],14:[function(require,module,exports){module.exports=intersectNode;function intersectNode(node,point){return node.intersect(point)}},{}],15:[function(require,module,exports){var intersectLine=require("./intersect-line");module.exports=intersectPolygon;function intersectPolygon(node,polyPoints,point){var x1=node.x;var y1=node.y;var intersections=[];var minX=Number.POSITIVE_INFINITY,minY=Number.POSITIVE_INFINITY;polyPoints.forEach(function(entry){minX=Math.min(minX,entry.x);minY=Math.min(minY,entry.y)});var left=x1-node.width/2-minX;var top=y1-node.height/2-minY;for(var i=0;i1){intersections.sort(function(p,q){var pdx=p.x-point.x,pdy=p.y-point.y,distp=Math.sqrt(pdx*pdx+pdy*pdy),qdx=q.x-point.x,qdy=q.y-point.y,distq=Math.sqrt(qdx*qdx+qdy*qdy);return distpMath.abs(dx)*h){if(dy<0){h=-h}sx=dy===0?0:h*dx/dy;sy=h}else{if(dx<0){w=-w}sx=w;sy=dx===0?0:w*dy/dx}return{x:x+sx,y:y+sy}}},{}],17:[function(require,module,exports){var util=require("../util");module.exports=addHtmlLabel;function addHtmlLabel(root,node){var fo=root.append("foreignObject").attr("width","100000");var div=fo.append("xhtml:div");var label=node.label;switch(typeof label){case"function":div.insert(label);break;case"object":div.insert(function(){return label});break;default:div.html(label)}util.applyStyle(div,node.labelStyle);div.style("display","inline-block");div.style("white-space","nowrap");var w,h;div.each(function(){w=this.clientWidth;h=this.clientHeight});fo.attr("width",w).attr("height",h);return fo}},{"../util":25}],18:[function(require,module,exports){var addTextLabel=require("./add-text-label"),addHtmlLabel=require("./add-html-label");module.exports=addLabel;function addLabel(root,node){var label=node.label;var labelSvg=root.append("g");if(typeof label!=="string"||node.labelType==="html"){addHtmlLabel(labelSvg,node)}else{addTextLabel(labelSvg,node)}var labelBBox=labelSvg.node().getBBox();labelSvg.attr("transform","translate("+-labelBBox.width/2+","+-labelBBox.height/2+")");return labelSvg}},{"./add-html-label":17,"./add-text-label":19}],19:[function(require,module,exports){var util=require("../util");module.exports=addTextLabel;function addTextLabel(root,node){var domNode=root.append("text");var lines=processEscapeSequences(node.label).split("\n");for(var i=0;i0;--i){entry=buckets[i].dequeue();if(entry){results=results.concat(removeNode(g,buckets,zeroIdx,entry,true));break}}}}return results}function removeNode(g,buckets,zeroIdx,entry,collectPredecessors){var results=collectPredecessors?[]:undefined;_.each(g.inEdges(entry.v),function(edge){var weight=g.edge(edge),uEntry=g.node(edge.v);if(collectPredecessors){results.push({v:edge.v,w:edge.w})}uEntry.out-=weight;assignBucket(buckets,zeroIdx,uEntry)});_.each(g.outEdges(entry.v),function(edge){var weight=g.edge(edge),w=edge.w,wEntry=g.node(w);wEntry["in"]-=weight;assignBucket(buckets,zeroIdx,wEntry)});g.removeNode(entry.v);return results}function buildState(g,weightFn){var fasGraph=new Graph,maxIn=0,maxOut=0;_.each(g.nodes(),function(v){fasGraph.setNode(v,{v:v,"in":0,out:0})});_.each(g.edges(),function(e){var prevWeight=fasGraph.edge(e.v,e.w)||0,weight=weightFn(e),edgeWeight=prevWeight+weight;fasGraph.setEdge(e.v,e.w,edgeWeight);maxOut=Math.max(maxOut,fasGraph.node(e.v).out+=weight);maxIn=Math.max(maxIn,fasGraph.node(e.w)["in"]+=weight)});var buckets=_.range(maxOut+maxIn+3).map(function(){return new List});var zeroIdx=maxIn+1;_.each(fasGraph.nodes(),function(v){assignBucket(buckets,zeroIdx,fasGraph.node(v))});return{graph:fasGraph,buckets:buckets,zeroIdx:zeroIdx}}function assignBucket(buckets,zeroIdx,entry){if(!entry.out){buckets[0].enqueue(entry)}else if(!entry["in"]){buckets[buckets.length-1].enqueue(entry)}else{buckets[entry.out-entry["in"]+zeroIdx].enqueue(entry)}}},{"./data/list":31,"./graphlib":33,"./lodash":36}],35:[function(require,module,exports){"use strict";var _=require("./lodash"),acyclic=require("./acyclic"),normalize=require("./normalize"),rank=require("./rank"),normalizeRanks=require("./util").normalizeRanks,parentDummyChains=require("./parent-dummy-chains"),removeEmptyRanks=require("./util").removeEmptyRanks,nestingGraph=require("./nesting-graph"),addBorderSegments=require("./add-border-segments"),coordinateSystem=require("./coordinate-system"),order=require("./order"),position=require("./position"),util=require("./util"),Graph=require("./graphlib").Graph;module.exports=layout;function layout(g,opts){var time=opts&&opts.debugTiming?util.time:util.notime;time("layout",function(){var layoutGraph=time(" buildLayoutGraph",function(){return buildLayoutGraph(g)});time(" runLayout",function(){runLayout(layoutGraph,time)});time(" updateInputGraph",function(){updateInputGraph(g,layoutGraph)})})}function runLayout(g,time){time(" makeSpaceForEdgeLabels",function(){makeSpaceForEdgeLabels(g)});time(" removeSelfEdges",function(){removeSelfEdges(g)});time(" acyclic",function(){acyclic.run(g)});time(" nestingGraph.run",function(){nestingGraph.run(g)});time(" rank",function(){rank(util.asNonCompoundGraph(g))});time(" injectEdgeLabelProxies",function(){injectEdgeLabelProxies(g)});time(" removeEmptyRanks",function(){removeEmptyRanks(g)});time(" nestingGraph.cleanup",function(){nestingGraph.cleanup(g)});time(" normalizeRanks",function(){normalizeRanks(g)});time(" assignRankMinMax",function(){assignRankMinMax(g)});time(" removeEdgeLabelProxies",function(){removeEdgeLabelProxies(g)});time(" normalize.run",function(){normalize.run(g)});time(" parentDummyChains",function(){parentDummyChains(g)});time(" addBorderSegments",function(){addBorderSegments(g)});time(" order",function(){order(g)});time(" insertSelfEdges",function(){insertSelfEdges(g)});time(" adjustCoordinateSystem",function(){coordinateSystem.adjust(g)});time(" position",function(){position(g)});time(" positionSelfEdges",function(){positionSelfEdges(g)});time(" removeBorderNodes",function(){removeBorderNodes(g)});time(" normalize.undo",function(){normalize.undo(g)});time(" fixupEdgeLabelCoords",function(){fixupEdgeLabelCoords(g)});time(" undoCoordinateSystem",function(){coordinateSystem.undo(g)});time(" translateGraph",function(){translateGraph(g)});time(" assignNodeIntersects",function(){assignNodeIntersects(g)});time(" reversePoints",function(){reversePointsForReversedEdges(g)});time(" acyclic.undo",function(){acyclic.undo(g)})}function updateInputGraph(inputGraph,layoutGraph){_.each(inputGraph.nodes(),function(v){var inputLabel=inputGraph.node(v),layoutLabel=layoutGraph.node(v);if(inputLabel){inputLabel.x=layoutLabel.x;inputLabel.y=layoutLabel.y;if(layoutGraph.children(v).length){inputLabel.width=layoutLabel.width;inputLabel.height=layoutLabel.height}}});_.each(inputGraph.edges(),function(e){var inputLabel=inputGraph.edge(e),layoutLabel=layoutGraph.edge(e);inputLabel.points=layoutLabel.points;if(_.has(layoutLabel,"x")){inputLabel.x=layoutLabel.x;inputLabel.y=layoutLabel.y}});inputGraph.graph().width=layoutGraph.graph().width;inputGraph.graph().height=layoutGraph.graph().height}var graphNumAttrs=["nodesep","edgesep","ranksep","marginx","marginy"],graphDefaults={ranksep:50,edgesep:20,nodesep:50,rankdir:"tb"},graphAttrs=["acyclicer","ranker","rankdir","align"],nodeNumAttrs=["width","height"],nodeDefaults={width:0,height:0},edgeNumAttrs=["minlen","weight","width","height","labeloffset"],edgeDefaults={minlen:1,weight:1,width:0,height:0,labeloffset:10,labelpos:"r"},edgeAttrs=["labelpos"];function buildLayoutGraph(inputGraph){var g=new Graph({multigraph:true,compound:true}),graph=canonicalize(inputGraph.graph());g.setGraph(_.merge({},graphDefaults,selectNumberAttrs(graph,graphNumAttrs),_.pick(graph,graphAttrs))); -graph.ranksep/=2;_.each(g.edges(),function(e){var edge=g.edge(e);edge.minlen*=2;if(edge.labelpos.toLowerCase()!=="c"){if(graph.rankdir==="TB"||graph.rankdir==="BT"){edge.width+=edge.labeloffset}else{edge.height+=edge.labeloffset}}})}function injectEdgeLabelProxies(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.width&&edge.height){var v=g.node(e.v),w=g.node(e.w),label={rank:(w.rank-v.rank)/2+v.rank,e:e};util.addDummyNode(g,"edge-proxy",label,"_ep")}})}function assignRankMinMax(g){var maxRank=0;_.each(g.nodes(),function(v){var node=g.node(v);if(node.borderTop){node.minRank=g.node(node.borderTop).rank;node.maxRank=g.node(node.borderBottom).rank;maxRank=_.max(maxRank,node.maxRank)}});g.graph().maxRank=maxRank}function removeEdgeLabelProxies(g){_.each(g.nodes(),function(v){var node=g.node(v);if(node.dummy==="edge-proxy"){g.edge(node.e).labelRank=node.rank;g.removeNode(v)}})}function translateGraph(g){var minX=Number.POSITIVE_INFINITY,maxX=0,minY=Number.POSITIVE_INFINITY,maxY=0,graphLabel=g.graph(),marginX=graphLabel.marginx||0,marginY=graphLabel.marginy||0;function getExtremes(attrs){var x=attrs.x,y=attrs.y,w=attrs.width,h=attrs.height;minX=Math.min(minX,x-w/2);maxX=Math.max(maxX,x+w/2);minY=Math.min(minY,y-h/2);maxY=Math.max(maxY,y+h/2)}_.each(g.nodes(),function(v){getExtremes(g.node(v))});_.each(g.edges(),function(e){var edge=g.edge(e);if(_.has(edge,"x")){getExtremes(edge)}});minX-=marginX;minY-=marginY;_.each(g.nodes(),function(v){var node=g.node(v);node.x-=minX;node.y-=minY});_.each(g.edges(),function(e){var edge=g.edge(e);_.each(edge.points,function(p){p.x-=minX;p.y-=minY});if(_.has(edge,"x")){edge.x-=minX}if(_.has(edge,"y")){edge.y-=minY}});graphLabel.width=maxX-minX+marginX;graphLabel.height=maxY-minY+marginY}function assignNodeIntersects(g){_.each(g.edges(),function(e){var edge=g.edge(e),nodeV=g.node(e.v),nodeW=g.node(e.w),p1,p2;if(!edge.points){edge.points=[];p1=nodeW;p2=nodeV}else{p1=edge.points[0];p2=edge.points[edge.points.length-1]}edge.points.unshift(util.intersectRect(nodeV,p1));edge.points.push(util.intersectRect(nodeW,p2))})}function fixupEdgeLabelCoords(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(_.has(edge,"x")){if(edge.labelpos==="l"||edge.labelpos==="r"){edge.width-=edge.labeloffset}switch(edge.labelpos){case"l":edge.x-=edge.width/2+edge.labeloffset;break;case"r":edge.x+=edge.width/2+edge.labeloffset;break}}})}function reversePointsForReversedEdges(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.reversed){edge.points.reverse()}})}function removeBorderNodes(g){_.each(g.nodes(),function(v){if(g.children(v).length){var node=g.node(v),t=g.node(node.borderTop),b=g.node(node.borderBottom),l=g.node(_.last(node.borderLeft)),r=g.node(_.last(node.borderRight));node.width=Math.abs(r.x-l.x);node.height=Math.abs(b.y-t.y);node.x=l.x+node.width/2;node.y=t.y+node.height/2}});_.each(g.nodes(),function(v){if(g.node(v).dummy==="border"){g.removeNode(v)}})}function removeSelfEdges(g){_.each(g.edges(),function(e){if(e.v===e.w){var node=g.node(e.v);if(!node.selfEdges){node.selfEdges=[]}node.selfEdges.push({e:e,label:g.edge(e)});g.removeEdge(e)}})}function insertSelfEdges(g){var layers=util.buildLayerMatrix(g);_.each(layers,function(layer){var orderShift=0;_.each(layer,function(v,i){var node=g.node(v);node.order=i+orderShift;_.each(node.selfEdges,function(selfEdge){util.addDummyNode(g,"selfedge",{width:selfEdge.label.width,height:selfEdge.label.height,rank:node.rank,order:i+ ++orderShift,e:selfEdge.e,label:selfEdge.label},"_se")});delete node.selfEdges})})}function positionSelfEdges(g){_.each(g.nodes(),function(v){var node=g.node(v);if(node.dummy==="selfedge"){var selfNode=g.node(node.e.v),x=selfNode.x+selfNode.width/2,y=selfNode.y,dx=node.x-x,dy=selfNode.height/2;g.setEdge(node.e,node.label);g.removeNode(v);node.label.points=[{x:x+2*dx/3,y:y-dy},{x:x+5*dx/6,y:y-dy},{x:x+dx,y:y},{x:x+5*dx/6,y:y+dy},{x:x+2*dx/3,y:y+dy}];node.label.x=node.x;node.label.y=node.y}})}function selectNumberAttrs(obj,attrs){return _.mapValues(_.pick(obj,attrs),Number)}function canonicalize(attrs){var newAttrs={};_.each(attrs,function(v,k){newAttrs[k.toLowerCase()]=v});return newAttrs}},{"./acyclic":28,"./add-border-segments":29,"./coordinate-system":30,"./graphlib":33,"./lodash":36,"./nesting-graph":37,"./normalize":38,"./order":43,"./parent-dummy-chains":48,"./position":50,"./rank":52,"./util":55}],36:[function(require,module,exports){arguments[4][20][0].apply(exports,arguments)},{dup:20,lodash:77}],37:[function(require,module,exports){var _=require("./lodash"),util=require("./util");module.exports={run:run,cleanup:cleanup};function run(g){var root=util.addDummyNode(g,"root",{},"_root"),depths=treeDepths(g),height=_.max(depths)-1,nodeSep=2*height+1;g.graph().nestingRoot=root;_.each(g.edges(),function(e){g.edge(e).minlen*=nodeSep});var weight=sumWeights(g)+1;_.each(g.children(),function(child){dfs(g,root,nodeSep,weight,height,depths,child)});g.graph().nodeRankFactor=nodeSep}function dfs(g,root,nodeSep,weight,height,depths,v){var children=g.children(v);if(!children.length){if(v!==root){g.setEdge(root,v,{weight:0,minlen:nodeSep})}return}var top=util.addBorderNode(g,"_bt"),bottom=util.addBorderNode(g,"_bb"),label=g.node(v);g.setParent(top,v);label.borderTop=top;g.setParent(bottom,v);label.borderBottom=bottom;_.each(children,function(child){dfs(g,root,nodeSep,weight,height,depths,child);var childNode=g.node(child),childTop=childNode.borderTop?childNode.borderTop:child,childBottom=childNode.borderBottom?childNode.borderBottom:child,thisWeight=childNode.borderTop?weight:2*weight,minlen=childTop!==childBottom?1:height-depths[v]+1;g.setEdge(top,childTop,{weight:thisWeight,minlen:minlen,nestingEdge:true});g.setEdge(childBottom,bottom,{weight:thisWeight,minlen:minlen,nestingEdge:true})});if(!g.parent(v)){g.setEdge(root,top,{weight:0,minlen:height+depths[v]})}}function treeDepths(g){var depths={};function dfs(v,depth){var children=g.children(v);if(children&&children.length){_.each(children,function(child){dfs(child,depth+1)})}depths[v]=depth}_.each(g.children(),function(v){dfs(v,1)});return depths}function sumWeights(g){return _.reduce(g.edges(),function(acc,e){return acc+g.edge(e).weight},0)}function cleanup(g){var graphLabel=g.graph();g.removeNode(graphLabel.nestingRoot);delete graphLabel.nestingRoot;_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.nestingEdge){g.removeEdge(e)}})}},{"./lodash":36,"./util":55}],38:[function(require,module,exports){"use strict";var _=require("./lodash"),util=require("./util");module.exports={run:run,undo:undo};function run(g){g.graph().dummyChains=[];_.each(g.edges(),function(edge){normalizeEdge(g,edge)})}function normalizeEdge(g,e){var v=e.v,vRank=g.node(v).rank,w=e.w,wRank=g.node(w).rank,name=e.name,edgeLabel=g.edge(e),labelRank=edgeLabel.labelRank;if(wRank===vRank+1)return;g.removeEdge(e);var dummy,attrs,i;for(i=0,++vRank;vRank0){if(index%2){weightSum+=tree[index+1]}index=index-1>>1;tree[index]+=entry.weight}cc+=entry.weight*weightSum}));return cc}},{"../lodash":36}],43:[function(require,module,exports){"use strict";var _=require("../lodash"),initOrder=require("./init-order"),crossCount=require("./cross-count"),sortSubgraph=require("./sort-subgraph"),buildLayerGraph=require("./build-layer-graph"),addSubgraphConstraints=require("./add-subgraph-constraints"),Graph=require("../graphlib").Graph,util=require("../util");module.exports=order;function order(g){var maxRank=util.maxRank(g),downLayerGraphs=buildLayerGraphs(g,_.range(1,maxRank+1),"inEdges"),upLayerGraphs=buildLayerGraphs(g,_.range(maxRank-1,-1,-1),"outEdges");var layering=initOrder(g);assignOrder(g,layering);var bestCC=Number.POSITIVE_INFINITY,best;for(var i=0,lastBest=0;lastBest<4;++i,++lastBest){sweepLayerGraphs(i%2?downLayerGraphs:upLayerGraphs,i%4>=2);layering=util.buildLayerMatrix(g);var cc=crossCount(g,layering);if(cc=vEntry.barycenter){mergeEntries(vEntry,uEntry)}}}function handleOut(vEntry){return function(wEntry){wEntry["in"].push(vEntry);if(--wEntry.indegree===0){sourceSet.push(wEntry)}}}while(sourceSet.length){var entry=sourceSet.pop();entries.push(entry);_.each(entry["in"].reverse(),handleIn(entry));_.each(entry.out,handleOut(entry))}return _.chain(entries).filter(function(entry){return!entry.merged}).map(function(entry){return _.pick(entry,["vs","i","barycenter","weight"])}).value()}function mergeEntries(target,source){var sum=0,weight=0;if(target.weight){sum+=target.barycenter*target.weight;weight+=target.weight}if(source.weight){sum+=source.barycenter*source.weight;weight+=source.weight}target.vs=source.vs.concat(target.vs);target.barycenter=sum/weight;target.weight=weight;target.i=Math.min(source.i,target.i);source.merged=true}},{"../lodash":36}],46:[function(require,module,exports){var _=require("../lodash"),barycenter=require("./barycenter"),resolveConflicts=require("./resolve-conflicts"),sort=require("./sort");module.exports=sortSubgraph;function sortSubgraph(g,v,cg,biasRight){var movable=g.children(v),node=g.node(v),bl=node?node.borderLeft:undefined,br=node?node.borderRight:undefined,subgraphs={};if(bl){movable=_.filter(movable,function(w){return w!==bl&&w!==br})}var barycenters=barycenter(g,movable);_.each(barycenters,function(entry){if(g.children(entry.v).length){var subgraphResult=sortSubgraph(g,entry.v,cg,biasRight);subgraphs[entry.v]=subgraphResult;if(_.has(subgraphResult,"barycenter")){mergeBarycenters(entry,subgraphResult)}}});var entries=resolveConflicts(barycenters,cg);expandSubgraphs(entries,subgraphs);var result=sort(entries,biasRight);if(bl){result.vs=_.flatten([bl,result.vs,br],true);if(g.predecessors(bl).length){var blPred=g.node(g.predecessors(bl)[0]),brPred=g.node(g.predecessors(br)[0]);if(!_.has(result,"barycenter")){result.barycenter=0;result.weight=0}result.barycenter=(result.barycenter*result.weight+blPred.order+brPred.order)/(result.weight+2);result.weight+=2}}return result}function expandSubgraphs(entries,subgraphs){_.each(entries,function(entry){entry.vs=_.flatten(entry.vs.map(function(v){if(subgraphs[v]){return subgraphs[v].vs}return v}),true)})}function mergeBarycenters(target,other){if(!_.isUndefined(target.barycenter)){target.barycenter=(target.barycenter*target.weight+other.barycenter*other.weight)/(target.weight+other.weight);target.weight+=other.weight}else{target.barycenter=other.barycenter;target.weight=other.weight}}},{"../lodash":36,"./barycenter":40,"./resolve-conflicts":45,"./sort":47}],47:[function(require,module,exports){var _=require("../lodash"),util=require("../util");module.exports=sort;function sort(entries,biasRight){var parts=util.partition(entries,function(entry){return _.has(entry,"barycenter")});var sortable=parts.lhs,unsortable=_.sortBy(parts.rhs,function(entry){return-entry.i}),vs=[],sum=0,weight=0,vsIndex=0;sortable.sort(compareWithBias(!!biasRight));vsIndex=consumeUnsortable(vs,unsortable,vsIndex);_.each(sortable,function(entry){vsIndex+=entry.vs.length;vs.push(entry.vs);sum+=entry.barycenter*entry.weight;weight+=entry.weight;vsIndex=consumeUnsortable(vs,unsortable,vsIndex)});var result={vs:_.flatten(vs,true)};if(weight){result.barycenter=sum/weight;result.weight=weight}return result}function consumeUnsortable(vs,unsortable,index){var last;while(unsortable.length&&(last=_.last(unsortable)).i<=index){unsortable.pop();vs.push(last.vs);index++}return index}function compareWithBias(bias){return function(entryV,entryW){if(entryV.barycenterentryW.barycenter){return 1}return!bias?entryV.i-entryW.i:entryW.i-entryV.i}}},{"../lodash":36,"../util":55}],48:[function(require,module,exports){var _=require("./lodash");module.exports=parentDummyChains;function parentDummyChains(g){var postorderNums=postorder(g);_.each(g.graph().dummyChains,function(v){var node=g.node(v),edgeObj=node.edgeObj,pathData=findPath(g,postorderNums,edgeObj.v,edgeObj.w),path=pathData.path,lca=pathData.lca,pathIdx=0,pathV=path[pathIdx],ascending=true;while(v!==edgeObj.w){node=g.node(v);if(ascending){while((pathV=path[pathIdx])!==lca&&g.node(pathV).maxRanklow||lim>postorderNums[parent].lim));lca=parent;parent=w;while((parent=g.parent(parent))!==lca){wPath.push(parent)}return{path:vPath.concat(wPath.reverse()),lca:lca}}function postorder(g){var result={},lim=0;function dfs(v){var low=lim;_.each(g.children(v),dfs);result[v]={low:low,lim:lim++}}_.each(g.children(),dfs);return result}},{"./lodash":36}],49:[function(require,module,exports){"use strict";var _=require("../lodash"),Graph=require("../graphlib").Graph,util=require("../util");module.exports={positionX:positionX,findType1Conflicts:findType1Conflicts,findType2Conflicts:findType2Conflicts,addConflict:addConflict,hasConflict:hasConflict,verticalAlignment:verticalAlignment,horizontalCompaction:horizontalCompaction,alignCoordinates:alignCoordinates,findSmallestWidthAlignment:findSmallestWidthAlignment,balance:balance};function findType1Conflicts(g,layering){var conflicts={};function visitLayer(prevLayer,layer){var k0=0,scanPos=0,prevLayerLength=prevLayer.length,lastNode=_.last(layer);_.each(layer,function(v,i){var w=findOtherInnerSegmentNode(g,v),k1=w?g.node(w).order:prevLayerLength;if(w||v===lastNode){_.each(layer.slice(scanPos,i+1),function(scanNode){_.each(g.predecessors(scanNode),function(u){var uLabel=g.node(u),uPos=uLabel.order;if((uPosnextNorthBorder)){addConflict(conflicts,u,v)}})}})}function visitLayer(north,south){var prevNorthPos=-1,nextNorthPos,southPos=0;_.each(south,function(v,southLookahead){if(g.node(v).dummy==="border"){var predecessors=g.predecessors(v);if(predecessors.length){nextNorthPos=g.node(predecessors[0]).order;scan(south,southPos,southLookahead,prevNorthPos,nextNorthPos);southPos=southLookahead;prevNorthPos=nextNorthPos}}scan(south,southPos,south.length,nextNorthPos,north.length)});return south}_.reduce(layering,visitLayer);return conflicts}function findOtherInnerSegmentNode(g,v){if(g.node(v).dummy){return _.find(g.predecessors(v),function(u){return g.node(u).dummy})}}function addConflict(conflicts,v,w){if(v>w){var tmp=v;v=w;w=tmp}var conflictsV=conflicts[v];if(!conflictsV){conflicts[v]=conflictsV={}}conflictsV[w]=true}function hasConflict(conflicts,v,w){if(v>w){var tmp=v;v=w;w=tmp}return _.has(conflicts[v],w)}function verticalAlignment(g,layering,conflicts,neighborFn){var root={},align={},pos={};_.each(layering,function(layer){_.each(layer,function(v,order){root[v]=v;align[v]=v;pos[v]=order})});_.each(layering,function(layer){var prevIdx=-1;_.each(layer,function(v){var ws=neighborFn(v);if(ws.length){ws=_.sortBy(ws,function(w){return pos[w]});var mp=(ws.length-1)/2;for(var i=Math.floor(mp),il=Math.ceil(mp);i<=il;++i){var w=ws[i];if(align[v]===v&&prevIdxwLabel.lim){tailLabel=wLabel;flip=true}var candidates=_.filter(g.edges(),function(edge){return flip===isDescendant(t,t.node(edge.v),tailLabel)&&flip!==isDescendant(t,t.node(edge.w),tailLabel)});return _.min(candidates,function(edge){return slack(g,edge)})}function exchangeEdges(t,g,e,f){var v=e.v,w=e.w;t.removeEdge(v,w);t.setEdge(f.v,f.w,{});initLowLimValues(t);initCutValues(t,g);updateRanks(t,g)}function updateRanks(t,g){var root=_.find(t.nodes(),function(v){return!g.node(v).parent}),vs=preorder(t,root);vs=vs.slice(1);_.each(vs,function(v){var parent=t.node(v).parent,edge=g.edge(v,parent),flipped=false;if(!edge){edge=g.edge(parent,v);flipped=true}g.node(v).rank=g.node(parent).rank+(flipped?edge.minlen:-edge.minlen)})}function isTreeEdge(tree,u,v){return tree.hasEdge(u,v)}function isDescendant(tree,vLabel,rootLabel){return rootLabel.low<=vLabel.lim&&vLabel.lim<=rootLabel.lim}},{"../graphlib":33,"../lodash":36,"../util":55,"./feasible-tree":51,"./util":54}],54:[function(require,module,exports){"use strict";var _=require("../lodash");module.exports={longestPath:longestPath,slack:slack};function longestPath(g){var visited={};function dfs(v){var label=g.node(v);if(_.has(visited,v)){return label.rank}visited[v]=true;var rank=_.min(_.map(g.outEdges(v),function(e){return dfs(e.w)-g.edge(e).minlen}));if(rank===Number.POSITIVE_INFINITY){rank=0}return label.rank=rank}_.each(g.sources(),dfs)}function slack(g,e){return g.node(e.w).rank-g.node(e.v).rank-g.edge(e).minlen}},{"../lodash":36}],55:[function(require,module,exports){"use strict";var _=require("./lodash"),Graph=require("./graphlib").Graph;module.exports={addDummyNode:addDummyNode,simplify:simplify,asNonCompoundGraph:asNonCompoundGraph,successorWeights:successorWeights,predecessorWeights:predecessorWeights,intersectRect:intersectRect,buildLayerMatrix:buildLayerMatrix,normalizeRanks:normalizeRanks,removeEmptyRanks:removeEmptyRanks,addBorderNode:addBorderNode,maxRank:maxRank,partition:partition,time:time,notime:notime};function addDummyNode(g,type,attrs,name){var v;do{v=_.uniqueId(name)}while(g.hasNode(v));attrs.dummy=type;g.setNode(v,attrs);return v}function simplify(g){var simplified=(new Graph).setGraph(g.graph());_.each(g.nodes(),function(v){simplified.setNode(v,g.node(v))});_.each(g.edges(),function(e){var simpleLabel=simplified.edge(e.v,e.w)||{weight:0,minlen:1},label=g.edge(e);simplified.setEdge(e.v,e.w,{weight:simpleLabel.weight+label.weight,minlen:Math.max(simpleLabel.minlen,label.minlen)})});return simplified}function asNonCompoundGraph(g){ -var simplified=new Graph({multigraph:g.isMultigraph()}).setGraph(g.graph());_.each(g.nodes(),function(v){if(!g.children(v).length){simplified.setNode(v,g.node(v))}});_.each(g.edges(),function(e){simplified.setEdge(e,g.edge(e))});return simplified}function successorWeights(g){var weightMap=_.map(g.nodes(),function(v){var sucs={};_.each(g.outEdges(v),function(e){sucs[e.w]=(sucs[e.w]||0)+g.edge(e).weight});return sucs});return _.zipObject(g.nodes(),weightMap)}function predecessorWeights(g){var weightMap=_.map(g.nodes(),function(v){var preds={};_.each(g.inEdges(v),function(e){preds[e.v]=(preds[e.v]||0)+g.edge(e).weight});return preds});return _.zipObject(g.nodes(),weightMap)}function intersectRect(rect,point){var x=rect.x;var y=rect.y;var dx=point.x-x;var dy=point.y-y;var w=rect.width/2;var h=rect.height/2;if(!dx&&!dy){throw new Error("Not possible to find intersection inside of the rectangle")}var sx,sy;if(Math.abs(dy)*w>Math.abs(dx)*h){if(dy<0){h=-h}sx=h*dx/dy;sy=h}else{if(dx<0){w=-w}sx=w;sy=w*dy/dx}return{x:x+sx,y:y+sy}}function buildLayerMatrix(g){var layering=_.map(_.range(maxRank(g)+1),function(){return[]});_.each(g.nodes(),function(v){var node=g.node(v),rank=node.rank;if(!_.isUndefined(rank)){layering[rank][node.order]=v}});return layering}function normalizeRanks(g){var min=_.min(_.map(g.nodes(),function(v){return g.node(v).rank}));_.each(g.nodes(),function(v){var node=g.node(v);if(_.has(node,"rank")){node.rank-=min}})}function removeEmptyRanks(g){var offset=_.min(_.map(g.nodes(),function(v){return g.node(v).rank}));var layers=[];_.each(g.nodes(),function(v){var rank=g.node(v).rank-offset;if(!_.has(layers,rank)){layers[rank]=[]}layers[rank].push(v)});var delta=0,nodeRankFactor=g.graph().nodeRankFactor;_.each(layers,function(vs,i){if(_.isUndefined(vs)&&i%nodeRankFactor!==0){--delta}else if(delta){_.each(vs,function(v){g.node(v).rank+=delta})}})}function addBorderNode(g,prefix,rank,order){var node={width:0,height:0};if(arguments.length>=4){node.rank=rank;node.order=order}return addDummyNode(g,"border",node,prefix)}function maxRank(g){return _.max(_.map(g.nodes(),function(v){var rank=g.node(v).rank;if(!_.isUndefined(rank)){return rank}}))}function partition(collection,fn){var result={lhs:[],rhs:[]};_.each(collection,function(value){if(fn(value)){result.lhs.push(value)}else{result.rhs.push(value)}});return result}function time(name,fn){var start=_.now();try{return fn()}finally{console.log(name+" time: "+(_.now()-start)+"ms")}}function notime(name,fn){return fn()}},{"./graphlib":33,"./lodash":36}],56:[function(require,module,exports){module.exports="0.7.1"},{}],57:[function(require,module,exports){var lib=require("./lib");module.exports={Graph:lib.Graph,json:require("./lib/json"),alg:require("./lib/alg"),version:lib.version}},{"./lib":73,"./lib/alg":64,"./lib/json":74}],58:[function(require,module,exports){var _=require("../lodash");module.exports=components;function components(g){var visited={},cmpts=[],cmpt;function dfs(v){if(_.has(visited,v))return;visited[v]=true;cmpt.push(v);_.each(g.successors(v),dfs);_.each(g.predecessors(v),dfs)}_.each(g.nodes(),function(v){cmpt=[];dfs(v);if(cmpt.length){cmpts.push(cmpt)}});return cmpts}},{"../lodash":75}],59:[function(require,module,exports){var _=require("../lodash");module.exports=dfs;function dfs(g,vs,order){if(!_.isArray(vs)){vs=[vs]}var acc=[],visited={};_.each(vs,function(v){if(!g.hasNode(v)){throw new Error("Graph does not have node: "+v)}doDfs(g,v,order==="post",visited,acc)});return acc}function doDfs(g,v,postorder,visited,acc){if(!_.has(visited,v)){visited[v]=true;if(!postorder){acc.push(v)}_.each(g.neighbors(v),function(w){doDfs(g,w,postorder,visited,acc)});if(postorder){acc.push(v)}}}},{"../lodash":75}],60:[function(require,module,exports){var dijkstra=require("./dijkstra"),_=require("../lodash");module.exports=dijkstraAll;function dijkstraAll(g,weightFunc,edgeFunc){return _.transform(g.nodes(),function(acc,v){acc[v]=dijkstra(g,v,weightFunc,edgeFunc)},{})}},{"../lodash":75,"./dijkstra":61}],61:[function(require,module,exports){var _=require("../lodash"),PriorityQueue=require("../data/priority-queue");module.exports=dijkstra;var DEFAULT_WEIGHT_FUNC=_.constant(1);function dijkstra(g,source,weightFn,edgeFn){return runDijkstra(g,String(source),weightFn||DEFAULT_WEIGHT_FUNC,edgeFn||function(v){return g.outEdges(v)})}function runDijkstra(g,source,weightFn,edgeFn){var results={},pq=new PriorityQueue,v,vEntry;var updateNeighbors=function(edge){var w=edge.v!==v?edge.v:edge.w,wEntry=results[w],weight=weightFn(edge),distance=vEntry.distance+weight;if(weight<0){throw new Error("dijkstra does not allow negative edge weights. "+"Bad edge: "+edge+" Weight: "+weight)}if(distance0){v=pq.removeMin();vEntry=results[v];if(vEntry.distance===Number.POSITIVE_INFINITY){break}edgeFn(v).forEach(updateNeighbors)}return results}},{"../data/priority-queue":71,"../lodash":75}],62:[function(require,module,exports){var _=require("../lodash"),tarjan=require("./tarjan");module.exports=findCycles;function findCycles(g){return _.filter(tarjan(g),function(cmpt){return cmpt.length>1})}},{"../lodash":75,"./tarjan":69}],63:[function(require,module,exports){var _=require("../lodash");module.exports=floydWarshall;var DEFAULT_WEIGHT_FUNC=_.constant(1);function floydWarshall(g,weightFn,edgeFn){return runFloydWarshall(g,weightFn||DEFAULT_WEIGHT_FUNC,edgeFn||function(v){return g.outEdges(v)})}function runFloydWarshall(g,weightFn,edgeFn){var results={},nodes=g.nodes();nodes.forEach(function(v){results[v]={};results[v][v]={distance:0};nodes.forEach(function(w){if(v!==w){results[v][w]={distance:Number.POSITIVE_INFINITY}}});edgeFn(v).forEach(function(edge){var w=edge.v===v?edge.w:edge.v,d=weightFn(edge);results[v][w]={distance:d,predecessor:v}})});nodes.forEach(function(k){var rowK=results[k];nodes.forEach(function(i){var rowI=results[i];nodes.forEach(function(j){var ik=rowI[k];var kj=rowK[j];var ij=rowI[j];var altDistance=ik.distance+kj.distance;if(altDistance0){v=pq.removeMin();if(_.has(parents,v)){result.setEdge(v,parents[v])}else if(init){throw new Error("Input graph is not connected: "+g)}else{init=true}g.nodeEdges(v).forEach(updateNeighbors)}return result}},{"../data/priority-queue":71,"../graph":72,"../lodash":75}],69:[function(require,module,exports){var _=require("../lodash");module.exports=tarjan;function tarjan(g){var index=0,stack=[],visited={},results=[];function dfs(v){var entry=visited[v]={onStack:true,lowlink:index,index:index++};stack.push(v);g.successors(v).forEach(function(w){if(!_.has(visited,w)){dfs(w);entry.lowlink=Math.min(entry.lowlink,visited[w].lowlink)}else if(visited[w].onStack){entry.lowlink=Math.min(entry.lowlink,visited[w].index)}});if(entry.lowlink===entry.index){var cmpt=[],w;do{w=stack.pop();visited[w].onStack=false;cmpt.push(w)}while(v!==w);results.push(cmpt)}}g.nodes().forEach(function(v){if(!_.has(visited,v)){dfs(v)}});return results}},{"../lodash":75}],70:[function(require,module,exports){var _=require("../lodash");module.exports=topsort;topsort.CycleException=CycleException;function topsort(g){var visited={},stack={},results=[];function visit(node){if(_.has(stack,node)){throw new CycleException}if(!_.has(visited,node)){stack[node]=true;visited[node]=true;_.each(g.predecessors(node),visit);delete stack[node];results.push(node)}}_.each(g.sinks(),visit);if(_.size(visited)!==g.nodeCount()){throw new CycleException}return results}function CycleException(){}},{"../lodash":75}],71:[function(require,module,exports){var _=require("../lodash");module.exports=PriorityQueue;function PriorityQueue(){this._arr=[];this._keyIndices={}}PriorityQueue.prototype.size=function(){return this._arr.length};PriorityQueue.prototype.keys=function(){return this._arr.map(function(x){return x.key})};PriorityQueue.prototype.has=function(key){return _.has(this._keyIndices,key)};PriorityQueue.prototype.priority=function(key){var index=this._keyIndices[key];if(index!==undefined){return this._arr[index].priority}};PriorityQueue.prototype.min=function(){if(this.size()===0){throw new Error("Queue underflow")}return this._arr[0].key};PriorityQueue.prototype.add=function(key,priority){var keyIndices=this._keyIndices;key=String(key);if(!_.has(keyIndices,key)){var arr=this._arr;var index=arr.length;keyIndices[key]=index;arr.push({key:key,priority:priority});this._decrease(index);return true}return false};PriorityQueue.prototype.removeMin=function(){this._swap(0,this._arr.length-1);var min=this._arr.pop();delete this._keyIndices[min.key];this._heapify(0);return min.key};PriorityQueue.prototype.decrease=function(key,priority){var index=this._keyIndices[key];if(priority>this._arr[index].priority){throw new Error("New priority is greater than current priority. "+"Key: "+key+" Old: "+this._arr[index].priority+" New: "+priority)}this._arr[index].priority=priority;this._decrease(index)};PriorityQueue.prototype._heapify=function(i){var arr=this._arr;var l=2*i,r=l+1,largest=i;if(l>1;if(arr[parent].priority1){this.setNode(v,value)}else{this.setNode(v)}},this);return this};Graph.prototype.setNode=function(v,value){if(_.has(this._nodes,v)){if(arguments.length>1){this._nodes[v]=value}return this}this._nodes[v]=arguments.length>1?value:this._defaultNodeLabelFn(v);if(this._isCompound){this._parent[v]=GRAPH_NODE;this._children[v]={};this._children[GRAPH_NODE][v]=true}this._in[v]={};this._preds[v]={};this._out[v]={};this._sucs[v]={};++this._nodeCount;return this};Graph.prototype.node=function(v){return this._nodes[v]};Graph.prototype.hasNode=function(v){return _.has(this._nodes,v)};Graph.prototype.removeNode=function(v){var self=this;if(_.has(this._nodes,v)){var removeEdge=function(e){self.removeEdge(self._edgeObjs[e])};delete this._nodes[v];if(this._isCompound){this._removeFromParentsChildList(v);delete this._parent[v];_.each(this.children(v),function(child){this.setParent(child)},this);delete this._children[v]}_.each(_.keys(this._in[v]),removeEdge);delete this._in[v];delete this._preds[v];_.each(_.keys(this._out[v]),removeEdge);delete this._out[v];delete this._sucs[v];--this._nodeCount}return this};Graph.prototype.setParent=function(v,parent){if(!this._isCompound){throw new Error("Cannot set parent in a non-compound graph")}if(_.isUndefined(parent)){parent=GRAPH_NODE}else{for(var ancestor=parent;!_.isUndefined(ancestor);ancestor=this.parent(ancestor)){if(ancestor===v){throw new Error("Setting "+parent+" as parent of "+v+" would create create a cycle")}}this.setNode(parent)}this.setNode(v);this._removeFromParentsChildList(v);this._parent[v]=parent;this._children[parent][v]=true;return this};Graph.prototype._removeFromParentsChildList=function(v){delete this._children[this._parent[v]][v]};Graph.prototype.parent=function(v){if(this._isCompound){var parent=this._parent[v];if(parent!==GRAPH_NODE){return parent}}};Graph.prototype.children=function(v){if(_.isUndefined(v)){v=GRAPH_NODE}if(this._isCompound){var children=this._children[v];if(children){return _.keys(children)}}else if(v===GRAPH_NODE){return this.nodes()}else if(this.hasNode(v)){return[]}};Graph.prototype.predecessors=function(v){var predsV=this._preds[v];if(predsV){return _.keys(predsV)}};Graph.prototype.successors=function(v){var sucsV=this._sucs[v];if(sucsV){return _.keys(sucsV)}};Graph.prototype.neighbors=function(v){var preds=this.predecessors(v);if(preds){return _.union(preds,this.successors(v))}};Graph.prototype.setDefaultEdgeLabel=function(newDefault){if(!_.isFunction(newDefault)){newDefault=_.constant(newDefault)}this._defaultEdgeLabelFn=newDefault;return this};Graph.prototype.edgeCount=function(){return this._edgeCount};Graph.prototype.edges=function(){return _.values(this._edgeObjs)};Graph.prototype.setPath=function(vs,value){var self=this,args=arguments;_.reduce(vs,function(v,w){if(args.length>1){self.setEdge(v,w,value)}else{self.setEdge(v,w)}return w});return this};Graph.prototype.setEdge=function(){var v,w,name,value,valueSpecified=false;if(_.isPlainObject(arguments[0])){v=arguments[0].v;w=arguments[0].w;name=arguments[0].name;if(arguments.length===2){value=arguments[1];valueSpecified=true}}else{v=arguments[0];w=arguments[1];name=arguments[3];if(arguments.length>2){value=arguments[2];valueSpecified=true}}v=""+v;w=""+w;if(!_.isUndefined(name)){name=""+name}var e=edgeArgsToId(this._isDirected,v,w,name);if(_.has(this._edgeLabels,e)){if(valueSpecified){this._edgeLabels[e]=value}return this}if(!_.isUndefined(name)&&!this._isMultigraph){throw new Error("Cannot set a named edge when isMultigraph = false")}this.setNode(v);this.setNode(w);this._edgeLabels[e]=valueSpecified?value:this._defaultEdgeLabelFn(v,w,name);var edgeObj=edgeArgsToObj(this._isDirected,v,w,name);v=edgeObj.v;w=edgeObj.w;Object.freeze(edgeObj);this._edgeObjs[e]=edgeObj;incrementOrInitEntry(this._preds[w],v);incrementOrInitEntry(this._sucs[v],w);this._in[w][e]=edgeObj;this._out[v][e]=edgeObj;this._edgeCount++;return this};Graph.prototype.edge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name);return this._edgeLabels[e]};Graph.prototype.hasEdge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name);return _.has(this._edgeLabels,e)};Graph.prototype.removeEdge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name),edge=this._edgeObjs[e];if(edge){v=edge.v;w=edge.w;delete this._edgeLabels[e];delete this._edgeObjs[e];decrementOrRemoveEntry(this._preds[w],v);decrementOrRemoveEntry(this._sucs[v],w);delete this._in[w][e];delete this._out[v][e];this._edgeCount--}return this};Graph.prototype.inEdges=function(v,u){var inV=this._in[v];if(inV){var edges=_.values(inV);if(!u){return edges}return _.filter(edges,function(edge){return edge.v===u})}};Graph.prototype.outEdges=function(v,w){var outV=this._out[v];if(outV){var edges=_.values(outV);if(!w){return edges}return _.filter(edges,function(edge){return edge.w===w})}};Graph.prototype.nodeEdges=function(v,w){var inEdges=this.inEdges(v,w);if(inEdges){return inEdges.concat(this.outEdges(v,w))}};function incrementOrInitEntry(map,k){if(_.has(map,k)){map[k]++}else{map[k]=1}}function decrementOrRemoveEntry(map,k){if(!--map[k]){delete map[k]}}function edgeArgsToId(isDirected,v,w,name){if(!isDirected&&v>w){var tmp=v;v=w;w=tmp}return v+EDGE_KEY_DELIM+w+EDGE_KEY_DELIM+(_.isUndefined(name)?DEFAULT_EDGE_NAME:name)}function edgeArgsToObj(isDirected,v,w,name){if(!isDirected&&v>w){var tmp=v;v=w;w=tmp}var edgeObj={v:v,w:w};if(name){edgeObj.name=name}return edgeObj}function edgeObjToId(isDirected,edgeObj){return edgeArgsToId(isDirected,edgeObj.v,edgeObj.w,edgeObj.name)}},{"./lodash":75}],73:[function(require,module,exports){module.exports={Graph:require("./graph"),version:require("./version")}},{"./graph":72,"./version":76}],74:[function(require,module,exports){var _=require("./lodash"),Graph=require("./graph");module.exports={write:write,read:read};function write(g){var json={options:{directed:g.isDirected(),multigraph:g.isMultigraph(),compound:g.isCompound()},nodes:writeNodes(g),edges:writeEdges(g)};if(!_.isUndefined(g.graph())){json.value=_.clone(g.graph())}return json}function writeNodes(g){return _.map(g.nodes(),function(v){var nodeValue=g.node(v),parent=g.parent(v),node={v:v};if(!_.isUndefined(nodeValue)){node.value=nodeValue}if(!_.isUndefined(parent)){node.parent=parent}return node})}function writeEdges(g){return _.map(g.edges(),function(e){var edgeValue=g.edge(e),edge={v:e.v,w:e.w};if(!_.isUndefined(e.name)){edge.name=e.name}if(!_.isUndefined(edgeValue)){edge.value=edgeValue}return edge})}function read(json){var g=new Graph(json.options).setGraph(json.value);_.each(json.nodes,function(entry){g.setNode(entry.v,entry.value);if(entry.parent){g.setParent(entry.v,entry.parent)}});_.each(json.edges,function(entry){g.setEdge({v:entry.v,w:entry.w,name:entry.name},entry.value)});return g}},{"./graph":72,"./lodash":75}],75:[function(require,module,exports){arguments[4][20][0].apply(exports,arguments)},{dup:20,lodash:77}],76:[function(require,module,exports){module.exports="1.0.1"},{}],77:[function(require,module,exports){(function(global){(function(){var undefined;var arrayPool=[],objectPool=[];var idCounter=0;var keyPrefix=+new Date+"";var largeArraySize=75;var maxPoolSize=40;var whitespace=" \f \ufeff"+"\n\r\u2028\u2029"+" ᠎              ";var reEmptyStringLeading=/\b__p \+= '';/g,reEmptyStringMiddle=/\b(__p \+=) '' \+/g,reEmptyStringTrailing=/(__e\(.*?\)|\b__t\)) \+\n'';/g;var reEsTemplate=/\$\{([^\\}]*(?:\\.[^\\}]*)*)\}/g;var reFlags=/\w*$/;var reFuncName=/^\s*function[ \n\r\t]+\w/;var reInterpolate=/<%=([\s\S]+?)%>/g;var reLeadingSpacesAndZeros=RegExp("^["+whitespace+"]*0+(?=.$)");var reNoMatch=/($^)/;var reThis=/\bthis\b/;var reUnescapedString=/['\n\r\t\u2028\u2029\\]/g;var contextProps=["Array","Boolean","Date","Function","Math","Number","Object","RegExp","String","_","attachEvent","clearTimeout","isFinite","isNaN","parseInt","setTimeout"];var templateCounter=0;var argsClass="[object Arguments]",arrayClass="[object Array]",boolClass="[object Boolean]",dateClass="[object Date]",funcClass="[object Function]",numberClass="[object Number]",objectClass="[object Object]",regexpClass="[object RegExp]",stringClass="[object String]";var cloneableClasses={};cloneableClasses[funcClass]=false;cloneableClasses[argsClass]=cloneableClasses[arrayClass]=cloneableClasses[boolClass]=cloneableClasses[dateClass]=cloneableClasses[numberClass]=cloneableClasses[objectClass]=cloneableClasses[regexpClass]=cloneableClasses[stringClass]=true;var debounceOptions={leading:false,maxWait:0,trailing:false};var descriptor={configurable:false,enumerable:false,value:null,writable:false};var objectTypes={"boolean":false,"function":true,object:true,number:false,string:false,undefined:false};var stringEscapes={"\\":"\\","'":"'","\n":"n","\r":"r"," ":"t","\u2028":"u2028","\u2029":"u2029"};var root=objectTypes[typeof window]&&window||this;var freeExports=objectTypes[typeof exports]&&exports&&!exports.nodeType&&exports;var freeModule=objectTypes[typeof module]&&module&&!module.nodeType&&module;var moduleExports=freeModule&&freeModule.exports===freeExports&&freeExports;var freeGlobal=objectTypes[typeof global]&&global;if(freeGlobal&&(freeGlobal.global===freeGlobal||freeGlobal.window===freeGlobal)){root=freeGlobal}function baseIndexOf(array,value,fromIndex){var index=(fromIndex||0)-1,length=array?array.length:0;while(++index-1?0:-1:cache?0:-1}function cachePush(value){var cache=this.cache,type=typeof value;if(type=="boolean"||value==null){cache[value]=true}else{if(type!="number"&&type!="string"){type="object"}var key=type=="number"?value:keyPrefix+value,typeCache=cache[type]||(cache[type]={});if(type=="object"){(typeCache[key]||(typeCache[key]=[])).push(value)}else{typeCache[key]=true}}}function charAtCallback(value){return value.charCodeAt(0)}function compareAscending(a,b){var ac=a.criteria,bc=b.criteria,index=-1,length=ac.length;while(++indexother||typeof value=="undefined"){return 1}if(value/g,evaluate:/<%([\s\S]+?)%>/g,interpolate:reInterpolate,variable:"",imports:{_:lodash}};function baseBind(bindData){var func=bindData[0],partialArgs=bindData[2],thisArg=bindData[4];function bound(){if(partialArgs){var args=slice(partialArgs);push.apply(args,arguments)}if(this instanceof bound){var thisBinding=baseCreate(func.prototype),result=func.apply(thisBinding,args||arguments);return isObject(result)?result:thisBinding}return func.apply(thisArg,args||arguments)}setBindData(bound,bindData);return bound}function baseClone(value,isDeep,callback,stackA,stackB){if(callback){var result=callback(value);if(typeof result!="undefined"){return result}}var isObj=isObject(value);if(isObj){var className=toString.call(value);if(!cloneableClasses[className]){return value}var ctor=ctorByClass[className];switch(className){case boolClass:case dateClass:return new ctor(+value);case numberClass:case stringClass:return new ctor(value);case regexpClass:result=ctor(value.source,reFlags.exec(value));result.lastIndex=value.lastIndex;return result}}else{return value}var isArr=isArray(value);if(isDeep){var initedStack=!stackA;stackA||(stackA=getArray());stackB||(stackB=getArray());var length=stackA.length;while(length--){if(stackA[length]==value){return stackB[length]}}result=isArr?ctor(value.length):{}}else{result=isArr?slice(value):assign({},value)}if(isArr){if(hasOwnProperty.call(value,"index")){result.index=value.index}if(hasOwnProperty.call(value,"input")){result.input=value.input}}if(!isDeep){return result}stackA.push(value);stackB.push(result);(isArr?forEach:forOwn)(value,function(objValue,key){result[key]=baseClone(objValue,isDeep,callback,stackA,stackB)});if(initedStack){releaseArray(stackA);releaseArray(stackB)}return result}function baseCreate(prototype,properties){return isObject(prototype)?nativeCreate(prototype):{}}if(!nativeCreate){baseCreate=function(){function Object(){}return function(prototype){if(isObject(prototype)){Object.prototype=prototype;var result=new Object;Object.prototype=null}return result||context.Object()}}()}function baseCreateCallback(func,thisArg,argCount){if(typeof func!="function"){return identity}if(typeof thisArg=="undefined"||!("prototype"in func)){return func}var bindData=func.__bindData__;if(typeof bindData=="undefined"){if(support.funcNames){bindData=!func.name}bindData=bindData||!support.funcDecomp;if(!bindData){var source=fnToString.call(func);if(!support.funcNames){bindData=!reFuncName.test(source)}if(!bindData){bindData=reThis.test(source);setBindData(func,bindData)}}}if(bindData===false||bindData!==true&&bindData[1]&1){return func}switch(argCount){case 1:return function(value){return func.call(thisArg,value)};case 2:return function(a,b){return func.call(thisArg,a,b)};case 3:return function(value,index,collection){return func.call(thisArg,value,index,collection)};case 4:return function(accumulator,value,index,collection){return func.call(thisArg,accumulator,value,index,collection)}}return bind(func,thisArg)}function baseCreateWrapper(bindData){var func=bindData[0],bitmask=bindData[1],partialArgs=bindData[2],partialRightArgs=bindData[3],thisArg=bindData[4],arity=bindData[5];var isBind=bitmask&1,isBindKey=bitmask&2,isCurry=bitmask&4,isCurryBound=bitmask&8,key=func;function bound(){var thisBinding=isBind?thisArg:this;if(partialArgs){var args=slice(partialArgs);push.apply(args,arguments)}if(partialRightArgs||isCurry){args||(args=slice(arguments));if(partialRightArgs){push.apply(args,partialRightArgs)}if(isCurry&&args.length=largeArraySize&&indexOf===baseIndexOf,result=[];if(isLarge){var cache=createCache(values);if(cache){indexOf=cacheIndexOf;values=cache}else{isLarge=false}}while(++index-1}})}}stackA.pop();stackB.pop();if(initedStack){releaseArray(stackA);releaseArray(stackB)}return result}function baseMerge(object,source,callback,stackA,stackB){(isArray(source)?forEach:forOwn)(source,function(source,key){var found,isArr,result=source,value=object[key];if(source&&((isArr=isArray(source))||isPlainObject(source))){var stackLength=stackA.length;while(stackLength--){if(found=stackA[stackLength]==source){value=stackB[stackLength];break}}if(!found){var isShallow;if(callback){result=callback(value,source);if(isShallow=typeof result!="undefined"){value=result}}if(!isShallow){value=isArr?isArray(value)?value:[]:isPlainObject(value)?value:{}}stackA.push(source);stackB.push(value);if(!isShallow){baseMerge(value,source,callback,stackA,stackB)}}}else{if(callback){result=callback(value,source);if(typeof result=="undefined"){result=source}}if(typeof result!="undefined"){value=result}}object[key]=value})}function baseRandom(min,max){return min+floor(nativeRandom()*(max-min+1))}function baseUniq(array,isSorted,callback){var index=-1,indexOf=getIndexOf(),length=array?array.length:0,result=[];var isLarge=!isSorted&&length>=largeArraySize&&indexOf===baseIndexOf,seen=callback||isLarge?getArray():result;if(isLarge){var cache=createCache(seen);indexOf=cacheIndexOf;seen=cache}while(++index":">",'"':""","'":"'"};var htmlUnescapes=invert(htmlEscapes);var reEscapedHtml=RegExp("("+keys(htmlUnescapes).join("|")+")","g"),reUnescapedHtml=RegExp("["+keys(htmlEscapes).join("")+"]","g");var assign=function(object,source,guard){var index,iterable=object,result=iterable;if(!iterable)return result;var args=arguments,argsIndex=0,argsLength=typeof guard=="number"?2:args.length;if(argsLength>3&&typeof args[argsLength-2]=="function"){var callback=baseCreateCallback(args[--argsLength-1],args[argsLength--],2)}else if(argsLength>2&&typeof args[argsLength-1]=="function"){callback=args[--argsLength]}while(++argsIndex3&&typeof args[length-2]=="function"){var callback=baseCreateCallback(args[--length-1],args[length--],2)}else if(length>2&&typeof args[length-1]=="function"){callback=args[--length]}var sources=slice(arguments,1,length),index=-1,stackA=getArray(),stackB=getArray();while(++index-1}else if(typeof length=="number"){result=(isString(collection)?collection.indexOf(target,fromIndex):indexOf(collection,target,fromIndex))>-1}else{forOwn(collection,function(value){if(++index>=fromIndex){return!(result=value===target)}})}return result}var countBy=createAggregator(function(result,value,key){hasOwnProperty.call(result,key)?result[key]++:result[key]=1});function every(collection,callback,thisArg){var result=true;callback=lodash.createCallback(callback,thisArg,3);var index=-1,length=collection?collection.length:0;if(typeof length=="number"){while(++indexresult){result=value}}}else{callback=callback==null&&isString(collection)?charAtCallback:lodash.createCallback(callback,thisArg,3);forEach(collection,function(value,index,collection){var current=callback(value,index,collection);if(current>computed){computed=current;result=value}})}return result}function min(collection,callback,thisArg){var computed=Infinity,result=computed;if(typeof callback!="function"&&thisArg&&thisArg[callback]===collection){callback=null}if(callback==null&&isArray(collection)){var index=-1,length=collection.length;while(++index=largeArraySize&&createCache(argsIndex?args[argsIndex]:seen))}}var array=args[0],index=-1,length=array?array.length:0,result=[];outer:while(++index>>1;callback(array[mid])1?arguments:arguments[0],index=-1,length=array?max(pluck(array,"length")):0,result=Array(length<0?0:length);while(++index2?createWrapper(func,17,slice(arguments,2),null,thisArg):createWrapper(func,1,null,null,thisArg)}function bindAll(object){var funcs=arguments.length>1?baseFlatten(arguments,true,false,1):functions(object),index=-1,length=funcs.length;while(++index2?createWrapper(key,19,slice(arguments,2),null,object):createWrapper(key,3,null,null,object)}function compose(){var funcs=arguments,length=funcs.length;while(length--){if(!isFunction(funcs[length])){throw new TypeError}}return function(){var args=arguments,length=funcs.length;while(length--){args=[funcs[length].apply(this,args)]}return args[0]}}function curry(func,arity){arity=typeof arity=="number"?arity:+arity||func.length;return createWrapper(func,4,null,null,null,arity)}function debounce(func,wait,options){var args,maxTimeoutId,result,stamp,thisArg,timeoutId,trailingCall,lastCalled=0,maxWait=false,trailing=true;if(!isFunction(func)){throw new TypeError}wait=nativeMax(0,wait)||0;if(options===true){var leading=true;trailing=false}else if(isObject(options)){leading=options.leading;maxWait="maxWait"in options&&(nativeMax(wait,options.maxWait)||0);trailing="trailing"in options?options.trailing:trailing}var delayed=function(){var remaining=wait-(now()-stamp);if(remaining<=0){if(maxTimeoutId){clearTimeout(maxTimeoutId)}var isCalled=trailingCall;maxTimeoutId=timeoutId=trailingCall=undefined;if(isCalled){lastCalled=now();result=func.apply(thisArg,args);if(!timeoutId&&!maxTimeoutId){args=thisArg=null}}}else{timeoutId=setTimeout(delayed,remaining)}};var maxDelayed=function(){if(timeoutId){clearTimeout(timeoutId)}maxTimeoutId=timeoutId=trailingCall=undefined;if(trailing||maxWait!==wait){lastCalled=now();result=func.apply(thisArg,args);if(!timeoutId&&!maxTimeoutId){args=thisArg=null}}};return function(){args=arguments; +_.each(inputGraph.nodes(),function(v){var node=canonicalize(inputGraph.node(v));g.setNode(v,_.defaults(selectNumberAttrs(node,nodeNumAttrs),nodeDefaults));g.setParent(v,inputGraph.parent(v))});_.each(inputGraph.edges(),function(e){var edge=canonicalize(inputGraph.edge(e));g.setEdge(e,_.merge({},edgeDefaults,selectNumberAttrs(edge,edgeNumAttrs),_.pick(edge,edgeAttrs)))});return g}function makeSpaceForEdgeLabels(g){var graph=g.graph();graph.ranksep/=2;_.each(g.edges(),function(e){var edge=g.edge(e);edge.minlen*=2;if(edge.labelpos.toLowerCase()!=="c"){if(graph.rankdir==="TB"||graph.rankdir==="BT"){edge.width+=edge.labeloffset}else{edge.height+=edge.labeloffset}}})}function injectEdgeLabelProxies(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.width&&edge.height){var v=g.node(e.v),w=g.node(e.w),label={rank:(w.rank-v.rank)/2+v.rank,e:e};util.addDummyNode(g,"edge-proxy",label,"_ep")}})}function assignRankMinMax(g){var maxRank=0;_.each(g.nodes(),function(v){var node=g.node(v);if(node.borderTop){node.minRank=g.node(node.borderTop).rank;node.maxRank=g.node(node.borderBottom).rank;maxRank=_.max(maxRank,node.maxRank)}});g.graph().maxRank=maxRank}function removeEdgeLabelProxies(g){_.each(g.nodes(),function(v){var node=g.node(v);if(node.dummy==="edge-proxy"){g.edge(node.e).labelRank=node.rank;g.removeNode(v)}})}function translateGraph(g){var minX=Number.POSITIVE_INFINITY,maxX=0,minY=Number.POSITIVE_INFINITY,maxY=0,graphLabel=g.graph(),marginX=graphLabel.marginx||0,marginY=graphLabel.marginy||0;function getExtremes(attrs){var x=attrs.x,y=attrs.y,w=attrs.width,h=attrs.height;minX=Math.min(minX,x-w/2);maxX=Math.max(maxX,x+w/2);minY=Math.min(minY,y-h/2);maxY=Math.max(maxY,y+h/2)}_.each(g.nodes(),function(v){getExtremes(g.node(v))});_.each(g.edges(),function(e){var edge=g.edge(e);if(_.has(edge,"x")){getExtremes(edge)}});minX-=marginX;minY-=marginY;_.each(g.nodes(),function(v){var node=g.node(v);node.x-=minX;node.y-=minY});_.each(g.edges(),function(e){var edge=g.edge(e);_.each(edge.points,function(p){p.x-=minX;p.y-=minY});if(_.has(edge,"x")){edge.x-=minX}if(_.has(edge,"y")){edge.y-=minY}});graphLabel.width=maxX-minX+marginX;graphLabel.height=maxY-minY+marginY}function assignNodeIntersects(g){_.each(g.edges(),function(e){var edge=g.edge(e),nodeV=g.node(e.v),nodeW=g.node(e.w),p1,p2;if(!edge.points){edge.points=[];p1=nodeW;p2=nodeV}else{p1=edge.points[0];p2=edge.points[edge.points.length-1]}edge.points.unshift(util.intersectRect(nodeV,p1));edge.points.push(util.intersectRect(nodeW,p2))})}function fixupEdgeLabelCoords(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(_.has(edge,"x")){if(edge.labelpos==="l"||edge.labelpos==="r"){edge.width-=edge.labeloffset}switch(edge.labelpos){case"l":edge.x-=edge.width/2+edge.labeloffset;break;case"r":edge.x+=edge.width/2+edge.labeloffset;break}}})}function reversePointsForReversedEdges(g){_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.reversed){edge.points.reverse()}})}function removeBorderNodes(g){_.each(g.nodes(),function(v){if(g.children(v).length){var node=g.node(v),t=g.node(node.borderTop),b=g.node(node.borderBottom),l=g.node(_.last(node.borderLeft)),r=g.node(_.last(node.borderRight));node.width=Math.abs(r.x-l.x);node.height=Math.abs(b.y-t.y);node.x=l.x+node.width/2;node.y=t.y+node.height/2}});_.each(g.nodes(),function(v){if(g.node(v).dummy==="border"){g.removeNode(v)}})}function removeSelfEdges(g){_.each(g.edges(),function(e){if(e.v===e.w){var node=g.node(e.v);if(!node.selfEdges){node.selfEdges=[]}node.selfEdges.push({e:e,label:g.edge(e)});g.removeEdge(e)}})}function insertSelfEdges(g){var layers=util.buildLayerMatrix(g);_.each(layers,function(layer){var orderShift=0;_.each(layer,function(v,i){var node=g.node(v);node.order=i+orderShift;_.each(node.selfEdges,function(selfEdge){util.addDummyNode(g,"selfedge",{width:selfEdge.label.width,height:selfEdge.label.height,rank:node.rank,order:i+ ++orderShift,e:selfEdge.e,label:selfEdge.label},"_se")});delete node.selfEdges})})}function positionSelfEdges(g){_.each(g.nodes(),function(v){var node=g.node(v);if(node.dummy==="selfedge"){var selfNode=g.node(node.e.v),x=selfNode.x+selfNode.width/2,y=selfNode.y,dx=node.x-x,dy=selfNode.height/2;g.setEdge(node.e,node.label);g.removeNode(v);node.label.points=[{x:x+2*dx/3,y:y-dy},{x:x+5*dx/6,y:y-dy},{x:x+dx,y:y},{x:x+5*dx/6,y:y+dy},{x:x+2*dx/3,y:y+dy}];node.label.x=node.x;node.label.y=node.y}})}function selectNumberAttrs(obj,attrs){return _.mapValues(_.pick(obj,attrs),Number)}function canonicalize(attrs){var newAttrs={};_.each(attrs,function(v,k){newAttrs[k.toLowerCase()]=v});return newAttrs}},{"./acyclic":28,"./add-border-segments":29,"./coordinate-system":30,"./graphlib":33,"./lodash":36,"./nesting-graph":37,"./normalize":38,"./order":43,"./parent-dummy-chains":48,"./position":50,"./rank":52,"./util":55}],36:[function(require,module,exports){module.exports=require(20)},{"/Users/andrew/Documents/dev/dagre-d3/lib/lodash.js":20,lodash:77}],37:[function(require,module,exports){var _=require("./lodash"),util=require("./util");module.exports={run:run,cleanup:cleanup};function run(g){var root=util.addDummyNode(g,"root",{},"_root"),depths=treeDepths(g),height=_.max(depths)-1,nodeSep=2*height+1;g.graph().nestingRoot=root;_.each(g.edges(),function(e){g.edge(e).minlen*=nodeSep});var weight=sumWeights(g)+1;_.each(g.children(),function(child){dfs(g,root,nodeSep,weight,height,depths,child)});g.graph().nodeRankFactor=nodeSep}function dfs(g,root,nodeSep,weight,height,depths,v){var children=g.children(v);if(!children.length){if(v!==root){g.setEdge(root,v,{weight:0,minlen:nodeSep})}return}var top=util.addBorderNode(g,"_bt"),bottom=util.addBorderNode(g,"_bb"),label=g.node(v);g.setParent(top,v);label.borderTop=top;g.setParent(bottom,v);label.borderBottom=bottom;_.each(children,function(child){dfs(g,root,nodeSep,weight,height,depths,child);var childNode=g.node(child),childTop=childNode.borderTop?childNode.borderTop:child,childBottom=childNode.borderBottom?childNode.borderBottom:child,thisWeight=childNode.borderTop?weight:2*weight,minlen=childTop!==childBottom?1:height-depths[v]+1;g.setEdge(top,childTop,{weight:thisWeight,minlen:minlen,nestingEdge:true});g.setEdge(childBottom,bottom,{weight:thisWeight,minlen:minlen,nestingEdge:true})});if(!g.parent(v)){g.setEdge(root,top,{weight:0,minlen:height+depths[v]})}}function treeDepths(g){var depths={};function dfs(v,depth){var children=g.children(v);if(children&&children.length){_.each(children,function(child){dfs(child,depth+1)})}depths[v]=depth}_.each(g.children(),function(v){dfs(v,1)});return depths}function sumWeights(g){return _.reduce(g.edges(),function(acc,e){return acc+g.edge(e).weight},0)}function cleanup(g){var graphLabel=g.graph();g.removeNode(graphLabel.nestingRoot);delete graphLabel.nestingRoot;_.each(g.edges(),function(e){var edge=g.edge(e);if(edge.nestingEdge){g.removeEdge(e)}})}},{"./lodash":36,"./util":55}],38:[function(require,module,exports){"use strict";var _=require("./lodash"),util=require("./util");module.exports={run:run,undo:undo};function run(g){g.graph().dummyChains=[];_.each(g.edges(),function(edge){normalizeEdge(g,edge)})}function normalizeEdge(g,e){var v=e.v,vRank=g.node(v).rank,w=e.w,wRank=g.node(w).rank,name=e.name,edgeLabel=g.edge(e),labelRank=edgeLabel.labelRank;if(wRank===vRank+1)return;g.removeEdge(e);var dummy,attrs,i;for(i=0,++vRank;vRank0){if(index%2){weightSum+=tree[index+1]}index=index-1>>1;tree[index]+=entry.weight}cc+=entry.weight*weightSum}));return cc}},{"../lodash":36}],43:[function(require,module,exports){"use strict";var _=require("../lodash"),initOrder=require("./init-order"),crossCount=require("./cross-count"),sortSubgraph=require("./sort-subgraph"),buildLayerGraph=require("./build-layer-graph"),addSubgraphConstraints=require("./add-subgraph-constraints"),Graph=require("../graphlib").Graph,util=require("../util");module.exports=order;function order(g){var maxRank=util.maxRank(g),downLayerGraphs=buildLayerGraphs(g,_.range(1,maxRank+1),"inEdges"),upLayerGraphs=buildLayerGraphs(g,_.range(maxRank-1,-1,-1),"outEdges");var layering=initOrder(g);assignOrder(g,layering);var bestCC=Number.POSITIVE_INFINITY,best;for(var i=0,lastBest=0;lastBest<4;++i,++lastBest){sweepLayerGraphs(i%2?downLayerGraphs:upLayerGraphs,i%4>=2);layering=util.buildLayerMatrix(g);var cc=crossCount(g,layering);if(cc=vEntry.barycenter){mergeEntries(vEntry,uEntry)}}}function handleOut(vEntry){return function(wEntry){wEntry["in"].push(vEntry);if(--wEntry.indegree===0){sourceSet.push(wEntry)}}}while(sourceSet.length){var entry=sourceSet.pop();entries.push(entry);_.each(entry["in"].reverse(),handleIn(entry));_.each(entry.out,handleOut(entry))}return _.chain(entries).filter(function(entry){return!entry.merged}).map(function(entry){return _.pick(entry,["vs","i","barycenter","weight"])}).value()}function mergeEntries(target,source){var sum=0,weight=0;if(target.weight){sum+=target.barycenter*target.weight;weight+=target.weight}if(source.weight){sum+=source.barycenter*source.weight;weight+=source.weight}target.vs=source.vs.concat(target.vs);target.barycenter=sum/weight;target.weight=weight;target.i=Math.min(source.i,target.i);source.merged=true}},{"../lodash":36}],46:[function(require,module,exports){var _=require("../lodash"),barycenter=require("./barycenter"),resolveConflicts=require("./resolve-conflicts"),sort=require("./sort");module.exports=sortSubgraph;function sortSubgraph(g,v,cg,biasRight){var movable=g.children(v),node=g.node(v),bl=node?node.borderLeft:undefined,br=node?node.borderRight:undefined,subgraphs={};if(bl){movable=_.filter(movable,function(w){return w!==bl&&w!==br})}var barycenters=barycenter(g,movable);_.each(barycenters,function(entry){if(g.children(entry.v).length){var subgraphResult=sortSubgraph(g,entry.v,cg,biasRight);subgraphs[entry.v]=subgraphResult;if(_.has(subgraphResult,"barycenter")){mergeBarycenters(entry,subgraphResult)}}});var entries=resolveConflicts(barycenters,cg);expandSubgraphs(entries,subgraphs);var result=sort(entries,biasRight);if(bl){result.vs=_.flatten([bl,result.vs,br],true);if(g.predecessors(bl).length){var blPred=g.node(g.predecessors(bl)[0]),brPred=g.node(g.predecessors(br)[0]);if(!_.has(result,"barycenter")){result.barycenter=0;result.weight=0}result.barycenter=(result.barycenter*result.weight+blPred.order+brPred.order)/(result.weight+2);result.weight+=2}}return result}function expandSubgraphs(entries,subgraphs){_.each(entries,function(entry){entry.vs=_.flatten(entry.vs.map(function(v){if(subgraphs[v]){return subgraphs[v].vs}return v}),true)})}function mergeBarycenters(target,other){if(!_.isUndefined(target.barycenter)){target.barycenter=(target.barycenter*target.weight+other.barycenter*other.weight)/(target.weight+other.weight);target.weight+=other.weight}else{target.barycenter=other.barycenter;target.weight=other.weight}}},{"../lodash":36,"./barycenter":40,"./resolve-conflicts":45,"./sort":47}],47:[function(require,module,exports){var _=require("../lodash"),util=require("../util");module.exports=sort;function sort(entries,biasRight){var parts=util.partition(entries,function(entry){return _.has(entry,"barycenter")});var sortable=parts.lhs,unsortable=_.sortBy(parts.rhs,function(entry){return-entry.i}),vs=[],sum=0,weight=0,vsIndex=0;sortable.sort(compareWithBias(!!biasRight));vsIndex=consumeUnsortable(vs,unsortable,vsIndex);_.each(sortable,function(entry){vsIndex+=entry.vs.length;vs.push(entry.vs);sum+=entry.barycenter*entry.weight;weight+=entry.weight;vsIndex=consumeUnsortable(vs,unsortable,vsIndex)});var result={vs:_.flatten(vs,true)};if(weight){result.barycenter=sum/weight;result.weight=weight}return result}function consumeUnsortable(vs,unsortable,index){var last;while(unsortable.length&&(last=_.last(unsortable)).i<=index){unsortable.pop();vs.push(last.vs);index++}return index}function compareWithBias(bias){return function(entryV,entryW){if(entryV.barycenterentryW.barycenter){return 1}return!bias?entryV.i-entryW.i:entryW.i-entryV.i}}},{"../lodash":36,"../util":55}],48:[function(require,module,exports){var _=require("./lodash");module.exports=parentDummyChains;function parentDummyChains(g){var postorderNums=postorder(g);_.each(g.graph().dummyChains,function(v){var node=g.node(v),edgeObj=node.edgeObj,pathData=findPath(g,postorderNums,edgeObj.v,edgeObj.w),path=pathData.path,lca=pathData.lca,pathIdx=0,pathV=path[pathIdx],ascending=true;while(v!==edgeObj.w){node=g.node(v);if(ascending){while((pathV=path[pathIdx])!==lca&&g.node(pathV).maxRanklow||lim>postorderNums[parent].lim));lca=parent;parent=w;while((parent=g.parent(parent))!==lca){wPath.push(parent)}return{path:vPath.concat(wPath.reverse()),lca:lca}}function postorder(g){var result={},lim=0;function dfs(v){var low=lim;_.each(g.children(v),dfs);result[v]={low:low,lim:lim++}}_.each(g.children(),dfs);return result}},{"./lodash":36}],49:[function(require,module,exports){"use strict";var _=require("../lodash"),Graph=require("../graphlib").Graph,util=require("../util");module.exports={positionX:positionX,findType1Conflicts:findType1Conflicts,findType2Conflicts:findType2Conflicts,addConflict:addConflict,hasConflict:hasConflict,verticalAlignment:verticalAlignment,horizontalCompaction:horizontalCompaction,alignCoordinates:alignCoordinates,findSmallestWidthAlignment:findSmallestWidthAlignment,balance:balance};function findType1Conflicts(g,layering){var conflicts={};function visitLayer(prevLayer,layer){var k0=0,scanPos=0,prevLayerLength=prevLayer.length,lastNode=_.last(layer);_.each(layer,function(v,i){var w=findOtherInnerSegmentNode(g,v),k1=w?g.node(w).order:prevLayerLength;if(w||v===lastNode){_.each(layer.slice(scanPos,i+1),function(scanNode){_.each(g.predecessors(scanNode),function(u){var uLabel=g.node(u),uPos=uLabel.order;if((uPosnextNorthBorder)){addConflict(conflicts,u,v)}})}})}function visitLayer(north,south){var prevNorthPos=-1,nextNorthPos,southPos=0;_.each(south,function(v,southLookahead){if(g.node(v).dummy==="border"){var predecessors=g.predecessors(v);if(predecessors.length){nextNorthPos=g.node(predecessors[0]).order;scan(south,southPos,southLookahead,prevNorthPos,nextNorthPos);southPos=southLookahead;prevNorthPos=nextNorthPos}}scan(south,southPos,south.length,nextNorthPos,north.length)});return south}_.reduce(layering,visitLayer);return conflicts}function findOtherInnerSegmentNode(g,v){if(g.node(v).dummy){return _.find(g.predecessors(v),function(u){return g.node(u).dummy})}}function addConflict(conflicts,v,w){if(v>w){var tmp=v;v=w;w=tmp}var conflictsV=conflicts[v];if(!conflictsV){conflicts[v]=conflictsV={}}conflictsV[w]=true}function hasConflict(conflicts,v,w){if(v>w){var tmp=v;v=w;w=tmp}return _.has(conflicts[v],w)}function verticalAlignment(g,layering,conflicts,neighborFn){var root={},align={},pos={};_.each(layering,function(layer){_.each(layer,function(v,order){root[v]=v;align[v]=v;pos[v]=order})});_.each(layering,function(layer){var prevIdx=-1;_.each(layer,function(v){var ws=neighborFn(v);if(ws.length){ws=_.sortBy(ws,function(w){return pos[w]});var mp=(ws.length-1)/2;for(var i=Math.floor(mp),il=Math.ceil(mp);i<=il;++i){var w=ws[i];if(align[v]===v&&prevIdxwLabel.lim){tailLabel=wLabel;flip=true}var candidates=_.filter(g.edges(),function(edge){return flip===isDescendant(t,t.node(edge.v),tailLabel)&&flip!==isDescendant(t,t.node(edge.w),tailLabel)});return _.min(candidates,function(edge){return slack(g,edge)})}function exchangeEdges(t,g,e,f){var v=e.v,w=e.w;t.removeEdge(v,w);t.setEdge(f.v,f.w,{});initLowLimValues(t);initCutValues(t,g);updateRanks(t,g)}function updateRanks(t,g){var root=_.find(t.nodes(),function(v){return!g.node(v).parent}),vs=preorder(t,root);vs=vs.slice(1);_.each(vs,function(v){var parent=t.node(v).parent,edge=g.edge(v,parent),flipped=false;if(!edge){edge=g.edge(parent,v);flipped=true}g.node(v).rank=g.node(parent).rank+(flipped?edge.minlen:-edge.minlen)})}function isTreeEdge(tree,u,v){return tree.hasEdge(u,v)}function isDescendant(tree,vLabel,rootLabel){return rootLabel.low<=vLabel.lim&&vLabel.lim<=rootLabel.lim}},{"../graphlib":33,"../lodash":36,"../util":55,"./feasible-tree":51,"./util":54}],54:[function(require,module,exports){"use strict";var _=require("../lodash");module.exports={longestPath:longestPath,slack:slack};function longestPath(g){var visited={};function dfs(v){var label=g.node(v);if(_.has(visited,v)){return label.rank}visited[v]=true;var rank=_.min(_.map(g.outEdges(v),function(e){return dfs(e.w)-g.edge(e).minlen}));if(rank===Number.POSITIVE_INFINITY){rank=0}return label.rank=rank}_.each(g.sources(),dfs)}function slack(g,e){return g.node(e.w).rank-g.node(e.v).rank-g.edge(e).minlen}},{"../lodash":36}],55:[function(require,module,exports){"use strict";var _=require("./lodash"),Graph=require("./graphlib").Graph;module.exports={addDummyNode:addDummyNode,simplify:simplify,asNonCompoundGraph:asNonCompoundGraph,successorWeights:successorWeights,predecessorWeights:predecessorWeights,intersectRect:intersectRect,buildLayerMatrix:buildLayerMatrix,normalizeRanks:normalizeRanks,removeEmptyRanks:removeEmptyRanks,addBorderNode:addBorderNode,maxRank:maxRank,partition:partition,time:time,notime:notime};function addDummyNode(g,type,attrs,name){var v; -stamp=now();thisArg=this;trailingCall=trailing&&(timeoutId||!leading);if(maxWait===false){var leadingCall=leading&&!timeoutId}else{if(!maxTimeoutId&&!leading){lastCalled=stamp}var remaining=maxWait-(stamp-lastCalled),isCalled=remaining<=0;if(isCalled){if(maxTimeoutId){maxTimeoutId=clearTimeout(maxTimeoutId)}lastCalled=stamp;result=func.apply(thisArg,args)}else if(!maxTimeoutId){maxTimeoutId=setTimeout(maxDelayed,remaining)}}if(isCalled&&timeoutId){timeoutId=clearTimeout(timeoutId)}else if(!timeoutId&&wait!==maxWait){timeoutId=setTimeout(delayed,wait)}if(leadingCall){isCalled=true;result=func.apply(thisArg,args)}if(isCalled&&!timeoutId&&!maxTimeoutId){args=thisArg=null}return result}}function defer(func){if(!isFunction(func)){throw new TypeError}var args=slice(arguments,1);return setTimeout(function(){func.apply(undefined,args)},1)}function delay(func,wait){if(!isFunction(func)){throw new TypeError}var args=slice(arguments,2);return setTimeout(function(){func.apply(undefined,args)},wait)}function memoize(func,resolver){if(!isFunction(func)){throw new TypeError}var memoized=function(){var cache=memoized.cache,key=resolver?resolver.apply(this,arguments):keyPrefix+arguments[0];return hasOwnProperty.call(cache,key)?cache[key]:cache[key]=func.apply(this,arguments)};memoized.cache={};return memoized}function once(func){var ran,result;if(!isFunction(func)){throw new TypeError}return function(){if(ran){return result}ran=true;result=func.apply(this,arguments);func=null;return result}}function partial(func){return createWrapper(func,16,slice(arguments,1))}function partialRight(func){return createWrapper(func,32,null,slice(arguments,1))}function throttle(func,wait,options){var leading=true,trailing=true;if(!isFunction(func)){throw new TypeError}if(options===false){leading=false}else if(isObject(options)){leading="leading"in options?options.leading:leading;trailing="trailing"in options?options.trailing:trailing}debounceOptions.leading=leading;debounceOptions.maxWait=wait;debounceOptions.trailing=trailing;return debounce(func,wait,debounceOptions)}function wrap(value,wrapper){return createWrapper(wrapper,16,[value])}function constant(value){return function(){return value}}function createCallback(func,thisArg,argCount){var type=typeof func;if(func==null||type=="function"){return baseCreateCallback(func,thisArg,argCount)}if(type!="object"){return property(func)}var props=keys(func),key=props[0],a=func[key];if(props.length==1&&a===a&&!isObject(a)){return function(object){var b=object[key];return a===b&&(a!==0||1/a==1/b)}}return function(object){var length=props.length,result=false;while(length--){if(!(result=baseIsEqual(object[props[length]],func[props[length]],null,true))){break}}return result}}function escape(string){return string==null?"":String(string).replace(reUnescapedHtml,escapeHtmlChar)}function identity(value){return value}function mixin(object,source,options){var chain=true,methodNames=source&&functions(source);if(!source||!options&&!methodNames.length){if(options==null){options=source}ctor=lodashWrapper;source=object;object=lodash;methodNames=functions(source)}if(options===false){chain=false}else if(isObject(options)&&"chain"in options){chain=options.chain}var ctor=object,isFunc=isFunction(ctor);forEach(methodNames,function(methodName){var func=object[methodName]=source[methodName];if(isFunc){ctor.prototype[methodName]=function(){var chainAll=this.__chain__,value=this.__wrapped__,args=[value];push.apply(args,arguments);var result=func.apply(object,args);if(chain||chainAll){if(value===result&&isObject(result)){return this}result=new ctor(result);result.__chain__=chainAll}return result}}})}function noConflict(){context._=oldDash;return this}function noop(){}var now=isNative(now=Date.now)&&now||function(){return(new Date).getTime()};var parseInt=nativeParseInt(whitespace+"08")==8?nativeParseInt:function(value,radix){return nativeParseInt(isString(value)?value.replace(reLeadingSpacesAndZeros,""):value,radix||0)};function property(key){return function(object){return object[key]}}function random(min,max,floating){var noMin=min==null,noMax=max==null;if(floating==null){if(typeof min=="boolean"&&noMax){floating=min;min=1}else if(!noMax&&typeof max=="boolean"){floating=max;noMax=true}}if(noMin&&noMax){max=1}min=+min||0;if(noMax){max=min;min=0}else{max=+max||0}if(floating||min%1||max%1){var rand=nativeRandom();return nativeMin(min+rand*(max-min+parseFloat("1e-"+((rand+"").length-1))),max)}return baseRandom(min,max)}function result(object,key){if(object){var value=object[key];return isFunction(value)?object[key]():value}}function template(text,data,options){var settings=lodash.templateSettings;text=String(text||"");options=defaults({},options,settings);var imports=defaults({},options.imports,settings.imports),importsKeys=keys(imports),importsValues=values(imports);var isEvaluating,index=0,interpolate=options.interpolate||reNoMatch,source="__p += '";var reDelimiters=RegExp((options.escape||reNoMatch).source+"|"+interpolate.source+"|"+(interpolate===reInterpolate?reEsTemplate:reNoMatch).source+"|"+(options.evaluate||reNoMatch).source+"|$","g");text.replace(reDelimiters,function(match,escapeValue,interpolateValue,esTemplateValue,evaluateValue,offset){interpolateValue||(interpolateValue=esTemplateValue);source+=text.slice(index,offset).replace(reUnescapedString,escapeStringChar);if(escapeValue){source+="' +\n__e("+escapeValue+") +\n'"}if(evaluateValue){isEvaluating=true;source+="';\n"+evaluateValue+";\n__p += '"}if(interpolateValue){source+="' +\n((__t = ("+interpolateValue+")) == null ? '' : __t) +\n'"}index=offset+match.length;return match});source+="';\n";var variable=options.variable,hasVariable=variable;if(!hasVariable){variable="obj";source="with ("+variable+") {\n"+source+"\n}\n"}source=(isEvaluating?source.replace(reEmptyStringLeading,""):source).replace(reEmptyStringMiddle,"$1").replace(reEmptyStringTrailing,"$1;");source="function("+variable+") {\n"+(hasVariable?"":variable+" || ("+variable+" = {});\n")+"var __t, __p = '', __e = _.escape"+(isEvaluating?", __j = Array.prototype.join;\n"+"function print() { __p += __j.call(arguments, '') }\n":";\n")+source+"return __p\n}";var sourceURL="\n/*\n//# sourceURL="+(options.sourceURL||"/lodash/template/source["+templateCounter++ +"]")+"\n*/";try{var result=Function(importsKeys,"return "+source+sourceURL).apply(undefined,importsValues)}catch(e){e.source=source;throw e}if(data){return result(data)}result.source=source;return result}function times(n,callback,thisArg){n=(n=+n)>-1?n:0;var index=-1,result=Array(n);callback=baseCreateCallback(callback,thisArg,1);while(++indexMath.abs(dx)*h){if(dy<0){h=-h}sx=h*dx/dy;sy=h}else{if(dx<0){w=-w}sx=w;sy=w*dy/dx}return{x:x+sx,y:y+sy}}function buildLayerMatrix(g){var layering=_.map(_.range(maxRank(g)+1),function(){return[]});_.each(g.nodes(),function(v){var node=g.node(v),rank=node.rank;if(!_.isUndefined(rank)){layering[rank][node.order]=v}});return layering}function normalizeRanks(g){var min=_.min(_.map(g.nodes(),function(v){return g.node(v).rank}));_.each(g.nodes(),function(v){var node=g.node(v);if(_.has(node,"rank")){node.rank-=min}})}function removeEmptyRanks(g){var offset=_.min(_.map(g.nodes(),function(v){return g.node(v).rank}));var layers=[];_.each(g.nodes(),function(v){var rank=g.node(v).rank-offset;if(!_.has(layers,rank)){layers[rank]=[]}layers[rank].push(v)});var delta=0,nodeRankFactor=g.graph().nodeRankFactor;_.each(layers,function(vs,i){if(_.isUndefined(vs)&&i%nodeRankFactor!==0){--delta}else if(delta){_.each(vs,function(v){g.node(v).rank+=delta})}})}function addBorderNode(g,prefix,rank,order){var node={width:0,height:0};if(arguments.length>=4){node.rank=rank;node.order=order}return addDummyNode(g,"border",node,prefix)}function maxRank(g){return _.max(_.map(g.nodes(),function(v){var rank=g.node(v).rank;if(!_.isUndefined(rank)){return rank}}))}function partition(collection,fn){var result={lhs:[],rhs:[]};_.each(collection,function(value){if(fn(value)){result.lhs.push(value)}else{result.rhs.push(value)}});return result}function time(name,fn){var start=_.now();try{return fn()}finally{console.log(name+" time: "+(_.now()-start)+"ms")}}function notime(name,fn){return fn()}},{"./graphlib":33,"./lodash":36}],56:[function(require,module,exports){module.exports="0.7.1"},{}],57:[function(require,module,exports){var lib=require("./lib");module.exports={Graph:lib.Graph,json:require("./lib/json"),alg:require("./lib/alg"),version:lib.version}},{"./lib":73,"./lib/alg":64,"./lib/json":74}],58:[function(require,module,exports){var _=require("../lodash");module.exports=components;function components(g){var visited={},cmpts=[],cmpt;function dfs(v){if(_.has(visited,v))return;visited[v]=true;cmpt.push(v);_.each(g.successors(v),dfs);_.each(g.predecessors(v),dfs)}_.each(g.nodes(),function(v){cmpt=[];dfs(v);if(cmpt.length){cmpts.push(cmpt)}});return cmpts}},{"../lodash":75}],59:[function(require,module,exports){var _=require("../lodash");module.exports=dfs;function dfs(g,vs,order){if(!_.isArray(vs)){vs=[vs]}var acc=[],visited={};_.each(vs,function(v){if(!g.hasNode(v)){throw new Error("Graph does not have node: "+v)}doDfs(g,v,order==="post",visited,acc)});return acc}function doDfs(g,v,postorder,visited,acc){if(!_.has(visited,v)){visited[v]=true;if(!postorder){acc.push(v)}_.each(g.neighbors(v),function(w){doDfs(g,w,postorder,visited,acc)});if(postorder){acc.push(v)}}}},{"../lodash":75}],60:[function(require,module,exports){var dijkstra=require("./dijkstra"),_=require("../lodash");module.exports=dijkstraAll;function dijkstraAll(g,weightFunc,edgeFunc){return _.transform(g.nodes(),function(acc,v){acc[v]=dijkstra(g,v,weightFunc,edgeFunc)},{})}},{"../lodash":75,"./dijkstra":61}],61:[function(require,module,exports){var _=require("../lodash"),PriorityQueue=require("../data/priority-queue");module.exports=dijkstra;var DEFAULT_WEIGHT_FUNC=_.constant(1);function dijkstra(g,source,weightFn,edgeFn){return runDijkstra(g,String(source),weightFn||DEFAULT_WEIGHT_FUNC,edgeFn||function(v){return g.outEdges(v)})}function runDijkstra(g,source,weightFn,edgeFn){var results={},pq=new PriorityQueue,v,vEntry;var updateNeighbors=function(edge){var w=edge.v!==v?edge.v:edge.w,wEntry=results[w],weight=weightFn(edge),distance=vEntry.distance+weight;if(weight<0){throw new Error("dijkstra does not allow negative edge weights. "+"Bad edge: "+edge+" Weight: "+weight)}if(distance0){v=pq.removeMin();vEntry=results[v];if(vEntry.distance===Number.POSITIVE_INFINITY){break}edgeFn(v).forEach(updateNeighbors)}return results}},{"../data/priority-queue":71,"../lodash":75}],62:[function(require,module,exports){var _=require("../lodash"),tarjan=require("./tarjan");module.exports=findCycles;function findCycles(g){return _.filter(tarjan(g),function(cmpt){return cmpt.length>1})}},{"../lodash":75,"./tarjan":69}],63:[function(require,module,exports){var _=require("../lodash");module.exports=floydWarshall;var DEFAULT_WEIGHT_FUNC=_.constant(1);function floydWarshall(g,weightFn,edgeFn){return runFloydWarshall(g,weightFn||DEFAULT_WEIGHT_FUNC,edgeFn||function(v){return g.outEdges(v)})}function runFloydWarshall(g,weightFn,edgeFn){var results={},nodes=g.nodes();nodes.forEach(function(v){results[v]={};results[v][v]={distance:0};nodes.forEach(function(w){if(v!==w){results[v][w]={distance:Number.POSITIVE_INFINITY}}});edgeFn(v).forEach(function(edge){var w=edge.v===v?edge.w:edge.v,d=weightFn(edge);results[v][w]={distance:d,predecessor:v}})});nodes.forEach(function(k){var rowK=results[k];nodes.forEach(function(i){var rowI=results[i];nodes.forEach(function(j){var ik=rowI[k];var kj=rowK[j];var ij=rowI[j];var altDistance=ik.distance+kj.distance;if(altDistance0){v=pq.removeMin();if(_.has(parents,v)){result.setEdge(v,parents[v])}else if(init){throw new Error("Input graph is not connected: "+g)}else{init=true}g.nodeEdges(v).forEach(updateNeighbors)}return result}},{"../data/priority-queue":71,"../graph":72,"../lodash":75}],69:[function(require,module,exports){var _=require("../lodash");module.exports=tarjan;function tarjan(g){var index=0,stack=[],visited={},results=[];function dfs(v){var entry=visited[v]={onStack:true,lowlink:index,index:index++};stack.push(v);g.successors(v).forEach(function(w){if(!_.has(visited,w)){dfs(w);entry.lowlink=Math.min(entry.lowlink,visited[w].lowlink)}else if(visited[w].onStack){entry.lowlink=Math.min(entry.lowlink,visited[w].index)}});if(entry.lowlink===entry.index){var cmpt=[],w;do{w=stack.pop();visited[w].onStack=false;cmpt.push(w)}while(v!==w);results.push(cmpt)}}g.nodes().forEach(function(v){if(!_.has(visited,v)){dfs(v)}});return results}},{"../lodash":75}],70:[function(require,module,exports){var _=require("../lodash");module.exports=topsort;topsort.CycleException=CycleException;function topsort(g){var visited={},stack={},results=[];function visit(node){if(_.has(stack,node)){throw new CycleException}if(!_.has(visited,node)){stack[node]=true;visited[node]=true;_.each(g.predecessors(node),visit);delete stack[node];results.push(node)}}_.each(g.sinks(),visit);if(_.size(visited)!==g.nodeCount()){throw new CycleException}return results}function CycleException(){}},{"../lodash":75}],71:[function(require,module,exports){var _=require("../lodash");module.exports=PriorityQueue;function PriorityQueue(){this._arr=[];this._keyIndices={}}PriorityQueue.prototype.size=function(){return this._arr.length};PriorityQueue.prototype.keys=function(){return this._arr.map(function(x){return x.key})};PriorityQueue.prototype.has=function(key){return _.has(this._keyIndices,key)};PriorityQueue.prototype.priority=function(key){var index=this._keyIndices[key];if(index!==undefined){return this._arr[index].priority}};PriorityQueue.prototype.min=function(){if(this.size()===0){throw new Error("Queue underflow")}return this._arr[0].key};PriorityQueue.prototype.add=function(key,priority){var keyIndices=this._keyIndices;key=String(key);if(!_.has(keyIndices,key)){var arr=this._arr;var index=arr.length;keyIndices[key]=index;arr.push({key:key,priority:priority});this._decrease(index);return true}return false};PriorityQueue.prototype.removeMin=function(){this._swap(0,this._arr.length-1);var min=this._arr.pop();delete this._keyIndices[min.key];this._heapify(0);return min.key};PriorityQueue.prototype.decrease=function(key,priority){var index=this._keyIndices[key];if(priority>this._arr[index].priority){throw new Error("New priority is greater than current priority. "+"Key: "+key+" Old: "+this._arr[index].priority+" New: "+priority)}this._arr[index].priority=priority;this._decrease(index)};PriorityQueue.prototype._heapify=function(i){var arr=this._arr;var l=2*i,r=l+1,largest=i;if(l>1;if(arr[parent].priority1){this.setNode(v,value)}else{this.setNode(v)}},this);return this};Graph.prototype.setNode=function(v,value){if(_.has(this._nodes,v)){if(arguments.length>1){this._nodes[v]=value}return this}this._nodes[v]=arguments.length>1?value:this._defaultNodeLabelFn(v);if(this._isCompound){this._parent[v]=GRAPH_NODE;this._children[v]={};this._children[GRAPH_NODE][v]=true}this._in[v]={};this._preds[v]={};this._out[v]={};this._sucs[v]={};++this._nodeCount;return this};Graph.prototype.node=function(v){return this._nodes[v]};Graph.prototype.hasNode=function(v){return _.has(this._nodes,v)};Graph.prototype.removeNode=function(v){var self=this;if(_.has(this._nodes,v)){var removeEdge=function(e){self.removeEdge(self._edgeObjs[e])};delete this._nodes[v];if(this._isCompound){this._removeFromParentsChildList(v);delete this._parent[v];_.each(this.children(v),function(child){this.setParent(child)},this);delete this._children[v]}_.each(_.keys(this._in[v]),removeEdge);delete this._in[v];delete this._preds[v];_.each(_.keys(this._out[v]),removeEdge);delete this._out[v];delete this._sucs[v];--this._nodeCount}return this};Graph.prototype.setParent=function(v,parent){if(!this._isCompound){throw new Error("Cannot set parent in a non-compound graph")}if(_.isUndefined(parent)){parent=GRAPH_NODE}else{for(var ancestor=parent;!_.isUndefined(ancestor);ancestor=this.parent(ancestor)){if(ancestor===v){throw new Error("Setting "+parent+" as parent of "+v+" would create create a cycle")}}this.setNode(parent)}this.setNode(v);this._removeFromParentsChildList(v);this._parent[v]=parent;this._children[parent][v]=true;return this};Graph.prototype._removeFromParentsChildList=function(v){delete this._children[this._parent[v]][v]};Graph.prototype.parent=function(v){if(this._isCompound){var parent=this._parent[v];if(parent!==GRAPH_NODE){return parent}}};Graph.prototype.children=function(v){if(_.isUndefined(v)){v=GRAPH_NODE}if(this._isCompound){var children=this._children[v];if(children){return _.keys(children)}}else if(v===GRAPH_NODE){return this.nodes()}else if(this.hasNode(v)){return[]}};Graph.prototype.predecessors=function(v){var predsV=this._preds[v];if(predsV){return _.keys(predsV)}};Graph.prototype.successors=function(v){var sucsV=this._sucs[v];if(sucsV){return _.keys(sucsV)}};Graph.prototype.neighbors=function(v){var preds=this.predecessors(v);if(preds){return _.union(preds,this.successors(v))}};Graph.prototype.setDefaultEdgeLabel=function(newDefault){if(!_.isFunction(newDefault)){newDefault=_.constant(newDefault)}this._defaultEdgeLabelFn=newDefault;return this};Graph.prototype.edgeCount=function(){return this._edgeCount};Graph.prototype.edges=function(){return _.values(this._edgeObjs)};Graph.prototype.setPath=function(vs,value){var self=this,args=arguments;_.reduce(vs,function(v,w){if(args.length>1){self.setEdge(v,w,value)}else{self.setEdge(v,w)}return w});return this};Graph.prototype.setEdge=function(){var v,w,name,value,valueSpecified=false;if(_.isPlainObject(arguments[0])){v=arguments[0].v;w=arguments[0].w;name=arguments[0].name;if(arguments.length===2){value=arguments[1];valueSpecified=true}}else{v=arguments[0];w=arguments[1];name=arguments[3];if(arguments.length>2){value=arguments[2];valueSpecified=true}}v=""+v;w=""+w;if(!_.isUndefined(name)){name=""+name}var e=edgeArgsToId(this._isDirected,v,w,name);if(_.has(this._edgeLabels,e)){if(valueSpecified){this._edgeLabels[e]=value}return this}if(!_.isUndefined(name)&&!this._isMultigraph){throw new Error("Cannot set a named edge when isMultigraph = false")}this.setNode(v);this.setNode(w);this._edgeLabels[e]=valueSpecified?value:this._defaultEdgeLabelFn(v,w,name);var edgeObj=edgeArgsToObj(this._isDirected,v,w,name);v=edgeObj.v;w=edgeObj.w;Object.freeze(edgeObj);this._edgeObjs[e]=edgeObj;incrementOrInitEntry(this._preds[w],v);incrementOrInitEntry(this._sucs[v],w);this._in[w][e]=edgeObj;this._out[v][e]=edgeObj;this._edgeCount++;return this};Graph.prototype.edge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name);return this._edgeLabels[e]};Graph.prototype.hasEdge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name);return _.has(this._edgeLabels,e)};Graph.prototype.removeEdge=function(v,w,name){var e=arguments.length===1?edgeObjToId(this._isDirected,arguments[0]):edgeArgsToId(this._isDirected,v,w,name),edge=this._edgeObjs[e];if(edge){v=edge.v;w=edge.w;delete this._edgeLabels[e];delete this._edgeObjs[e];decrementOrRemoveEntry(this._preds[w],v);decrementOrRemoveEntry(this._sucs[v],w);delete this._in[w][e];delete this._out[v][e];this._edgeCount--}return this};Graph.prototype.inEdges=function(v,u){var inV=this._in[v];if(inV){var edges=_.values(inV);if(!u){return edges}return _.filter(edges,function(edge){return edge.v===u})}};Graph.prototype.outEdges=function(v,w){var outV=this._out[v];if(outV){var edges=_.values(outV);if(!w){return edges}return _.filter(edges,function(edge){return edge.w===w})}};Graph.prototype.nodeEdges=function(v,w){var inEdges=this.inEdges(v,w);if(inEdges){return inEdges.concat(this.outEdges(v,w))}};function incrementOrInitEntry(map,k){if(_.has(map,k)){map[k]++}else{map[k]=1}}function decrementOrRemoveEntry(map,k){if(!--map[k]){delete map[k]}}function edgeArgsToId(isDirected,v,w,name){if(!isDirected&&v>w){var tmp=v;v=w;w=tmp}return v+EDGE_KEY_DELIM+w+EDGE_KEY_DELIM+(_.isUndefined(name)?DEFAULT_EDGE_NAME:name)}function edgeArgsToObj(isDirected,v,w,name){if(!isDirected&&v>w){var tmp=v;v=w;w=tmp}var edgeObj={v:v,w:w};if(name){edgeObj.name=name}return edgeObj}function edgeObjToId(isDirected,edgeObj){return edgeArgsToId(isDirected,edgeObj.v,edgeObj.w,edgeObj.name)}},{"./lodash":75}],73:[function(require,module,exports){module.exports={Graph:require("./graph"),version:require("./version")}},{"./graph":72,"./version":76}],74:[function(require,module,exports){var _=require("./lodash"),Graph=require("./graph");module.exports={write:write,read:read};function write(g){var json={options:{directed:g.isDirected(),multigraph:g.isMultigraph(),compound:g.isCompound()},nodes:writeNodes(g),edges:writeEdges(g)};if(!_.isUndefined(g.graph())){json.value=_.clone(g.graph())}return json}function writeNodes(g){return _.map(g.nodes(),function(v){var nodeValue=g.node(v),parent=g.parent(v),node={v:v};if(!_.isUndefined(nodeValue)){node.value=nodeValue}if(!_.isUndefined(parent)){node.parent=parent}return node})}function writeEdges(g){return _.map(g.edges(),function(e){var edgeValue=g.edge(e),edge={v:e.v,w:e.w};if(!_.isUndefined(e.name)){edge.name=e.name}if(!_.isUndefined(edgeValue)){edge.value=edgeValue}return edge})}function read(json){var g=new Graph(json.options).setGraph(json.value);_.each(json.nodes,function(entry){g.setNode(entry.v,entry.value);if(entry.parent){g.setParent(entry.v,entry.parent)}});_.each(json.edges,function(entry){g.setEdge({v:entry.v,w:entry.w,name:entry.name},entry.value)});return g}},{"./graph":72,"./lodash":75}],75:[function(require,module,exports){module.exports=require(20)},{"/Users/andrew/Documents/dev/dagre-d3/lib/lodash.js":20,lodash:77}],76:[function(require,module,exports){module.exports="1.0.1"},{}],77:[function(require,module,exports){(function(global){(function(){var undefined;var arrayPool=[],objectPool=[];var idCounter=0;var keyPrefix=+new Date+"";var largeArraySize=75;var maxPoolSize=40;var whitespace=" \f \ufeff"+"\n\r\u2028\u2029"+" ᠎              ";var reEmptyStringLeading=/\b__p \+= '';/g,reEmptyStringMiddle=/\b(__p \+=) '' \+/g,reEmptyStringTrailing=/(__e\(.*?\)|\b__t\)) \+\n'';/g;var reEsTemplate=/\$\{([^\\}]*(?:\\.[^\\}]*)*)\}/g;var reFlags=/\w*$/;var reFuncName=/^\s*function[ \n\r\t]+\w/;var reInterpolate=/<%=([\s\S]+?)%>/g;var reLeadingSpacesAndZeros=RegExp("^["+whitespace+"]*0+(?=.$)");var reNoMatch=/($^)/;var reThis=/\bthis\b/;var reUnescapedString=/['\n\r\t\u2028\u2029\\]/g;var contextProps=["Array","Boolean","Date","Function","Math","Number","Object","RegExp","String","_","attachEvent","clearTimeout","isFinite","isNaN","parseInt","setTimeout"];var templateCounter=0;var argsClass="[object Arguments]",arrayClass="[object Array]",boolClass="[object Boolean]",dateClass="[object Date]",funcClass="[object Function]",numberClass="[object Number]",objectClass="[object Object]",regexpClass="[object RegExp]",stringClass="[object String]";var cloneableClasses={};cloneableClasses[funcClass]=false;cloneableClasses[argsClass]=cloneableClasses[arrayClass]=cloneableClasses[boolClass]=cloneableClasses[dateClass]=cloneableClasses[numberClass]=cloneableClasses[objectClass]=cloneableClasses[regexpClass]=cloneableClasses[stringClass]=true;var debounceOptions={leading:false,maxWait:0,trailing:false};var descriptor={configurable:false,enumerable:false,value:null,writable:false};var objectTypes={"boolean":false,"function":true,object:true,number:false,string:false,undefined:false};var stringEscapes={"\\":"\\","'":"'","\n":"n","\r":"r"," ":"t","\u2028":"u2028","\u2029":"u2029"};var root=objectTypes[typeof window]&&window||this;var freeExports=objectTypes[typeof exports]&&exports&&!exports.nodeType&&exports;var freeModule=objectTypes[typeof module]&&module&&!module.nodeType&&module;var moduleExports=freeModule&&freeModule.exports===freeExports&&freeExports;var freeGlobal=objectTypes[typeof global]&&global;if(freeGlobal&&(freeGlobal.global===freeGlobal||freeGlobal.window===freeGlobal)){root=freeGlobal}function baseIndexOf(array,value,fromIndex){var index=(fromIndex||0)-1,length=array?array.length:0;while(++index-1?0:-1:cache?0:-1}function cachePush(value){var cache=this.cache,type=typeof value;if(type=="boolean"||value==null){cache[value]=true}else{if(type!="number"&&type!="string"){type="object"}var key=type=="number"?value:keyPrefix+value,typeCache=cache[type]||(cache[type]={});if(type=="object"){(typeCache[key]||(typeCache[key]=[])).push(value)}else{typeCache[key]=true}}}function charAtCallback(value){return value.charCodeAt(0)}function compareAscending(a,b){var ac=a.criteria,bc=b.criteria,index=-1,length=ac.length;while(++indexother||typeof value=="undefined"){return 1}if(value/g,evaluate:/<%([\s\S]+?)%>/g,interpolate:reInterpolate,variable:"",imports:{_:lodash}};function baseBind(bindData){var func=bindData[0],partialArgs=bindData[2],thisArg=bindData[4];function bound(){if(partialArgs){var args=slice(partialArgs);push.apply(args,arguments)}if(this instanceof bound){var thisBinding=baseCreate(func.prototype),result=func.apply(thisBinding,args||arguments);return isObject(result)?result:thisBinding}return func.apply(thisArg,args||arguments)}setBindData(bound,bindData);return bound}function baseClone(value,isDeep,callback,stackA,stackB){if(callback){var result=callback(value);if(typeof result!="undefined"){return result}}var isObj=isObject(value);if(isObj){var className=toString.call(value);if(!cloneableClasses[className]){return value}var ctor=ctorByClass[className];switch(className){case boolClass:case dateClass:return new ctor(+value);case numberClass:case stringClass:return new ctor(value);case regexpClass:result=ctor(value.source,reFlags.exec(value));result.lastIndex=value.lastIndex;return result}}else{return value}var isArr=isArray(value);if(isDeep){var initedStack=!stackA;stackA||(stackA=getArray());stackB||(stackB=getArray());var length=stackA.length;while(length--){if(stackA[length]==value){return stackB[length]}}result=isArr?ctor(value.length):{}}else{result=isArr?slice(value):assign({},value)}if(isArr){if(hasOwnProperty.call(value,"index")){result.index=value.index}if(hasOwnProperty.call(value,"input")){result.input=value.input}}if(!isDeep){return result}stackA.push(value);stackB.push(result);(isArr?forEach:forOwn)(value,function(objValue,key){result[key]=baseClone(objValue,isDeep,callback,stackA,stackB)});if(initedStack){releaseArray(stackA);releaseArray(stackB)}return result}function baseCreate(prototype,properties){return isObject(prototype)?nativeCreate(prototype):{}}if(!nativeCreate){baseCreate=function(){function Object(){}return function(prototype){if(isObject(prototype)){Object.prototype=prototype;var result=new Object;Object.prototype=null}return result||context.Object()}}()}function baseCreateCallback(func,thisArg,argCount){if(typeof func!="function"){return identity}if(typeof thisArg=="undefined"||!("prototype"in func)){return func}var bindData=func.__bindData__;if(typeof bindData=="undefined"){if(support.funcNames){bindData=!func.name}bindData=bindData||!support.funcDecomp;if(!bindData){var source=fnToString.call(func);if(!support.funcNames){bindData=!reFuncName.test(source)}if(!bindData){bindData=reThis.test(source);setBindData(func,bindData)}}}if(bindData===false||bindData!==true&&bindData[1]&1){return func}switch(argCount){case 1:return function(value){return func.call(thisArg,value)};case 2:return function(a,b){return func.call(thisArg,a,b)};case 3:return function(value,index,collection){return func.call(thisArg,value,index,collection)};case 4:return function(accumulator,value,index,collection){return func.call(thisArg,accumulator,value,index,collection)}}return bind(func,thisArg)}function baseCreateWrapper(bindData){var func=bindData[0],bitmask=bindData[1],partialArgs=bindData[2],partialRightArgs=bindData[3],thisArg=bindData[4],arity=bindData[5];var isBind=bitmask&1,isBindKey=bitmask&2,isCurry=bitmask&4,isCurryBound=bitmask&8,key=func;function bound(){var thisBinding=isBind?thisArg:this;if(partialArgs){var args=slice(partialArgs);push.apply(args,arguments)}if(partialRightArgs||isCurry){args||(args=slice(arguments));if(partialRightArgs){push.apply(args,partialRightArgs)}if(isCurry&&args.length=largeArraySize&&indexOf===baseIndexOf,result=[]; + +if(isLarge){var cache=createCache(values);if(cache){indexOf=cacheIndexOf;values=cache}else{isLarge=false}}while(++index-1}})}}stackA.pop();stackB.pop();if(initedStack){releaseArray(stackA);releaseArray(stackB)}return result}function baseMerge(object,source,callback,stackA,stackB){(isArray(source)?forEach:forOwn)(source,function(source,key){var found,isArr,result=source,value=object[key];if(source&&((isArr=isArray(source))||isPlainObject(source))){var stackLength=stackA.length;while(stackLength--){if(found=stackA[stackLength]==source){value=stackB[stackLength];break}}if(!found){var isShallow;if(callback){result=callback(value,source);if(isShallow=typeof result!="undefined"){value=result}}if(!isShallow){value=isArr?isArray(value)?value:[]:isPlainObject(value)?value:{}}stackA.push(source);stackB.push(value);if(!isShallow){baseMerge(value,source,callback,stackA,stackB)}}}else{if(callback){result=callback(value,source);if(typeof result=="undefined"){result=source}}if(typeof result!="undefined"){value=result}}object[key]=value})}function baseRandom(min,max){return min+floor(nativeRandom()*(max-min+1))}function baseUniq(array,isSorted,callback){var index=-1,indexOf=getIndexOf(),length=array?array.length:0,result=[];var isLarge=!isSorted&&length>=largeArraySize&&indexOf===baseIndexOf,seen=callback||isLarge?getArray():result;if(isLarge){var cache=createCache(seen);indexOf=cacheIndexOf;seen=cache}while(++index":">",'"':""","'":"'"};var htmlUnescapes=invert(htmlEscapes);var reEscapedHtml=RegExp("("+keys(htmlUnescapes).join("|")+")","g"),reUnescapedHtml=RegExp("["+keys(htmlEscapes).join("")+"]","g");var assign=function(object,source,guard){var index,iterable=object,result=iterable;if(!iterable)return result;var args=arguments,argsIndex=0,argsLength=typeof guard=="number"?2:args.length;if(argsLength>3&&typeof args[argsLength-2]=="function"){var callback=baseCreateCallback(args[--argsLength-1],args[argsLength--],2)}else if(argsLength>2&&typeof args[argsLength-1]=="function"){callback=args[--argsLength]}while(++argsIndex3&&typeof args[length-2]=="function"){var callback=baseCreateCallback(args[--length-1],args[length--],2)}else if(length>2&&typeof args[length-1]=="function"){callback=args[--length]}var sources=slice(arguments,1,length),index=-1,stackA=getArray(),stackB=getArray();while(++index-1}else if(typeof length=="number"){result=(isString(collection)?collection.indexOf(target,fromIndex):indexOf(collection,target,fromIndex))>-1}else{forOwn(collection,function(value){if(++index>=fromIndex){return!(result=value===target)}})}return result}var countBy=createAggregator(function(result,value,key){hasOwnProperty.call(result,key)?result[key]++:result[key]=1});function every(collection,callback,thisArg){var result=true;callback=lodash.createCallback(callback,thisArg,3);var index=-1,length=collection?collection.length:0;if(typeof length=="number"){while(++indexresult){result=value}}}else{callback=callback==null&&isString(collection)?charAtCallback:lodash.createCallback(callback,thisArg,3);forEach(collection,function(value,index,collection){var current=callback(value,index,collection);if(current>computed){computed=current;result=value}})}return result}function min(collection,callback,thisArg){var computed=Infinity,result=computed;if(typeof callback!="function"&&thisArg&&thisArg[callback]===collection){callback=null}if(callback==null&&isArray(collection)){var index=-1,length=collection.length;while(++index=largeArraySize&&createCache(argsIndex?args[argsIndex]:seen))}}var array=args[0],index=-1,length=array?array.length:0,result=[];outer:while(++index>>1;callback(array[mid])1?arguments:arguments[0],index=-1,length=array?max(pluck(array,"length")):0,result=Array(length<0?0:length);while(++index2?createWrapper(func,17,slice(arguments,2),null,thisArg):createWrapper(func,1,null,null,thisArg)}function bindAll(object){var funcs=arguments.length>1?baseFlatten(arguments,true,false,1):functions(object),index=-1,length=funcs.length;while(++index2?createWrapper(key,19,slice(arguments,2),null,object):createWrapper(key,3,null,null,object)}function compose(){var funcs=arguments,length=funcs.length;while(length--){if(!isFunction(funcs[length])){throw new TypeError}}return function(){var args=arguments,length=funcs.length;while(length--){args=[funcs[length].apply(this,args)]}return args[0]}}function curry(func,arity){arity=typeof arity=="number"?arity:+arity||func.length;return createWrapper(func,4,null,null,null,arity)}function debounce(func,wait,options){var args,maxTimeoutId,result,stamp,thisArg,timeoutId,trailingCall,lastCalled=0,maxWait=false,trailing=true;if(!isFunction(func)){throw new TypeError}wait=nativeMax(0,wait)||0;if(options===true){var leading=true;trailing=false}else if(isObject(options)){leading=options.leading;maxWait="maxWait"in options&&(nativeMax(wait,options.maxWait)||0);trailing="trailing"in options?options.trailing:trailing}var delayed=function(){var remaining=wait-(now()-stamp);if(remaining<=0){if(maxTimeoutId){clearTimeout(maxTimeoutId); + +}var isCalled=trailingCall;maxTimeoutId=timeoutId=trailingCall=undefined;if(isCalled){lastCalled=now();result=func.apply(thisArg,args);if(!timeoutId&&!maxTimeoutId){args=thisArg=null}}}else{timeoutId=setTimeout(delayed,remaining)}};var maxDelayed=function(){if(timeoutId){clearTimeout(timeoutId)}maxTimeoutId=timeoutId=trailingCall=undefined;if(trailing||maxWait!==wait){lastCalled=now();result=func.apply(thisArg,args);if(!timeoutId&&!maxTimeoutId){args=thisArg=null}}};return function(){args=arguments;stamp=now();thisArg=this;trailingCall=trailing&&(timeoutId||!leading);if(maxWait===false){var leadingCall=leading&&!timeoutId}else{if(!maxTimeoutId&&!leading){lastCalled=stamp}var remaining=maxWait-(stamp-lastCalled),isCalled=remaining<=0;if(isCalled){if(maxTimeoutId){maxTimeoutId=clearTimeout(maxTimeoutId)}lastCalled=stamp;result=func.apply(thisArg,args)}else if(!maxTimeoutId){maxTimeoutId=setTimeout(maxDelayed,remaining)}}if(isCalled&&timeoutId){timeoutId=clearTimeout(timeoutId)}else if(!timeoutId&&wait!==maxWait){timeoutId=setTimeout(delayed,wait)}if(leadingCall){isCalled=true;result=func.apply(thisArg,args)}if(isCalled&&!timeoutId&&!maxTimeoutId){args=thisArg=null}return result}}function defer(func){if(!isFunction(func)){throw new TypeError}var args=slice(arguments,1);return setTimeout(function(){func.apply(undefined,args)},1)}function delay(func,wait){if(!isFunction(func)){throw new TypeError}var args=slice(arguments,2);return setTimeout(function(){func.apply(undefined,args)},wait)}function memoize(func,resolver){if(!isFunction(func)){throw new TypeError}var memoized=function(){var cache=memoized.cache,key=resolver?resolver.apply(this,arguments):keyPrefix+arguments[0];return hasOwnProperty.call(cache,key)?cache[key]:cache[key]=func.apply(this,arguments)};memoized.cache={};return memoized}function once(func){var ran,result;if(!isFunction(func)){throw new TypeError}return function(){if(ran){return result}ran=true;result=func.apply(this,arguments);func=null;return result}}function partial(func){return createWrapper(func,16,slice(arguments,1))}function partialRight(func){return createWrapper(func,32,null,slice(arguments,1))}function throttle(func,wait,options){var leading=true,trailing=true;if(!isFunction(func)){throw new TypeError}if(options===false){leading=false}else if(isObject(options)){leading="leading"in options?options.leading:leading;trailing="trailing"in options?options.trailing:trailing}debounceOptions.leading=leading;debounceOptions.maxWait=wait;debounceOptions.trailing=trailing;return debounce(func,wait,debounceOptions)}function wrap(value,wrapper){return createWrapper(wrapper,16,[value])}function constant(value){return function(){return value}}function createCallback(func,thisArg,argCount){var type=typeof func;if(func==null||type=="function"){return baseCreateCallback(func,thisArg,argCount)}if(type!="object"){return property(func)}var props=keys(func),key=props[0],a=func[key];if(props.length==1&&a===a&&!isObject(a)){return function(object){var b=object[key];return a===b&&(a!==0||1/a==1/b)}}return function(object){var length=props.length,result=false;while(length--){if(!(result=baseIsEqual(object[props[length]],func[props[length]],null,true))){break}}return result}}function escape(string){return string==null?"":String(string).replace(reUnescapedHtml,escapeHtmlChar)}function identity(value){return value}function mixin(object,source,options){var chain=true,methodNames=source&&functions(source);if(!source||!options&&!methodNames.length){if(options==null){options=source}ctor=lodashWrapper;source=object;object=lodash;methodNames=functions(source)}if(options===false){chain=false}else if(isObject(options)&&"chain"in options){chain=options.chain}var ctor=object,isFunc=isFunction(ctor);forEach(methodNames,function(methodName){var func=object[methodName]=source[methodName];if(isFunc){ctor.prototype[methodName]=function(){var chainAll=this.__chain__,value=this.__wrapped__,args=[value];push.apply(args,arguments);var result=func.apply(object,args);if(chain||chainAll){if(value===result&&isObject(result)){return this}result=new ctor(result);result.__chain__=chainAll}return result}}})}function noConflict(){context._=oldDash;return this}function noop(){}var now=isNative(now=Date.now)&&now||function(){return(new Date).getTime()};var parseInt=nativeParseInt(whitespace+"08")==8?nativeParseInt:function(value,radix){return nativeParseInt(isString(value)?value.replace(reLeadingSpacesAndZeros,""):value,radix||0)};function property(key){return function(object){return object[key]}}function random(min,max,floating){var noMin=min==null,noMax=max==null;if(floating==null){if(typeof min=="boolean"&&noMax){floating=min;min=1}else if(!noMax&&typeof max=="boolean"){floating=max;noMax=true}}if(noMin&&noMax){max=1}min=+min||0;if(noMax){max=min;min=0}else{max=+max||0}if(floating||min%1||max%1){var rand=nativeRandom();return nativeMin(min+rand*(max-min+parseFloat("1e-"+((rand+"").length-1))),max)}return baseRandom(min,max)}function result(object,key){if(object){var value=object[key];return isFunction(value)?object[key]():value}}function template(text,data,options){var settings=lodash.templateSettings;text=String(text||"");options=defaults({},options,settings);var imports=defaults({},options.imports,settings.imports),importsKeys=keys(imports),importsValues=values(imports);var isEvaluating,index=0,interpolate=options.interpolate||reNoMatch,source="__p += '";var reDelimiters=RegExp((options.escape||reNoMatch).source+"|"+interpolate.source+"|"+(interpolate===reInterpolate?reEsTemplate:reNoMatch).source+"|"+(options.evaluate||reNoMatch).source+"|$","g");text.replace(reDelimiters,function(match,escapeValue,interpolateValue,esTemplateValue,evaluateValue,offset){interpolateValue||(interpolateValue=esTemplateValue);source+=text.slice(index,offset).replace(reUnescapedString,escapeStringChar);if(escapeValue){source+="' +\n__e("+escapeValue+") +\n'"}if(evaluateValue){isEvaluating=true;source+="';\n"+evaluateValue+";\n__p += '"}if(interpolateValue){source+="' +\n((__t = ("+interpolateValue+")) == null ? '' : __t) +\n'"}index=offset+match.length;return match});source+="';\n";var variable=options.variable,hasVariable=variable;if(!hasVariable){variable="obj";source="with ("+variable+") {\n"+source+"\n}\n"}source=(isEvaluating?source.replace(reEmptyStringLeading,""):source).replace(reEmptyStringMiddle,"$1").replace(reEmptyStringTrailing,"$1;");source="function("+variable+") {\n"+(hasVariable?"":variable+" || ("+variable+" = {});\n")+"var __t, __p = '', __e = _.escape"+(isEvaluating?", __j = Array.prototype.join;\n"+"function print() { __p += __j.call(arguments, '') }\n":";\n")+source+"return __p\n}";var sourceURL="\n/*\n//# sourceURL="+(options.sourceURL||"/lodash/template/source["+templateCounter++ +"]")+"\n*/";try{var result=Function(importsKeys,"return "+source+sourceURL).apply(undefined,importsValues)}catch(e){e.source=source;throw e}if(data){return result(data)}result.source=source;return result}function times(n,callback,thisArg){n=(n=+n)>-1?n:0;var index=-1,result=Array(n);callback=baseCreateCallback(callback,thisArg,1);while(++index 0) { - // Take into account the position and width of the last stage's container - var existingStages = stageClusters(); + var existingStages = svgContainer + .selectAll("g.cluster") + .filter("[id*=\"" + VizConstants.stageClusterPrefix + "\"]"); if (!existingStages.empty()) { - var lastStage = existingStages[0].pop(); - var lastStageId = d3.select(lastStage).attr("id"); - var lastStageWidth = toFloat(d3.select("#" + lastStageId + " rect").attr("width")); - var lastStagePosition = getAbsolutePosition(lastStageId); + var lastStage = d3.select(existingStages[0].pop()); + var lastStageId = lastStage.attr("id"); + var lastStageWidth = toFloat(svgContainer + .select("#" + lastStageId) + .select("rect") + .attr("width")); + var lastStagePosition = getAbsolutePosition(lastStage); var offset = lastStagePosition.x + lastStageWidth + VizConstants.stageSep; container.attr("transform", "translate(" + offset + ", 0)"); } } + + // Actually render the stage renderDot(dot, container); + // If there are any incoming edges into this graph, keep track of them to render // them separately later. Note that we cannot draw them now because we need to // put these edges in a separate container that is on top of all stage graphs. @@ -221,15 +218,7 @@ function renderDagVizForJob(svgContainer) { }); }); - // Draw edges that cross stages - if (crossStageEdges.length > 0) { - var container = svgContainer.append("g").attr("id", "cross-stage-edges"); - for (var i = 0; i < crossStageEdges.length; i++) { - var fromRDDId = crossStageEdges[i][0]; - var toRDDId = crossStageEdges[i][1]; - connectRDDs(fromRDDId, toRDDId, container); - } - } + drawCrossStageEdges(crossStageEdges, svgContainer); } /* Render the dot file as an SVG in the given container. */ @@ -243,99 +232,156 @@ function renderDot(dot, container) { renderer(container, g); } -/* Style the visualization we just rendered. */ -function styleDagViz(forJob) { - graphContainer().selectAll("svg g.cluster rect") - .style("fill", "white") - .style("stroke", VizConstants.rddOperationColor) - .style("stroke-width", "4px") - .style("stroke-opacity", "0.5"); - graphContainer().selectAll("svg g.cluster text") - .attr("fill", VizConstants.clusterLabelColor) - .attr("font-size", "11px"); - graphContainer().selectAll("svg path") - .style("stroke", VizConstants.edgeColor) - .style("stroke-width", VizConstants.edgeWidth); - stageClusters() - .select("rect") - .style("stroke", VizConstants.stageColor) - .style("strokeWidth", "6px"); - - // Put an arrow at the end of every edge - // We need to do this because we manually render some edges ourselves - // For these edges, we borrow the arrow marker generated by dagre-d3 - var dagreD3Marker = graphContainer().select("svg g.edgePaths marker").node(); - graphContainer().select("svg") - .append(function() { return dagreD3Marker.cloneNode(true); }) - .attr("id", "marker-arrow") - .select("path") - .attr("fill", VizConstants.edgeColor) - .attr("strokeWidth", "0px"); - graphContainer().selectAll("svg g > path").attr("marker-end", "url(#marker-arrow)"); - graphContainer().selectAll("svg g.edgePaths def").remove(); // We no longer need these - - // Apply any job or stage specific styles +/* -------------------- * + * | Helper functions | * + * -------------------- */ + +// Helper d3 accessors +function graphContainer() { return d3.select("#dag-viz-graph"); } +function metadataContainer() { return d3.select("#dag-viz-metadata"); } + +/* + * Helper function to create draw a label for each cluster. + * + * We need to do this manually because dagre-d3 does not support labeling clusters. + * In general, the clustering support for dagre-d3 is quite limited at this point. + */ +function drawClusterLabels(svgContainer, forJob) { if (forJob) { - styleDagVizForJob(); + var clusterLabelSize = JobPageVizConstants.clusterLabelSize; + var stageClusterLabelSize = JobPageVizConstants.stageClusterLabelSize; } else { - styleDagVizForStage(); + var clusterLabelSize = StagePageVizConstants.clusterLabelSize; + var stageClusterLabelSize = StagePageVizConstants.stageClusterLabelSize; } + svgContainer.selectAll("g.cluster").each(function() { + var cluster = d3.select(this); + var isStage = cluster.attr("id").indexOf(VizConstants.stageClusterPrefix) > -1; + var labelSize = isStage ? stageClusterLabelSize : clusterLabelSize; + drawClusterLabel(cluster, labelSize); + }); } -/* Apply job-page-specific style to the visualization. */ -function styleDagVizForJob() { - graphContainer().selectAll("svg g.node circle") - .style("fill", VizConstants.rddColor); - // TODO: add a legend to explain what a highlighted dot means - graphContainer().selectAll("svg g.cached circle") - .style("fill", VizConstants.rddCachedColor); - graphContainer().selectAll("svg g#cross-stage-edges path") - .style("fill", "none"); +/* + * Helper function to draw a label for the given cluster element based on its name. + * + * In the process, we need to expand the bounding box to make room for the label. + * We need to do this because dagre-d3 did not take this into account when it first + * rendered the bounding boxes. Note that this means we need to adjust the view box + * of the SVG afterwards since we shifted a few boxes around. + */ +function drawClusterLabel(d3cluster, fontSize) { + var cluster = d3cluster; + var rect = d3cluster.select("rect"); + rect.attr("y", toFloat(rect.attr("y")) - fontSize); + rect.attr("height", toFloat(rect.attr("height")) + fontSize); + var labelX = toFloat(rect.attr("x")) + toFloat(rect.attr("width")) - fontSize / 2; + var labelY = toFloat(rect.attr("y")) + fontSize * 1.5; + var labelText = cluster.attr("name").replace(VizConstants.clusterPrefix, ""); + cluster.append("text") + .attr("x", labelX) + .attr("y", labelY) + .attr("text-anchor", "end") + .style("font-size", fontSize) + .text(labelText); } -/* Apply stage-page-specific style to the visualization. */ -function styleDagVizForStage() { - graphContainer().selectAll("svg g.node rect") - .style("fill", "none") - .style("stroke", VizConstants.rddColor) - .style("stroke-width", "2px") - .attr("rx", "5") // round corners - .attr("ry", "5"); - // TODO: add a legend to explain what a highlighted RDD means - graphContainer().selectAll("svg g.cached rect") - .style("stroke", VizConstants.rddCachedColor); - graphContainer().selectAll("svg g.node g.label text tspan") - .style("fill", VizConstants.rddColor); +/* + * Helper function to size the SVG appropriately such that all elements are displyed. + * This assumes that all outermost elements are clusters (rectangles). + */ +function resizeSvg(svg) { + var allClusters = svg.selectAll("g.cluster rect")[0]; + var startX = -VizConstants.svgMarginX + + toFloat(d3.min(allClusters, function(e) { + return getAbsolutePosition(d3.select(e)).x; + })); + var startY = -VizConstants.svgMarginY + + toFloat(d3.min(allClusters, function(e) { + return getAbsolutePosition(d3.select(e)).y; + })); + var endX = VizConstants.svgMarginX + + toFloat(d3.max(allClusters, function(e) { + var t = d3.select(e) + return getAbsolutePosition(t).x + toFloat(t.attr("width")); + })); + var endY = VizConstants.svgMarginY + + toFloat(d3.max(allClusters, function(e) { + var t = d3.select(e) + return getAbsolutePosition(t).y + toFloat(t.attr("height")); + })); + var width = endX - startX; + var height = endY - startY; + svg.attr("viewBox", startX + " " + startY + " " + width + " " + height) + .attr("width", width) + .attr("height", height); } /* - * (Job page only) Helper method to compute the absolute - * position of the group element identified by the given ID. + * (Job page only) Helper function to draw edges that cross stage boundaries. + * We need to do this manually because we render each stage separately in dagre-d3. */ -function getAbsolutePosition(groupId) { - var obj = d3.select("#" + groupId).filter("g"); - var _x = 0, _y = 0; +function drawCrossStageEdges(edges, svgContainer) { + if (edges.length == 0) { + return; + } + // Draw the paths first + var edgesContainer = svgContainer.append("g").attr("id", "cross-stage-edges"); + for (var i = 0; i < edges.length; i++) { + var fromRDDId = edges[i][0]; + var toRDDId = edges[i][1]; + connectRDDs(fromRDDId, toRDDId, edgesContainer, svgContainer); + } + // Now draw the arrows by borrowing the arrow marker generated by dagre-d3 + var dagreD3Marker = svgContainer.select("g.edgePaths marker").node(); + if (!dagreD3Marker.empty()) { + svgContainer + .append(function() { return dagreD3Marker.cloneNode(true); }) + .attr("id", "marker-arrow") + svgContainer.selectAll("g > path").attr("marker-end", "url(#marker-arrow)"); + svgContainer.selectAll("g.edgePaths def").remove(); // We no longer need these + } +} + +/* + * (Job page only) Helper function to compute the absolute + * position of the specified element in our graph. + */ +function getAbsolutePosition(d3selection) { + if (d3selection.empty()) { + throw "Attempted to get absolute position of an empty selection."; + } + var obj = d3selection; + var _x = toFloat(obj.attr("x")) || 0; + var _y = toFloat(obj.attr("y")) || 0; while (!obj.empty()) { var transformText = obj.attr("transform"); - var translate = d3.transform(transformText).translate - _x += translate[0]; - _y += translate[1]; - obj = d3.select(obj.node().parentNode).filter("g") + if (transformText) { + var translate = d3.transform(transformText).translate; + _x += toFloat(translate[0]); + _y += toFloat(translate[1]); + } + // Climb upwards to find how our parents are translated + obj = d3.select(obj.node().parentNode); + // Stop when we've reached the graph container itself + if (obj.node() == graphContainer().node()) { + break; + } } return { x: _x, y: _y }; } -/* (Job page only) Connect two RDD nodes with a curved edge. */ -function connectRDDs(fromRDDId, toRDDId, container) { +/* (Job page only) Helper function to connect two RDDs with a curved edge. */ +function connectRDDs(fromRDDId, toRDDId, edgesContainer, svgContainer) { var fromNodeId = VizConstants.nodePrefix + fromRDDId; - var toNodeId = VizConstants.nodePrefix + toRDDId - var fromPos = getAbsolutePosition(fromNodeId); - var toPos = getAbsolutePosition(toNodeId); + var toNodeId = VizConstants.nodePrefix + toRDDId; + var fromPos = getAbsolutePosition(svgContainer.select("#" + fromNodeId)); + var toPos = getAbsolutePosition(svgContainer.select("#" + toNodeId)); // On the job page, RDDs are rendered as dots (circles). When rendering the path, // we need to account for the radii of these circles. Otherwise the arrow heads // will bleed into the circle itself. - var delta = toFloat(graphContainer() + var delta = toFloat(svgContainer .select("g.node#" + toNodeId) .select("circle") .attr("r")); @@ -375,18 +421,15 @@ function connectRDDs(fromRDDId, toRDDId, container) { } var line = d3.svg.line().interpolate("basis"); - container.append("path").datum(points).attr("d", line); + edgesContainer.append("path").datum(points).attr("d", line); } -/* Helper d3 accessor to clusters that represent stages. */ -function stageClusters() { - return graphContainer().selectAll("g.cluster").filter(function() { - return d3.select(this).attr("id").indexOf(VizConstants.stageClusterPrefix) > -1; - }); -} - -/* Helper method to convert attributes to numeric values. */ +/* Helper function to convert attributes to numeric values. */ function toFloat(f) { - return parseFloat(f.replace(/px$/, "")); + if (f) { + return parseFloat(f.toString().replace(/px$/, "")); + } else { + return f; + } } diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 2f3fb181e4026..e2d03f8342315 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -156,13 +156,10 @@ private[spark] object UIUtils extends Logging { def commonHeaderNodes: Seq[Node] = { - - - - + + + + @@ -174,6 +171,7 @@ private[spark] object UIUtils extends Logging { } def vizHeaderNodes: Seq[Node] = { + @@ -358,7 +356,7 @@ private[spark] object UIUtils extends Logging {
{ graphs.map { g => - +## ElementwiseProduct + +ElementwiseProduct multiplies each input vector by a provided "weight" vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector. + +`\[ \begin{pmatrix} +v_1 \\ +\vdots \\ +v_N +\end{pmatrix} \circ \begin{pmatrix} + w_1 \\ + \vdots \\ + w_N + \end{pmatrix} += \begin{pmatrix} + v_1 w_1 \\ + \vdots \\ + v_N w_N + \end{pmatrix} +\]` + +[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) has the following parameter in the constructor: + +* `w`: the transforming vector. + +`ElementwiseProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`. + +### Example + +This example below demonstrates how to load a simple vectors file, extract a set of vectors, then transform those vectors using a transforming vector value. + + +
+
+{% highlight scala %} +import org.apache.spark.SparkContext._ +import org.apache.spark.mllib.feature.ElementwiseProduct +import org.apache.spark.mllib.linalg.Vectors + +// Load and parse the data: +val data = sc.textFile("data/mllib/kmeans_data.txt") +val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))) + +val transformingVector = Vectors.dense(0.0, 1.0, 2.0) +val transformer = new ElementwiseProduct(transformingVector) + +// Batch transform and per-row transform give the same results: +val transformedData = transformer.transform(parsedData) +val transformedData2 = parsedData.map(x => transformer.transform(x)) + +{% endhighlight %} +
+
+ + diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala new file mode 100644 index 0000000000000..f8b56293e3ccc --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.annotation.AlphaComponent +import org.apache.spark.ml.UnaryTransformer +import org.apache.spark.ml.param.Param +import org.apache.spark.mllib.feature +import org.apache.spark.mllib.linalg.{Vector, VectorUDT} +import org.apache.spark.sql.types.DataType + +/** + * :: AlphaComponent :: + * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a + * provided "weight" vector. In other words, it scales each column of the dataset by a scalar + * multiplier. + */ +@AlphaComponent +class ElementwiseProduct extends UnaryTransformer[Vector, Vector, ElementwiseProduct] { + + /** + * the vector to multiply with input vectors + * @group param + */ + val scalingVec: Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product") + + /** @group setParam */ + def setScalingVec(value: Vector): this.type = set(scalingVec, value) + + /** @group getParam */ + def getScalingVec: Vector = getOrDefault(scalingVec) + + override protected def createTransformFunc: Vector => Vector = { + require(params.contains(scalingVec), s"transformation requires a weight vector") + val elemScaler = new feature.ElementwiseProduct($(scalingVec)) + elemScaler.transform + } + + override protected def outputDataType: DataType = new VectorUDT() +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala new file mode 100644 index 0000000000000..b0985baf9b278 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.feature + +import org.apache.spark.annotation.Experimental +import org.apache.spark.mllib.linalg._ + +/** + * :: Experimental :: + * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a + * provided "weight" vector. In other words, it scales each column of the dataset by a scalar + * multiplier. + * @param scalingVector The values used to scale the reference vector's individual components. + */ +@Experimental +class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer { + + /** + * Does the hadamard product transformation. + * + * @param vector vector to be transformed. + * @return transformed vector. + */ + override def transform(vector: Vector): Vector = { + require(vector.size == scalingVector.size, + s"vector sizes do not match: Expected ${scalingVector.size} but found ${vector.size}") + vector match { + case dv: DenseVector => + val values: Array[Double] = dv.values.clone() + val dim = scalingVector.size + var i = 0 + while (i < dim) { + values(i) *= scalingVector(i) + i += 1 + } + Vectors.dense(values) + case SparseVector(size, indices, vs) => + val values = vs.clone() + val dim = values.length + var i = 0 + while (i < dim) { + values(i) *= scalingVector(indices(i)) + i += 1 + } + Vectors.sparse(size, indices, values) + case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass) + } + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala new file mode 100644 index 0000000000000..f3a482abda873 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.feature + +import org.scalatest.FunSuite + +import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ + +class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext { + + test("elementwise (hadamard) product should properly apply vector to dense data set") { + val denseData = Array( + Vectors.dense(1.0, 4.0, 1.9, -9.0) + ) + val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25) + val transformer = new ElementwiseProduct(scalingVec) + val transformedData = transformer.transform(sc.makeRDD(denseData)) + val transformedVecs = transformedData.collect() + val transformedVec = transformedVecs(0) + val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25) + assert(transformedVec ~== expectedVec absTol 1E-5, + s"Expected transformed vector $expectedVec but found $transformedVec") + } + + test("elementwise (hadamard) product should properly apply vector to sparse data set") { + val sparseData = Array( + Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))) + ) + val dataRDD = sc.parallelize(sparseData, 3) + val scalingVec = Vectors.dense(1.0, 0.0, 0.5) + val transformer = new ElementwiseProduct(scalingVec) + val data2 = sparseData.map(transformer.transform) + val data2RDD = transformer.transform(dataRDD) + + assert((sparseData, data2, data2RDD.collect()).zipped.forall { + case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true + case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true + case _ => false + }, "The vector type should be preserved after hadamard product") + + assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) + assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5) + } +} From e43803b8f477b2c8d28836ac163cb54328d13f1a Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 7 May 2015 15:45:37 -0700 Subject: [PATCH 31/48] [SPARK-6948] [MLLIB] compress vectors in VectorAssembler The compression is based on storage. brkyvz Author: Xiangrui Meng Closes #5985 from mengxr/SPARK-6948 and squashes the following commits: df56a00 [Xiangrui Meng] update python tests 6d90d45 [Xiangrui Meng] compress vectors in VectorAssembler --- .../org/apache/spark/ml/feature/VectorAssembler.scala | 2 +- .../apache/spark/ml/feature/VectorAssemblerSuite.scala | 10 +++++++++- python/pyspark/ml/feature.py | 6 +++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index b5a69cee6daf3..796758a70ef18 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -102,6 +102,6 @@ object VectorAssembler { case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } - Vectors.sparse(cur, indices.result(), values.result()) + Vectors.sparse(cur, indices.result(), values.result()).compressed } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index 57d0278e03639..0db27607bc274 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.feature import org.scalatest.FunSuite import org.apache.spark.SparkException -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SQLContext} @@ -48,6 +48,14 @@ class VectorAssemblerSuite extends FunSuite with MLlibTestSparkContext { } } + test("assemble should compress vectors") { + import org.apache.spark.ml.feature.VectorAssembler.assemble + val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) + assert(v1.isInstanceOf[SparseVector]) + val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) + assert(v2.isInstanceOf[DenseVector]) + } + test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8a0fdddd2d9b5..705a368192c24 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -121,12 +121,12 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol): >>> df = sc.parallelize([Row(a=1, b=0, c=3)]).toDF() >>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features") >>> vecAssembler.transform(df).head().features - SparseVector(3, {0: 1.0, 2: 3.0}) + DenseVector([1.0, 0.0, 3.0]) >>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs - SparseVector(3, {0: 1.0, 2: 3.0}) + DenseVector([1.0, 0.0, 3.0]) >>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"} >>> vecAssembler.transform(df, params).head().vector - SparseVector(2, {1: 1.0}) + DenseVector([0.0, 1.0]) """ _java_class = "org.apache.spark.ml.feature.VectorAssembler" From 97d1182af63d55abab44521171652c81c56c6af6 Mon Sep 17 00:00:00 2001 From: scwf Date: Thu, 7 May 2015 16:21:24 -0700 Subject: [PATCH 32/48] [SQL] [MINOR] make star and multialias extend NamedExpression `Star` and `MultiAlias` just used in `analyzer` and them will be substituted after analyze, So just like `Alias` they do not need extend `Attribute` Author: scwf Closes #5928 from scwf/attribute and squashes the following commits: 73a0560 [scwf] star and multialias do not need extend attribute --- .../sql/catalyst/analysis/unresolved.scala | 20 +++++-------------- .../expressions/namedExpressions.scala | 2 +- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 3f567e3e8b2a6..eb736ac3290e8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -95,7 +95,7 @@ case class UnresolvedFunction(name: String, children: Seq[Expression]) extends E * Represents all of the input attributes to a given relational operator, for example in * "SELECT * FROM ...". A [[Star]] gets automatically expanded during analysis. */ -trait Star extends Attribute with trees.LeafNode[Expression] { +trait Star extends NamedExpression with trees.LeafNode[Expression] { self: Product => override def name: String = throw new UnresolvedException(this, "name") @@ -103,13 +103,9 @@ trait Star extends Attribute with trees.LeafNode[Expression] { override def dataType: DataType = throw new UnresolvedException(this, "dataType") override def nullable: Boolean = throw new UnresolvedException(this, "nullable") override def qualifiers: Seq[String] = throw new UnresolvedException(this, "qualifiers") + override def toAttribute: Attribute = throw new UnresolvedException(this, "toAttribute") override lazy val resolved = false - override def newInstance(): Star = this - override def withNullability(newNullability: Boolean): Star = this - override def withQualifiers(newQualifiers: Seq[String]): Star = this - override def withName(newName: String): Star = this - // Star gets expanded at runtime so we never evaluate a Star. override def eval(input: Row = null): EvaluatedType = throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}") @@ -154,7 +150,7 @@ case class UnresolvedStar(table: Option[String]) extends Star { * @param names the names to be associated with each output of computing [[child]]. */ case class MultiAlias(child: Expression, names: Seq[String]) - extends Attribute with trees.UnaryNode[Expression] { + extends NamedExpression with trees.UnaryNode[Expression] { override def name: String = throw new UnresolvedException(this, "name") @@ -166,15 +162,9 @@ case class MultiAlias(child: Expression, names: Seq[String]) override def qualifiers: Seq[String] = throw new UnresolvedException(this, "qualifiers") - override lazy val resolved = false - - override def newInstance(): MultiAlias = this + override def toAttribute: Attribute = throw new UnresolvedException(this, "toAttribute") - override def withNullability(newNullability: Boolean): MultiAlias = this - - override def withQualifiers(newQualifiers: Seq[String]): MultiAlias = this - - override def withName(newName: String): MultiAlias = this + override lazy val resolved = false override def eval(input: Row = null): EvaluatedType = throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 57ace2a14f77c..a9170589f8c6c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -86,7 +86,7 @@ abstract class Attribute extends NamedExpression { def withQualifiers(newQualifiers: Seq[String]): Attribute def withName(newName: String): Attribute - def toAttribute: Attribute = this + override def toAttribute: Attribute = this def newInstance(): Attribute } From ea3077f19c18b5556a3632b36771aeb153746ff5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 7 May 2015 16:22:45 -0700 Subject: [PATCH 33/48] [SPARK-7277] [SQL] Throw exception if the property mapred.reduce.tasks is set to -1 JIRA: https://issues.apache.org/jira/browse/SPARK-7277 As automatically determining the number of reducers is not supported (`mapred.reduce.tasks` is set to `-1`), we should throw exception to users. Author: Liang-Chi Hsieh Closes #5811 from viirya/no_neg_reduce_tasks and squashes the following commits: e518f96 [Liang-Chi Hsieh] Consider other wrong setting values. fd9c817 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into no_neg_reduce_tasks 4ede705 [Liang-Chi Hsieh] Throw exception instead of warning message. 68a1c70 [Liang-Chi Hsieh] Show warning message if mapred.reduce.tasks is set to -1. --- .../org/apache/spark/sql/execution/commands.scala | 10 ++++++++-- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 10 ++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index 98df5bef34efa..65687db4e6235 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -84,8 +84,14 @@ case class SetCommand( logWarning( s"Property ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} is deprecated, " + s"automatically converted to ${SQLConf.SHUFFLE_PARTITIONS} instead.") - sqlContext.setConf(SQLConf.SHUFFLE_PARTITIONS, value) - Seq(Row(s"${SQLConf.SHUFFLE_PARTITIONS}=$value")) + if (value.toInt < 1) { + val msg = s"Setting negative ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} for automatically " + + "determining the number of reducers is not supported." + throw new IllegalArgumentException(msg) + } else { + sqlContext.setConf(SQLConf.SHUFFLE_PARTITIONS, value) + Seq(Row(s"${SQLConf.SHUFFLE_PARTITIONS}=$value")) + } // Configures a single property. case Some((key, Some(value))) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 208cec6a32d4d..77be3b8b206c0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -871,6 +871,16 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { conf.clear() } + test("SET commands with illegal or inappropriate argument") { + conf.clear() + // Set negative mapred.reduce.tasks for automatically determing + // the number of reducers is not supported + intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-1")) + intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-01")) + intercept[IllegalArgumentException](sql(s"SET mapred.reduce.tasks=-2")) + conf.clear() + } + test("apply schema") { val schema1 = StructType( StructField("f1", IntegerType, false) :: From 937ba798c56770ec54276b9259e47ae65ee93967 Mon Sep 17 00:00:00 2001 From: Iulian Dragos Date: Thu, 7 May 2015 16:24:11 -0700 Subject: [PATCH 34/48] [SPARK-5281] [SQL] Registering table on RDD is giving MissingRequirementError Go through the context classloader when reflecting on user types in ScalaReflection. Replaced calls to `typeOf` with `typeTag[T].in(mirror)`. The convenience method assumes all types can be found in the classloader that loaded scala-reflect (the primordial classloader). This assumption is not valid in all contexts (sbt console, Eclipse launchers). Fixed SPARK-5281 Author: Iulian Dragos Closes #5981 from dragos/issue/mirrors-missing-requirement-error and squashes the following commits: d103e70 [Iulian Dragos] Go through the context classloader when reflecting on user types in ScalaReflection --- .../spark/sql/catalyst/ScalaReflection.scala | 57 ++++++++++++------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index c52965507c715..6998cc8d9666d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.types._ */ object ScalaReflection extends ScalaReflection { val universe: scala.reflect.runtime.universe.type = scala.reflect.runtime.universe + val mirror: universe.Mirror = universe.runtimeMirror(Thread.currentThread().getContextClassLoader) } /** @@ -36,6 +37,9 @@ trait ScalaReflection { /** The universe we work in (runtime or macro) */ val universe: scala.reflect.api.Universe + /** The mirror used to access types in the universe */ + val mirror: universe.Mirror + import universe._ // The Predef.Map is scala.collection.immutable.Map. @@ -52,7 +56,19 @@ trait ScalaReflection { /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */ def schemaFor[T: TypeTag]: Schema = - ScalaReflectionLock.synchronized { schemaFor(typeOf[T]) } + ScalaReflectionLock.synchronized { schemaFor(localTypeOf[T]) } + + /** + * Return the Scala Type for `T` in the current classloader mirror. + * + * Use this method instead of the convenience method `universe.typeOf`, which + * assumes that all types can be found in the classloader that loaded scala-reflect classes. + * That's not necessarily the case when running using Eclipse launchers or even + * Sbt console or test (without `fork := true`). + * + * @see SPARK-5281 + */ + private def localTypeOf[T: TypeTag]: `Type` = typeTag[T].in(mirror).tpe /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */ def schemaFor(tpe: `Type`): Schema = ScalaReflectionLock.synchronized { @@ -67,25 +83,25 @@ trait ScalaReflection { val udt = Utils.classForName(className) .getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance() Schema(udt, nullable = true) - case t if t <:< typeOf[Option[_]] => + case t if t <:< localTypeOf[Option[_]] => val TypeRef(_, _, Seq(optType)) = t Schema(schemaFor(optType).dataType, nullable = true) // Need to decide if we actually need a special type here. - case t if t <:< typeOf[Array[Byte]] => Schema(BinaryType, nullable = true) - case t if t <:< typeOf[Array[_]] => + case t if t <:< localTypeOf[Array[Byte]] => Schema(BinaryType, nullable = true) + case t if t <:< localTypeOf[Array[_]] => val TypeRef(_, _, Seq(elementType)) = t val Schema(dataType, nullable) = schemaFor(elementType) Schema(ArrayType(dataType, containsNull = nullable), nullable = true) - case t if t <:< typeOf[Seq[_]] => + case t if t <:< localTypeOf[Seq[_]] => val TypeRef(_, _, Seq(elementType)) = t val Schema(dataType, nullable) = schemaFor(elementType) Schema(ArrayType(dataType, containsNull = nullable), nullable = true) - case t if t <:< typeOf[Map[_, _]] => + case t if t <:< localTypeOf[Map[_, _]] => val TypeRef(_, _, Seq(keyType, valueType)) = t val Schema(valueDataType, valueNullable) = schemaFor(valueType) Schema(MapType(schemaFor(keyType).dataType, valueDataType, valueContainsNull = valueNullable), nullable = true) - case t if t <:< typeOf[Product] => + case t if t <:< localTypeOf[Product] => val formalTypeArgs = t.typeSymbol.asClass.typeParams val TypeRef(_, _, actualTypeArgs) = t val constructorSymbol = t.member(nme.CONSTRUCTOR) @@ -107,19 +123,20 @@ trait ScalaReflection { schemaFor(p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)) StructField(p.name.toString, dataType, nullable) }), nullable = true) - case t if t <:< typeOf[String] => Schema(StringType, nullable = true) - case t if t <:< typeOf[java.sql.Timestamp] => Schema(TimestampType, nullable = true) - case t if t <:< typeOf[java.sql.Date] => Schema(DateType, nullable = true) - case t if t <:< typeOf[BigDecimal] => Schema(DecimalType.Unlimited, nullable = true) - case t if t <:< typeOf[java.math.BigDecimal] => Schema(DecimalType.Unlimited, nullable = true) - case t if t <:< typeOf[Decimal] => Schema(DecimalType.Unlimited, nullable = true) - case t if t <:< typeOf[java.lang.Integer] => Schema(IntegerType, nullable = true) - case t if t <:< typeOf[java.lang.Long] => Schema(LongType, nullable = true) - case t if t <:< typeOf[java.lang.Double] => Schema(DoubleType, nullable = true) - case t if t <:< typeOf[java.lang.Float] => Schema(FloatType, nullable = true) - case t if t <:< typeOf[java.lang.Short] => Schema(ShortType, nullable = true) - case t if t <:< typeOf[java.lang.Byte] => Schema(ByteType, nullable = true) - case t if t <:< typeOf[java.lang.Boolean] => Schema(BooleanType, nullable = true) + case t if t <:< localTypeOf[String] => Schema(StringType, nullable = true) + case t if t <:< localTypeOf[java.sql.Timestamp] => Schema(TimestampType, nullable = true) + case t if t <:< localTypeOf[java.sql.Date] => Schema(DateType, nullable = true) + case t if t <:< localTypeOf[BigDecimal] => Schema(DecimalType.Unlimited, nullable = true) + case t if t <:< localTypeOf[java.math.BigDecimal] => + Schema(DecimalType.Unlimited, nullable = true) + case t if t <:< localTypeOf[Decimal] => Schema(DecimalType.Unlimited, nullable = true) + case t if t <:< localTypeOf[java.lang.Integer] => Schema(IntegerType, nullable = true) + case t if t <:< localTypeOf[java.lang.Long] => Schema(LongType, nullable = true) + case t if t <:< localTypeOf[java.lang.Double] => Schema(DoubleType, nullable = true) + case t if t <:< localTypeOf[java.lang.Float] => Schema(FloatType, nullable = true) + case t if t <:< localTypeOf[java.lang.Short] => Schema(ShortType, nullable = true) + case t if t <:< localTypeOf[java.lang.Byte] => Schema(ByteType, nullable = true) + case t if t <:< localTypeOf[java.lang.Boolean] => Schema(BooleanType, nullable = true) case t if t <:< definitions.IntTpe => Schema(IntegerType, nullable = false) case t if t <:< definitions.LongTpe => Schema(LongType, nullable = false) case t if t <:< definitions.DoubleTpe => Schema(DoubleType, nullable = false) From 35f0173b8f67e2e506fc4575be6430cfb66e2238 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Thu, 7 May 2015 16:26:49 -0700 Subject: [PATCH 35/48] [SPARK-2155] [SQL] [WHEN D THEN E] [ELSE F] add CaseKeyWhen for "CASE a WHEN b THEN c * END" Avoid translating to CaseWhen and evaluate the key expression many times. Author: Wenchen Fan Closes #5979 from cloud-fan/condition and squashes the following commits: 3ce54e1 [Wenchen Fan] add CaseKeyWhen --- .../apache/spark/sql/catalyst/SqlParser.scala | 10 +- .../catalyst/analysis/HiveTypeCoercion.scala | 43 +++--- .../sql/catalyst/expressions/Expression.scala | 2 +- .../sql/catalyst/expressions/predicates.scala | 135 ++++++++++++------ .../ExpressionEvaluationSuite.scala | 26 ++++ .../spark/sql/DataFrameNaFunctions.scala | 9 +- .../org/apache/spark/sql/hive/HiveQl.scala | 12 +- .../sql/hive/execution/SQLQuerySuite.scala | 7 + 8 files changed, 159 insertions(+), 85 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index 1d3a2dc0d9bb0..b06bfb2ce8c8e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -296,13 +296,13 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser { | LOWER ~ "(" ~> expression <~ ")" ^^ { case exp => Lower(exp) } | IF ~ "(" ~> expression ~ ("," ~> expression) ~ ("," ~> expression) <~ ")" ^^ { case c ~ t ~ f => If(c, t, f) } - | CASE ~> expression.? ~ (WHEN ~> expression ~ (THEN ~> expression)).* ~ + | CASE ~> expression.? ~ rep1(WHEN ~> expression ~ (THEN ~> expression)) ~ (ELSE ~> expression).? <~ END ^^ { case casePart ~ altPart ~ elsePart => - val altExprs = altPart.flatMap { case whenExpr ~ thenExpr => - Seq(casePart.fold(whenExpr)(EqualTo(_, whenExpr)), thenExpr) - } - CaseWhen(altExprs ++ elsePart.toList) + val branches = altPart.flatMap { case whenExpr ~ thenExpr => + Seq(whenExpr, thenExpr) + } ++ elsePart + casePart.map(CaseKeyWhen(_, branches)).getOrElse(CaseWhen(branches)) } | (SUBSTR | SUBSTRING) ~ "(" ~> expression ~ ("," ~> expression) <~ ")" ^^ { case s ~ p => Substring(s, p, Literal(Integer.MAX_VALUE)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 873c75c525c3b..168a4e30eab86 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -631,31 +631,24 @@ trait HiveTypeCoercion { import HiveTypeCoercion._ def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { - case cw @ CaseWhen(branches) if !cw.resolved && !branches.exists(!_.resolved) => - val valueTypes = branches.sliding(2, 2).map { - case Seq(_, value) => value.dataType - case Seq(elseVal) => elseVal.dataType - }.toSeq - - logDebug(s"Input values for null casting ${valueTypes.mkString(",")}") - - if (valueTypes.distinct.size > 1) { - val commonType = valueTypes.reduce { (v1, v2) => - findTightestCommonType(v1, v2) - .getOrElse(sys.error( - s"Types in CASE WHEN must be the same or coercible to a common type: $v1 != $v2")) - } - val transformedBranches = branches.sliding(2, 2).map { - case Seq(cond, value) if value.dataType != commonType => - Seq(cond, Cast(value, commonType)) - case Seq(elseVal) if elseVal.dataType != commonType => - Seq(Cast(elseVal, commonType)) - case s => s - }.reduce(_ ++ _) - CaseWhen(transformedBranches) - } else { - // Types match up. Hopefully some other rule fixes whatever is wrong with resolution. - cw + case cw: CaseWhenLike if !cw.resolved && cw.childrenResolved && !cw.valueTypesEqual => + logDebug(s"Input values for null casting ${cw.valueTypes.mkString(",")}") + val commonType = cw.valueTypes.reduce { (v1, v2) => + findTightestCommonType(v1, v2).getOrElse(sys.error( + s"Types in CASE WHEN must be the same or coercible to a common type: $v1 != $v2")) + } + val transformedBranches = cw.branches.sliding(2, 2).map { + case Seq(when, value) if value.dataType != commonType => + Seq(when, Cast(value, commonType)) + case Seq(elseVal) if elseVal.dataType != commonType => + Seq(Cast(elseVal, commonType)) + case s => s + }.reduce(_ ++ _) + cw match { + case _: CaseWhen => + CaseWhen(transformedBranches) + case CaseKeyWhen(key, _) => + CaseKeyWhen(key, transformedBranches) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 4fd1bc4dd642d..0837a3179d897 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -64,7 +64,7 @@ abstract class Expression extends TreeNode[Expression] { * Returns true if all the children of this expression have been resolved to a specific schema * and false if any still contains any unresolved placeholders. */ - def childrenResolved: Boolean = !children.exists(!_.resolved) + def childrenResolved: Boolean = children.forall(_.resolved) /** * Returns a string representation of this expression that does not have developer centric diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 26c38c56c04f5..50b0f3ee5f93f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -353,79 +353,134 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi override def toString: String = s"if ($predicate) $trueValue else $falseValue" } +trait CaseWhenLike extends Expression { + self: Product => + + type EvaluatedType = Any + + // Note that `branches` are considered in consecutive pairs (cond, val), and the optional last + // element is the value for the default catch-all case (if provided). + // Hence, `branches` consists of at least two elements, and can have an odd or even length. + def branches: Seq[Expression] + + @transient lazy val whenList = + branches.sliding(2, 2).collect { case Seq(whenExpr, _) => whenExpr }.toSeq + @transient lazy val thenList = + branches.sliding(2, 2).collect { case Seq(_, thenExpr) => thenExpr }.toSeq + val elseValue = if (branches.length % 2 == 0) None else Option(branches.last) + + // both then and else val should be considered. + def valueTypes: Seq[DataType] = (thenList ++ elseValue).map(_.dataType) + def valueTypesEqual: Boolean = valueTypes.distinct.size <= 1 + + override def dataType: DataType = { + if (!resolved) { + throw new UnresolvedException(this, "cannot resolve due to differing types in some branches") + } + valueTypes.head + } + + override def nullable: Boolean = { + // If no value is nullable and no elseValue is provided, the whole statement defaults to null. + thenList.exists(_.nullable) || (elseValue.map(_.nullable).getOrElse(true)) + } +} + // scalastyle:off /** * Case statements of the form "CASE WHEN a THEN b [WHEN c THEN d]* [ELSE e] END". * Refer to this link for the corresponding semantics: * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-ConditionalFunctions - * - * The other form of case statements "CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END" gets - * translated to this form at parsing time. Namely, such a statement gets translated to - * "CASE WHEN a=b THEN c [WHEN a=d THEN e]* [ELSE f] END". - * - * Note that `branches` are considered in consecutive pairs (cond, val), and the optional last - * element is the value for the default catch-all case (if provided). Hence, `branches` consists of - * at least two elements, and can have an odd or even length. */ // scalastyle:on -case class CaseWhen(branches: Seq[Expression]) extends Expression { - type EvaluatedType = Any +case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike { + + // Use private[this] Array to speed up evaluation. + @transient private[this] lazy val branchesArr = branches.toArray override def children: Seq[Expression] = branches - override def dataType: DataType = { - if (!resolved) { - throw new UnresolvedException(this, "cannot resolve due to differing types in some branches") + override lazy val resolved: Boolean = + childrenResolved && + whenList.forall(_.dataType == BooleanType) && + valueTypesEqual + + /** Written in imperative fashion for performance considerations. */ + override def eval(input: Row): Any = { + val len = branchesArr.length + var i = 0 + // If all branches fail and an elseVal is not provided, the whole statement + // defaults to null, according to Hive's semantics. + while (i < len - 1) { + if (branchesArr(i).eval(input) == true) { + return branchesArr(i + 1).eval(input) + } + i += 2 + } + var res: Any = null + if (i == len - 1) { + res = branchesArr(i).eval(input) } - branches(1).dataType + return res } + override def toString: String = { + "CASE" + branches.sliding(2, 2).map { + case Seq(cond, value) => s" WHEN $cond THEN $value" + case Seq(elseValue) => s" ELSE $elseValue" + }.mkString + } +} + +// scalastyle:off +/** + * Case statements of the form "CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END". + * Refer to this link for the corresponding semantics: + * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-ConditionalFunctions + */ +// scalastyle:on +case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseWhenLike { + + // Use private[this] Array to speed up evaluation. @transient private[this] lazy val branchesArr = branches.toArray - @transient private[this] lazy val predicates = - branches.sliding(2, 2).collect { case Seq(cond, _) => cond }.toSeq - @transient private[this] lazy val values = - branches.sliding(2, 2).collect { case Seq(_, value) => value }.toSeq - @transient private[this] lazy val elseValue = - if (branches.length % 2 == 0) None else Option(branches.last) - override def nullable: Boolean = { - // If no value is nullable and no elseValue is provided, the whole statement defaults to null. - values.exists(_.nullable) || (elseValue.map(_.nullable).getOrElse(true)) - } + override def children: Seq[Expression] = key +: branches - override lazy val resolved: Boolean = { - if (!childrenResolved) { - false - } else { - val allCondBooleans = predicates.forall(_.dataType == BooleanType) - // both then and else val should be considered. - val dataTypesEqual = (values ++ elseValue).map(_.dataType).distinct.size <= 1 - allCondBooleans && dataTypesEqual - } - } + override lazy val resolved: Boolean = + childrenResolved && valueTypesEqual /** Written in imperative fashion for performance considerations. */ override def eval(input: Row): Any = { + val evaluatedKey = key.eval(input) val len = branchesArr.length var i = 0 // If all branches fail and an elseVal is not provided, the whole statement // defaults to null, according to Hive's semantics. - var res: Any = null while (i < len - 1) { - if (branchesArr(i).eval(input) == true) { - res = branchesArr(i + 1).eval(input) - return res + if (equalNullSafe(evaluatedKey, branchesArr(i).eval(input))) { + return branchesArr(i + 1).eval(input) } i += 2 } + var res: Any = null if (i == len - 1) { res = branchesArr(i).eval(input) } - res + return res + } + + private def equalNullSafe(l: Any, r: Any) = { + if (l == null && r == null) { + true + } else if (l == null || r == null) { + false + } else { + l == r + } } override def toString: String = { - "CASE" + branches.sliding(2, 2).map { + s"CASE $key" + branches.sliding(2, 2).map { case Seq(cond, value) => s" WHEN $cond THEN $value" case Seq(elseValue) => s" ELSE $elseValue" }.mkString diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala index faaa55aa5e6f1..88d36d153ceea 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala @@ -850,6 +850,32 @@ class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite { assert(CaseWhen(Seq(c2, c4_notNull, c3, c5)).nullable === true) } + test("case key when") { + val row = create_row(null, 1, 2, "a", "b", "c") + val c1 = 'a.int.at(0) + val c2 = 'a.int.at(1) + val c3 = 'a.int.at(2) + val c4 = 'a.string.at(3) + val c5 = 'a.string.at(4) + val c6 = 'a.string.at(5) + + val literalNull = Literal.create(null, BooleanType) + val literalInt = Literal(1) + val literalString = Literal("a") + + checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, c5)), "b", row) + checkEvaluation(CaseKeyWhen(c1, Seq(c2, c4, literalNull, c5, c6)), "b", row) + checkEvaluation(CaseKeyWhen(c2, Seq(literalInt, c4, c5)), "a", row) + checkEvaluation(CaseKeyWhen(c2, Seq(c1, c4, c5)), "b", row) + checkEvaluation(CaseKeyWhen(c4, Seq(literalString, c2, c3)), 1, row) + checkEvaluation(CaseKeyWhen(c4, Seq(c1, c3, c5, c2, Literal(3))), 3, row) + + checkEvaluation(CaseKeyWhen(literalInt, Seq(c2, c4, c5)), "a", row) + checkEvaluation(CaseKeyWhen(literalString, Seq(c5, c2, c4, c3)), 2, row) + checkEvaluation(CaseKeyWhen(literalInt, Seq(c5, c2, c4, c3)), null, row) + checkEvaluation(CaseKeyWhen(literalNull, Seq(c5, c2, c1, c3)), 2, row) + } + test("complex type") { val row = create_row( "^Ba*n", // 0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala index 481ed4924857e..4a54120ba86f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala @@ -357,11 +357,12 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { * TODO: This can be optimized to use broadcast join when replacementMap is large. */ private def replaceCol(col: StructField, replacementMap: Map[_, _]): Column = { - val branches: Seq[Expression] = replacementMap.flatMap { case (source, target) => - df.col(col.name).equalTo(lit(source).cast(col.dataType)).expr :: - lit(target).cast(col.dataType).expr :: Nil + val keyExpr = df.col(col.name).expr + def buildExpr(v: Any) = Cast(Literal(v), keyExpr.dataType) + val branches = replacementMap.flatMap { case (source, target) => + Seq(buildExpr(source), buildExpr(target)) }.toSeq - new Column(CaseWhen(branches ++ Seq(df.col(col.name).expr))).as(col.name) + new Column(CaseKeyWhen(keyExpr, branches :+ keyExpr)).as(col.name) } private def convertToDouble(v: Any): Double = v match { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 4e51473979a2a..6176aee25e7a4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -1246,16 +1246,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case Token("TOK_FUNCTION", Token(WHEN(), Nil) :: branches) => CaseWhen(branches.map(nodeToExpr)) case Token("TOK_FUNCTION", Token(CASE(), Nil) :: branches) => - val transformed = branches.drop(1).sliding(2, 2).map { - case Seq(condVal, value) => - // FIXME (SPARK-2155): the key will get evaluated for multiple times in CaseWhen's eval(). - // Hence effectful / non-deterministic key expressions are *not* supported at the moment. - // We should consider adding new Expressions to get around this. - Seq(EqualTo(nodeToExpr(branches(0)), nodeToExpr(condVal)), - nodeToExpr(value)) - case Seq(elseVal) => Seq(nodeToExpr(elseVal)) - }.toSeq.reduce(_ ++ _) - CaseWhen(transformed) + val keyExpr = nodeToExpr(branches.head) + CaseKeyWhen(keyExpr, branches.drop(1).map(nodeToExpr)) /* Complex datatype manipulation */ case Token("[", child :: ordinal :: Nil) => diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 616352d223c5b..c605f1017547a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -751,4 +751,11 @@ class SQLQuerySuite extends QueryTest { (6, "c", 0, 6) ).map(i => Row(i._1, i._2, i._3, i._4))) } + + test("test case key when") { + (1 to 5).map(i => (i, i.toString)).toDF("k", "v").registerTempTable("t") + checkAnswer( + sql("SELECT CASE k WHEN 2 THEN 22 WHEN 4 THEN 44 ELSE 0 END, v FROM t"), + Row(0, "1") :: Row(22, "2") :: Row(0, "3") :: Row(44, "4") :: Row(0, "5") :: Nil) + } } From 88063c62689135da56ee6678b2e826b88c382732 Mon Sep 17 00:00:00 2001 From: tedyu Date: Thu, 7 May 2015 16:53:59 -0700 Subject: [PATCH 36/48] [SPARK-7450] Use UNSAFE.getLong() to speed up BitSetMethods#anySet() Author: tedyu Closes #5897 from tedyu/master and squashes the following commits: 473bf9d [tedyu] Address Josh's review comments 1719c5b [tedyu] Correct upper bound in for loop b51dcaf [tedyu] Add unit test in BitSetSuite for BitSet#anySet() 83f9f87 [tedyu] Merge branch 'master' of github.com:apache/spark 817e3f9 [tedyu] Replace constant 8 with SIZE_OF_LONG 75a467b [tedyu] Correct offset for UNSAFE.getLong() 855374b [tedyu] Remove second loop since bitSetWidthInBytes is WORD aligned 093b7a4 [tedyu] Use UNSAFE.getLong() to speed up BitSetMethods#anySet() 63ee050 [tedyu] Use UNSAFE.getLong() to speed up BitSetMethods#anySet() 4ca0ef6 [tedyu] Use UNSAFE.getLong() to speed up BitSetMethods#anySet() 3e9b6919 [tedyu] Use UNSAFE.getLong() to speed up BitSetMethods#anySet() --- .../main/java/org/apache/spark/unsafe/bitset/BitSet.java | 8 ++++++++ .../org/apache/spark/unsafe/bitset/BitSetMethods.java | 7 ++++--- .../java/org/apache/spark/unsafe/bitset/BitSetSuite.java | 7 +++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java index f72e07fce92fd..28e23da108ebe 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java @@ -102,4 +102,12 @@ public boolean isSet(int index) { public int nextSetBit(int fromIndex) { return BitSetMethods.nextSetBit(baseObject, baseOffset, fromIndex, numWords); } + + /** + * Returns {@code true} if any bit is set. + */ + public boolean anySet() { + return BitSetMethods.anySet(baseObject, baseOffset, numWords); + } + } diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java index f30626d8f4317..0987191c1c636 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSetMethods.java @@ -70,9 +70,10 @@ public static boolean isSet(Object baseObject, long baseOffset, int index) { /** * Returns {@code true} if any bit is set. */ - public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInBytes) { - for (int i = 0; i <= bitSetWidthInBytes; i++) { - if (PlatformDependent.UNSAFE.getByte(baseObject, baseOffset + i) != 0) { + public static boolean anySet(Object baseObject, long baseOffset, long bitSetWidthInWords) { + long addr = baseOffset; + for (int i = 0; i < bitSetWidthInWords; i++, addr += WORD_SIZE) { + if (PlatformDependent.UNSAFE.getLong(baseObject, addr) != 0) { return true; } } diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java index e3a824e29b768..18393db9f382f 100644 --- a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java +++ b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java @@ -39,6 +39,8 @@ public void basicOps() { for (int i = 0; i < bs.capacity(); i++) { Assert.assertFalse(bs.isSet(i)); } + // another form of asserting that the bit set is empty + Assert.assertFalse(bs.anySet()); // Set every bit and check it. for (int i = 0; i < bs.capacity(); i++) { @@ -52,6 +54,11 @@ public void basicOps() { bs.unset(i); Assert.assertFalse(bs.isSet(i)); } + + // Make sure anySet() can detect any set bit + bs = createBitSet(256); + bs.set(64); + Assert.assertTrue(bs.anySet()); } @Test From 22ab70e06ede65ca865073fe36c859042a920aa3 Mon Sep 17 00:00:00 2001 From: zsxwing Date: Thu, 7 May 2015 17:34:44 -0700 Subject: [PATCH 37/48] [SPARK-7305] [STREAMING] [WEBUI] Make BatchPage show friendly information when jobs are dropped by SparkListener If jobs are dropped by SparkListener, at least we can show the job ids in BatchPage. Screenshot: ![b1](https://cloud.githubusercontent.com/assets/1000778/7434968/f19aa784-eff3-11e4-8f86-36a073873574.png) Author: zsxwing Closes #5840 from zsxwing/SPARK-7305 and squashes the following commits: aca0ba6 [zsxwing] Fix the code style 718765e [zsxwing] Make generateNormalJobRow private 8073b03 [zsxwing] Merge branch 'master' into SPARK-7305 83dec11 [zsxwing] Make BatchPage show friendly information when jobs are dropped by SparkListener --- .../apache/spark/streaming/ui/BatchPage.scala | 136 ++++++++++++++---- 1 file changed, 106 insertions(+), 30 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala index 3f1cab69068dc..831f60e870f74 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala @@ -19,7 +19,7 @@ package org.apache.spark.streaming.ui import javax.servlet.http.HttpServletRequest -import scala.xml.{NodeSeq, Node} +import scala.xml.{NodeSeq, Node, Text} import org.apache.commons.lang3.StringEscapeUtils @@ -28,6 +28,7 @@ import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage} import org.apache.spark.streaming.ui.StreamingJobProgressListener.{SparkJobId, OutputOpId} import org.apache.spark.ui.jobs.UIData.JobUIData +private case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobUIData: Option[JobUIData]) private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { private val streamingListener = parent.listener @@ -44,25 +45,33 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { Error } + private def generateJobRow( + outputOpId: OutputOpId, + outputOpDescription: Seq[Node], + formattedOutputOpDuration: String, + numSparkJobRowsInOutputOp: Int, + isFirstRow: Boolean, + sparkJob: SparkJobIdWithUIData): Seq[Node] = { + if (sparkJob.jobUIData.isDefined) { + generateNormalJobRow(outputOpId, outputOpDescription, formattedOutputOpDuration, + numSparkJobRowsInOutputOp, isFirstRow, sparkJob.jobUIData.get) + } else { + generateDroppedJobRow(outputOpId, outputOpDescription, formattedOutputOpDuration, + numSparkJobRowsInOutputOp, isFirstRow, sparkJob.sparkJobId) + } + } + /** * Generate a row for a Spark Job. Because duplicated output op infos needs to be collapsed into * one cell, we use "rowspan" for the first row of a output op. */ - def generateJobRow( + private def generateNormalJobRow( outputOpId: OutputOpId, + outputOpDescription: Seq[Node], formattedOutputOpDuration: String, numSparkJobRowsInOutputOp: Int, isFirstRow: Boolean, sparkJob: JobUIData): Seq[Node] = { - val lastStageInfo = Option(sparkJob.stageIds) - .filter(_.nonEmpty) - .flatMap { ids => sparkListener.stageIdToInfo.get(ids.max) } - val lastStageData = lastStageInfo.flatMap { s => - sparkListener.stageIdToData.get((s.stageId, s.attemptId)) - } - - val lastStageName = lastStageInfo.map(_.name).getOrElse("(Unknown Stage Name)") - val lastStageDescription = lastStageData.flatMap(_.description).getOrElse("") val duration: Option[Long] = { sparkJob.submissionTime.map { start => val end = sparkJob.completionTime.getOrElse(System.currentTimeMillis()) @@ -83,9 +92,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { if (isFirstRow) { {outputOpId.toString} - - {lastStageDescription} - {lastStageName} + {outputOpDescription} {formattedOutputOpDuration} } else { @@ -122,27 +129,96 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { } - private def generateOutputOpIdRow( - outputOpId: OutputOpId, sparkJobs: Seq[JobUIData]): Seq[Node] = { - val sparkjobDurations = sparkJobs.map(sparkJob => { - sparkJob.submissionTime.map { start => - val end = sparkJob.completionTime.getOrElse(System.currentTimeMillis()) - end - start + /** + * If a job is dropped by sparkListener due to exceeding the limitation, we only show the job id + * with "-" cells. + */ + private def generateDroppedJobRow( + outputOpId: OutputOpId, + outputOpDescription: Seq[Node], + formattedOutputOpDuration: String, + numSparkJobRowsInOutputOp: Int, + isFirstRow: Boolean, + jobId: Int): Seq[Node] = { + // In the first row, output op id and its information needs to be shown. In other rows, these + // cells will be taken up due to "rowspan". + // scalastyle:off + val prefixCells = + if (isFirstRow) { + {outputOpId.toString} + {outputOpDescription} + {formattedOutputOpDuration} + } else { + Nil } - }) + // scalastyle:on + + + {prefixCells} + + {jobId.toString} + + + - + + - + + - + + - + + } + + private def generateOutputOpIdRow( + outputOpId: OutputOpId, sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = { + // We don't count the durations of dropped jobs + val sparkJobDurations = sparkJobs.filter(_.jobUIData.nonEmpty).map(_.jobUIData.get). + map(sparkJob => { + sparkJob.submissionTime.map { start => + val end = sparkJob.completionTime.getOrElse(System.currentTimeMillis()) + end - start + } + }) val formattedOutputOpDuration = - if (sparkjobDurations.exists(_ == None)) { - // If any job does not finish, set "formattedOutputOpDuration" to "-" + if (sparkJobDurations.isEmpty || sparkJobDurations.exists(_ == None)) { + // If no job or any job does not finish, set "formattedOutputOpDuration" to "-" "-" } else { - SparkUIUtils.formatDuration(sparkjobDurations.flatMap(x => x).sum) + SparkUIUtils.formatDuration(sparkJobDurations.flatMap(x => x).sum) } - generateJobRow(outputOpId, formattedOutputOpDuration, sparkJobs.size, true, sparkJobs.head) ++ + + val description = generateOutputOpDescription(sparkJobs) + + generateJobRow( + outputOpId, description, formattedOutputOpDuration, sparkJobs.size, true, sparkJobs.head) ++ sparkJobs.tail.map { sparkJob => - generateJobRow(outputOpId, formattedOutputOpDuration, sparkJobs.size, false, sparkJob) + generateJobRow( + outputOpId, description, formattedOutputOpDuration, sparkJobs.size, false, sparkJob) }.flatMap(x => x) } + private def generateOutputOpDescription(sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = { + val lastStageInfo = + sparkJobs.flatMap(_.jobUIData).headOption. // Get the first JobUIData + flatMap { sparkJob => // For the first job, get the latest Stage info + if (sparkJob.stageIds.isEmpty) { + None + } else { + sparkListener.stageIdToInfo.get(sparkJob.stageIds.max) + } + } + val lastStageData = lastStageInfo.flatMap { s => + sparkListener.stageIdToData.get((s.stageId, s.attemptId)) + } + + val lastStageName = lastStageInfo.map(_.name).getOrElse("(Unknown Stage Name)") + val lastStageDescription = lastStageData.flatMap(_.description).getOrElse("") + + + {lastStageDescription} + ++ Text(lastStageName) + } + private def failureReasonCell(failureReason: String): Seq[Node] = { val isMultiline = failureReason.indexOf('\n') >= 0 // Display the first line by default @@ -187,10 +263,10 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { (outputOpId, outputOpIdAndSparkJobIds.map(_.sparkJobId).sorted) } sparkListener.synchronized { - val outputOpIdWithJobs: Seq[(OutputOpId, Seq[JobUIData])] = + val outputOpIdWithJobs: Seq[(OutputOpId, Seq[SparkJobIdWithUIData])] = outputOpIdToSparkJobIds.map { case (outputOpId, sparkJobIds) => - // Filter out spark Job ids that don't exist in sparkListener - (outputOpId, sparkJobIds.flatMap(getJobData)) + (outputOpId, + sparkJobIds.map(sparkJobId => SparkJobIdWithUIData(sparkJobId, getJobData(sparkJobId)))) } @@ -200,7 +276,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { { outputOpIdWithJobs.map { - case (outputOpId, jobs) => generateOutputOpIdRow(outputOpId, jobs) + case (outputOpId, sparkJobIds) => generateOutputOpIdRow(outputOpId, sparkJobIds) } } From cd1d4110cfffb413ab585cf1cc8f1264243cb393 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 7 May 2015 19:36:24 -0700 Subject: [PATCH 38/48] [SPARK-6908] [SQL] Use isolated Hive client This PR switches Spark SQL's Hive support to use the isolated hive client interface introduced by #5851, instead of directly interacting with the client. By using this isolated client we can now allow users to dynamically configure the version of Hive that they are connecting to by setting `spark.sql.hive.metastore.version` without the need recompile. This also greatly reduces the surface area for our interaction with the hive libraries, hopefully making it easier to support other versions in the future. Jars for the desired hive version can be configured using `spark.sql.hive.metastore.jars`, which accepts the following options: - a colon-separated list of jar files or directories for hive and hadoop. - `builtin` - attempt to discover the jars that were used to load Spark SQL and use those. This option is only valid when using the execution version of Hive. - `maven` - download the correct version of hive on demand from maven. By default, `builtin` is used for Hive 13. This PR also removes the test step for building against Hive 12, as this will no longer be required to talk to Hive 12 metastores. However, the full removal of the Shim is deferred until a later PR. Remaining TODOs: - Remove the Hive Shims and inline code for Hive 13. - Several HiveCompatibility tests are not yet passing. - `nullformatCTAS` - As detailed below, we now are handling CTAS parsing ourselves instead of hacking into the Hive semantic analyzer. However, we currently only handle the common cases and not things like CTAS where the null format is specified. - `combine1` now leaks state about compression somehow, breaking all subsequent tests. As such we currently add it to the blacklist - `part_inherit_tbl_props` and `part_inherit_tbl_props_with_star` do not work anymore. We are correctly propagating the information - "load_dyn_part14.*" - These tests pass when run on their own, but fail when run with all other tests. It seems our `RESET` mechanism may not be as robust as it used to be? Other required changes: - `CreateTableAsSelect` no longer carries parts of the HiveQL AST with it through the query execution pipeline. Instead, we parse CTAS during the HiveQL conversion and construct a `HiveTable`. The full parsing here is not yet complete as detailed above in the remaining TODOs. Since the operator is Hive specific, it is moved to the hive package. - `Command` is simplified to be a trait that simply acts as a marker for a LogicalPlan that should be eagerly evaluated. Author: Michael Armbrust Closes #5876 from marmbrus/useIsolatedClient and squashes the following commits: 258d000 [Michael Armbrust] really really correct path handling e56fd4a [Michael Armbrust] getAbsolutePath 5a259f5 [Michael Armbrust] fix typos 81bb366 [Michael Armbrust] comments from vanzin 5f3945e [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient 4b5cd41 [Michael Armbrust] yin's comments f5de7de [Michael Armbrust] cleanup 11e9c72 [Michael Armbrust] better coverage in versions suite 7e8f010 [Michael Armbrust] better error messages and jar handling e7b3941 [Michael Armbrust] more permisive checking for function registration da91ba7 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient 5fe5894 [Michael Armbrust] fix serialization suite 81711c4 [Michael Armbrust] Initial support for running without maven 1d8ae44 [Michael Armbrust] fix final tests? 1c50813 [Michael Armbrust] more comments a3bee70 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into useIsolatedClient a6f5df1 [Michael Armbrust] style ab07f7e [Michael Armbrust] WIP 4d8bf02 [Michael Armbrust] Remove hive 12 compilation 8843a25 [Michael Armbrust] [SPARK-6908] [SQL] Use isolated Hive client --- dev/run-tests | 23 - project/MimaExcludes.scala | 2 + project/SparkBuild.scala | 9 +- .../plans/logical/basicOperators.scala | 16 +- .../sql/catalyst/plans/logical/commands.scala | 8 +- .../spark/sql/catalyst/SqlParserSuite.scala | 6 +- .../org/apache/spark/sql/DataFrame.scala | 1 - .../org/apache/spark/sql/SQLContext.scala | 11 +- .../apache/spark/sql/execution/commands.scala | 4 +- .../org/apache/spark/sql/sources/ddl.scala | 16 +- .../hive/thriftserver/SparkSQLCLIDriver.scala | 26 +- .../sql/hive/thriftserver/SparkSQLEnv.scala | 9 +- .../execution/HiveCompatibilitySuite.scala | 12 +- .../apache/spark/sql/hive/HiveContext.scala | 283 ++++++------ .../spark/sql/hive/HiveMetastoreCatalog.scala | 415 ++++++------------ .../org/apache/spark/sql/hive/HiveQl.scala | 126 +++++- .../apache/spark/sql/hive/TableReader.scala | 11 +- .../sql/hive/client/ClientInterface.scala | 41 +- .../spark/sql/hive/client/ClientWrapper.scala | 99 +++-- .../hive/client/IsolatedClientLoader.scala | 23 +- .../sql/hive/client/ReflectionMagic.scala | 8 + .../hive/execution/CreateTableAsSelect.scala | 33 +- .../hive/execution/InsertIntoHiveTable.scala | 33 +- .../spark/sql/hive/execution/commands.scala | 13 + .../apache/spark/sql/hive/test/TestHive.scala | 72 ++- sql/hive/src/test/resources/log4j.properties | 2 +- .../spark/sql/hive/ErrorPositionSuite.scala | 22 +- .../sql/hive/MetastoreDataSourcesSuite.scala | 26 +- .../spark/sql/hive/SerializationSuite.scala | 6 +- .../spark/sql/hive/client/VersionsSuite.scala | 78 +++- .../hive/execution/HiveComparisonTest.scala | 2 + .../sql/hive/execution/HiveQuerySuite.scala | 2 +- .../sql/hive/execution/PruningSuite.scala | 15 +- 33 files changed, 782 insertions(+), 671 deletions(-) diff --git a/dev/run-tests b/dev/run-tests index 05c63bce4d40d..ef587a1a5988c 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -142,29 +142,6 @@ CURRENT_BLOCK=$BLOCK_BUILD { HIVE_BUILD_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver" - HIVE_12_BUILD_ARGS="$HIVE_BUILD_ARGS -Phive-0.12.0" - - # First build with Hive 0.12.0 to ensure patches do not break the Hive 0.12.0 build - echo "[info] Compile with Hive 0.12.0" - [ -d "lib_managed" ] && rm -rf lib_managed - echo "[info] Building Spark with these arguments: $HIVE_12_BUILD_ARGS" - - if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then - build/mvn $HIVE_12_BUILD_ARGS clean package -DskipTests - else - # NOTE: echo "q" is needed because sbt on encountering a build file with failure - # (either resolution or compilation) prompts the user for input either q, r, etc - # to quit or retry. This echo is there to make it not block. - # NOTE: Do not quote $BUILD_MVN_PROFILE_ARGS or else it will be interpreted as a - # single argument! - # QUESTION: Why doesn't 'yes "q"' work? - # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work? - echo -e "q\n" \ - | build/sbt $HIVE_12_BUILD_ARGS clean hive/compile hive-thriftserver/compile \ - | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" - fi - - # Then build with default Hive version (0.13.1) because tests are based on this version echo "[info] Compile with Hive 0.13.1" [ -d "lib_managed" ] && rm -rf lib_managed echo "[info] Building Spark with these arguments: $HIVE_BUILD_ARGS" diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index bf343d4b7e40b..cfe387faec14b 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -89,6 +89,8 @@ object MimaExcludes { ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.mllib.linalg.Vector.numActives") ) ++ Seq( + // Execution should never be included as its always internal. + MimaBuild.excludeSparkPackage("sql.execution"), // This `protected[sql]` method was removed in 1.3.1 ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.sql.SQLContext.checkAnalysis"), diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index b4431c7ee05b6..026855f8f6a5a 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -193,6 +193,7 @@ object SparkBuild extends PomBuild { * Usage: `build/sbt sparkShell` */ val sparkShell = taskKey[Unit]("start a spark-shell.") + val sparkSql = taskKey[Unit]("starts the spark sql CLI.") enable(Seq( connectInput in run := true, @@ -203,6 +204,12 @@ object SparkBuild extends PomBuild { sparkShell := { (runMain in Compile).toTask(" org.apache.spark.repl.Main -usejavacp").value + }, + + javaOptions in Compile += "-Dspark.master=local", + + sparkSql := { + (runMain in Compile).toTask(" org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver").value } ))(assembly) @@ -497,7 +504,7 @@ object TestSettings { // Setting SPARK_DIST_CLASSPATH is a simple way to make sure any child processes // launched by the tests have access to the correct test-time classpath. envVars in Test ++= Map( - "SPARK_DIST_CLASSPATH" -> + "SPARK_DIST_CLASSPATH" -> (fullClasspath in Test).value.files.map(_.getAbsolutePath).mkString(":").stripSuffix(":"), "JAVA_HOME" -> sys.env.get("JAVA_HOME").getOrElse(sys.props("java.home"))), javaOptions in Test += "-Dspark.test.home=" + sparkHome, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala index ba0abb2df596c..0f349f9d11415 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala @@ -149,16 +149,6 @@ case class InsertIntoTable( } } -case class CreateTableAsSelect[T]( - databaseName: Option[String], - tableName: String, - child: LogicalPlan, - allowExisting: Boolean, - desc: Option[T] = None) extends UnaryNode { - override def output: Seq[Attribute] = Seq.empty[Attribute] - override lazy val resolved: Boolean = databaseName != None && childrenResolved -} - /** * A container for holding named common table expressions (CTEs) and a query plan. * This operator will be removed during analysis and the relations will be substituted into child. @@ -184,10 +174,10 @@ case class WriteToFile( } /** - * @param order The ordering expressions - * @param global True means global sorting apply for entire data set, + * @param order The ordering expressions + * @param global True means global sorting apply for entire data set, * False means sorting only apply within the partition. - * @param child Child logical plan + * @param child Child logical plan */ case class Sort( order: Seq[SortOrder], diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala index 45905f8ef98c5..246f4d7e34d3d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala @@ -21,9 +21,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute /** * A logical node that represents a non-query command to be executed by the system. For example, - * commands can be used by parsers to represent DDL operations. + * commands can be used by parsers to represent DDL operations. Commands, unlike queries, are + * eagerly executed. */ -abstract class Command extends LeafNode { - self: Product => - def output: Seq[Attribute] = Seq.empty -} +trait Command diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala index a652c70560990..890ea2a84b82e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala @@ -17,11 +17,15 @@ package org.apache.spark.sql.catalyst +import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.logical.Command import org.scalatest.FunSuite -private[sql] case class TestCommand(cmd: String) extends Command +private[sql] case class TestCommand(cmd: String) extends LogicalPlan with Command { + override def output: Seq[Attribute] = Seq.empty + override def children: Seq[LogicalPlan] = Seq.empty +} private[sql] class SuperLongKeywordTestParser extends AbstractSparkSQLParser { protected val EXECUTE = Keyword("THISISASUPERLONGKEYWORDTEST") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 79fbf50300d4d..7947042c14299 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -143,7 +143,6 @@ class DataFrame private[sql]( // happen right away to let these side effects take place eagerly. case _: Command | _: InsertIntoTable | - _: CreateTableAsSelect[_] | _: CreateTableUsingAsSelect | _: WriteToFile => LogicalRDD(queryExecution.analyzed.output, queryExecution.toRdd)(sqlContext) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 0563430a6fdc7..0ac0936f0f592 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -70,7 +70,7 @@ import org.apache.spark.{Partition, SparkContext} * spark-sql> SELECT * FROM src LIMIT 1; * *-- Exception will be thrown and switch to dialect - *-- "sql" (for SQLContext) or + *-- "sql" (for SQLContext) or *-- "hiveql" (for HiveContext) * }}} */ @@ -107,7 +107,7 @@ class SQLContext(@transient val sparkContext: SparkContext) /** * @return Spark SQL configuration */ - protected[sql] def conf = tlSession.get().conf + protected[sql] def conf = currentSession().conf /** * Set Spark SQL configuration properties. @@ -1197,13 +1197,17 @@ class SQLContext(@transient val sparkContext: SparkContext) |${stringOrError(executedPlan)} """.stripMargin.trim - override def toString: String = + override def toString: String = { + def output = + analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ") + // TODO previously will output RDD details by run (${stringOrError(toRdd.toDebugString)}) // however, the `toRdd` will cause the real execution, which is not what we want. // We need to think about how to avoid the side effect. s"""== Parsed Logical Plan == |${stringOrError(logical)} |== Analyzed Logical Plan == + |${stringOrError(output)} |${stringOrError(analyzed)} |== Optimized Logical Plan == |${stringOrError(optimizedPlan)} @@ -1212,6 +1216,7 @@ class SQLContext(@transient val sparkContext: SparkContext) |Code Generation: ${stringOrError(executedPlan.codegenEnabled)} |== RDD == """.stripMargin.trim + } } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index 65687db4e6235..388a8184e4cfe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -32,9 +32,11 @@ import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext} * A logical command that is executed for its side-effects. `RunnableCommand`s are * wrapped in `ExecutedCommand` during execution. */ -trait RunnableCommand extends logical.Command { +private[sql] trait RunnableCommand extends LogicalPlan with logical.Command { self: Product => + override def output: Seq[Attribute] = Seq.empty + override def children: Seq[LogicalPlan] = Seq.empty def run(sqlContext: SQLContext): Seq[Row] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala index 1abf3aa51cb25..06c64f2bdd59e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala @@ -269,8 +269,10 @@ private[sql] case class ResolvedDataSource(provider: Class[_], relation: BaseRel */ private[sql] case class DescribeCommand( table: LogicalPlan, - isExtended: Boolean) extends Command { - override val output = Seq( + isExtended: Boolean) extends LogicalPlan with Command { + + override def children: Seq[LogicalPlan] = Seq.empty + override val output: Seq[Attribute] = Seq( // Column names are based on Hive. AttributeReference("col_name", StringType, nullable = false, new MetadataBuilder().putString("comment", "name of the column").build())(), @@ -292,7 +294,11 @@ private[sql] case class CreateTableUsing( temporary: Boolean, options: Map[String, String], allowExisting: Boolean, - managedIfNoPath: Boolean) extends Command + managedIfNoPath: Boolean) extends LogicalPlan with Command { + + override def output: Seq[Attribute] = Seq.empty + override def children: Seq[LogicalPlan] = Seq.empty +} /** * A node used to support CTAS statements and saveAsTable for the data source API. @@ -318,7 +324,7 @@ private[sql] case class CreateTempTableUsing( provider: String, options: Map[String, String]) extends RunnableCommand { - def run(sqlContext: SQLContext): Seq[Row] = { + override def run(sqlContext: SQLContext): Seq[Row] = { val resolved = ResolvedDataSource(sqlContext, userSpecifiedSchema, provider, options) sqlContext.registerDataFrameAsTable( DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName) @@ -333,7 +339,7 @@ private[sql] case class CreateTempTableUsingAsSelect( options: Map[String, String], query: LogicalPlan) extends RunnableCommand { - def run(sqlContext: SQLContext): Seq[Row] = { + override def run(sqlContext: SQLContext): Seq[Row] = { val df = DataFrame(sqlContext, query) val resolved = ResolvedDataSource(sqlContext, provider, mode, options, df) sqlContext.registerDataFrameAsTable( diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala index b7b6925aa87f7..deb1008c468bf 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala @@ -37,7 +37,7 @@ import org.apache.hadoop.hive.ql.session.SessionState import org.apache.thrift.transport.TSocket import org.apache.spark.Logging -import org.apache.spark.sql.hive.HiveShim +import org.apache.spark.sql.hive.{HiveContext, HiveShim} import org.apache.spark.util.Utils private[hive] object SparkSQLCLIDriver { @@ -74,7 +74,12 @@ private[hive] object SparkSQLCLIDriver { System.exit(1) } - val sessionState = new CliSessionState(new HiveConf(classOf[SessionState])) + val cliConf = new HiveConf(classOf[SessionState]) + // Override the location of the metastore since this is only used for local execution. + HiveContext.newTemporaryConfiguration().foreach { + case (key, value) => cliConf.set(key, value) + } + val sessionState = new CliSessionState(cliConf) sessionState.in = System.in try { @@ -91,10 +96,14 @@ private[hive] object SparkSQLCLIDriver { // Set all properties specified via command line. val conf: HiveConf = sessionState.getConf - sessionState.cmdProperties.entrySet().foreach { item: java.util.Map.Entry[Object, Object] => - conf.set(item.getKey.asInstanceOf[String], item.getValue.asInstanceOf[String]) - sessionState.getOverriddenConfigurations.put( - item.getKey.asInstanceOf[String], item.getValue.asInstanceOf[String]) + sessionState.cmdProperties.entrySet().foreach { item => + val key = item.getKey.asInstanceOf[String] + val value = item.getValue.asInstanceOf[String] + // We do not propagate metastore options to the execution copy of hive. + if (key != "javax.jdo.option.ConnectionURL") { + conf.set(key, value) + sessionState.getOverriddenConfigurations.put(key, value) + } } SessionState.start(sessionState) @@ -138,8 +147,9 @@ private[hive] object SparkSQLCLIDriver { case e: UnsupportedEncodingException => System.exit(3) } - // use the specified database if specified - cli.processSelectDatabase(sessionState); + if (sessionState.database != null) { + SparkSQLEnv.hiveContext.runSqlHive(s"USE ${sessionState.database}") + } // Execute -i init files (always in silent mode) cli.processInitFiles(sessionState) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala index 97b46a01ba5b4..7c0c505e2d61e 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.hive.thriftserver +import java.io.PrintStream + import scala.collection.JavaConversions._ import org.apache.spark.scheduler.StatsReportListener @@ -39,7 +41,6 @@ private[hive] object SparkSQLEnv extends Logging { sparkConf .setAppName(s"SparkSQL::${Utils.localHostName()}") - .set("spark.sql.hive.version", HiveShim.version) .set( "spark.serializer", maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer")) @@ -51,6 +52,12 @@ private[hive] object SparkSQLEnv extends Logging { sparkContext.addSparkListener(new StatsReportListener()) hiveContext = new HiveContext(sparkContext) + hiveContext.metadataHive.setOut(new PrintStream(System.out, true, "UTF-8")) + hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8")) + hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8")) + + hiveContext.setConf("spark.sql.hive.version", HiveShim.version) + if (log.isDebugEnabled) { hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) => logDebug(s"HiveConf var: $k=$v") diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 5e411c2fdba9d..b6245a57074c8 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -240,7 +240,17 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // It has a bug and it has been fixed by // https://issues.apache.org/jira/browse/HIVE-7673 (in Hive 0.14 and trunk). - "input46" + "input46", + + // These tests were broken by the hive client isolation PR. + "part_inherit_tbl_props", + "part_inherit_tbl_props_with_star", + + "nullformatCTAS", // SPARK-7411: need to finish CTAS parser + + // The isolated classloader seemed to make some of our test reset mechanisms less robust. + "combine1", // This test changes compression settings in a way that breaks all subsequent tests. + "load_dyn_part14.*" // These work alone but fail when run with other tests... ) ++ HiveShim.compatibilityBlackList /** diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index f25723e53f07c..538c6c7f0a200 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -17,8 +17,9 @@ package org.apache.spark.sql.hive -import java.io.{BufferedReader, InputStreamReader, PrintStream} +import java.io.{BufferedReader, File, InputStreamReader, PrintStream} import java.sql.Timestamp +import java.util.{ArrayList => JArrayList} import org.apache.hadoop.hive.ql.parse.VariableSubstitution import org.apache.spark.sql.catalyst.Dialect @@ -35,15 +36,19 @@ import org.apache.hadoop.hive.ql.processors._ import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable} -import org.apache.spark.SparkContext +import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.Experimental +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateSubQueries, OverrideCatalog, OverrideFunctionRegistry} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, QueryExecutionException, SetCommand} +import org.apache.spark.sql.hive.client._ import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand} import org.apache.spark.sql.sources.{DDLParser, DataSourceStrategy} import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils + /** * This is the HiveQL Dialect, this dialect is strongly bind with HiveContext @@ -61,6 +66,8 @@ private[hive] class HiveQLDialect extends Dialect { class HiveContext(sc: SparkContext) extends SQLContext(sc) { self => + import HiveContext._ + /** * When true, enables an experimental feature where metastore tables that use the parquet SerDe * are automatically converted to use the Spark SQL parquet table scan, instead of the Hive @@ -93,9 +100,118 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { protected[sql] def convertCTAS: Boolean = getConf("spark.sql.hive.convertCTAS", "false").toBoolean + /** + * The version of the hive client that will be used to communicate with the metastore. Note that + * this does not necessarily need to be the same version of Hive that is used internally by + * Spark SQL for execution. + */ + protected[hive] def hiveMetastoreVersion: String = + getConf(HIVE_METASTORE_VERSION, hiveExecutionVersion) + + /** + * The location of the jars that should be used to instantiate the HiveMetastoreClient. This + * property can be one of three options: + * - a classpath in the standard format for both hive and hadoop. + * - builtin - attempt to discover the jars that were used to load Spark SQL and use those. This + * option is only valid when using the execution version of Hive. + * - maven - download the correct version of hive on demand from maven. + */ + protected[hive] def hiveMetastoreJars: String = + getConf(HIVE_METASTORE_JARS, "builtin") + @transient protected[sql] lazy val substitutor = new VariableSubstitution() + /** + * The copy of the hive client that is used for execution. Currently this must always be + * Hive 13 as this is the version of Hive that is packaged with Spark SQL. This copy of the + * client is used for execution related tasks like registering temporary functions or ensuring + * that the ThreadLocal SessionState is correctly populated. This copy of Hive is *not* used + * for storing peristent metadata, and only point to a dummy metastore in a temporary directory. + */ + @transient + protected[hive] lazy val executionHive: ClientWrapper = { + logInfo(s"Initilizing execution hive, version $hiveExecutionVersion") + new ClientWrapper( + version = IsolatedClientLoader.hiveVersion(hiveExecutionVersion), + config = newTemporaryConfiguration()) + } + SessionState.setCurrentSessionState(executionHive.state) + + /** + * The copy of the Hive client that is used to retrieve metadata from the Hive MetaStore. + * The version of the Hive client that is used here must match the metastore that is configured + * in the hive-site.xml file. + */ + @transient + protected[hive] lazy val metadataHive: ClientInterface = { + val metaVersion = IsolatedClientLoader.hiveVersion(hiveMetastoreVersion) + + // We instantiate a HiveConf here to read in the hive-site.xml file and then pass the options + // into the isolated client loader + val metadataConf = new HiveConf() + // `configure` goes second to override other settings. + val allConfig = metadataConf.iterator.map(e => e.getKey -> e.getValue).toMap ++ configure + + val isolatedLoader = if (hiveMetastoreJars == "builtin") { + if (hiveExecutionVersion != hiveMetastoreVersion) { + throw new IllegalArgumentException( + "Builtin jars can only be used when hive execution version == hive metastore version. " + + s"Execution: ${hiveExecutionVersion} != Metastore: ${hiveMetastoreVersion}. " + + "Specify a vaild path to the correct hive jars using $HIVE_METASTORE_JARS " + + s"or change $HIVE_METASTORE_VERSION to $hiveExecutionVersion.") + } + val jars = getClass.getClassLoader match { + case urlClassLoader: java.net.URLClassLoader => urlClassLoader.getURLs + case other => + throw new IllegalArgumentException( + "Unable to locate hive jars to connect to metastore " + + s"using classloader ${other.getClass.getName}. " + + "Please set spark.sql.hive.metastore.jars") + } + + logInfo( + s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using Spark classes.") + new IsolatedClientLoader( + version = metaVersion, + execJars = jars.toSeq, + config = allConfig, + isolationOn = true) + } else if (hiveMetastoreJars == "maven") { + // TODO: Support for loading the jars from an already downloaded location. + logInfo( + s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using maven.") + IsolatedClientLoader.forVersion(hiveMetastoreVersion, allConfig ) + } else { + // Convert to files and expand any directories. + val jars = + hiveMetastoreJars + .split(File.pathSeparator) + .flatMap { + case path if new File(path).getName() == "*" => + val files = new File(path).getParentFile().listFiles() + if (files == null) { + logWarning(s"Hive jar path '$path' does not exist.") + Nil + } else { + files.filter(_.getName().toLowerCase().endsWith(".jar")) + } + case path => + new File(path) :: Nil + } + .map(_.toURI.toURL) + + logInfo( + s"Initializing HiveMetastoreConnection version $hiveMetastoreVersion using $jars") + new IsolatedClientLoader( + version = metaVersion, + execJars = jars.toSeq, + config = allConfig, + isolationOn = true) + } + isolatedLoader.client + } + protected[sql] override def parseSql(sql: String): LogicalPlan = { super.parseSql(substitutor.substitute(hiveconf, sql)) } @@ -178,15 +294,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { // recorded in the Hive metastore. // This logic is based on org.apache.hadoop.hive.ql.exec.StatsTask.aggregateStats(). if (newTotalSize > 0 && newTotalSize != oldTotalSize) { - tableParameters.put(HiveShim.getStatsSetupConstTotalSize, newTotalSize.toString) - val hiveTTable = relation.hiveQlTable.getTTable - hiveTTable.setParameters(tableParameters) - val tableFullName = - relation.hiveQlTable.getDbName + "." + relation.hiveQlTable.getTableName - - catalog.synchronized { - catalog.client.alterTable(tableFullName, new Table(hiveTTable)) - } + catalog.client.alterTable( + relation.table.copy( + properties = relation.table.properties + + (HiveShim.getStatsSetupConstTotalSize -> newTotalSize.toString))) } case otherRelation => throw new UnsupportedOperationException( @@ -194,47 +305,19 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { } } - // Circular buffer to hold what hive prints to STDOUT and ERR. Only printed when failures occur. - @transient - protected lazy val outputBuffer = new java.io.OutputStream { - var pos: Int = 0 - var buffer = new Array[Int](10240) - def write(i: Int): Unit = { - buffer(pos) = i - pos = (pos + 1) % buffer.size - } - - override def toString: String = { - val (end, start) = buffer.splitAt(pos) - val input = new java.io.InputStream { - val iterator = (start ++ end).iterator - - def read(): Int = if (iterator.hasNext) iterator.next() else -1 - } - val reader = new BufferedReader(new InputStreamReader(input)) - val stringBuilder = new StringBuilder - var line = reader.readLine() - while(line != null) { - stringBuilder.append(line) - stringBuilder.append("\n") - line = reader.readLine() - } - stringBuilder.toString() - } - } - - protected[hive] def sessionState = tlSession.get().asInstanceOf[this.SQLSession].sessionState - protected[hive] def hiveconf = tlSession.get().asInstanceOf[this.SQLSession].hiveconf override def setConf(key: String, value: String): Unit = { super.setConf(key, value) - runSqlHive(s"SET $key=$value") + hiveconf.set(key, value) + executionHive.runSqlHive(s"SET $key=$value") + metadataHive.runSqlHive(s"SET $key=$value") } /* A catalyst metadata catalog that points to the Hive Metastore. */ @transient - override protected[sql] lazy val catalog = new HiveMetastoreCatalog(this) with OverrideCatalog + override protected[sql] lazy val catalog = + new HiveMetastoreCatalog(metadataHive, this) with OverrideCatalog // Note that HiveUDFs will be overridden by functions registered in this context. @transient @@ -261,16 +344,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { new this.SQLSession() } + /** Overridden by child classes that need to set configuration before the client init. */ + protected def configure(): Map[String, String] = Map.empty + protected[hive] class SQLSession extends super.SQLSession { protected[sql] override lazy val conf: SQLConf = new SQLConf { override def dialect: String = getConf(SQLConf.DIALECT, "hiveql") } - protected[hive] lazy val hiveconf: HiveConf = { - setConf(sessionState.getConf.getAllProperties) - sessionState.getConf - } - /** * SQLConf and HiveConf contracts: * @@ -285,78 +366,12 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { state = new SessionState(new HiveConf(classOf[SessionState])) SessionState.start(state) } - if (state.out == null) { - state.out = new PrintStream(outputBuffer, true, "UTF-8") - } - if (state.err == null) { - state.err = new PrintStream(outputBuffer, true, "UTF-8") - } state } - } - - /** - * Runs the specified SQL query using Hive. - */ - protected[sql] def runSqlHive(sql: String): Seq[String] = { - val maxResults = 100000 - val results = runHive(sql, maxResults) - // It is very confusing when you only get back some of the results... - if (results.size == maxResults) sys.error("RESULTS POSSIBLY TRUNCATED") - results - } - - /** - * Execute the command using Hive and return the results as a sequence. Each element - * in the sequence is one row. - */ - protected def runHive(cmd: String, maxRows: Int = 1000): Seq[String] = synchronized { - try { - val cmd_trimmed: String = cmd.trim() - val tokens: Array[String] = cmd_trimmed.split("\\s+") - val cmd_1: String = cmd_trimmed.substring(tokens(0).length()).trim() - val proc: CommandProcessor = HiveShim.getCommandProcessor(Array(tokens(0)), hiveconf) - - // Makes sure the session represented by the `sessionState` field is activated. This implies - // Spark SQL Hive support uses a single `SessionState` for all Hive operations and breaks - // session isolation under multi-user scenarios (i.e. HiveThriftServer2). - // TODO Fix session isolation - if (SessionState.get() != sessionState) { - SessionState.start(sessionState) - } - proc match { - case driver: Driver => - val results = HiveShim.createDriverResultsArray - val response: CommandProcessorResponse = driver.run(cmd) - // Throw an exception if there is an error in query processing. - if (response.getResponseCode != 0) { - driver.close() - throw new QueryExecutionException(response.getErrorMessage) - } - driver.setMaxRows(maxRows) - driver.getResults(results) - driver.close() - HiveShim.processResults(results) - case _ => - if (sessionState.out != null) { - sessionState.out.println(tokens(0) + " " + cmd_1) - } - Seq(proc.run(cmd_1).getResponseCode.toString) - } - } catch { - case e: Exception => - logError( - s""" - |====================== - |HIVE FAILURE OUTPUT - |====================== - |${outputBuffer.toString} - |====================== - |END HIVE FAILURE OUTPUT - |====================== - """.stripMargin) - throw e + protected[hive] lazy val hiveconf: HiveConf = { + setConf(sessionState.getConf.getAllProperties) + sessionState.getConf } } @@ -391,17 +406,23 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { ) } + protected[hive] def runSqlHive(sql: String): Seq[String] = { + if (sql.toLowerCase.contains("create temporary function")) { + executionHive.runSqlHive(sql) + } else if (sql.trim.toLowerCase.startsWith("set")) { + metadataHive.runSqlHive(sql) + executionHive.runSqlHive(sql) + } else { + metadataHive.runSqlHive(sql) + } + } + @transient override protected[sql] val planner = hivePlanner /** Extends QueryExecution with hive specific features. */ protected[sql] class QueryExecution(logicalPlan: LogicalPlan) extends super.QueryExecution(logicalPlan) { - // Like what we do in runHive, makes sure the session represented by the - // `sessionState` field is activated. - if (SessionState.get() != sessionState) { - SessionState.start(sessionState) - } /** * Returns the result as a hive compatible sequence of strings. For native commands, the @@ -439,7 +460,21 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { } -private object HiveContext { +private[hive] object HiveContext { + /** The version of hive used internally by Spark SQL. */ + val hiveExecutionVersion: String = "0.13.1" + + val HIVE_METASTORE_VERSION: String = "spark.sql.hive.metastore.version" + val HIVE_METASTORE_JARS: String = "spark.sql.hive.metastore.jars" + + /** Constructs a configuration for hive, where the metastore is located in a temp directory. */ + def newTemporaryConfiguration(): Map[String, String] = { + val tempDir = Utils.createTempDir() + val localMetastore = new File(tempDir, "metastore").getAbsolutePath + Map( + "javax.jdo.option.ConnectionURL" -> s"jdbc:derby:;databaseName=$localMetastore;create=true") + } + protected val primitiveTypes = Seq(StringType, IntegerType, LongType, DoubleType, FloatType, BooleanType, ByteType, ShortType, DateType, TimestampType, BinaryType) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 4d222cf88e5e8..8fcdf3d0ab119 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -22,6 +22,8 @@ import java.util.{List => JList} import com.google.common.base.Objects import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} + +import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.metastore.api.{FieldSchema, Partition => TPartition, Table => TTable} import org.apache.hadoop.hive.metastore.{TableType, Warehouse} import org.apache.hadoop.hive.ql.metadata._ @@ -32,6 +34,7 @@ import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.Logging +import org.apache.spark.sql.hive.client.IsolatedClientLoader import org.apache.spark.sql.{SaveMode, AnalysisException, SQLContext} import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NoSuchTableException, Catalog, OverrideCatalog} import org.apache.spark.sql.catalyst.expressions._ @@ -39,6 +42,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ +import org.apache.spark.sql.hive.client._ import org.apache.spark.sql.parquet.{ParquetRelation2, Partition => ParquetPartition, PartitionSpec} import org.apache.spark.sql.sources.{CreateTableUsingAsSelect, DDLParser, LogicalRelation, ResolvedDataSource} import org.apache.spark.sql.types._ @@ -47,11 +51,10 @@ import org.apache.spark.util.Utils /* Implicit conversions */ import scala.collection.JavaConversions._ -private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with Logging { - import org.apache.spark.sql.hive.HiveMetastoreTypes._ +private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: HiveContext) + extends Catalog with Logging { - /** Connection to hive metastore. Usages should lock on `this`. */ - protected[hive] val client = Hive.get(hive.hiveconf) + import org.apache.spark.sql.hive.HiveMetastoreTypes._ /** Usages should lock on `this`. */ protected[hive] lazy val hiveWarehouse = new Warehouse(hive.hiveconf) @@ -67,14 +70,12 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with val cacheLoader = new CacheLoader[QualifiedTableName, LogicalPlan]() { override def load(in: QualifiedTableName): LogicalPlan = { logDebug(s"Creating new cached data source for $in") - val table = HiveMetastoreCatalog.this.synchronized { - client.getTable(in.database, in.name) - } + val table = client.getTable(in.database, in.name) def schemaStringFromParts: Option[String] = { - Option(table.getProperty("spark.sql.sources.schema.numParts")).map { numParts => + table.properties.get("spark.sql.sources.schema.numParts").map { numParts => val parts = (0 until numParts.toInt).map { index => - val part = table.getProperty(s"spark.sql.sources.schema.part.${index}") + val part = table.properties.get(s"spark.sql.sources.schema.part.${index}").orNull if (part == null) { throw new AnalysisException( s"Could not read schema from the metastore because it is corrupted " + @@ -92,20 +93,20 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with // After SPARK-6024, we removed this flag. // Although we are not using spark.sql.sources.schema any more, we need to still support. val schemaString = - Option(table.getProperty("spark.sql.sources.schema")).orElse(schemaStringFromParts) + table.properties.get("spark.sql.sources.schema").orElse(schemaStringFromParts) val userSpecifiedSchema = schemaString.map(s => DataType.fromJson(s).asInstanceOf[StructType]) // It does not appear that the ql client for the metastore has a way to enumerate all the // SerDe properties directly... - val options = table.getTTable.getSd.getSerdeInfo.getParameters.toMap + val options = table.serdeProperties val resolvedRelation = ResolvedDataSource( hive, userSpecifiedSchema, - table.getProperty("spark.sql.sources.provider"), + table.properties("spark.sql.sources.provider"), options) LogicalRelation(resolvedRelation.relation) @@ -144,49 +145,53 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with options: Map[String, String], isExternal: Boolean): Unit = { val (dbName, tblName) = processDatabaseAndTableName("default", tableName) - val tbl = new Table(dbName, tblName) - - tbl.setProperty("spark.sql.sources.provider", provider) + val tableProperties = new scala.collection.mutable.HashMap[String, String] + tableProperties.put("spark.sql.sources.provider", provider) if (userSpecifiedSchema.isDefined) { val threshold = hive.conf.schemaStringLengthThreshold val schemaJsonString = userSpecifiedSchema.get.json // Split the JSON string. val parts = schemaJsonString.grouped(threshold).toSeq - tbl.setProperty("spark.sql.sources.schema.numParts", parts.size.toString) + tableProperties.put("spark.sql.sources.schema.numParts", parts.size.toString) parts.zipWithIndex.foreach { case (part, index) => - tbl.setProperty(s"spark.sql.sources.schema.part.${index}", part) + tableProperties.put(s"spark.sql.sources.schema.part.${index}", part) } } - options.foreach { case (key, value) => tbl.setSerdeParam(key, value) } - if (isExternal) { - tbl.setProperty("EXTERNAL", "TRUE") - tbl.setTableType(TableType.EXTERNAL_TABLE) + val tableType = if (isExternal) { + tableProperties.put("EXTERNAL", "TRUE") + ExternalTable } else { - tbl.setProperty("EXTERNAL", "FALSE") - tbl.setTableType(TableType.MANAGED_TABLE) - } - - // create the table - synchronized { - client.createTable(tbl, false) - } + tableProperties.put("EXTERNAL", "FALSE") + ManagedTable + } + + client.createTable( + HiveTable( + specifiedDatabase = Option(dbName), + name = tblName, + schema = Seq.empty, + partitionColumns = Seq.empty, + tableType = tableType, + properties = tableProperties.toMap, + serdeProperties = options)) } - def hiveDefaultTableFilePath(tableName: String): String = synchronized { - val currentDatabase = client.getDatabase(hive.sessionState.getCurrentDatabase) - - hiveWarehouse.getTablePath(currentDatabase, tableName).toString + def hiveDefaultTableFilePath(tableName: String): String = { + // Code based on: hiveWarehouse.getTablePath(currentDatabase, tableName) + new Path( + new Path(client.getDatabase(client.currentDatabase).location), + tableName.toLowerCase).toString } - def tableExists(tableIdentifier: Seq[String]): Boolean = synchronized { + def tableExists(tableIdentifier: Seq[String]): Boolean = { val tableIdent = processTableIdentifier(tableIdentifier) val databaseName = tableIdent .lift(tableIdent.size - 2) - .getOrElse(hive.sessionState.getCurrentDatabase) + .getOrElse(client.currentDatabase) val tblName = tableIdent.last - client.getTable(databaseName, tblName, false) != null + client.getTableOption(databaseName, tblName).isDefined } def lookupRelation( @@ -194,18 +199,11 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with alias: Option[String]): LogicalPlan = { val tableIdent = processTableIdentifier(tableIdentifier) val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse( - hive.sessionState.getCurrentDatabase) + client.currentDatabase) val tblName = tableIdent.last - val table = try { - synchronized { - client.getTable(databaseName, tblName) - } - } catch { - case te: org.apache.hadoop.hive.ql.metadata.InvalidTableException => - throw new NoSuchTableException - } + val table = client.getTable(databaseName, tblName) - if (table.getProperty("spark.sql.sources.provider") != null) { + if (table.properties.get("spark.sql.sources.provider").isDefined) { val dataSourceTable = cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase) // Then, if alias is specified, wrap the table with a Subquery using the alias. @@ -215,22 +213,16 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with Subquery(tableIdent.last, dataSourceTable)) withAlias - } else if (table.isView) { - // if the unresolved relation is from hive view - // parse the text into logic node. - HiveQl.createPlanForView(table, alias) + } else if (table.tableType == VirtualView) { + val viewText = table.viewText.getOrElse(sys.error("Invalid view without text.")) + alias match { + // because hive use things like `_c0` to build the expanded text + // currently we cannot support view from "create view v1(c1) as ..." + case None => Subquery(table.name, HiveQl.createPlan(viewText)) + case Some(aliasText) => Subquery(aliasText, HiveQl.createPlan(viewText)) + } } else { - val partitions: Seq[Partition] = - if (table.isPartitioned) { - synchronized { - HiveShim.getAllPartitionsOf(client, table).toSeq - } - } else { - Nil - } - - MetastoreRelation(databaseName, tblName, alias)( - table.getTTable, partitions.map(part => part.getTPartition))(hive) + MetastoreRelation(databaseName, tblName, alias)(table)(hive) } } @@ -318,178 +310,10 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with result.newInstance() } - override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = synchronized { - val dbName = if (!caseSensitive) { - if (databaseName.isDefined) Some(databaseName.get.toLowerCase) else None - } else { - databaseName - } - val db = dbName.getOrElse(hive.sessionState.getCurrentDatabase) - - client.getAllTables(db).map(tableName => (tableName, false)) - } - - /** - * Create table with specified database, table name, table description and schema - * @param databaseName Database Name - * @param tableName Table Name - * @param schema Schema of the new table, if not specified, will use the schema - * specified in crtTbl - * @param allowExisting if true, ignore AlreadyExistsException - * @param desc CreateTableDesc object which contains the SerDe info. Currently - * we support most of the features except the bucket. - */ - def createTable( - databaseName: String, - tableName: String, - schema: Seq[Attribute], - allowExisting: Boolean = false, - desc: Option[CreateTableDesc] = None) { - val hconf = hive.hiveconf - - val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName) - val tbl = new Table(dbName, tblName) - - val crtTbl: CreateTableDesc = desc.getOrElse(null) - - // We should respect the passed in schema, unless it's not set - val hiveSchema: JList[FieldSchema] = if (schema == null || schema.isEmpty) { - crtTbl.getCols - } else { - schema.map(attr => new FieldSchema(attr.name, toMetastoreType(attr.dataType), null)) - } - tbl.setFields(hiveSchema) - - // Most of code are similar with the DDLTask.createTable() of Hive, - if (crtTbl != null && crtTbl.getTblProps() != null) { - tbl.getTTable().getParameters().putAll(crtTbl.getTblProps()) - } - - if (crtTbl != null && crtTbl.getPartCols() != null) { - tbl.setPartCols(crtTbl.getPartCols()) - } - - if (crtTbl != null && crtTbl.getStorageHandler() != null) { - tbl.setProperty( - org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE, - crtTbl.getStorageHandler()) - } - - /* - * We use LazySimpleSerDe by default. - * - * If the user didn't specify a SerDe, and any of the columns are not simple - * types, we will have to use DynamicSerDe instead. - */ - if (crtTbl == null || crtTbl.getSerName() == null) { - val storageHandler = tbl.getStorageHandler() - if (storageHandler == null) { - logInfo(s"Default to LazySimpleSerDe for table $dbName.$tblName") - tbl.setSerializationLib(classOf[LazySimpleSerDe].getName()) - - import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - import org.apache.hadoop.io.Text - import org.apache.hadoop.mapred.TextInputFormat - - tbl.setInputFormatClass(classOf[TextInputFormat]) - tbl.setOutputFormatClass(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]]) - tbl.setSerializationLib("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe") - } else { - val serDeClassName = storageHandler.getSerDeClass().getName() - logInfo(s"Use StorageHandler-supplied $serDeClassName for table $dbName.$tblName") - tbl.setSerializationLib(serDeClassName) - } - } else { - // let's validate that the serde exists - val serdeName = crtTbl.getSerName() - try { - val d = ReflectionUtils.newInstance(hconf.getClassByName(serdeName), hconf) - if (d != null) { - logDebug("Found class for $serdeName") - } - } catch { - case e: SerDeException => throw new HiveException("Cannot validate serde: " + serdeName, e) - } - tbl.setSerializationLib(serdeName) - } - - if (crtTbl != null && crtTbl.getFieldDelim() != null) { - tbl.setSerdeParam(serdeConstants.FIELD_DELIM, crtTbl.getFieldDelim()) - tbl.setSerdeParam(serdeConstants.SERIALIZATION_FORMAT, crtTbl.getFieldDelim()) - } - if (crtTbl != null && crtTbl.getFieldEscape() != null) { - tbl.setSerdeParam(serdeConstants.ESCAPE_CHAR, crtTbl.getFieldEscape()) - } - - if (crtTbl != null && crtTbl.getCollItemDelim() != null) { - tbl.setSerdeParam(serdeConstants.COLLECTION_DELIM, crtTbl.getCollItemDelim()) - } - if (crtTbl != null && crtTbl.getMapKeyDelim() != null) { - tbl.setSerdeParam(serdeConstants.MAPKEY_DELIM, crtTbl.getMapKeyDelim()) - } - if (crtTbl != null && crtTbl.getLineDelim() != null) { - tbl.setSerdeParam(serdeConstants.LINE_DELIM, crtTbl.getLineDelim()) - } - HiveShim.setTblNullFormat(crtTbl, tbl) - - if (crtTbl != null && crtTbl.getSerdeProps() != null) { - val iter = crtTbl.getSerdeProps().entrySet().iterator() - while (iter.hasNext()) { - val m = iter.next() - tbl.setSerdeParam(m.getKey(), m.getValue()) - } - } - - if (crtTbl != null && crtTbl.getComment() != null) { - tbl.setProperty("comment", crtTbl.getComment()) - } - - if (crtTbl != null && crtTbl.getLocation() != null) { - HiveShim.setLocation(tbl, crtTbl) - } - - if (crtTbl != null && crtTbl.getSkewedColNames() != null) { - tbl.setSkewedColNames(crtTbl.getSkewedColNames()) - } - if (crtTbl != null && crtTbl.getSkewedColValues() != null) { - tbl.setSkewedColValues(crtTbl.getSkewedColValues()) - } - - if (crtTbl != null) { - tbl.setStoredAsSubDirectories(crtTbl.isStoredAsSubDirectories()) - tbl.setInputFormatClass(crtTbl.getInputFormat()) - tbl.setOutputFormatClass(crtTbl.getOutputFormat()) - } - - tbl.getTTable().getSd().setInputFormat(tbl.getInputFormatClass().getName()) - tbl.getTTable().getSd().setOutputFormat(tbl.getOutputFormatClass().getName()) - - if (crtTbl != null && crtTbl.isExternal()) { - tbl.setProperty("EXTERNAL", "TRUE") - tbl.setTableType(TableType.EXTERNAL_TABLE) - } - - // set owner - try { - tbl.setOwner(hive.hiveconf.getUser) - } catch { - case e: IOException => throw new HiveException("Unable to get current user", e) - } - - // set create time - tbl.setCreateTime((System.currentTimeMillis() / 1000).asInstanceOf[Int]) - - // TODO add bucket support - // TODO set more info if Hive upgrade + override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = { + val db = databaseName.getOrElse(client.currentDatabase) - // create the table - synchronized { - try client.createTable(tbl, allowExisting) catch { - case e: org.apache.hadoop.hive.metastore.api.AlreadyExistsException - if allowExisting => // Do nothing - case e: Throwable => throw e - } - } + client.listTables(db).map(tableName => (tableName, false)) } protected def processDatabaseAndTableName( @@ -598,42 +422,11 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with // Wait until children are resolved. case p: LogicalPlan if !p.childrenResolved => p - // TODO extra is in type of ASTNode which means the logical plan is not resolved - // Need to think about how to implement the CreateTableAsSelect.resolved - case CreateTableAsSelect(db, tableName, child, allowExisting, Some(extra: ASTNode)) => - val (dbName, tblName) = processDatabaseAndTableName(db, tableName) - val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase) - - // Get the CreateTableDesc from Hive SemanticAnalyzer - val desc: Option[CreateTableDesc] = if (tableExists(Seq(databaseName, tblName))) { - None - } else { - val sa = new SemanticAnalyzer(hive.hiveconf) { - override def analyzeInternal(ast: ASTNode) { - // A hack to intercept the SemanticAnalyzer.analyzeInternal, - // to ignore the SELECT clause of the CTAS - val method = classOf[SemanticAnalyzer].getDeclaredMethod( - "analyzeCreateTable", classOf[ASTNode], classOf[QB]) - method.setAccessible(true) - method.invoke(this, ast, this.getQB) - } - } - - sa.analyze(extra, new Context(hive.hiveconf)) - Some(sa.getQB().getTableDesc) - } - - // Check if the query specifies file format or storage handler. - val hasStorageSpec = desc match { - case Some(crtTbl) => - crtTbl != null && (crtTbl.getSerName != null || crtTbl.getStorageHandler != null) - case None => false - } - - if (hive.convertCTAS && !hasStorageSpec) { + case CreateTableAsSelect(desc, child, allowExisting) => + if (hive.convertCTAS && !desc.serde.isDefined) { // Do the conversion when spark.sql.hive.convertCTAS is true and the query // does not specify any storage format (file format and storage handler). - if (dbName.isDefined) { + if (desc.specifiedDatabase.isDefined) { throw new AnalysisException( "Cannot specify database name in a CTAS statement " + "when spark.sql.hive.convertCTAS is set to true.") @@ -641,7 +434,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with val mode = if (allowExisting) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTableUsingAsSelect( - tblName, + desc.name, hive.conf.defaultDataSourceName, temporary = false, mode, @@ -650,19 +443,19 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with ) } else { execution.CreateTableAsSelect( - databaseName, - tableName, + desc.copy( + specifiedDatabase = Option(desc.specifiedDatabase.getOrElse(client.currentDatabase))), child, - allowExisting, - desc) + allowExisting) } case p: LogicalPlan if p.resolved => p - case p @ CreateTableAsSelect(db, tableName, child, allowExisting, None) => - val (dbName, tblName) = processDatabaseAndTableName(db, tableName) + case p @ CreateTableAsSelect(desc, child, allowExisting) => + val (dbName, tblName) = processDatabaseAndTableName(desc.database, desc.name) + if (hive.convertCTAS) { - if (dbName.isDefined) { + if (desc.specifiedDatabase.isDefined) { throw new AnalysisException( "Cannot specify database name in a CTAS statement " + "when spark.sql.hive.convertCTAS is set to true.") @@ -678,13 +471,10 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with child ) } else { - val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase) execution.CreateTableAsSelect( - databaseName, - tableName, + desc, child, - allowExisting, - None) + allowExisting) } } } @@ -767,7 +557,7 @@ private[hive] case class InsertIntoHiveTable( private[hive] case class MetastoreRelation (databaseName: String, tableName: String, alias: Option[String]) - (val table: TTable, val partitions: Seq[TPartition]) + (val table: HiveTable) (@transient sqlContext: SQLContext) extends LeafNode with MultiInstanceRelation { @@ -786,16 +576,63 @@ private[hive] case class MetastoreRelation Objects.hashCode(databaseName, tableName, alias, output) } - // TODO: Can we use org.apache.hadoop.hive.ql.metadata.Table as the type of table and - // use org.apache.hadoop.hive.ql.metadata.Partition as the type of elements of partitions. - // Right now, using org.apache.hadoop.hive.ql.metadata.Table and - // org.apache.hadoop.hive.ql.metadata.Partition will cause a NotSerializableException - // which indicates the SerDe we used is not Serializable. + @transient val hiveQlTable: Table = { + // We start by constructing an API table as Hive performs several important transformations + // internally when converting an API table to a QL table. + val tTable = new org.apache.hadoop.hive.metastore.api.Table() + tTable.setTableName(table.name) + tTable.setDbName(table.database) + + val tableParameters = new java.util.HashMap[String, String]() + tTable.setParameters(tableParameters) + table.properties.foreach { case (k, v) => tableParameters.put(k, v) } + + tTable.setTableType(table.tableType.name) + + val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor() + tTable.setSd(sd) + sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment))) + tTable.setPartitionKeys( + table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, c.comment))) + + table.location.foreach(sd.setLocation) + table.inputFormat.foreach(sd.setInputFormat) + table.outputFormat.foreach(sd.setOutputFormat) + + val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo + sd.setSerdeInfo(serdeInfo) + table.serde.foreach(serdeInfo.setSerializationLib) + val serdeParameters = new java.util.HashMap[String, String]() + serdeInfo.setParameters(serdeParameters) + table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) } + + new Table(tTable) + } + + @transient val hiveQlPartitions: Seq[Partition] = table.getAllPartitions.map { p => + val tPartition = new org.apache.hadoop.hive.metastore.api.Partition + tPartition.setDbName(databaseName) + tPartition.setTableName(tableName) + tPartition.setValues(p.values) + + val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor() + tPartition.setSd(sd) + sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment))) + + sd.setLocation(p.storage.location) + sd.setInputFormat(p.storage.inputFormat) + sd.setOutputFormat(p.storage.outputFormat) + + val serdeInfo = new org.apache.hadoop.hive.metastore.api.SerDeInfo + sd.setSerdeInfo(serdeInfo) + serdeInfo.setSerializationLib(p.storage.serde) - @transient val hiveQlTable: Table = new Table(table) + val serdeParameters = new java.util.HashMap[String, String]() + serdeInfo.setParameters(serdeParameters) + table.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) } + p.storage.serdeProperties.foreach { case (k, v) => serdeParameters.put(k, v) } - @transient val hiveQlPartitions: Seq[Partition] = partitions.map { p => - new Partition(hiveQlTable, p) + new Partition(hiveQlTable, tPartition) } @transient override lazy val statistics: Statistics = Statistics( @@ -865,7 +702,7 @@ private[hive] case class MetastoreRelation val columnOrdinals = AttributeMap(attributes.zipWithIndex) override def newInstance(): MetastoreRelation = { - MetastoreRelation(databaseName, tableName, alias)(table, partitions)(sqlContext) + MetastoreRelation(databaseName, tableName, alias)(table)(sqlContext) } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 6176aee25e7a4..f30b196734c40 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.CurrentOrigin import org.apache.spark.sql.execution.ExplainCommand import org.apache.spark.sql.sources.DescribeCommand +import org.apache.spark.sql.hive.client._ import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema} import org.apache.spark.sql.types._ import org.apache.spark.util.random.RandomSampler @@ -50,7 +51,19 @@ import scala.collection.JavaConversions._ * back for Hive to execute natively. Will be replaced with a native command that contains the * cmd string. */ -private[hive] case object NativePlaceholder extends Command +private[hive] case object NativePlaceholder extends LogicalPlan { + override def children: Seq[LogicalPlan] = Seq.empty + override def output: Seq[Attribute] = Seq.empty +} + +case class CreateTableAsSelect( + tableDesc: HiveTable, + child: LogicalPlan, + allowExisting: Boolean) extends UnaryNode with Command { + + override def output: Seq[Attribute] = Seq.empty[Attribute] + override lazy val resolved: Boolean = tableDesc.specifiedDatabase.isDefined && childrenResolved +} /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */ private[hive] object HiveQl { @@ -78,16 +91,16 @@ private[hive] object HiveQl { "TOK_ALTERVIEW_DROPPARTS", "TOK_ALTERVIEW_PROPERTIES", "TOK_ALTERVIEW_RENAME", - + "TOK_CREATEDATABASE", "TOK_CREATEFUNCTION", "TOK_CREATEINDEX", "TOK_CREATEROLE", "TOK_CREATEVIEW", - + "TOK_DESCDATABASE", "TOK_DESCFUNCTION", - + "TOK_DROPDATABASE", "TOK_DROPFUNCTION", "TOK_DROPINDEX", @@ -95,22 +108,22 @@ private[hive] object HiveQl { "TOK_DROPTABLE_PROPERTIES", "TOK_DROPVIEW", "TOK_DROPVIEW_PROPERTIES", - + "TOK_EXPORT", - + "TOK_GRANT", "TOK_GRANT_ROLE", - + "TOK_IMPORT", - + "TOK_LOAD", - + "TOK_LOCKTABLE", - + "TOK_MSCK", - + "TOK_REVOKE", - + "TOK_SHOW_COMPACTIONS", "TOK_SHOW_CREATETABLE", "TOK_SHOW_GRANT", @@ -127,9 +140,9 @@ private[hive] object HiveQl { "TOK_SHOWINDEXES", "TOK_SHOWLOCKS", "TOK_SHOWPARTITIONS", - + "TOK_SWITCHDATABASE", - + "TOK_UNLOCKTABLE" ) @@ -259,6 +272,7 @@ private[hive] object HiveQl { case otherMessage => throw new AnalysisException(otherMessage) } + case e: MatchError => throw e case e: Exception => throw new AnalysisException(e.getMessage) case e: NotImplementedError => @@ -272,14 +286,6 @@ private[hive] object HiveQl { } } - /** Creates LogicalPlan for a given VIEW */ - def createPlanForView(view: Table, alias: Option[String]): Subquery = alias match { - // because hive use things like `_c0` to build the expanded text - // currently we cannot support view from "create view v1(c1) as ..." - case None => Subquery(view.getTableName, createPlan(view.getViewExpandedText)) - case Some(aliasText) => Subquery(aliasText, createPlan(view.getViewExpandedText)) - } - def parseDdl(ddl: String): Seq[Attribute] = { val tree = try { @@ -453,6 +459,14 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C (keys, bitmasks) } + protected def getProperties(node: Node): Seq[(String, String)] = node match { + case Token("TOK_TABLEPROPLIST", list) => + list.map { + case Token("TOK_TABLEPROPERTY", Token(key, Nil) :: Token(value, Nil) :: Nil) => + (unquoteString(key) -> unquoteString(value)) + } + } + protected def nodeToPlan(node: Node): LogicalPlan = node match { // Special drop table that also uncaches. case Token("TOK_DROPTABLE", @@ -562,7 +576,62 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C children) val (db, tableName) = extractDbNameTableName(tableNameParts) - CreateTableAsSelect(db, tableName, nodeToPlan(query), allowExisting != None, Some(node)) + var tableDesc = + HiveTable( + specifiedDatabase = db, + name = tableName, + schema = Seq.empty, + partitionColumns = Seq.empty, + properties = Map.empty, + serdeProperties = Map.empty, + tableType = ManagedTable, + location = None, + inputFormat = None, + outputFormat = None, + serde = None) + + // TODO: Handle all the cases here... + children.foreach { + case Token("TOK_TBLRCFILE", Nil) => + import org.apache.hadoop.hive.ql.io.{RCFileInputFormat, RCFileOutputFormat} + tableDesc = tableDesc.copy( + outputFormat = Option(classOf[RCFileOutputFormat].getName), + inputFormat = Option(classOf[RCFileInputFormat[_, _]].getName)) + + if (tableDesc.serde.isEmpty) { + tableDesc = tableDesc.copy( + serde = Option("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe")) + } + case Token("TOK_TBLORCFILE", Nil) => + tableDesc = tableDesc.copy( + inputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"), + outputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"), + serde = Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) + + case Token("TOK_TBLPARQUETFILE", Nil) => + tableDesc = tableDesc.copy( + inputFormat = Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"), + outputFormat = Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"), + serde = Option("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")) + + case Token("TOK_TABLESERIALIZER", + Token("TOK_SERDENAME", Token(serdeName, Nil) :: otherProps) :: Nil) => + tableDesc = tableDesc.copy(serde = Option(unquoteString(serdeName))) + + otherProps match { + case Token("TOK_TABLEPROPERTIES", list :: Nil) :: Nil => + tableDesc = tableDesc.copy( + serdeProperties = tableDesc.serdeProperties ++ getProperties(list)) + case Nil => + } + + case Token("TOK_TABLEPROPERTIES", list :: Nil) => + tableDesc = tableDesc.copy(properties = tableDesc.properties ++ getProperties(list)) + + case _ => + } + + CreateTableAsSelect(tableDesc, nodeToPlan(query), allowExisting != None) // If its not a "CREATE TABLE AS" like above then just pass it back to hive as a native command. case Token("TOK_CREATETABLE", _) => NativePlaceholder @@ -759,7 +828,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case Token("TOK_CUBE_GROUPBY", children) => Cube(children.map(nodeToExpr), withLateralView, selectExpressions) case _ => sys.error("Expect WITH CUBE") - }), + }), Some(Project(selectExpressions, withLateralView))).flatten.head } @@ -1077,6 +1146,15 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C } protected val escapedIdentifier = "`([^`]+)`".r + protected val doubleQuotedString = "\"([^\"]+)\"".r + protected val singleQuotedString = "'([^']+)'".r + + protected def unquoteString(str: String) = str match { + case singleQuotedString(s) => s + case doubleQuotedString(s) => s + case other => other + } + /** Strips backticks from ident if present */ protected def cleanIdentifier(ident: String): String = ident match { case escapedIdentifier(i) => i diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala index e556c74ffb015..b69312f0f8717 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala @@ -32,6 +32,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf} import org.apache.spark.SerializableWritable import org.apache.spark.broadcast.Broadcast +import org.apache.spark.Logging import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, RDD, UnionRDD} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.DateUtils @@ -57,7 +58,7 @@ class HadoopTableReader( @transient relation: MetastoreRelation, @transient sc: HiveContext, @transient hiveExtraConf: HiveConf) - extends TableReader { + extends TableReader with Logging { // Hadoop honors "mapred.map.tasks" as hint, but will ignore when mapred.job.tracker is "local". // https://hadoop.apache.org/docs/r1.0.4/mapred-default.html @@ -78,7 +79,7 @@ class HadoopTableReader( makeRDDForTable( hiveTable, Class.forName( - relation.tableDesc.getSerdeClassName, true, sc.sessionState.getConf.getClassLoader) + relation.tableDesc.getSerdeClassName, true, Utils.getSparkClassLoader) .asInstanceOf[Class[Deserializer]], filterOpt = None) @@ -145,7 +146,7 @@ class HadoopTableReader( partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]], filterOpt: Option[PathFilter]): RDD[Row] = { - + // SPARK-5068:get FileStatus and do the filtering locally when the path is not exists def verifyPartitionPath( partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]]): @@ -288,7 +289,7 @@ class HadoopTableReader( } } -private[hive] object HadoopTableReader extends HiveInspectors { +private[hive] object HadoopTableReader extends HiveInspectors with Logging { /** * Curried. After given an argument for 'path', the resulting JobConf => Unit closure is used to * instantiate a HadoopRDD. @@ -329,6 +330,8 @@ private[hive] object HadoopTableReader extends HiveInspectors { tableDeser.getObjectInspector).asInstanceOf[StructObjectInspector] } + logDebug(soi.toString) + val (fieldRefs, fieldOrdinals) = nonPartitionKeyAttrs.map { case (attr, ordinal) => soi.getStructFieldRef(attr.name) -> ordinal }.unzip diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala index a863aa77cb7e0..0a1d761a52f88 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala @@ -17,30 +17,35 @@ package org.apache.spark.sql.hive.client +import java.io.PrintStream +import java.util.{Map => JMap} + import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException} -case class HiveDatabase( +private[hive] case class HiveDatabase( name: String, location: String) -abstract class TableType { val name: String } -case object ExternalTable extends TableType { override val name = "EXTERNAL_TABLE" } -case object IndexTable extends TableType { override val name = "INDEX_TABLE" } -case object ManagedTable extends TableType { override val name = "MANAGED_TABLE" } -case object VirtualView extends TableType { override val name = "VIRTUAL_VIEW" } +private[hive] abstract class TableType { val name: String } +private[hive] case object ExternalTable extends TableType { override val name = "EXTERNAL_TABLE" } +private[hive] case object IndexTable extends TableType { override val name = "INDEX_TABLE" } +private[hive] case object ManagedTable extends TableType { override val name = "MANAGED_TABLE" } +private[hive] case object VirtualView extends TableType { override val name = "VIRTUAL_VIEW" } -case class HiveStorageDescriptor( +// TODO: Use this for Tables and Partitions +private[hive] case class HiveStorageDescriptor( location: String, inputFormat: String, outputFormat: String, - serde: String) + serde: String, + serdeProperties: Map[String, String]) -case class HivePartition( +private[hive] case class HivePartition( values: Seq[String], storage: HiveStorageDescriptor) -case class HiveColumn(name: String, hiveType: String, comment: String) -case class HiveTable( +private[hive] case class HiveColumn(name: String, hiveType: String, comment: String) +private[hive] case class HiveTable( specifiedDatabase: Option[String], name: String, schema: Seq[HiveColumn], @@ -51,7 +56,8 @@ case class HiveTable( location: Option[String] = None, inputFormat: Option[String] = None, outputFormat: Option[String] = None, - serde: Option[String] = None) { + serde: Option[String] = None, + viewText: Option[String] = None) { @transient private[client] var client: ClientInterface = _ @@ -76,13 +82,17 @@ case class HiveTable( * internal and external classloaders for a given version of Hive and thus must expose only * shared classes. */ -trait ClientInterface { +private[hive] trait ClientInterface { /** * Runs a HiveQL command using Hive, returning the results as a list of strings. Each row will * result in one string. */ def runSqlHive(sql: String): Seq[String] + def setOut(stream: PrintStream): Unit + def setInfo(stream: PrintStream): Unit + def setError(stream: PrintStream): Unit + /** Returns the names of all tables in the given database. */ def listTables(dbName: String): Seq[String] @@ -114,6 +124,11 @@ trait ClientInterface { /** Creates a new database with the given name. */ def createDatabase(database: HiveDatabase): Unit + /** Returns the specified paritition or None if it does not exist. */ + def getPartitionOption( + hTable: HiveTable, + partitionSpec: JMap[String, String]): Option[HivePartition] + /** Returns all partitions for the given table. */ def getAllPartitions(hTable: HiveTable): Seq[HivePartition] diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala index ea52fea037f1f..6bca9d0179fe3 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.hive.client import java.io.{BufferedReader, InputStreamReader, File, PrintStream} import java.net.URI -import java.util.{ArrayList => JArrayList} +import java.util.{ArrayList => JArrayList, Map => JMap, List => JList, Set => JSet} import scala.collection.JavaConversions._ import scala.language.reflectiveCalls @@ -27,6 +27,7 @@ import scala.language.reflectiveCalls import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.metastore.api.Database import org.apache.hadoop.hive.conf.HiveConf +import org.apache.hadoop.hive.metastore.TableType import org.apache.hadoop.hive.metastore.api import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.ql.metadata @@ -54,19 +55,13 @@ import org.apache.spark.sql.execution.QueryExecutionException * @param config a collection of configuration options that will be added to the hive conf before * opening the hive client. */ -class ClientWrapper( +private[hive] class ClientWrapper( version: HiveVersion, config: Map[String, String]) extends ClientInterface with Logging with ReflectionMagic { - private val conf = new HiveConf(classOf[SessionState]) - config.foreach { case (k, v) => - logDebug(s"Hive Config: $k=$v") - conf.set(k, v) - } - // Circular buffer to hold what hive prints to STDOUT and ERR. Only printed when failures occur. private val outputBuffer = new java.io.OutputStream { var pos: Int = 0 @@ -99,17 +94,31 @@ class ClientWrapper( val original = Thread.currentThread().getContextClassLoader Thread.currentThread().setContextClassLoader(getClass.getClassLoader) val ret = try { - val newState = new SessionState(conf) - SessionState.start(newState) - newState.out = new PrintStream(outputBuffer, true, "UTF-8") - newState.err = new PrintStream(outputBuffer, true, "UTF-8") - newState + val oldState = SessionState.get() + if (oldState == null) { + val initialConf = new HiveConf(classOf[SessionState]) + config.foreach { case (k, v) => + logDebug(s"Hive Config: $k=$v") + initialConf.set(k, v) + } + val newState = new SessionState(initialConf) + SessionState.start(newState) + newState.out = new PrintStream(outputBuffer, true, "UTF-8") + newState.err = new PrintStream(outputBuffer, true, "UTF-8") + newState + } else { + oldState + } } finally { Thread.currentThread().setContextClassLoader(original) } ret } + /** Returns the configuration for the current session. */ + def conf: HiveConf = SessionState.get().getConf + + // TODO: should be a def?s private val client = Hive.get(conf) /** @@ -133,6 +142,18 @@ class ClientWrapper( ret } + def setOut(stream: PrintStream): Unit = withHiveState { + state.out = stream + } + + def setInfo(stream: PrintStream): Unit = withHiveState { + state.info = stream + } + + def setError(stream: PrintStream): Unit = withHiveState { + state.err = stream + } + override def currentDatabase: String = withHiveState { state.getCurrentDatabase } @@ -171,14 +192,20 @@ class ClientWrapper( partitionColumns = h.getPartCols.map(f => HiveColumn(f.getName, f.getType, f.getComment)), properties = h.getParameters.toMap, serdeProperties = h.getTTable.getSd.getSerdeInfo.getParameters.toMap, - tableType = ManagedTable, // TODO + tableType = h.getTableType match { + case TableType.MANAGED_TABLE => ManagedTable + case TableType.EXTERNAL_TABLE => ExternalTable + case TableType.VIRTUAL_VIEW => VirtualView + case TableType.INDEX_TABLE => IndexTable + }, location = version match { case hive.v12 => Option(h.call[URI]("getDataLocation")).map(_.toString) case hive.v13 => Option(h.call[Path]("getDataLocation")).map(_.toString) }, inputFormat = Option(h.getInputFormatClass).map(_.getName), outputFormat = Option(h.getOutputFormatClass).map(_.getName), - serde = Option(h.getSerializationLib)).withClient(this) + serde = Option(h.getSerializationLib), + viewText = Option(h.getViewExpandedText)).withClient(this) } converted } @@ -223,27 +250,40 @@ class ClientWrapper( client.alterTable(table.qualifiedName, qlTable) } + private def toHivePartition(partition: metadata.Partition): HivePartition = { + val apiPartition = partition.getTPartition + HivePartition( + values = Option(apiPartition.getValues).map(_.toSeq).getOrElse(Seq.empty), + storage = HiveStorageDescriptor( + location = apiPartition.getSd.getLocation, + inputFormat = apiPartition.getSd.getInputFormat, + outputFormat = apiPartition.getSd.getOutputFormat, + serde = apiPartition.getSd.getSerdeInfo.getSerializationLib, + serdeProperties = apiPartition.getSd.getSerdeInfo.getParameters.toMap)) + } + + override def getPartitionOption( + table: HiveTable, + partitionSpec: JMap[String, String]): Option[HivePartition] = withHiveState { + + val qlTable = toQlTable(table) + val qlPartition = client.getPartition(qlTable, partitionSpec, false) + Option(qlPartition).map(toHivePartition) + } + override def getAllPartitions(hTable: HiveTable): Seq[HivePartition] = withHiveState { val qlTable = toQlTable(hTable) val qlPartitions = version match { case hive.v12 => - client.call[metadata.Table, Set[metadata.Partition]]("getAllPartitionsForPruner", qlTable) + client.call[metadata.Table, JSet[metadata.Partition]]("getAllPartitionsForPruner", qlTable) case hive.v13 => - client.call[metadata.Table, Set[metadata.Partition]]("getAllPartitionsOf", qlTable) + client.call[metadata.Table, JSet[metadata.Partition]]("getAllPartitionsOf", qlTable) } - qlPartitions.map(_.getTPartition).map { p => - HivePartition( - values = Option(p.getValues).map(_.toSeq).getOrElse(Seq.empty), - storage = HiveStorageDescriptor( - location = p.getSd.getLocation, - inputFormat = p.getSd.getInputFormat, - outputFormat = p.getSd.getOutputFormat, - serde = p.getSd.getSerdeInfo.getSerializationLib)) - }.toSeq + qlPartitions.toSeq.map(toHivePartition) } override def listTables(dbName: String): Seq[String] = withHiveState { - client.getAllTables + client.getAllTables(dbName) } /** @@ -267,11 +307,12 @@ class ClientWrapper( try { val cmd_trimmed: String = cmd.trim() val tokens: Array[String] = cmd_trimmed.split("\\s+") + // The remainder of the command. val cmd_1: String = cmd_trimmed.substring(tokens(0).length()).trim() val proc: CommandProcessor = version match { case hive.v12 => classOf[CommandProcessorFactory] - .callStatic[String, HiveConf, CommandProcessor]("get", cmd_1, conf) + .callStatic[String, HiveConf, CommandProcessor]("get", tokens(0), conf) case hive.v13 => classOf[CommandProcessorFactory] .callStatic[Array[String], HiveConf, CommandProcessor]("get", Array(tokens(0)), conf) @@ -294,7 +335,7 @@ class ClientWrapper( res.toSeq case hive.v13 => val res = new JArrayList[Object] - driver.call[JArrayList[Object], Boolean]("getResults", res) + driver.call[JList[Object], Boolean]("getResults", res) res.map { r => r match { case s: String => s diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index 710dbca6e3c66..7f94c93ba49c1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.hive.client import java.io.File -import java.net.URLClassLoader +import java.net.{URL, URLClassLoader} import java.util import scala.language.reflectiveCalls @@ -30,9 +30,10 @@ import org.apache.spark.Logging import org.apache.spark.deploy.SparkSubmitUtils import org.apache.spark.sql.catalyst.util.quietly +import org.apache.spark.sql.hive.HiveContext /** Factory for `IsolatedClientLoader` with specific versions of hive. */ -object IsolatedClientLoader { +private[hive] object IsolatedClientLoader { /** * Creates isolated Hive client loaders by downloading the requested version from maven. */ @@ -49,7 +50,7 @@ object IsolatedClientLoader { case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13 } - private def downloadVersion(version: HiveVersion): Seq[File] = { + private def downloadVersion(version: HiveVersion): Seq[URL] = { val hiveArtifacts = (Seq("hive-metastore", "hive-exec", "hive-common", "hive-serde") ++ (if (version.hasBuiltinsJar) "hive-builtins" :: Nil else Nil)) @@ -72,10 +73,10 @@ object IsolatedClientLoader { tempDir.mkdir() allFiles.foreach(f => FileUtils.copyFileToDirectory(f, tempDir)) - tempDir.listFiles() + tempDir.listFiles().map(_.toURL) } - private def resolvedVersions = new scala.collection.mutable.HashMap[HiveVersion, Seq[File]] + private def resolvedVersions = new scala.collection.mutable.HashMap[HiveVersion, Seq[URL]] } /** @@ -99,9 +100,9 @@ object IsolatedClientLoader { * @param baseClassLoader The spark classloader that is used to load shared classes. * */ -class IsolatedClientLoader( +private[hive] class IsolatedClientLoader( val version: HiveVersion, - val execJars: Seq[File] = Seq.empty, + val execJars: Seq[URL] = Seq.empty, val config: Map[String, String] = Map.empty, val isolationOn: Boolean = true, val rootClassLoader: ClassLoader = ClassLoader.getSystemClassLoader.getParent.getParent, @@ -112,7 +113,7 @@ class IsolatedClientLoader( assert(Try(baseClassLoader.loadClass("org.apache.hive.HiveConf")).isFailure) /** All jars used by the hive specific classloader. */ - protected def allJars = execJars.map(_.toURI.toURL).toArray + protected def allJars = execJars.toArray protected def isSharedClass(name: String): Boolean = name.contains("slf4j") || @@ -166,6 +167,12 @@ class IsolatedClientLoader( .getConstructors.head .newInstance(version, config) .asInstanceOf[ClientInterface] + } catch { + case ReflectionException(cnf: NoClassDefFoundError) => + throw new ClassNotFoundException( + s"$cnf when creating Hive client using classpath: ${execJars.mkString(", ")}\n" + + "Please make sure that jars for your version of hive and hadoop are included in the " + + s"paths passed to ${HiveContext.HIVE_METASTORE_JARS}.") } finally { Thread.currentThread.setContextClassLoader(baseClassLoader) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala index 90d03049356b5..c600b158c5460 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ReflectionMagic.scala @@ -19,6 +19,14 @@ package org.apache.spark.sql.hive.client import scala.reflect._ +/** Unwraps reflection exceptions. */ +private[client] object ReflectionException { + def unapply(a: Throwable): Option[Throwable] = a match { + case ite: java.lang.reflect.InvocationTargetException => Option(ite.getCause) + case _ => None + } +} + /** * Provides implicit functions on any object for calling methods reflectively. */ diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala index 76a1965f3cb25..91e6ac4032204 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala @@ -24,8 +24,8 @@ import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.RunnableCommand -import org.apache.spark.sql.hive.HiveContext -import org.apache.spark.sql.hive.MetastoreRelation +import org.apache.spark.sql.hive.client.{HiveTable, HiveColumn} +import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation, HiveMetastoreTypes} /** * Create table and insert the query result into it. @@ -39,17 +39,34 @@ import org.apache.spark.sql.hive.MetastoreRelation */ private[hive] case class CreateTableAsSelect( - database: String, - tableName: String, + tableDesc: HiveTable, query: LogicalPlan, - allowExisting: Boolean, - desc: Option[CreateTableDesc]) extends RunnableCommand { + allowExisting: Boolean) + extends RunnableCommand { + + def database: String = tableDesc.database + def tableName: String = tableDesc.name override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { - // Create Hive Table - hiveContext.catalog.createTable(database, tableName, query.output, allowExisting, desc) + import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe + import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + import org.apache.hadoop.io.Text + import org.apache.hadoop.mapred.TextInputFormat + + val withSchema = + tableDesc.copy( + schema = + query.output.map(c => + HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)), + inputFormat = + tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), + outputFormat = + tableDesc.outputFormat + .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), + serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName()))) + hiveContext.catalog.client.createTable(withSchema) // Get the Metastore Relation hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 89995a91b1a92..de8954d5dec99 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -200,9 +200,7 @@ case class InsertIntoHiveTable( orderedPartitionSpec.put(entry.getName,partitionSpec.get(entry.getName).getOrElse("")) } val partVals = MetaStoreUtils.getPvals(table.hiveQlTable.getPartCols, partitionSpec) - catalog.synchronized { - catalog.client.validatePartitionNameCharacters(partVals) - } + // inheritTableSpecs is set to true. It should be set to false for a IMPORT query // which is currently considered as a Hive native command. val inheritTableSpecs = true @@ -211,7 +209,7 @@ case class InsertIntoHiveTable( if (numDynamicPartitions > 0) { catalog.synchronized { catalog.client.loadDynamicPartitions( - outputPath, + outputPath.toString, qualifiedTableName, orderedPartitionSpec, overwrite, @@ -224,31 +222,28 @@ case class InsertIntoHiveTable( // ifNotExists is only valid with static partition, refer to // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-InsertingdataintoHiveTablesfromqueries // scalastyle:on - val oldPart = catalog.synchronized { - catalog.client.getPartition( - catalog.client.getTable(qualifiedTableName), partitionSpec, false) - } - if (oldPart == null || !ifNotExists) { - catalog.synchronized { + val oldPart = + catalog.client.getPartitionOption( + catalog.client.getTable(table.databaseName, table.tableName), + partitionSpec) + + if (oldPart.isEmpty || !ifNotExists) { catalog.client.loadPartition( - outputPath, + outputPath.toString, qualifiedTableName, orderedPartitionSpec, overwrite, holdDDLTime, inheritTableSpecs, isSkewedStoreAsSubdir) - } } } } else { - catalog.synchronized { - catalog.client.loadTable( - outputPath, - qualifiedTableName, - overwrite, - holdDDLTime) - } + catalog.client.loadTable( + outputPath.toString, // TODO: URI + qualifiedTableName, + overwrite, + holdDDLTime) } // Invalidate the cache. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala index a40a1e53117cd..abab1a223a43a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils /** * Analyzes the given table in the current database to generate statistics, which will be @@ -84,8 +85,20 @@ case class AddJar(path: String) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] + val currentClassLoader = Utils.getContextOrSparkClassLoader + + // Add jar to current context + val jarURL = new java.io.File(path).toURL + val newClassLoader = new java.net.URLClassLoader(Array(jarURL), currentClassLoader) + Thread.currentThread.setContextClassLoader(newClassLoader) + org.apache.hadoop.hive.ql.metadata.Hive.get().getConf().setClassLoader(newClassLoader) + + // Add jar to isolated hive classloader hiveContext.runSqlHive(s"ADD JAR $path") + + // Add jar to executors hiveContext.sparkContext.addJar(path) + Seq(Row(0)) } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index ca84b43a998b8..1f40a5340c2ce 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.test import java.io.File import java.util.{Set => JavaSet} +import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.exec.FunctionRegistry import org.apache.hadoop.hive.ql.io.avro.{AvroContainerInputFormat, AvroContainerOutputFormat} import org.apache.hadoop.hive.ql.metadata.Table @@ -62,6 +63,8 @@ object TestHive class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { self => + import HiveContext._ + // By clearing the port we force Spark to pick a new one. This allows us to rerun tests // without restarting the JVM. System.clearProperty("spark.hostPort") @@ -70,24 +73,16 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { hiveconf.set("hive.plan.serialization.format", "javaXML") lazy val warehousePath = Utils.createTempDir() - lazy val metastorePath = Utils.createTempDir() /** Sets up the system initially or after a RESET command */ - protected def configure(): Unit = { - warehousePath.delete() - metastorePath.delete() - setConf("javax.jdo.option.ConnectionURL", - s"jdbc:derby:;databaseName=$metastorePath;create=true") - setConf("hive.metastore.warehouse.dir", warehousePath.toString) - } + protected override def configure(): Map[String, String] = + newTemporaryConfiguration() ++ Map("hive.metastore.warehouse.dir" -> warehousePath.toString) val testTempDir = Utils.createTempDir() // For some hive test case which contain ${system:test.tmp.dir} System.setProperty("test.tmp.dir", testTempDir.getCanonicalPath) - configure() // Must be called before initializing the catalog below. - /** The location of the compiled hive distribution */ lazy val hiveHome = envVarToFile("HIVE_HOME") /** The location of the hive source code. */ @@ -195,6 +190,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { * A list of test tables and the DDL required to initialize them. A test table is loaded on * demand when a query are run against it. */ + @transient lazy val testTables = new mutable.HashMap[String, TestTable]() def registerTestTable(testTable: TestTable): Unit = { @@ -204,6 +200,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { // The test tables that are defined in the Hive QTestUtil. // /itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java // https://github.com/apache/hive/blob/branch-0.13/data/scripts/q_test_init.sql + @transient val hiveQTestUtilTables = Seq( TestTable("src", "CREATE TABLE src (key INT, value STRING)".cmd, @@ -236,16 +233,18 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { import org.apache.hadoop.mapred.{SequenceFileInputFormat, SequenceFileOutputFormat} import org.apache.thrift.protocol.TBinaryProtocol - val srcThrift = new Table("default", "src_thrift") - srcThrift.setFields(Nil) - srcThrift.setInputFormatClass(classOf[SequenceFileInputFormat[_,_]].getName) - // In Hive, SequenceFileOutputFormat will be substituted by HiveSequenceFileOutputFormat. - srcThrift.setOutputFormatClass(classOf[SequenceFileOutputFormat[_,_]].getName) - srcThrift.setSerializationLib(classOf[ThriftDeserializer].getName) - srcThrift.setSerdeParam("serialization.class", classOf[Complex].getName) - srcThrift.setSerdeParam("serialization.format", classOf[TBinaryProtocol].getName) - catalog.client.createTable(srcThrift) - + runSqlHive( + s""" + |CREATE TABLE src_thrift(fake INT) + |ROW FORMAT SERDE '${classOf[ThriftDeserializer].getName}' + |WITH SERDEPROPERTIES( + | 'serialization.class'='${classOf[Complex].getName}', + | 'serialization.format'='${classOf[TBinaryProtocol].getName}' + |) + |STORED AS + |INPUTFORMAT '${classOf[SequenceFileInputFormat[_,_]].getName}' + |OUTPUTFORMAT '${classOf[SequenceFileOutputFormat[_,_]].getName}' + """.stripMargin) runSqlHive( s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/complex.seq")}' INTO TABLE src_thrift") @@ -367,7 +366,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { if (!(loadedTables contains name)) { // Marks the table as loaded first to prevent infinite mutually recursive table loading. loadedTables += name - logInfo(s"Loading test table $name") + logDebug(s"Loading test table $name") val createCmds = testTables.get(name).map(_.commands).getOrElse(sys.error(s"Unknown test table $name")) createCmds.foreach(_()) @@ -384,9 +383,6 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { */ protected val originalUdfs: JavaSet[String] = FunctionRegistry.getFunctionNames - // Database default may not exist in 0.13.1, create it if not exist - HiveShim.createDefaultDBIfNeeded(this) - /** * Resets the test instance by deleting any tables that have been created. * TODO: also clear out UDFs, views, etc. @@ -401,24 +397,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { cacheManager.clearCache() loadedTables.clear() catalog.cachedDataSourceTables.invalidateAll() - catalog.client.getAllTables("default").foreach { t => - logDebug(s"Deleting table $t") - val table = catalog.client.getTable("default", t) - - catalog.client.getIndexes("default", t, 255).foreach { index => - catalog.client.dropIndex("default", t, index.getIndexName, true) - } - - if (!table.isIndexTable) { - catalog.client.dropTable("default", t) - } - } - - catalog.client.getAllDatabases.filterNot(_ == "default").foreach { db => - logDebug(s"Dropping Database: $db") - catalog.client.dropDatabase(db, true, false, true) - } - + catalog.client.reset() catalog.unregisterAllTables() FunctionRegistry.getFunctionNames.filterNot(originalUdfs.contains(_)).foreach { udfName => @@ -429,7 +408,8 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { hiveconf.set("fs.default.name", new File(".").toURI.toString) // It is important that we RESET first as broken hooks that might have been set could break // other sql exec here. - runSqlHive("RESET") + executionHive.runSqlHive("RESET") + metadataHive.runSqlHive("RESET") // For some reason, RESET does not reset the following variables... // https://issues.apache.org/jira/browse/HIVE-9004 runSqlHive("set hive.table.parameters.default=") @@ -437,7 +417,11 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { runSqlHive("set datanucleus.cache.collections.lazy=true") // Lots of tests fail if we do not change the partition whitelist from the default. runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*") - configure() + + configure().foreach { + case (k, v) => + metadataHive.runSqlHive(s"SET $k=$v") + } runSqlHive("USE default") diff --git a/sql/hive/src/test/resources/log4j.properties b/sql/hive/src/test/resources/log4j.properties index 5bc08062d30eb..92eaf1f2795ba 100644 --- a/sql/hive/src/test/resources/log4j.properties +++ b/sql/hive/src/test/resources/log4j.properties @@ -33,7 +33,7 @@ log4j.appender.FA.layout=org.apache.log4j.PatternLayout log4j.appender.FA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %t %p %c{1}: %m%n # Set the logger level of File Appender to WARN -log4j.appender.FA.Threshold = INFO +log4j.appender.FA.Threshold = DEBUG # Some packages are noisy for no good reason. log4j.additivity.org.apache.hadoop.hive.serde2.lazy.LazyStruct=false diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala index d960a30e00738..30f5313d2b812 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala @@ -17,12 +17,11 @@ package org.apache.spark.sql.hive -import java.io.{OutputStream, PrintStream} - import scala.util.Try import org.scalatest.BeforeAndAfter +import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.apache.spark.sql.{AnalysisException, QueryTest} @@ -109,25 +108,6 @@ class ErrorPositionSuite extends QueryTest with BeforeAndAfter { "SELECT 1 + array(1)", "1 + array") } - /** Hive can be very noisy, messing up the output of our tests. */ - private def quietly[A](f: => A): A = { - val origErr = System.err - val origOut = System.out - try { - System.setErr(new PrintStream(new OutputStream { - def write(b: Int) = {} - })) - System.setOut(new PrintStream(new OutputStream { - def write(b: Int) = {} - })) - - f - } finally { - System.setErr(origErr) - System.setOut(origOut) - } - } - /** * Creates a test that checks to see if the error thrown when analyzing a given query includes * the location of the given token in the query string. diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 0538aa203c5a0..47c60f651d14c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -31,6 +31,7 @@ import org.apache.hadoop.mapred.InvalidInputException import org.apache.spark.sql._ import org.apache.spark.util.Utils import org.apache.spark.sql.types._ +import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.apache.spark.sql.parquet.ParquetRelation2 @@ -686,16 +687,21 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach { test("SPARK-6655 still support a schema stored in spark.sql.sources.schema") { val tableName = "spark6655" val schema = StructType(StructField("int", IntegerType, true) :: Nil) - // Manually create the metadata in metastore. - val tbl = new Table("default", tableName) - tbl.setProperty("spark.sql.sources.provider", "json") - tbl.setProperty("spark.sql.sources.schema", schema.json) - tbl.setProperty("EXTERNAL", "FALSE") - tbl.setTableType(TableType.MANAGED_TABLE) - tbl.setSerdeParam("path", catalog.hiveDefaultTableFilePath(tableName)) - catalog.synchronized { - catalog.client.createTable(tbl) - } + + val hiveTable = HiveTable( + specifiedDatabase = Some("default"), + name = tableName, + schema = Seq.empty, + partitionColumns = Seq.empty, + properties = Map( + "spark.sql.sources.provider" -> "json", + "spark.sql.sources.schema" -> schema.json, + "EXTERNAL" -> "FALSE"), + tableType = ManagedTable, + serdeProperties = Map( + "path" -> catalog.hiveDefaultTableFilePath(tableName))) + + catalog.client.createTable(hiveTable) invalidateTable(tableName) val actualSchema = table(tableName).schema diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala index d6ddd539d159d..8afe5459d4f1b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala @@ -26,8 +26,10 @@ import org.apache.spark.sql.hive.test.TestHive class SerializationSuite extends FunSuite { test("[SPARK-5840] HiveContext should be serializable") { - val hiveContext = new HiveContext(TestHive.sparkContext) + val hiveContext = TestHive hiveContext.hiveconf - new JavaSerializer(new SparkConf()).newInstance().serialize(hiveContext) + val serializer = new JavaSerializer(new SparkConf()).newInstance() + val bytes = serializer.serialize(hiveContext) + val deSer = serializer.deserialize[AnyRef](bytes) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index 81e77ba257bf1..321dc8d7322b8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -22,9 +22,13 @@ import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.util.Utils import org.scalatest.FunSuite +/** + * A simple set of tests that call the methods of a hive ClientInterface, loading different version + * of hive from maven central. These tests are simple in that they are mostly just testing to make + * sure that reflective calls are not throwing NoSuchMethod error, but the actually functionallity + * is not fully tested. + */ class VersionsSuite extends FunSuite with Logging { - val testType = "derby" - private def buildConf() = { lazy val warehousePath = Utils.createTempDir() lazy val metastorePath = Utils.createTempDir() @@ -50,6 +54,14 @@ class VersionsSuite extends FunSuite with Logging { causes } + private val emptyDir = Utils.createTempDir().getCanonicalPath + + private def partSpec = { + val hashMap = new java.util.LinkedHashMap[String, String] + hashMap.put("key", "1") + hashMap + } + // Its actually pretty easy to mess things up and have all of your tests "pass" by accidentally // connecting to an auto-populated, in-process metastore. Let's make sure we are getting the // versions right by forcing a known compatibility failure. @@ -66,10 +78,9 @@ class VersionsSuite extends FunSuite with Logging { private var client: ClientInterface = null versions.foreach { version => - test(s"$version: listTables") { + test(s"$version: create client") { client = null client = IsolatedClientLoader.forVersion(version, buildConf()).client - client.listTables("default") } test(s"$version: createDatabase") { @@ -101,5 +112,64 @@ class VersionsSuite extends FunSuite with Logging { test(s"$version: getTable") { client.getTable("default", "src") } + + test(s"$version: listTables") { + assert(client.listTables("default") === Seq("src")) + } + + test(s"$version: currentDatabase") { + assert(client.currentDatabase === "default") + } + + test(s"$version: getDatabase") { + client.getDatabase("default") + } + + test(s"$version: alterTable") { + client.alterTable(client.getTable("default", "src")) + } + + test(s"$version: set command") { + client.runSqlHive("SET spark.sql.test.key=1") + } + + test(s"$version: create partitioned table DDL") { + client.runSqlHive("CREATE TABLE src_part (value INT) PARTITIONED BY (key INT)") + client.runSqlHive("ALTER TABLE src_part ADD PARTITION (key = '1')") + } + + test(s"$version: getPartitions") { + client.getAllPartitions(client.getTable("default", "src_part")) + } + + test(s"$version: loadPartition") { + client.loadPartition( + emptyDir, + "default.src_part", + partSpec, + false, + false, + false, + false) + } + + test(s"$version: loadTable") { + client.loadTable( + emptyDir, + "src", + false, + false) + } + + test(s"$version: loadDynamicPartitions") { + client.loadDynamicPartitions( + emptyDir, + "default.src_part", + partSpec, + false, + 1, + false, + false) + } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index a3eacbd4e3981..9c056e493bfde 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -300,6 +300,8 @@ abstract class HiveComparisonTest val hiveQueries = queryList.map(new TestHive.QueryExecution(_)) // Make sure we can at least parse everything before attempting hive execution. + // Note this must only look at the logical plan as we might not be able to analyze if + // other DDL has not been executed yet. hiveQueries.foreach(_.logical) val computedResults = (queryList.zipWithIndex, hiveQueries, hiveCacheFiles).zipped.map { case ((queryString, i), hiveQuery, cachedAnswerFile)=> diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index ac10b173307d8..7d728fe87bda7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -900,7 +900,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { |DROP TABLE IF EXISTS dynamic_part_table; """.stripMargin) - test("Dynamic partition folder layout") { + ignore("Dynamic partition folder layout") { sql("DROP TABLE IF EXISTS dynamic_part_table") sql("CREATE TABLE dynamic_part_table(intcol INT) PARTITIONED BY (partcol1 INT, partcol2 INT)") sql("SET hive.exec.dynamic.partition.mode=nonstrict") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala index 45f10e2fe64aa..de6a41ce5bfcb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala @@ -150,20 +150,21 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter { val (actualScannedColumns, actualPartValues) = plan.collect { case p @ HiveTableScan(columns, relation, _) => val columnNames = columns.map(_.name) - val partValues = p.prunePartitions(relation.hiveQlPartitions).map(_.getValues) + val partValues = if (relation.table.isPartitioned) { + p.prunePartitions(relation.hiveQlPartitions).map(_.getValues) + } else { + Seq.empty + } (columnNames, partValues) }.head assert(actualOutputColumns === expectedOutputColumns, "Output columns mismatch") assert(actualScannedColumns === expectedScannedColumns, "Scanned columns mismatch") - assert( - actualPartValues.length === expectedPartValues.length, - "Partition value count mismatches") + val actualPartitions = actualPartValues.map(_.toSeq.mkString(",")).sorted + val expectedPartitions = expectedPartValues.map(_.mkString(",")).sorted - for ((actual, expected) <- actualPartValues.zip(expectedPartValues)) { - assert(actual sameElements expected, "Partition values mismatch") - } + assert(actualPartitions === expectedPartitions, "Partitions selected do not match") } // Creates a query test to compare query results generated by Hive and Catalyst. From 92f8f803a68e0c16771e9793098c6d76dfdf99af Mon Sep 17 00:00:00 2001 From: Shuo Xiang Date: Thu, 7 May 2015 20:55:08 -0700 Subject: [PATCH 39/48] [SPARK-7452] [MLLIB] fix bug in topBykey and update test the toArray function of the BoundedPriorityQueue does not necessarily preserve order. Add a counter-example as the test, which would fail the original impl. Author: Shuo Xiang Closes #5990 from coderxiang/topbykey-test and squashes the following commits: 98804c9 [Shuo Xiang] fix bug in topBykey and update test --- .../org/apache/spark/mllib/rdd/MLPairRDDFunctions.scala | 2 +- .../apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctions.scala index 5af55aaf84802..1b93e2d764c69 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctions.scala @@ -46,7 +46,7 @@ class MLPairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)]) extends Se combOp = (queue1, queue2) => { queue1 ++= queue2 } - ).mapValues(_.toArray.reverse) // This is an min-heap, so we reverse the order. + ).mapValues(_.toArray.sorted(ord.reverse)) // This is an min-heap, so we reverse the order. } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala index cb8fe4dba96f5..57216e8eb4a55 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/MLPairRDDFunctionsSuite.scala @@ -24,13 +24,14 @@ import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ class MLPairRDDFunctionsSuite extends FunSuite with MLlibTestSparkContext { test("topByKey") { - val topMap = sc.parallelize(Array((1, 1), (1, 2), (3, 2), (3, 7), (5, 1), (3, 5)), 2) - .topByKey(2) + val topMap = sc.parallelize(Array((1, 7), (1, 3), (1, 6), (1, 1), (1, 2), (3, 2), (3, 7), (5, + 1), (3, 5)), 2) + .topByKey(5) .collectAsMap() assert(topMap.size === 3) - assert(topMap(1) === Array(2, 1)) - assert(topMap(3) === Array(7, 5)) + assert(topMap(1) === Array(7, 6, 3, 2, 1)) + assert(topMap(3) === Array(7, 5, 2)) assert(topMap(5) === Array(1)) } } From 3af423c92f117b5dd4dc6832dc50911cedb29abc Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Thu, 7 May 2015 20:59:42 -0700 Subject: [PATCH 40/48] [SPARK-6986] [SQL] Use Serializer2 in more cases. With https://github.com/apache/spark/commit/0a2b15ce43cf6096e1a7ae060b7c8a4010ce3b92, the serialization stream and deserialization stream has enough information to determine it is handling a key-value pari, a key, or a value. It is safe to use `SparkSqlSerializer2` in more cases. Author: Yin Huai Closes #5849 from yhuai/serializer2MoreCases and squashes the following commits: 53a5eaa [Yin Huai] Josh's comments. 487f540 [Yin Huai] Use BufferedOutputStream. 8385f95 [Yin Huai] Always create a new row at the deserialization side to work with sort merge join. c7e2129 [Yin Huai] Update tests. 4513d13 [Yin Huai] Use Serializer2 in more places. --- .../apache/spark/sql/execution/Exchange.scala | 23 ++---- .../sql/execution/SparkSqlSerializer2.scala | 74 ++++++++++++------- .../execution/SparkSqlSerializer2Suite.scala | 30 ++++---- 3 files changed, 69 insertions(+), 58 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala index 5b2e46962cd3b..f0d54cd6cd94f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala @@ -84,18 +84,8 @@ case class Exchange( def serializer( keySchema: Array[DataType], valueSchema: Array[DataType], + hasKeyOrdering: Boolean, numPartitions: Int): Serializer = { - // In ExternalSorter's spillToMergeableFile function, key-value pairs are written out - // through write(key) and then write(value) instead of write((key, value)). Because - // SparkSqlSerializer2 assumes that objects passed in are Product2, we cannot safely use - // it when spillToMergeableFile in ExternalSorter will be used. - // So, we will not use SparkSqlSerializer2 when - // - Sort-based shuffle is enabled and the number of reducers (numPartitions) is greater - // then the bypassMergeThreshold; or - // - newOrdering is defined. - val cannotUseSqlSerializer2 = - (sortBasedShuffleOn && numPartitions > bypassMergeThreshold) || newOrdering.nonEmpty - // It is true when there is no field that needs to be write out. // For now, we will not use SparkSqlSerializer2 when noField is true. val noField = @@ -104,14 +94,13 @@ case class Exchange( val useSqlSerializer2 = child.sqlContext.conf.useSqlSerializer2 && // SparkSqlSerializer2 is enabled. - !cannotUseSqlSerializer2 && // Safe to use Serializer2. SparkSqlSerializer2.support(keySchema) && // The schema of key is supported. SparkSqlSerializer2.support(valueSchema) && // The schema of value is supported. !noField val serializer = if (useSqlSerializer2) { logInfo("Using SparkSqlSerializer2.") - new SparkSqlSerializer2(keySchema, valueSchema) + new SparkSqlSerializer2(keySchema, valueSchema, hasKeyOrdering) } else { logInfo("Using SparkSqlSerializer.") new SparkSqlSerializer(sparkConf) @@ -154,7 +143,8 @@ case class Exchange( } val keySchema = expressions.map(_.dataType).toArray val valueSchema = child.output.map(_.dataType).toArray - shuffled.setSerializer(serializer(keySchema, valueSchema, numPartitions)) + shuffled.setSerializer( + serializer(keySchema, valueSchema, newOrdering.nonEmpty, numPartitions)) shuffled.map(_._2) @@ -179,7 +169,8 @@ case class Exchange( new ShuffledRDD[Row, Null, Null](rdd, part) } val keySchema = child.output.map(_.dataType).toArray - shuffled.setSerializer(serializer(keySchema, null, numPartitions)) + shuffled.setSerializer( + serializer(keySchema, null, newOrdering.nonEmpty, numPartitions)) shuffled.map(_._1) @@ -199,7 +190,7 @@ case class Exchange( val partitioner = new HashPartitioner(1) val shuffled = new ShuffledRDD[Null, Row, Row](rdd, partitioner) val valueSchema = child.output.map(_.dataType).toArray - shuffled.setSerializer(serializer(null, valueSchema, 1)) + shuffled.setSerializer(serializer(null, valueSchema, false, 1)) shuffled.map(_._2) case _ => sys.error(s"Exchange not implemented for $newPartitioning") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala index 35ad987eb1a63..256d527d7b636 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlSerializer2.scala @@ -27,7 +27,7 @@ import scala.reflect.ClassTag import org.apache.spark.serializer._ import org.apache.spark.Logging import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow +import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, MutableRow, GenericMutableRow} import org.apache.spark.sql.types._ /** @@ -49,9 +49,9 @@ private[sql] class Serializer2SerializationStream( out: OutputStream) extends SerializationStream with Logging { - val rowOut = new DataOutputStream(out) - val writeKeyFunc = SparkSqlSerializer2.createSerializationFunction(keySchema, rowOut) - val writeValueFunc = SparkSqlSerializer2.createSerializationFunction(valueSchema, rowOut) + private val rowOut = new DataOutputStream(new BufferedOutputStream(out)) + private val writeKeyFunc = SparkSqlSerializer2.createSerializationFunction(keySchema, rowOut) + private val writeValueFunc = SparkSqlSerializer2.createSerializationFunction(valueSchema, rowOut) override def writeObject[T: ClassTag](t: T): SerializationStream = { val kv = t.asInstanceOf[Product2[Row, Row]] @@ -86,31 +86,44 @@ private[sql] class Serializer2SerializationStream( private[sql] class Serializer2DeserializationStream( keySchema: Array[DataType], valueSchema: Array[DataType], + hasKeyOrdering: Boolean, in: InputStream) extends DeserializationStream with Logging { - val rowIn = new DataInputStream(new BufferedInputStream(in)) + private val rowIn = new DataInputStream(new BufferedInputStream(in)) + + private def rowGenerator(schema: Array[DataType]): () => (MutableRow) = { + if (schema == null) { + () => null + } else { + if (hasKeyOrdering) { + // We have key ordering specified in a ShuffledRDD, it is not safe to reuse a mutable row. + () => new GenericMutableRow(schema.length) + } else { + // It is safe to reuse the mutable row. + val mutableRow = new SpecificMutableRow(schema) + () => mutableRow + } + } + } - val key = if (keySchema != null) new SpecificMutableRow(keySchema) else null - val value = if (valueSchema != null) new SpecificMutableRow(valueSchema) else null - val readKeyFunc = SparkSqlSerializer2.createDeserializationFunction(keySchema, rowIn, key) - val readValueFunc = SparkSqlSerializer2.createDeserializationFunction(valueSchema, rowIn, value) + // Functions used to return rows for key and value. + private val getKey = rowGenerator(keySchema) + private val getValue = rowGenerator(valueSchema) + // Functions used to read a serialized row from the InputStream and deserialize it. + private val readKeyFunc = SparkSqlSerializer2.createDeserializationFunction(keySchema, rowIn) + private val readValueFunc = SparkSqlSerializer2.createDeserializationFunction(valueSchema, rowIn) override def readObject[T: ClassTag](): T = { - readKeyFunc() - readValueFunc() - - (key, value).asInstanceOf[T] + (readKeyFunc(getKey()), readValueFunc(getValue())).asInstanceOf[T] } override def readKey[T: ClassTag](): T = { - readKeyFunc() - key.asInstanceOf[T] + readKeyFunc(getKey()).asInstanceOf[T] } override def readValue[T: ClassTag](): T = { - readValueFunc() - value.asInstanceOf[T] + readValueFunc(getValue()).asInstanceOf[T] } override def close(): Unit = { @@ -118,9 +131,10 @@ private[sql] class Serializer2DeserializationStream( } } -private[sql] class ShuffleSerializerInstance( +private[sql] class SparkSqlSerializer2Instance( keySchema: Array[DataType], - valueSchema: Array[DataType]) + valueSchema: Array[DataType], + hasKeyOrdering: Boolean) extends SerializerInstance { def serialize[T: ClassTag](t: T): ByteBuffer = @@ -137,7 +151,7 @@ private[sql] class ShuffleSerializerInstance( } def deserializeStream(s: InputStream): DeserializationStream = { - new Serializer2DeserializationStream(keySchema, valueSchema, s) + new Serializer2DeserializationStream(keySchema, valueSchema, hasKeyOrdering, s) } } @@ -148,12 +162,16 @@ private[sql] class ShuffleSerializerInstance( * The schema of keys is represented by `keySchema` and that of values is represented by * `valueSchema`. */ -private[sql] class SparkSqlSerializer2(keySchema: Array[DataType], valueSchema: Array[DataType]) +private[sql] class SparkSqlSerializer2( + keySchema: Array[DataType], + valueSchema: Array[DataType], + hasKeyOrdering: Boolean) extends Serializer with Logging with Serializable{ - def newInstance(): SerializerInstance = new ShuffleSerializerInstance(keySchema, valueSchema) + def newInstance(): SerializerInstance = + new SparkSqlSerializer2Instance(keySchema, valueSchema, hasKeyOrdering) override def supportsRelocationOfSerializedObjects: Boolean = { // SparkSqlSerializer2 is stateless and writes no stream headers @@ -323,11 +341,11 @@ private[sql] object SparkSqlSerializer2 { */ def createDeserializationFunction( schema: Array[DataType], - in: DataInputStream, - mutableRow: SpecificMutableRow): () => Unit = { - () => { - // If the schema is null, the returned function does nothing when it get called. - if (schema != null) { + in: DataInputStream): (MutableRow) => Row = { + if (schema == null) { + (mutableRow: MutableRow) => null + } else { + (mutableRow: MutableRow) => { var i = 0 while (i < schema.length) { schema(i) match { @@ -440,6 +458,8 @@ private[sql] object SparkSqlSerializer2 { } i += 1 } + + mutableRow } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala index 27f063d73a9a9..15337c4045436 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlSerializer2Suite.scala @@ -148,6 +148,15 @@ abstract class SparkSqlSerializer2Suite extends QueryTest with BeforeAndAfterAll table("shuffle").collect()) } + test("key schema is null") { + val aggregations = allColumns.split(",").map(c => s"COUNT($c)").mkString(",") + val df = sql(s"SELECT $aggregations FROM shuffle") + checkSerializer(df.queryExecution.executedPlan, serializerClass) + checkAnswer( + df, + Row(1000, 1000, 0, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000)) + } + test("value schema is null") { val df = sql(s"SELECT col0 FROM shuffle ORDER BY col0") checkSerializer(df.queryExecution.executedPlan, serializerClass) @@ -167,29 +176,20 @@ class SparkSqlSerializer2SortShuffleSuite extends SparkSqlSerializer2Suite { override def beforeAll(): Unit = { super.beforeAll() // Sort merge will not be triggered. - sql("set spark.sql.shuffle.partitions = 200") - } - - test("key schema is null") { - val aggregations = allColumns.split(",").map(c => s"COUNT($c)").mkString(",") - val df = sql(s"SELECT $aggregations FROM shuffle") - checkSerializer(df.queryExecution.executedPlan, serializerClass) - checkAnswer( - df, - Row(1000, 1000, 0, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000)) + val bypassMergeThreshold = + sparkContext.conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) + sql(s"set spark.sql.shuffle.partitions=${bypassMergeThreshold-1}") } } /** For now, we will use SparkSqlSerializer for sort based shuffle with sort merge. */ class SparkSqlSerializer2SortMergeShuffleSuite extends SparkSqlSerializer2Suite { - // We are expecting SparkSqlSerializer. - override val serializerClass: Class[Serializer] = - classOf[SparkSqlSerializer].asInstanceOf[Class[Serializer]] - override def beforeAll(): Unit = { super.beforeAll() // To trigger the sort merge. - sql("set spark.sql.shuffle.partitions = 201") + val bypassMergeThreshold = + sparkContext.conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) + sql(s"set spark.sql.shuffle.partitions=${bypassMergeThreshold + 1}") } } From 714db2ef52c0fe34418e252e5a6f220337022046 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Thu, 7 May 2015 22:32:13 -0700 Subject: [PATCH 41/48] [SPARK-7470] [SQL] Spark shell SQLContext crashes without hive This only happens if you have `SPARK_PREPEND_CLASSES` set. Then I built it with `build/sbt clean assembly compile` and just ran it with `bin/spark-shell`. ``` ... 15/05/07 17:07:30 INFO EventLoggingListener: Logging events to file:/tmp/spark-events/local-1431043649919 15/05/07 17:07:30 INFO SparkILoop: Created spark context.. Spark context available as sc. java.lang.NoClassDefFoundError: org/apache/hadoop/hive/conf/HiveConf at java.lang.Class.getDeclaredConstructors0(Native Method) at java.lang.Class.privateGetDeclaredConstructors(Class.java:2493) at java.lang.Class.getConstructor0(Class.java:2803) ... Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.conf.HiveConf at java.net.URLClassLoader$1.run(URLClassLoader.java:366) at java.net.URLClassLoader$1.run(URLClassLoader.java:355) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(URLClassLoader.java:354) at java.lang.ClassLoader.loadClass(ClassLoader.java:425) at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308) at java.lang.ClassLoader.loadClass(ClassLoader.java:358) ... 52 more :10: error: not found: value sqlContext import sqlContext.implicits._ ^ :10: error: not found: value sqlContext import sqlContext.sql ^ ``` yhuai marmbrus Author: Andrew Or Closes #5997 from andrewor14/sql-shell-crash and squashes the following commits: 61147e6 [Andrew Or] Also expect NoClassDefFoundError --- .../src/main/scala/org/apache/spark/repl/SparkILoop.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala index 8dc0e0c965923..488f3a9f33256 100644 --- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala +++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala @@ -1028,7 +1028,7 @@ class SparkILoop( logInfo("Created sql context (with Hive support)..") } catch { - case cnf: java.lang.ClassNotFoundException => + case _: java.lang.ClassNotFoundException | _: java.lang.NoClassDefFoundError => sqlContext = new SQLContext(sparkContext) logInfo("Created sql context..") } From f496bf3c539a873ffdf3aa803847ef7b50135bd7 Mon Sep 17 00:00:00 2001 From: wangfei Date: Thu, 7 May 2015 22:55:42 -0700 Subject: [PATCH 42/48] [SPARK-7232] [SQL] Add a Substitution batch for spark sql analyzer Added a new batch named `Substitution` before `Resolution` batch. The motivation for this is there are kind of cases we want to do some substitution on the parsed logical plan before resolve it. Consider this two cases: 1 CTE, for cte we first build a row logical plan ``` 'With Map(q1 -> 'Subquery q1 'Project ['key] 'UnresolvedRelation [src], None) 'Project [*] 'Filter ('key = 5) 'UnresolvedRelation [q1], None ``` In `With` logicalplan here is a map stored the (`q1-> subquery`), we want first take off the with command and substitute the `q1` of `UnresolvedRelation` by the `subquery` 2 Another example is Window function, in window function user may define some windows, we also need substitute the window name of child by the concrete window. this should also done in the Substitution batch. Author: wangfei Closes #5776 from scwf/addbatch and squashes the following commits: d4b962f [wangfei] added WindowsSubstitution 70f6932 [wangfei] Merge branch 'master' of https://github.com/apache/spark into addbatch ecaeafb [wangfei] address yhuai's comments 553005a [wangfei] fix test case 0c54798 [wangfei] address comments 29aaaaf [wangfei] fix compile 1c9a092 [wangfei] added Substitution bastch --- .../sql/catalyst/analysis/Analyzer.scala | 98 ++++++++++++------- 1 file changed, 60 insertions(+), 38 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 7e46ad851cdd3..bb7913e186a85 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -55,6 +55,10 @@ class Analyzer( val extendedResolutionRules: Seq[Rule[LogicalPlan]] = Nil lazy val batches: Seq[Batch] = Seq( + Batch("Substitution", fixedPoint, + CTESubstitution :: + WindowsSubstitution :: + Nil : _*), Batch("Resolution", fixedPoint, ResolveRelations :: ResolveReferences :: @@ -71,6 +75,55 @@ class Analyzer( extendedResolutionRules : _*) ) + /** + * Substitute child plan with cte definitions + */ + object CTESubstitution extends Rule[LogicalPlan] { + // TODO allow subquery to define CTE + def apply(plan: LogicalPlan): LogicalPlan = plan match { + case With(child, relations) => substituteCTE(child, relations) + case other => other + } + + def substituteCTE(plan: LogicalPlan, cteRelations: Map[String, LogicalPlan]): LogicalPlan = { + plan transform { + // In hive, if there is same table name in database and CTE definition, + // hive will use the table in database, not the CTE one. + // Taking into account the reasonableness and the implementation complexity, + // here use the CTE definition first, check table name only and ignore database name + // see https://github.com/apache/spark/pull/4929#discussion_r27186638 for more info + case u : UnresolvedRelation => + val substituted = cteRelations.get(u.tableIdentifier.last).map { relation => + val withAlias = u.alias.map(Subquery(_, relation)) + withAlias.getOrElse(relation) + } + substituted.getOrElse(u) + } + } + } + + /** + * Substitute child plan with WindowSpecDefinitions. + */ + object WindowsSubstitution extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = plan transform { + // Lookup WindowSpecDefinitions. This rule works with unresolved children. + case WithWindowDefinition(windowDefinitions, child) => + child.transform { + case plan => plan.transformExpressions { + case UnresolvedWindowExpression(c, WindowSpecReference(windowName)) => + val errorMessage = + s"Window specification $windowName is not defined in the WINDOW clause." + val windowSpecDefinition = + windowDefinitions + .get(windowName) + .getOrElse(failAnalysis(errorMessage)) + WindowExpression(c, windowSpecDefinition) + } + } + } + } + /** * Removes no-op Alias expressions from the plan. */ @@ -172,36 +225,20 @@ class Analyzer( * Replaces [[UnresolvedRelation]]s with concrete relations from the catalog. */ object ResolveRelations extends Rule[LogicalPlan] { - def getTable(u: UnresolvedRelation, cteRelations: Map[String, LogicalPlan]): LogicalPlan = { + def getTable(u: UnresolvedRelation): LogicalPlan = { try { - // In hive, if there is same table name in database and CTE definition, - // hive will use the table in database, not the CTE one. - // Taking into account the reasonableness and the implementation complexity, - // here use the CTE definition first, check table name only and ignore database name - cteRelations.get(u.tableIdentifier.last) - .map(relation => u.alias.map(Subquery(_, relation)).getOrElse(relation)) - .getOrElse(catalog.lookupRelation(u.tableIdentifier, u.alias)) + catalog.lookupRelation(u.tableIdentifier, u.alias) } catch { case _: NoSuchTableException => u.failAnalysis(s"no such table ${u.tableName}") } } - def apply(plan: LogicalPlan): LogicalPlan = { - val (realPlan, cteRelations) = plan match { - // TODO allow subquery to define CTE - // Add cte table to a temp relation map,drop `with` plan and keep its child - case With(child, relations) => (child, relations) - case other => (other, Map.empty[String, LogicalPlan]) - } - - realPlan transform { - case i@InsertIntoTable(u: UnresolvedRelation, _, _, _, _) => - i.copy( - table = EliminateSubQueries(getTable(u, cteRelations))) - case u: UnresolvedRelation => - getTable(u, cteRelations) - } + def apply(plan: LogicalPlan): LogicalPlan = plan transform { + case i@InsertIntoTable(u: UnresolvedRelation, _, _, _, _) => + i.copy(table = EliminateSubQueries(getTable(u))) + case u: UnresolvedRelation => + getTable(u) } } @@ -664,21 +701,6 @@ class Analyzer( // We have to use transformDown at here to make sure the rule of // "Aggregate with Having clause" will be triggered. def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { - // Lookup WindowSpecDefinitions. This rule works with unresolved children. - case WithWindowDefinition(windowDefinitions, child) => - child.transform { - case plan => plan.transformExpressions { - case UnresolvedWindowExpression(c, WindowSpecReference(windowName)) => - val errorMessage = - s"Window specification $windowName is not defined in the WINDOW clause." - val windowSpecDefinition = - windowDefinitions - .get(windowName) - .getOrElse(failAnalysis(errorMessage)) - WindowExpression(c, windowSpecDefinition) - } - } - // Aggregate with Having clause. This rule works with an unresolved Aggregate because // a resolved Aggregate will not have Window Functions. case f @ Filter(condition, a @ Aggregate(groupingExprs, aggregateExprs, child)) From c2f0821aad3b82dcd327e914c9b297e92526649d Mon Sep 17 00:00:00 2001 From: "Zhang, Liye" Date: Fri, 8 May 2015 09:10:58 +0100 Subject: [PATCH 43/48] [SPARK-7392] [CORE] bugfix: Kryo buffer size cannot be larger than 2M Author: Zhang, Liye Closes #5934 from liyezhang556520/kryoBufSize and squashes the following commits: 5707e04 [Zhang, Liye] fix import order 8693288 [Zhang, Liye] replace multiplier with ByteUnit methods 9bf93e9 [Zhang, Liye] add tests d91e5ed [Zhang, Liye] change kb to mb --- .../spark/serializer/KryoSerializer.scala | 11 +++---- .../serializer/KryoSerializerSuite.scala | 30 +++++++++++++++++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index f9f78852f032b..64ba27f34d2f1 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -32,6 +32,7 @@ import org.apache.spark._ import org.apache.spark.api.python.PythonBroadcast import org.apache.spark.broadcast.HttpBroadcast import org.apache.spark.network.nio.{GetBlock, GotBlock, PutBlock} +import org.apache.spark.network.util.ByteUnit import org.apache.spark.scheduler.{CompressedMapStatus, HighlyCompressedMapStatus} import org.apache.spark.storage._ import org.apache.spark.util.BoundedPriorityQueue @@ -51,18 +52,18 @@ class KryoSerializer(conf: SparkConf) private val bufferSizeKb = conf.getSizeAsKb("spark.kryoserializer.buffer", "64k") - if (bufferSizeKb >= 2048) { + if (bufferSizeKb >= ByteUnit.GiB.toKiB(2)) { throw new IllegalArgumentException("spark.kryoserializer.buffer must be less than " + - s"2048 mb, got: + $bufferSizeKb mb.") + s"2048 mb, got: + ${ByteUnit.KiB.toMiB(bufferSizeKb)} mb.") } - private val bufferSize = (bufferSizeKb * 1024).toInt + private val bufferSize = ByteUnit.KiB.toBytes(bufferSizeKb).toInt val maxBufferSizeMb = conf.getSizeAsMb("spark.kryoserializer.buffer.max", "64m").toInt - if (maxBufferSizeMb >= 2048) { + if (maxBufferSizeMb >= ByteUnit.GiB.toMiB(2)) { throw new IllegalArgumentException("spark.kryoserializer.buffer.max must be less than " + s"2048 mb, got: + $maxBufferSizeMb mb.") } - private val maxBufferSize = maxBufferSizeMb * 1024 * 1024 + private val maxBufferSize = ByteUnit.MiB.toBytes(maxBufferSizeMb).toInt private val referenceTracking = conf.getBoolean("spark.kryo.referenceTracking", true) private val registrationRequired = conf.getBoolean("spark.kryo.registrationRequired", false) diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala index 778a7eee73b23..c7369de24b81f 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala @@ -32,6 +32,36 @@ class KryoSerializerSuite extends FunSuite with SharedSparkContext { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryo.registrator", classOf[MyRegistrator].getName) + test("configuration limits") { + val conf1 = conf.clone() + val kryoBufferProperty = "spark.kryoserializer.buffer" + val kryoBufferMaxProperty = "spark.kryoserializer.buffer.max" + conf1.set(kryoBufferProperty, "64k") + conf1.set(kryoBufferMaxProperty, "64m") + new KryoSerializer(conf1).newInstance() + // 2048m = 2097152k + conf1.set(kryoBufferProperty, "2097151k") + conf1.set(kryoBufferMaxProperty, "64m") + // should not throw exception when kryoBufferMaxProperty < kryoBufferProperty + new KryoSerializer(conf1).newInstance() + conf1.set(kryoBufferMaxProperty, "2097151k") + new KryoSerializer(conf1).newInstance() + val conf2 = conf.clone() + conf2.set(kryoBufferProperty, "2048m") + val thrown1 = intercept[IllegalArgumentException](new KryoSerializer(conf2).newInstance()) + assert(thrown1.getMessage.contains(kryoBufferProperty)) + val conf3 = conf.clone() + conf3.set(kryoBufferMaxProperty, "2048m") + val thrown2 = intercept[IllegalArgumentException](new KryoSerializer(conf3).newInstance()) + assert(thrown2.getMessage.contains(kryoBufferMaxProperty)) + val conf4 = conf.clone() + conf4.set(kryoBufferProperty, "2g") + conf4.set(kryoBufferMaxProperty, "3g") + val thrown3 = intercept[IllegalArgumentException](new KryoSerializer(conf4).newInstance()) + assert(thrown3.getMessage.contains(kryoBufferProperty)) + assert(!thrown3.getMessage.contains(kryoBufferMaxProperty)) + } + test("basic types") { val ser = new KryoSerializer(conf).newInstance() def check[T: ClassTag](t: T) { From ebff7327af5efa9f57c605284de4fae6b050ae0f Mon Sep 17 00:00:00 2001 From: Lianhui Wang Date: Fri, 8 May 2015 08:44:46 -0500 Subject: [PATCH 44/48] [SPARK-6869] [PYSPARK] Add pyspark archives path to PYTHONPATH Based on https://github.com/apache/spark/pull/5478 that provide a PYSPARK_ARCHIVES_PATH env. within this PR, we just should export PYSPARK_ARCHIVES_PATH=/user/spark/pyspark.zip,/user/spark/python/lib/py4j-0.8.2.1-src.zip in conf/spark-env.sh when we don't install PySpark on each node of Yarn. i run python application successfully on yarn-client and yarn-cluster with this PR. andrewor14 sryza Sephiroth-Lin Can you take a look at this?thanks. Author: Lianhui Wang Closes #5580 from lianhuiwang/SPARK-6869 and squashes the following commits: 66ffa43 [Lianhui Wang] Update Client.scala c2ad0f9 [Lianhui Wang] Update Client.scala 1c8f664 [Lianhui Wang] Merge remote-tracking branch 'remotes/apache/master' into SPARK-6869 008850a [Lianhui Wang] Merge remote-tracking branch 'remotes/apache/master' into SPARK-6869 f0b4ed8 [Lianhui Wang] Merge remote-tracking branch 'remotes/apache/master' into SPARK-6869 150907b [Lianhui Wang] Merge remote-tracking branch 'remotes/apache/master' into SPARK-6869 20402cd [Lianhui Wang] use ZipEntry 9d87c3f [Lianhui Wang] update scala style e7bd971 [Lianhui Wang] address vanzin's comments 4b8a3ed [Lianhui Wang] use pyArchivesEnvOpt e6b573b [Lianhui Wang] address vanzin's comments f11f84a [Lianhui Wang] zip pyspark archives 5192cca [Lianhui Wang] update import path 3b1e4c8 [Lianhui Wang] address tgravescs's comments 9396346 [Lianhui Wang] put zip to make-distribution.sh 0d2baf7 [Lianhui Wang] update import paths e0179be [Lianhui Wang] add zip pyspark archives in build or sparksubmit 31e8e06 [Lianhui Wang] update code style 9f31dac [Lianhui Wang] update code and add comments f72987c [Lianhui Wang] add archives path to PYTHONPATH --- assembly/pom.xml | 21 ++++++++++ .../org/apache/spark/deploy/SparkSubmit.scala | 41 +++++++++++++++++++ project/SparkBuild.scala | 37 ++++++++++++++++- .../org/apache/spark/deploy/yarn/Client.scala | 23 ++++++++--- 4 files changed, 114 insertions(+), 8 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 2b4d0a990bf22..626c8577e31fe 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -92,6 +92,27 @@ true + + + org.apache.maven.plugins + maven-antrun-plugin + + + package + + run + + + + + + + + + + + + org.apache.maven.plugins diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 8a0327984e195..329fa06ba8ba5 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -332,6 +332,47 @@ object SparkSubmit { } } + // In yarn mode for a python app, add pyspark archives to files + // that can be distributed with the job + if (args.isPython && clusterManager == YARN) { + var pyArchives: String = null + val pyArchivesEnvOpt = sys.env.get("PYSPARK_ARCHIVES_PATH") + if (pyArchivesEnvOpt.isDefined) { + pyArchives = pyArchivesEnvOpt.get + } else { + if (!sys.env.contains("SPARK_HOME")) { + printErrorAndExit("SPARK_HOME does not exist for python application in yarn mode.") + } + val pythonPath = new ArrayBuffer[String] + for (sparkHome <- sys.env.get("SPARK_HOME")) { + val pyLibPath = Seq(sparkHome, "python", "lib").mkString(File.separator) + val pyArchivesFile = new File(pyLibPath, "pyspark.zip") + if (!pyArchivesFile.exists()) { + printErrorAndExit("pyspark.zip does not exist for python application in yarn mode.") + } + val py4jFile = new File(pyLibPath, "py4j-0.8.2.1-src.zip") + if (!py4jFile.exists()) { + printErrorAndExit("py4j-0.8.2.1-src.zip does not exist for python application " + + "in yarn mode.") + } + pythonPath += pyArchivesFile.getAbsolutePath() + pythonPath += py4jFile.getAbsolutePath() + } + pyArchives = pythonPath.mkString(",") + } + + pyArchives = pyArchives.split(",").map { localPath=> + val localURI = Utils.resolveURI(localPath) + if (localURI.getScheme != "local") { + args.files = mergeFileLists(args.files, localURI.toString) + new Path(localPath).getName + } else { + localURI.getPath + } + }.mkString(File.pathSeparator) + sysProps("spark.submit.pyArchives") = pyArchives + } + // If we're running a R app, set the main class to our specific R runner if (args.isR && deployMode == CLIENT) { if (args.primaryResource == SPARKR_SHELL) { diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 026855f8f6a5a..186345af0e60e 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -370,6 +370,7 @@ object Assembly { object PySparkAssembly { import sbtassembly.Plugin._ import AssemblyKeys._ + import java.util.zip.{ZipOutputStream, ZipEntry} lazy val settings = Seq( unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" }, @@ -377,16 +378,48 @@ object PySparkAssembly { // to be included in the assembly. We can't just add "python/" to the assembly's resource dir // list since that will copy unneeded / unwanted files. resourceGenerators in Compile <+= resourceManaged in Compile map { outDir: File => + val src = new File(BuildCommons.sparkHome, "python/pyspark") + + val zipFile = new File(BuildCommons.sparkHome , "python/lib/pyspark.zip") + zipFile.delete() + zipRecursive(src, zipFile) + val dst = new File(outDir, "pyspark") if (!dst.isDirectory()) { require(dst.mkdirs()) } - - val src = new File(BuildCommons.sparkHome, "python/pyspark") copy(src, dst) } ) + private def zipRecursive(source: File, destZipFile: File) = { + val destOutput = new ZipOutputStream(new FileOutputStream(destZipFile)) + addFilesToZipStream("", source, destOutput) + destOutput.flush() + destOutput.close() + } + + private def addFilesToZipStream(parent: String, source: File, output: ZipOutputStream): Unit = { + if (source.isDirectory()) { + output.putNextEntry(new ZipEntry(parent + source.getName())) + for (file <- source.listFiles()) { + addFilesToZipStream(parent + source.getName() + File.separator, file, output) + } + } else { + val in = new FileInputStream(source) + output.putNextEntry(new ZipEntry(parent + source.getName())) + val buf = new Array[Byte](8192) + var n = 0 + while (n != -1) { + n = in.read(buf) + if (n != -1) { + output.write(buf, 0, n) + } + } + in.close() + } + } + private def copy(src: File, dst: File): Seq[File] = { src.listFiles().flatMap { f => val child = new File(dst, f.getName()) diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 20ecaf092e3f8..d21a7393478ce 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -468,6 +468,17 @@ private[spark] class Client( env("SPARK_YARN_USER_ENV") = userEnvs } + // if spark.submit.pyArchives is in sparkConf, append pyArchives to PYTHONPATH + // that can be passed on to the ApplicationMaster and the executors. + if (sparkConf.contains("spark.submit.pyArchives")) { + var pythonPath = sparkConf.get("spark.submit.pyArchives") + if (env.contains("PYTHONPATH")) { + pythonPath = Seq(env.get("PYTHONPATH"), pythonPath).mkString(File.pathSeparator) + } + env("PYTHONPATH") = pythonPath + sparkConf.setExecutorEnv("PYTHONPATH", pythonPath) + } + // In cluster mode, if the deprecated SPARK_JAVA_OPTS is set, we need to propagate it to // executors. But we can't just set spark.executor.extraJavaOptions, because the driver's // SparkContext will not let that set spark* system properties, which is expected behavior for @@ -1074,7 +1085,7 @@ object Client extends Logging { val hiveConf = hiveClass.getMethod("getConf").invoke(hive) val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf") - val hiveConfGet = (param:String) => Option(hiveConfClass + val hiveConfGet = (param: String) => Option(hiveConfClass .getMethod("get", classOf[java.lang.String]) .invoke(hiveConf, param)) @@ -1096,7 +1107,7 @@ object Client extends Logging { val hive2Token = new Token[DelegationTokenIdentifier]() hive2Token.decodeFromUrlString(tokenStr) - credentials.addToken(new Text("hive.server2.delegation.token"),hive2Token) + credentials.addToken(new Text("hive.server2.delegation.token"), hive2Token) logDebug("Added hive.Server2.delegation.token to conf.") hiveClass.getMethod("closeCurrent").invoke(null) } else { @@ -1141,13 +1152,13 @@ object Client extends Logging { logInfo("Added HBase security token to credentials.") } catch { - case e:java.lang.NoSuchMethodException => + case e: java.lang.NoSuchMethodException => logInfo("HBase Method not found: " + e) - case e:java.lang.ClassNotFoundException => + case e: java.lang.ClassNotFoundException => logDebug("HBase Class not found: " + e) - case e:java.lang.NoClassDefFoundError => + case e: java.lang.NoClassDefFoundError => logDebug("HBase Class not found: " + e) - case e:Exception => + case e: Exception => logError("Exception when obtaining HBase security token: " + e) } } From c796be70f36e262b6a2ce75924bd970f40bf4045 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 8 May 2015 16:54:32 +0100 Subject: [PATCH 45/48] [SPARK-3454] separate json endpoints for data in the UI Exposes data available in the UI as json over http. Key points: * new endpoints, handled independently of existing XyzPage classes. Root entrypoint is `JsonRootResource` * Uses jersey + jackson for routing & converting POJOs into json * tests against known results in `HistoryServerSuite` * also fixes some minor issues w/ the UI -- synchronizing on access to `StorageListener` & `StorageStatusListener`, and fixing some inconsistencies w/ the way we handle retained jobs & stages. Author: Imran Rashid Closes #5940 from squito/SPARK-3454_better_test_files and squashes the following commits: 1a72ed6 [Imran Rashid] rats 85fdb3e [Imran Rashid] Merge branch 'no_php' into SPARK-3454 1fc65b0 [Imran Rashid] Revert "Revert "[SPARK-3454] separate json endpoints for data in the UI"" 1276900 [Imran Rashid] get rid of giant event file, replace w/ smaller one; check both shuffle read & shuffle write 4e12013 [Imran Rashid] just use test case name for expectation file name 863ef64 [Imran Rashid] rename json files to avoid strange file names and not look like php --- .rat-excludes | 7 + core/pom.xml | 8 + .../org/apache/spark/JobExecutionStatus.java | 8 +- .../status/api/v1/ApplicationStatus.java | 30 + .../spark/status/api/v1/StageStatus.java | 31 + .../spark/status/api/v1/TaskSorting.java | 48 + .../java/org/apache/spark/util/EnumUtil.java | 38 + .../scala/org/apache/spark/SparkContext.scala | 2 +- .../history/ApplicationHistoryProvider.scala | 4 +- .../deploy/history/FsHistoryProvider.scala | 14 +- .../spark/deploy/history/HistoryServer.scala | 20 +- .../spark/deploy/master/ApplicationInfo.scala | 2 +- .../apache/spark/deploy/master/Master.scala | 14 +- .../deploy/master/ui/ApplicationPage.scala | 19 +- .../spark/deploy/master/ui/MasterPage.scala | 12 +- .../spark/deploy/master/ui/MasterWebUI.scala | 24 +- .../spark/status/api/v1/AllJobsResource.scala | 98 ++ .../spark/status/api/v1/AllRDDResource.scala | 104 ++ .../status/api/v1/AllStagesResource.scala | 309 ++++ .../api/v1/ApplicationListResource.scala | 94 ++ .../status/api/v1/ExecutorListResource.scala | 36 + .../status/api/v1/JacksonMessageWriter.scala | 93 ++ .../status/api/v1/JsonRootResource.scala | 255 +++ .../api/v1/OneApplicationResource.scala | 31 + .../spark/status/api/v1/OneJobResource.scala | 41 + .../spark/status/api/v1/OneRDDResource.scala | 34 + .../status/api/v1/OneStageResource.scala | 150 ++ .../spark/status/api/v1/SecurityFilter.scala | 38 + .../spark/status/api/v1/SimpleDateParam.scala | 55 + .../org/apache/spark/status/api/v1/api.scala | 228 +++ .../spark/storage/StorageStatusListener.scala | 6 +- .../scala/org/apache/spark/ui/SparkUI.scala | 49 +- .../scala/org/apache/spark/ui/WebUI.scala | 8 +- .../apache/spark/ui/exec/ExecutorsPage.scala | 17 +- .../apache/spark/ui/jobs/AllJobsPage.scala | 14 +- .../apache/spark/ui/jobs/AllStagesPage.scala | 12 +- .../org/apache/spark/ui/jobs/JobPage.scala | 2 +- .../spark/ui/jobs/JobProgressListener.scala | 4 + .../org/apache/spark/ui/jobs/PoolPage.scala | 2 +- .../org/apache/spark/ui/jobs/StagePage.scala | 19 +- .../org/apache/spark/ui/storage/RDDPage.scala | 73 +- .../apache/spark/ui/storage/StoragePage.scala | 2 +- .../apache/spark/ui/storage/StorageTab.scala | 6 +- .../application_list_json_expectation.json | 53 + .../complete_stage_list_json_expectation.json | 67 + .../completed_app_list_json_expectation.json | 53 + .../executor_list_json_expectation.json | 17 + .../failed_stage_list_json_expectation.json | 23 + ...multi_attempt_app_json_1__expectation.json | 15 + ...multi_attempt_app_json_2__expectation.json | 15 + .../job_list_json_expectation.json | 43 + .../maxDate2_app_list_json_expectation.json | 10 + .../maxDate_app_list_json_expectation.json | 19 + .../minDate_app_list_json_expectation.json | 35 + .../one_app_json_expectation.json | 10 + ...ne_app_multi_attempt_json_expectation.json | 17 + .../one_job_json_expectation.json | 15 + .../one_rdd_storage_json_expectation.json | 64 + .../one_stage_attempt_json_expectation.json | 270 ++++ .../one_stage_json_expectation.json | 270 ++++ .../rdd_list_storage_json_expectation.json | 9 + .../running_app_list_json_expectation.json | 1 + .../stage_list_json_expectation.json | 89 ++ ...ist_with_accumulable_json_expectation.json | 27 + .../stage_task_list_expectation.json | 561 +++++++ ...multi_attempt_app_json_1__expectation.json | 193 +++ ...multi_attempt_app_json_2__expectation.json | 193 +++ ...k_list_w__offset___length_expectation.json | 1401 +++++++++++++++++ ...stage_task_list_w__sortBy_expectation.json | 561 +++++++ ...tBy_short_names___runtime_expectation.json | 561 +++++++ ...rtBy_short_names__runtime_expectation.json | 561 +++++++ ...mmary_w__custom_quantiles_expectation.json | 19 + ...sk_summary_w_shuffle_read_expectation.json | 19 + ...k_summary_w_shuffle_write_expectation.json | 19 + ...age_with_accumulable_json_expectation.json | 242 +++ ...eded_failed_job_list_json_expectation.json | 43 + .../succeeded_job_list_json_expectation.json | 29 + .../local-1422981759269/APPLICATION_COMPLETE | 0 .../local-1422981759269/EVENT_LOG_1 | 88 ++ .../local-1422981759269/SPARK_VERSION_1.2.0 | 0 .../local-1422981780767/APPLICATION_COMPLETE | 0 .../local-1422981780767/EVENT_LOG_1 | 82 + .../local-1422981780767/SPARK_VERSION_1.2.0 | 0 .../local-1425081759269/APPLICATION_COMPLETE | 0 .../local-1425081759269/EVENT_LOG_1 | 88 ++ .../local-1425081759269/SPARK_VERSION_1.2.0 | 0 .../local-1426533911241/APPLICATION_COMPLETE | 0 .../local-1426533911241/EVENT_LOG_1 | 24 + .../local-1426533911241/SPARK_VERSION_1.2.0 | 0 .../local-1426633911242/APPLICATION_COMPLETE | 0 .../local-1426633911242/EVENT_LOG_1 | 24 + .../local-1426633911242/SPARK_VERSION_1.2.0 | 0 .../spark-events/local-1430917381534 | 231 +++ .../org/apache/spark/JsonTestUtils.scala | 34 + .../spark/deploy/JsonProtocolSuite.scala | 14 +- .../deploy/history/HistoryServerSuite.scala | 231 ++- .../status/api/v1/SimpleDateParamTest.scala | 29 + .../org/apache/spark/ui/UISeleniumSuite.scala | 264 +++- docs/monitoring.md | 74 + pom.xml | 12 + 100 files changed, 8608 insertions(+), 172 deletions(-) create mode 100644 core/src/main/java/org/apache/spark/status/api/v1/ApplicationStatus.java create mode 100644 core/src/main/java/org/apache/spark/status/api/v1/StageStatus.java create mode 100644 core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java create mode 100644 core/src/main/java/org/apache/spark/util/EnumUtil.java create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/AllJobsResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/AllRDDResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/ExecutorListResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/JsonRootResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/OneJobResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/OneRDDResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/SecurityFilter.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala create mode 100644 core/src/main/scala/org/apache/spark/status/api/v1/api.scala create mode 100644 core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/complete_stage_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/executor_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/failed_stage_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/maxDate2_app_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/maxDate_app_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/one_app_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/one_app_multi_attempt_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/one_rdd_storage_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/one_stage_attempt_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/one_stage_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/rdd_list_storage_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/running_app_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_list_with_accumulable_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_list_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_list_from_multi_attempt_app_json_1__expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_list_from_multi_attempt_app_json_2__expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_list_w__offset___length_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_short_names___runtime_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_short_names__runtime_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/stage_with_accumulable_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json create mode 100644 core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json create mode 100755 core/src/test/resources/spark-events/local-1422981759269/APPLICATION_COMPLETE create mode 100755 core/src/test/resources/spark-events/local-1422981759269/EVENT_LOG_1 create mode 100755 core/src/test/resources/spark-events/local-1422981759269/SPARK_VERSION_1.2.0 create mode 100755 core/src/test/resources/spark-events/local-1422981780767/APPLICATION_COMPLETE create mode 100755 core/src/test/resources/spark-events/local-1422981780767/EVENT_LOG_1 create mode 100755 core/src/test/resources/spark-events/local-1422981780767/SPARK_VERSION_1.2.0 create mode 100755 core/src/test/resources/spark-events/local-1425081759269/APPLICATION_COMPLETE create mode 100755 core/src/test/resources/spark-events/local-1425081759269/EVENT_LOG_1 create mode 100755 core/src/test/resources/spark-events/local-1425081759269/SPARK_VERSION_1.2.0 create mode 100755 core/src/test/resources/spark-events/local-1426533911241/APPLICATION_COMPLETE create mode 100755 core/src/test/resources/spark-events/local-1426533911241/EVENT_LOG_1 create mode 100755 core/src/test/resources/spark-events/local-1426533911241/SPARK_VERSION_1.2.0 create mode 100755 core/src/test/resources/spark-events/local-1426633911242/APPLICATION_COMPLETE create mode 100755 core/src/test/resources/spark-events/local-1426633911242/EVENT_LOG_1 create mode 100755 core/src/test/resources/spark-events/local-1426633911242/SPARK_VERSION_1.2.0 create mode 100755 core/src/test/resources/spark-events/local-1430917381534 create mode 100644 core/src/test/scala/org/apache/spark/JsonTestUtils.scala create mode 100644 core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamTest.scala diff --git a/.rat-excludes b/.rat-excludes index dccf2db8055ce..dc08c4a3673d2 100644 --- a/.rat-excludes +++ b/.rat-excludes @@ -74,5 +74,12 @@ logs .*scalastyle-output.xml .*dependency-reduced-pom.xml known_translations +json_expectation +local-1422981759269/* +local-1422981780767/* +local-1425081759269/* +local-1426533911241/* +local-1426633911242/* +local-1430917381534/* DESCRIPTION NAMESPACE diff --git a/core/pom.xml b/core/pom.xml index 164a836cb8f0b..fc42f48973fe9 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -228,6 +228,14 @@ json4s-jackson_${scala.binary.version} 3.2.10 + + com.sun.jersey + jersey-server + + + com.sun.jersey + jersey-core + org.apache.mesos mesos diff --git a/core/src/main/java/org/apache/spark/JobExecutionStatus.java b/core/src/main/java/org/apache/spark/JobExecutionStatus.java index 6e161313702bb..0287fb79f8dd2 100644 --- a/core/src/main/java/org/apache/spark/JobExecutionStatus.java +++ b/core/src/main/java/org/apache/spark/JobExecutionStatus.java @@ -17,9 +17,15 @@ package org.apache.spark; +import org.apache.spark.util.EnumUtil; + public enum JobExecutionStatus { RUNNING, SUCCEEDED, FAILED, - UNKNOWN + UNKNOWN; + + public static JobExecutionStatus fromString(String str) { + return EnumUtil.parseIgnoreCase(JobExecutionStatus.class, str); + } } diff --git a/core/src/main/java/org/apache/spark/status/api/v1/ApplicationStatus.java b/core/src/main/java/org/apache/spark/status/api/v1/ApplicationStatus.java new file mode 100644 index 0000000000000..8c7dcf776fda8 --- /dev/null +++ b/core/src/main/java/org/apache/spark/status/api/v1/ApplicationStatus.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.status.api.v1; + +import org.apache.spark.util.EnumUtil; + +public enum ApplicationStatus { + COMPLETED, + RUNNING; + + public static ApplicationStatus fromString(String str) { + return EnumUtil.parseIgnoreCase(ApplicationStatus.class, str); + } + +} diff --git a/core/src/main/java/org/apache/spark/status/api/v1/StageStatus.java b/core/src/main/java/org/apache/spark/status/api/v1/StageStatus.java new file mode 100644 index 0000000000000..9dbb565aab707 --- /dev/null +++ b/core/src/main/java/org/apache/spark/status/api/v1/StageStatus.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.status.api.v1; + +import org.apache.spark.util.EnumUtil; + +public enum StageStatus { + ACTIVE, + COMPLETE, + FAILED, + PENDING; + + public static StageStatus fromString(String str) { + return EnumUtil.parseIgnoreCase(StageStatus.class, str); + } +} diff --git a/core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java b/core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java new file mode 100644 index 0000000000000..f19ed01d5aebf --- /dev/null +++ b/core/src/main/java/org/apache/spark/status/api/v1/TaskSorting.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.status.api.v1; + +import org.apache.spark.util.EnumUtil; + +import java.util.HashSet; +import java.util.Set; + +public enum TaskSorting { + ID, + INCREASING_RUNTIME("runtime"), + DECREASING_RUNTIME("-runtime"); + + private final Set alternateNames; + private TaskSorting(String... names) { + alternateNames = new HashSet(); + for (String n: names) { + alternateNames.add(n); + } + } + + public static TaskSorting fromString(String str) { + String lower = str.toLowerCase(); + for (TaskSorting t: values()) { + if (t.alternateNames.contains(lower)) { + return t; + } + } + return EnumUtil.parseIgnoreCase(TaskSorting.class, str); + } + +} diff --git a/core/src/main/java/org/apache/spark/util/EnumUtil.java b/core/src/main/java/org/apache/spark/util/EnumUtil.java new file mode 100644 index 0000000000000..c40c7e727613c --- /dev/null +++ b/core/src/main/java/org/apache/spark/util/EnumUtil.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util; + +import com.google.common.base.Joiner; +import org.apache.spark.annotation.Private; + +@Private +public class EnumUtil { + public static > E parseIgnoreCase(Class clz, String str) { + E[] constants = clz.getEnumConstants(); + if (str == null) { + return null; + } + for (E e : constants) { + if (e.name().equalsIgnoreCase(str)) { + return e; + } + } + throw new IllegalArgumentException( + String.format("Illegal type='%s'. Supported type values: %s", + str, Joiner.on(", ").join(constants))); + } +} diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index b5f040ceb15ca..b59f562d05ead 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -430,7 +430,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli _ui = if (conf.getBoolean("spark.ui.enabled", true)) { Some(SparkUI.createLiveUI(this, _conf, listenerBus, _jobProgressListener, - _env.securityManager,appName)) + _env.securityManager,appName, startTime = startTime)) } else { // For tests, do not enable the UI None diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala index 6a5011af17458..298a8201960d1 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala @@ -19,7 +19,7 @@ package org.apache.spark.deploy.history import org.apache.spark.ui.SparkUI -private[history] case class ApplicationAttemptInfo( +private[spark] case class ApplicationAttemptInfo( attemptId: Option[String], startTime: Long, endTime: Long, @@ -27,7 +27,7 @@ private[history] case class ApplicationAttemptInfo( sparkUser: String, completed: Boolean = false) -private[history] case class ApplicationHistoryInfo( +private[spark] case class ApplicationHistoryInfo( id: String, name: String, attempts: List[ApplicationAttemptInfo]) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 993763f3aa092..45c2be34c8680 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -17,23 +17,21 @@ package org.apache.spark.deploy.history -import java.io.{IOException, BufferedInputStream, FileNotFoundException, InputStream} +import java.io.{BufferedInputStream, FileNotFoundException, IOException, InputStream} import java.util.concurrent.{ExecutorService, Executors, TimeUnit} import scala.collection.mutable -import scala.concurrent.duration.Duration -import com.google.common.util.concurrent.ThreadFactoryBuilder - -import com.google.common.util.concurrent.MoreExecutors -import org.apache.hadoop.fs.permission.AccessControlException +import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder} import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.fs.permission.AccessControlException + +import org.apache.spark.{Logging, SecurityManager, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.io.CompressionCodec import org.apache.spark.scheduler._ import org.apache.spark.ui.SparkUI import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils} -import org.apache.spark.{Logging, SecurityManager, SparkConf} /** * A class that provides application history from event logs stored in the file system. @@ -151,7 +149,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val conf = this.conf.clone() val appSecManager = new SecurityManager(conf) SparkUI.createHistoryUI(conf, replayBus, appSecManager, appId, - HistoryServer.getAttemptURI(appId, attempt.attemptId)) + HistoryServer.getAttemptURI(appId, attempt.attemptId), attempt.startTime) // Do not call ui.bind() to avoid creating a new server for each application } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index 754c8e9b6668b..50522e69dc519 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -25,6 +25,7 @@ import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder} import org.apache.spark.{Logging, SecurityManager, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.status.api.v1.{ApplicationInfo, ApplicationsListResource, JsonRootResource, UIRoot} import org.apache.spark.ui.{SparkUI, UIUtils, WebUI} import org.apache.spark.ui.JettyUtils._ import org.apache.spark.util.{SignalLogger, Utils} @@ -45,7 +46,7 @@ class HistoryServer( provider: ApplicationHistoryProvider, securityManager: SecurityManager, port: Int) - extends WebUI(securityManager, port, conf) with Logging { + extends WebUI(securityManager, port, conf) with Logging with UIRoot { // How many applications to retain private val retainedApplications = conf.getInt("spark.history.retainedApplications", 50) @@ -56,7 +57,7 @@ class HistoryServer( require(parts.length == 1 || parts.length == 2, s"Invalid app key $key") val ui = provider .getAppUI(parts(0), if (parts.length > 1) Some(parts(1)) else None) - .getOrElse(throw new NoSuchElementException()) + .getOrElse(throw new NoSuchElementException(s"no app with key $key")) attachSparkUI(ui) ui } @@ -113,6 +114,10 @@ class HistoryServer( } } + def getSparkUI(appKey: String): Option[SparkUI] = { + Option(appCache.get(appKey)) + } + initialize() /** @@ -123,6 +128,9 @@ class HistoryServer( */ def initialize() { attachPage(new HistoryPage(this)) + + attachHandler(JsonRootResource.getJsonServlet(this)) + attachHandler(createStaticHandler(SparkUI.STATIC_RESOURCE_DIR, "/static")) val contextHandler = new ServletContextHandler @@ -160,7 +168,13 @@ class HistoryServer( * * @return List of all known applications. */ - def getApplicationList(): Iterable[ApplicationHistoryInfo] = provider.getListing() + def getApplicationList(): Iterable[ApplicationHistoryInfo] = { + provider.getListing() + } + + def getApplicationInfoList: Iterator[ApplicationInfo] = { + getApplicationList().iterator.map(ApplicationsListResource.appHistoryInfoToPublicAppInfo) + } /** * Returns the provider configuration to show in the listing page. diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala index f59d550d4f3b3..1620e95bea218 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala @@ -28,7 +28,7 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.deploy.ApplicationDescription import org.apache.spark.util.Utils -private[deploy] class ApplicationInfo( +private[spark] class ApplicationInfo( val startTime: Long, val id: String, val desc: ApplicationDescription, diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 0fac3cdcf55e7..53e1903a3d125 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -754,9 +754,9 @@ private[master] class Master( /** * Rebuild a new SparkUI from the given application's event logs. - * Return whether this is successful. + * Return the UI if successful, else None */ - private def rebuildSparkUI(app: ApplicationInfo): Boolean = { + private[master] def rebuildSparkUI(app: ApplicationInfo): Option[SparkUI] = { val appName = app.desc.name val notFoundBasePath = HistoryServer.UI_PATH_PREFIX + "/not-found" try { @@ -764,7 +764,7 @@ private[master] class Master( .getOrElse { // Event logging is not enabled for this application app.desc.appUiUrl = notFoundBasePath - return false + return None } val eventLogFilePrefix = EventLoggingListener.getLogPath( @@ -787,7 +787,7 @@ private[master] class Master( val logInput = EventLoggingListener.openEventLog(new Path(eventLogFile), fs) val replayBus = new ReplayListenerBus() val ui = SparkUI.createHistoryUI(new SparkConf, replayBus, new SecurityManager(conf), - appName + status, HistoryServer.UI_PATH_PREFIX + s"/${app.id}") + appName + status, HistoryServer.UI_PATH_PREFIX + s"/${app.id}", app.startTime) val maybeTruncated = eventLogFile.endsWith(EventLoggingListener.IN_PROGRESS) try { replayBus.replay(logInput, eventLogFile, maybeTruncated) @@ -798,7 +798,7 @@ private[master] class Master( webUi.attachSparkUI(ui) // Application UI is successfully rebuilt, so link the Master UI to it app.desc.appUiUrl = ui.basePath - true + Some(ui) } catch { case fnf: FileNotFoundException => // Event logging is enabled for this application, but no event logs are found @@ -808,7 +808,7 @@ private[master] class Master( msg += " Did you specify the correct logging directory?" msg = URLEncoder.encode(msg, "UTF-8") app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&title=$title" - false + None case e: Exception => // Relay exception message to application UI page val title = s"Application history load error (${app.id})" @@ -817,7 +817,7 @@ private[master] class Master( logError(msg, e) msg = URLEncoder.encode(msg, "UTF-8") app.desc.appUiUrl = notFoundBasePath + s"?msg=$msg&exception=$exception&title=$title" - false + None } } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala index 273f077bd8f57..06e265f99e231 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala @@ -23,10 +23,8 @@ import scala.concurrent.Await import scala.xml.Node import akka.pattern.ask -import org.json4s.JValue -import org.json4s.JsonAST.JNothing -import org.apache.spark.deploy.{ExecutorState, JsonProtocol} +import org.apache.spark.deploy.ExecutorState import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState} import org.apache.spark.deploy.master.ExecutorDesc import org.apache.spark.ui.{UIUtils, WebUIPage} @@ -37,21 +35,6 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app") private val master = parent.masterActorRef private val timeout = parent.timeout - /** Executor details for a particular application */ - override def renderJson(request: HttpServletRequest): JValue = { - val appId = request.getParameter("appId") - val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse] - val state = Await.result(stateFuture, timeout) - val app = state.activeApps.find(_.id == appId).getOrElse({ - state.completedApps.find(_.id == appId).getOrElse(null) - }) - if (app == null) { - JNothing - } else { - JsonProtocol.writeApplicationInfo(app) - } - } - /** Executor details for a particular application */ def render(request: HttpServletRequest): Seq[Node] = { val appId = request.getParameter("appId") diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala index 1f2c3fdbfb2bc..756927682cd24 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala @@ -35,10 +35,13 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { private val master = parent.masterActorRef private val timeout = parent.timeout - override def renderJson(request: HttpServletRequest): JValue = { + def getMasterState: MasterStateResponse = { val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse] - val state = Await.result(stateFuture, timeout) - JsonProtocol.writeMasterState(state) + Await.result(stateFuture, timeout) + } + + override def renderJson(request: HttpServletRequest): JValue = { + JsonProtocol.writeMasterState(getMasterState) } def handleAppKillRequest(request: HttpServletRequest): Unit = { @@ -68,8 +71,7 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { /** Index view listing applications and executors */ def render(request: HttpServletRequest): Seq[Node] = { - val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse] - val state = Await.result(stateFuture, timeout) + val state = getMasterState val workerHeaders = Seq("Worker Id", "Address", "State", "Cores", "Memory") val workers = state.workers.sortBy(_.id) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala index dea0a65eeeaa6..eb26e9f99c70b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala @@ -19,6 +19,7 @@ package org.apache.spark.deploy.master.ui import org.apache.spark.Logging import org.apache.spark.deploy.master.Master +import org.apache.spark.status.api.v1.{ApplicationsListResource, ApplicationInfo, JsonRootResource, UIRoot} import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ import org.apache.spark.util.RpcUtils @@ -28,12 +29,15 @@ import org.apache.spark.util.RpcUtils */ private[master] class MasterWebUI(val master: Master, requestedPort: Int) - extends WebUI(master.securityMgr, requestedPort, master.conf, name = "MasterUI") with Logging { + extends WebUI(master.securityMgr, requestedPort, master.conf, name = "MasterUI") with Logging + with UIRoot { val masterActorRef = master.self val timeout = RpcUtils.askTimeout(master.conf) val killEnabled = master.conf.getBoolean("spark.ui.killEnabled", true) + val masterPage = new MasterPage(this) + initialize() /** Initialize all components of the server. */ @@ -43,6 +47,7 @@ class MasterWebUI(val master: Master, requestedPort: Int) attachPage(new HistoryNotFoundPage(this)) attachPage(masterPage) attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static")) + attachHandler(JsonRootResource.getJsonServlet(this)) attachHandler(createRedirectHandler( "/app/kill", "/", masterPage.handleAppKillRequest, httpMethods = Set("POST"))) attachHandler(createRedirectHandler( @@ -60,6 +65,23 @@ class MasterWebUI(val master: Master, requestedPort: Int) assert(serverInfo.isDefined, "Master UI must be bound to a server before detaching SparkUIs") ui.getHandlers.foreach(detachHandler) } + + def getApplicationInfoList: Iterator[ApplicationInfo] = { + val state = masterPage.getMasterState + val activeApps = state.activeApps.sortBy(_.startTime).reverse + val completedApps = state.completedApps.sortBy(_.endTime).reverse + activeApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, false) } ++ + completedApps.iterator.map { ApplicationsListResource.convertApplicationInfo(_, true) } + } + + def getSparkUI(appId: String): Option[SparkUI] = { + val state = masterPage.getMasterState + val activeApps = state.activeApps.sortBy(_.startTime).reverse + val completedApps = state.completedApps.sortBy(_.endTime).reverse + (activeApps ++ completedApps).find { _.id == appId }.flatMap { + master.rebuildSparkUI + } + } } private[master] object MasterWebUI { diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllJobsResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllJobsResource.scala new file mode 100644 index 0000000000000..5783df5d8220c --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllJobsResource.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import java.util.{Arrays, Date, List => JList} +import javax.ws.rs._ +import javax.ws.rs.core.MediaType + +import org.apache.spark.JobExecutionStatus +import org.apache.spark.ui.SparkUI +import org.apache.spark.ui.jobs.JobProgressListener +import org.apache.spark.ui.jobs.UIData.JobUIData + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class AllJobsResource(ui: SparkUI) { + + @GET + def jobsList(@QueryParam("status") statuses: JList[JobExecutionStatus]): Seq[JobData] = { + val statusToJobs: Seq[(JobExecutionStatus, Seq[JobUIData])] = + AllJobsResource.getStatusToJobs(ui) + val adjStatuses: JList[JobExecutionStatus] = { + if (statuses.isEmpty) { + Arrays.asList(JobExecutionStatus.values(): _*) + } else { + statuses + } + } + val jobInfos = for { + (status, jobs) <- statusToJobs + job <- jobs if adjStatuses.contains(status) + } yield { + AllJobsResource.convertJobData(job, ui.jobProgressListener, false) + } + jobInfos.sortBy{- _.jobId} + } + +} + +private[v1] object AllJobsResource { + + def getStatusToJobs(ui: SparkUI): Seq[(JobExecutionStatus, Seq[JobUIData])] = { + val statusToJobs = ui.jobProgressListener.synchronized { + Seq( + JobExecutionStatus.RUNNING -> ui.jobProgressListener.activeJobs.values.toSeq, + JobExecutionStatus.SUCCEEDED -> ui.jobProgressListener.completedJobs.toSeq, + JobExecutionStatus.FAILED -> ui.jobProgressListener.failedJobs.reverse.toSeq + ) + } + statusToJobs + } + + def convertJobData( + job: JobUIData, + listener: JobProgressListener, + includeStageDetails: Boolean): JobData = { + listener.synchronized { + val lastStageInfo = listener.stageIdToInfo.get(job.stageIds.max) + val lastStageData = lastStageInfo.flatMap { s => + listener.stageIdToData.get((s.stageId, s.attemptId)) + } + val lastStageName = lastStageInfo.map { _.name }.getOrElse("(Unknown Stage Name)") + val lastStageDescription = lastStageData.flatMap { _.description } + new JobData( + jobId = job.jobId, + name = lastStageName, + description = lastStageDescription, + submissionTime = job.submissionTime.map{new Date(_)}, + completionTime = job.completionTime.map{new Date(_)}, + stageIds = job.stageIds, + jobGroup = job.jobGroup, + status = job.status, + numTasks = job.numTasks, + numActiveTasks = job.numActiveTasks, + numCompletedTasks = job.numCompletedTasks, + numSkippedTasks = job.numCompletedTasks, + numFailedTasks = job.numFailedTasks, + numActiveStages = job.numActiveStages, + numCompletedStages = job.completedStageIndices.size, + numSkippedStages = job.numSkippedStages, + numFailedStages = job.numFailedStages + ) + } + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllRDDResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllRDDResource.scala new file mode 100644 index 0000000000000..645ede26a0879 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllRDDResource.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import javax.ws.rs.{GET, Produces} +import javax.ws.rs.core.MediaType + +import org.apache.spark.storage.{RDDInfo, StorageStatus, StorageUtils} +import org.apache.spark.ui.SparkUI +import org.apache.spark.ui.storage.StorageListener + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class AllRDDResource(ui: SparkUI) { + + @GET + def rddList(): Seq[RDDStorageInfo] = { + val storageStatusList = ui.storageListener.storageStatusList + val rddInfos = ui.storageListener.rddInfoList + rddInfos.map{rddInfo => + AllRDDResource.getRDDStorageInfo(rddInfo.id, rddInfo, storageStatusList, + includeDetails = false) + } + } + +} + +private[spark] object AllRDDResource { + + def getRDDStorageInfo( + rddId: Int, + listener: StorageListener, + includeDetails: Boolean): Option[RDDStorageInfo] = { + val storageStatusList = listener.storageStatusList + listener.rddInfoList.find { _.id == rddId }.map { rddInfo => + getRDDStorageInfo(rddId, rddInfo, storageStatusList, includeDetails) + } + } + + def getRDDStorageInfo( + rddId: Int, + rddInfo: RDDInfo, + storageStatusList: Seq[StorageStatus], + includeDetails: Boolean): RDDStorageInfo = { + val workers = storageStatusList.map { (rddId, _) } + val blockLocations = StorageUtils.getRddBlockLocations(rddId, storageStatusList) + val blocks = storageStatusList + .flatMap { _.rddBlocksById(rddId) } + .sortWith { _._1.name < _._1.name } + .map { case (blockId, status) => + (blockId, status, blockLocations.get(blockId).getOrElse(Seq[String]("Unknown"))) + } + + val dataDistribution = if (includeDetails) { + Some(storageStatusList.map { status => + new RDDDataDistribution( + address = status.blockManagerId.hostPort, + memoryUsed = status.memUsedByRdd(rddId), + memoryRemaining = status.memRemaining, + diskUsed = status.diskUsedByRdd(rddId) + ) } ) + } else { + None + } + val partitions = if (includeDetails) { + Some(blocks.map { case (id, block, locations) => + new RDDPartitionInfo( + blockName = id.name, + storageLevel = block.storageLevel.description, + memoryUsed = block.memSize, + diskUsed = block.diskSize, + executors = locations + ) + } ) + } else { + None + } + + new RDDStorageInfo( + id = rddId, + name = rddInfo.name, + numPartitions = rddInfo.numPartitions, + numCachedPartitions = rddInfo.numCachedPartitions, + storageLevel = rddInfo.storageLevel.description, + memoryUsed = rddInfo.memSize, + diskUsed = rddInfo.diskSize, + dataDistribution = dataDistribution, + partitions = partitions + ) + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala new file mode 100644 index 0000000000000..50608588f09ae --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import java.util.{Arrays, Date, List => JList} +import javax.ws.rs.{GET, PathParam, Produces, QueryParam} +import javax.ws.rs.core.MediaType + +import org.apache.spark.executor.{InputMetrics => InternalInputMetrics, OutputMetrics => InternalOutputMetrics, ShuffleReadMetrics => InternalShuffleReadMetrics, ShuffleWriteMetrics => InternalShuffleWriteMetrics, TaskMetrics => InternalTaskMetrics} +import org.apache.spark.scheduler.{AccumulableInfo => InternalAccumulableInfo, StageInfo} +import org.apache.spark.ui.SparkUI +import org.apache.spark.ui.jobs.UIData.{StageUIData, TaskUIData} +import org.apache.spark.util.Distribution + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class AllStagesResource(ui: SparkUI) { + + @GET + def stageList(@QueryParam("status") statuses: JList[StageStatus]): Seq[StageData] = { + val listener = ui.jobProgressListener + val stageAndStatus = AllStagesResource.stagesAndStatus(ui) + val adjStatuses = { + if (statuses.isEmpty()) { + Arrays.asList(StageStatus.values(): _*) + } else { + statuses + } + } + for { + (status, stageList) <- stageAndStatus + stageInfo: StageInfo <- stageList if adjStatuses.contains(status) + stageUiData: StageUIData <- listener.synchronized { + listener.stageIdToData.get((stageInfo.stageId, stageInfo.attemptId)) + } + } yield { + AllStagesResource.stageUiToStageData(status, stageInfo, stageUiData, includeDetails = false) + } + } +} + +private[v1] object AllStagesResource { + def stageUiToStageData( + status: StageStatus, + stageInfo: StageInfo, + stageUiData: StageUIData, + includeDetails: Boolean): StageData = { + + val taskData = if (includeDetails) { + Some(stageUiData.taskData.map { case (k, v) => k -> convertTaskData(v) } ) + } else { + None + } + val executorSummary = if (includeDetails) { + Some(stageUiData.executorSummary.map { case (k, summary) => + k -> new ExecutorStageSummary( + taskTime = summary.taskTime, + failedTasks = summary.failedTasks, + succeededTasks = summary.succeededTasks, + inputBytes = summary.inputBytes, + outputBytes = summary.outputBytes, + shuffleRead = summary.shuffleRead, + shuffleWrite = summary.shuffleWrite, + memoryBytesSpilled = summary.memoryBytesSpilled, + diskBytesSpilled = summary.diskBytesSpilled + ) + }) + } else { + None + } + + val accumulableInfo = stageUiData.accumulables.values.map { convertAccumulableInfo }.toSeq + + new StageData( + status = status, + stageId = stageInfo.stageId, + attemptId = stageInfo.attemptId, + numActiveTasks = stageUiData.numActiveTasks, + numCompleteTasks = stageUiData.numCompleteTasks, + numFailedTasks = stageUiData.numFailedTasks, + executorRunTime = stageUiData.executorRunTime, + inputBytes = stageUiData.inputBytes, + inputRecords = stageUiData.inputRecords, + outputBytes = stageUiData.outputBytes, + outputRecords = stageUiData.outputRecords, + shuffleReadBytes = stageUiData.shuffleReadTotalBytes, + shuffleReadRecords = stageUiData.shuffleReadRecords, + shuffleWriteBytes = stageUiData.shuffleWriteBytes, + shuffleWriteRecords = stageUiData.shuffleWriteRecords, + memoryBytesSpilled = stageUiData.memoryBytesSpilled, + diskBytesSpilled = stageUiData.diskBytesSpilled, + schedulingPool = stageUiData.schedulingPool, + name = stageInfo.name, + details = stageInfo.details, + accumulatorUpdates = accumulableInfo, + tasks = taskData, + executorSummary = executorSummary + ) + } + + def stagesAndStatus(ui: SparkUI): Seq[(StageStatus, Seq[StageInfo])] = { + val listener = ui.jobProgressListener + listener.synchronized { + Seq( + StageStatus.ACTIVE -> listener.activeStages.values.toSeq, + StageStatus.COMPLETE -> listener.completedStages.reverse.toSeq, + StageStatus.FAILED -> listener.failedStages.reverse.toSeq, + StageStatus.PENDING -> listener.pendingStages.values.toSeq + ) + } + } + + def convertTaskData(uiData: TaskUIData): TaskData = { + new TaskData( + taskId = uiData.taskInfo.taskId, + index = uiData.taskInfo.index, + attempt = uiData.taskInfo.attempt, + launchTime = new Date(uiData.taskInfo.launchTime), + executorId = uiData.taskInfo.executorId, + host = uiData.taskInfo.host, + taskLocality = uiData.taskInfo.taskLocality.toString(), + speculative = uiData.taskInfo.speculative, + accumulatorUpdates = uiData.taskInfo.accumulables.map { convertAccumulableInfo }, + errorMessage = uiData.errorMessage, + taskMetrics = uiData.taskMetrics.map { convertUiTaskMetrics } + ) + } + + def taskMetricDistributions( + allTaskData: Iterable[TaskUIData], + quantiles: Array[Double]): TaskMetricDistributions = { + + val rawMetrics = allTaskData.flatMap{_.taskMetrics}.toSeq + + def metricQuantiles(f: InternalTaskMetrics => Double): IndexedSeq[Double] = + Distribution(rawMetrics.map { d => f(d) }).get.getQuantiles(quantiles) + + // We need to do a lot of similar munging to nested metrics here. For each one, + // we want (a) extract the values for nested metrics (b) make a distribution for each metric + // (c) shove the distribution into the right field in our return type and (d) only return + // a result if the option is defined for any of the tasks. MetricHelper is a little util + // to make it a little easier to deal w/ all of the nested options. Mostly it lets us just + // implement one "build" method, which just builds the quantiles for each field. + + val inputMetrics: Option[InputMetricDistributions] = + new MetricHelper[InternalInputMetrics, InputMetricDistributions](rawMetrics, quantiles) { + def getSubmetrics(raw: InternalTaskMetrics): Option[InternalInputMetrics] = { + raw.inputMetrics + } + + def build: InputMetricDistributions = new InputMetricDistributions( + bytesRead = submetricQuantiles(_.bytesRead), + recordsRead = submetricQuantiles(_.recordsRead) + ) + }.metricOption + + val outputMetrics: Option[OutputMetricDistributions] = + new MetricHelper[InternalOutputMetrics, OutputMetricDistributions](rawMetrics, quantiles) { + def getSubmetrics(raw:InternalTaskMetrics): Option[InternalOutputMetrics] = { + raw.outputMetrics + } + def build: OutputMetricDistributions = new OutputMetricDistributions( + bytesWritten = submetricQuantiles(_.bytesWritten), + recordsWritten = submetricQuantiles(_.recordsWritten) + ) + }.metricOption + + val shuffleReadMetrics: Option[ShuffleReadMetricDistributions] = + new MetricHelper[InternalShuffleReadMetrics, ShuffleReadMetricDistributions](rawMetrics, + quantiles) { + def getSubmetrics(raw: InternalTaskMetrics): Option[InternalShuffleReadMetrics] = { + raw.shuffleReadMetrics + } + def build: ShuffleReadMetricDistributions = new ShuffleReadMetricDistributions( + readBytes = submetricQuantiles(_.totalBytesRead), + readRecords = submetricQuantiles(_.recordsRead), + remoteBytesRead = submetricQuantiles(_.remoteBytesRead), + remoteBlocksFetched = submetricQuantiles(_.remoteBlocksFetched), + localBlocksFetched = submetricQuantiles(_.localBlocksFetched), + totalBlocksFetched = submetricQuantiles(_.totalBlocksFetched), + fetchWaitTime = submetricQuantiles(_.fetchWaitTime) + ) + }.metricOption + + val shuffleWriteMetrics: Option[ShuffleWriteMetricDistributions] = + new MetricHelper[InternalShuffleWriteMetrics, ShuffleWriteMetricDistributions](rawMetrics, + quantiles) { + def getSubmetrics(raw: InternalTaskMetrics): Option[InternalShuffleWriteMetrics] = { + raw.shuffleWriteMetrics + } + def build: ShuffleWriteMetricDistributions = new ShuffleWriteMetricDistributions( + writeBytes = submetricQuantiles(_.shuffleBytesWritten), + writeRecords = submetricQuantiles(_.shuffleRecordsWritten), + writeTime = submetricQuantiles(_.shuffleWriteTime) + ) + }.metricOption + + new TaskMetricDistributions( + quantiles = quantiles, + executorDeserializeTime = metricQuantiles(_.executorDeserializeTime), + executorRunTime = metricQuantiles(_.executorRunTime), + resultSize = metricQuantiles(_.resultSize), + jvmGcTime = metricQuantiles(_.jvmGCTime), + resultSerializationTime = metricQuantiles(_.resultSerializationTime), + memoryBytesSpilled = metricQuantiles(_.memoryBytesSpilled), + diskBytesSpilled = metricQuantiles(_.diskBytesSpilled), + inputMetrics = inputMetrics, + outputMetrics = outputMetrics, + shuffleReadMetrics = shuffleReadMetrics, + shuffleWriteMetrics = shuffleWriteMetrics + ) + } + + def convertAccumulableInfo(acc: InternalAccumulableInfo): AccumulableInfo = { + new AccumulableInfo(acc.id, acc.name, acc.update, acc.value) + } + + def convertUiTaskMetrics(internal: InternalTaskMetrics): TaskMetrics = { + new TaskMetrics( + executorDeserializeTime = internal.executorDeserializeTime, + executorRunTime = internal.executorRunTime, + resultSize = internal.resultSize, + jvmGcTime = internal.jvmGCTime, + resultSerializationTime = internal.resultSerializationTime, + memoryBytesSpilled = internal.memoryBytesSpilled, + diskBytesSpilled = internal.diskBytesSpilled, + inputMetrics = internal.inputMetrics.map { convertInputMetrics }, + outputMetrics = Option(internal.outputMetrics).flatten.map { convertOutputMetrics }, + shuffleReadMetrics = internal.shuffleReadMetrics.map { convertShuffleReadMetrics }, + shuffleWriteMetrics = internal.shuffleWriteMetrics.map { convertShuffleWriteMetrics } + ) + } + + def convertInputMetrics(internal: InternalInputMetrics): InputMetrics = { + new InputMetrics( + bytesRead = internal.bytesRead, + recordsRead = internal.recordsRead + ) + } + + def convertOutputMetrics(internal: InternalOutputMetrics): OutputMetrics = { + new OutputMetrics( + bytesWritten = internal.bytesWritten, + recordsWritten = internal.recordsWritten + ) + } + + def convertShuffleReadMetrics(internal: InternalShuffleReadMetrics): ShuffleReadMetrics = { + new ShuffleReadMetrics( + remoteBlocksFetched = internal.remoteBlocksFetched, + localBlocksFetched = internal.localBlocksFetched, + fetchWaitTime = internal.fetchWaitTime, + remoteBytesRead = internal.remoteBytesRead, + totalBlocksFetched = internal.totalBlocksFetched, + recordsRead = internal.recordsRead + ) + } + + def convertShuffleWriteMetrics(internal: InternalShuffleWriteMetrics): ShuffleWriteMetrics = { + new ShuffleWriteMetrics( + bytesWritten = internal.shuffleBytesWritten, + writeTime = internal.shuffleWriteTime, + recordsWritten = internal.shuffleRecordsWritten + ) + } +} + +/** + * Helper for getting distributions from nested metric types. Many of the metrics we want are + * contained in options inside TaskMetrics (eg., ShuffleWriteMetrics). This makes it easy to handle + * the options (returning None if the metrics are all empty), and extract the quantiles for each + * metric. After creating an instance, call metricOption to get the result type. + */ +private[v1] abstract class MetricHelper[I,O]( + rawMetrics: Seq[InternalTaskMetrics], + quantiles: Array[Double]) { + + def getSubmetrics(raw: InternalTaskMetrics): Option[I] + + def build: O + + val data: Seq[I] = rawMetrics.flatMap(getSubmetrics) + + /** applies the given function to all input metrics, and returns the quantiles */ + def submetricQuantiles(f: I => Double): IndexedSeq[Double] = { + Distribution(data.map { d => f(d) }).get.getQuantiles(quantiles) + } + + def metricOption: Option[O] = { + if (data.isEmpty) { + None + } else { + Some(build) + } + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala new file mode 100644 index 0000000000000..17b521f3e1d41 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import java.util.{Arrays, Date, List => JList} +import javax.ws.rs.{DefaultValue, GET, Produces, QueryParam} +import javax.ws.rs.core.MediaType + +import org.apache.spark.deploy.history.ApplicationHistoryInfo +import org.apache.spark.deploy.master.{ApplicationInfo => InternalApplicationInfo} + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class ApplicationListResource(uiRoot: UIRoot) { + + @GET + def appList( + @QueryParam("status") status: JList[ApplicationStatus], + @DefaultValue("2010-01-01") @QueryParam("minDate") minDate: SimpleDateParam, + @DefaultValue("3000-01-01") @QueryParam("maxDate") maxDate: SimpleDateParam) + : Iterator[ApplicationInfo] = { + val allApps = uiRoot.getApplicationInfoList + val adjStatus = { + if (status.isEmpty) { + Arrays.asList(ApplicationStatus.values(): _*) + } else { + status + } + } + val includeCompleted = adjStatus.contains(ApplicationStatus.COMPLETED) + val includeRunning = adjStatus.contains(ApplicationStatus.RUNNING) + allApps.filter { app => + val anyRunning = app.attempts.exists(!_.completed) + // if any attempt is still running, we consider the app to also still be running + val statusOk = (!anyRunning && includeCompleted) || + (anyRunning && includeRunning) + // keep the app if *any* attempts fall in the right time window + val dateOk = app.attempts.exists { attempt => + attempt.startTime.getTime >= minDate.timestamp && + attempt.startTime.getTime <= maxDate.timestamp + } + statusOk && dateOk + } + } +} + +private[spark] object ApplicationsListResource { + def appHistoryInfoToPublicAppInfo(app: ApplicationHistoryInfo): ApplicationInfo = { + new ApplicationInfo( + id = app.id, + name = app.name, + attempts = app.attempts.map { internalAttemptInfo => + new ApplicationAttemptInfo( + attemptId = internalAttemptInfo.attemptId, + startTime = new Date(internalAttemptInfo.startTime), + endTime = new Date(internalAttemptInfo.endTime), + sparkUser = internalAttemptInfo.sparkUser, + completed = internalAttemptInfo.completed + ) + } + ) + } + + def convertApplicationInfo( + internal: InternalApplicationInfo, + completed: Boolean): ApplicationInfo = { + // standalone application info always has just one attempt + new ApplicationInfo( + id = internal.id, + name = internal.desc.name, + attempts = Seq(new ApplicationAttemptInfo( + attemptId = None, + startTime = new Date(internal.startTime), + endTime = new Date(internal.endTime), + sparkUser = internal.desc.user, + completed = completed + )) + ) + } + +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ExecutorListResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ExecutorListResource.scala new file mode 100644 index 0000000000000..8ad4656b4dada --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/ExecutorListResource.scala @@ -0,0 +1,36 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.spark.status.api.v1 + +import javax.ws.rs.{GET, PathParam, Produces} +import javax.ws.rs.core.MediaType + +import org.apache.spark.ui.SparkUI +import org.apache.spark.ui.exec.ExecutorsPage + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class ExecutorListResource(ui: SparkUI) { + + @GET + def executorList(): Seq[ExecutorSummary] = { + val listener = ui.executorsListener + val storageStatusList = listener.storageStatusList + (0 until storageStatusList.size).map { statusId => + ExecutorsPage.getExecInfo(listener, statusId) + } + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala new file mode 100644 index 0000000000000..202a5191ad57d --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import java.io.OutputStream +import java.lang.annotation.Annotation +import java.lang.reflect.Type +import java.text.SimpleDateFormat +import java.util.{Calendar, SimpleTimeZone} +import javax.ws.rs.Produces +import javax.ws.rs.core.{MediaType, MultivaluedMap} +import javax.ws.rs.ext.{MessageBodyWriter, Provider} + +import com.fasterxml.jackson.annotation.JsonInclude +import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} + +/** + * This class converts the POJO metric responses into json, using jackson. + * + * This doesn't follow the standard jersey-jackson plugin options, because we want to stick + * with an old version of jersey (since we have it from yarn anyway) and don't want to pull in lots + * of dependencies from a new plugin. + * + * Note that jersey automatically discovers this class based on its package and its annotations. + */ +@Provider +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class JacksonMessageWriter extends MessageBodyWriter[Object]{ + + val mapper = new ObjectMapper() { + override def writeValueAsString(t: Any): String = { + super.writeValueAsString(t) + } + } + mapper.registerModule(com.fasterxml.jackson.module.scala.DefaultScalaModule) + mapper.enable(SerializationFeature.INDENT_OUTPUT) + mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL) + mapper.setDateFormat(JacksonMessageWriter.makeISODateFormat) + + override def isWriteable( + aClass: Class[_], + `type`: Type, + annotations: Array[Annotation], + mediaType: MediaType): Boolean = { + true + } + + override def writeTo( + t: Object, + aClass: Class[_], + `type`: Type, + annotations: Array[Annotation], + mediaType: MediaType, + multivaluedMap: MultivaluedMap[String, AnyRef], + outputStream: OutputStream): Unit = { + t match { + case ErrorWrapper(err) => outputStream.write(err.getBytes("utf-8")) + case _ => mapper.writeValue(outputStream, t) + } + } + + override def getSize( + t: Object, + aClass: Class[_], + `type`: Type, + annotations: Array[Annotation], + mediaType: MediaType): Long = { + -1L + } +} + +private[spark] object JacksonMessageWriter { + def makeISODateFormat: SimpleDateFormat = { + val iso8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'GMT'") + val cal = Calendar.getInstance(new SimpleTimeZone(0, "GMT")) + iso8601.setCalendar(cal) + iso8601 + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/JsonRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/JsonRootResource.scala new file mode 100644 index 0000000000000..c3ec45f54681b --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/JsonRootResource.scala @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import javax.servlet.ServletContext +import javax.ws.rs._ +import javax.ws.rs.core.{Context, Response} + +import com.sun.jersey.api.core.ResourceConfig +import com.sun.jersey.spi.container.servlet.ServletContainer +import org.eclipse.jetty.server.handler.ContextHandler +import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder} + +import org.apache.spark.SecurityManager +import org.apache.spark.ui.SparkUI + +/** + * Main entry point for serving spark application metrics as json, using JAX-RS. + * + * Each resource should have endpoints that return **public** classes defined in api.scala. Mima + * binary compatibility checks ensure that we don't inadvertently make changes that break the api. + * The returned objects are automatically converted to json by jackson with JacksonMessageWriter. + * In addition, there are a number of tests in HistoryServerSuite that compare the json to "golden + * files". Any changes and additions should be reflected there as well -- see the notes in + * HistoryServerSuite. + */ +@Path("/v1") +private[v1] class JsonRootResource extends UIRootFromServletContext { + + @Path("applications") + def getApplicationList(): ApplicationListResource = { + new ApplicationListResource(uiRoot) + } + + @Path("applications/{appId}") + def getApplication(): OneApplicationResource = { + new OneApplicationResource(uiRoot) + } + + @Path("applications/{appId}/{attemptId}/jobs") + def getJobs( + @PathParam("appId") appId: String, + @PathParam("attemptId") attemptId: String): AllJobsResource = { + uiRoot.withSparkUI(appId, Some(attemptId)) { ui => + new AllJobsResource(ui) + } + } + + @Path("applications/{appId}/jobs") + def getJobs(@PathParam("appId") appId: String): AllJobsResource = { + uiRoot.withSparkUI(appId, None) { ui => + new AllJobsResource(ui) + } + } + + @Path("applications/{appId}/jobs/{jobId: \\d+}") + def getJob(@PathParam("appId") appId: String): OneJobResource = { + uiRoot.withSparkUI(appId, None) { ui => + new OneJobResource(ui) + } + } + + @Path("applications/{appId}/{attemptId}/jobs/{jobId: \\d+}") + def getJob( + @PathParam("appId") appId: String, + @PathParam("attemptId") attemptId: String): OneJobResource = { + uiRoot.withSparkUI(appId, Some(attemptId)) { ui => + new OneJobResource(ui) + } + } + + @Path("applications/{appId}/executors") + def getExecutors(@PathParam("appId") appId: String): ExecutorListResource = { + uiRoot.withSparkUI(appId, None) { ui => + new ExecutorListResource(ui) + } + } + + @Path("applications/{appId}/{attemptId}/executors") + def getExecutors( + @PathParam("appId") appId: String, + @PathParam("attemptId") attemptId: String): ExecutorListResource = { + uiRoot.withSparkUI(appId, Some(attemptId)) { ui => + new ExecutorListResource(ui) + } + } + + + @Path("applications/{appId}/stages") + def getStages(@PathParam("appId") appId: String): AllStagesResource= { + uiRoot.withSparkUI(appId, None) { ui => + new AllStagesResource(ui) + } + } + + @Path("applications/{appId}/{attemptId}/stages") + def getStages( + @PathParam("appId") appId: String, + @PathParam("attemptId") attemptId: String): AllStagesResource= { + uiRoot.withSparkUI(appId, Some(attemptId)) { ui => + new AllStagesResource(ui) + } + } + + @Path("applications/{appId}/stages/{stageId: \\d+}") + def getStage(@PathParam("appId") appId: String): OneStageResource= { + uiRoot.withSparkUI(appId, None) { ui => + new OneStageResource(ui) + } + } + + @Path("applications/{appId}/{attemptId}/stages/{stageId: \\d+}") + def getStage( + @PathParam("appId") appId: String, + @PathParam("attemptId") attemptId: String): OneStageResource = { + uiRoot.withSparkUI(appId, Some(attemptId)) { ui => + new OneStageResource(ui) + } + } + + @Path("applications/{appId}/storage/rdd") + def getRdds(@PathParam("appId") appId: String): AllRDDResource = { + uiRoot.withSparkUI(appId, None) { ui => + new AllRDDResource(ui) + } + } + + @Path("applications/{appId}/{attemptId}/storage/rdd") + def getRdds( + @PathParam("appId") appId: String, + @PathParam("attemptId") attemptId: String): AllRDDResource = { + uiRoot.withSparkUI(appId, Some(attemptId)) { ui => + new AllRDDResource(ui) + } + } + + @Path("applications/{appId}/storage/rdd/{rddId: \\d+}") + def getRdd(@PathParam("appId") appId: String): OneRDDResource = { + uiRoot.withSparkUI(appId, None) { ui => + new OneRDDResource(ui) + } + } + + @Path("applications/{appId}/{attemptId}/storage/rdd/{rddId: \\d+}") + def getRdd( + @PathParam("appId") appId: String, + @PathParam("attemptId") attemptId: String): OneRDDResource = { + uiRoot.withSparkUI(appId, Some(attemptId)) { ui => + new OneRDDResource(ui) + } + } + +} + +private[spark] object JsonRootResource { + + def getJsonServlet(uiRoot: UIRoot): ServletContextHandler = { + val jerseyContext = new ServletContextHandler(ServletContextHandler.NO_SESSIONS) + jerseyContext.setContextPath("/json") + val holder:ServletHolder = new ServletHolder(classOf[ServletContainer]) + holder.setInitParameter("com.sun.jersey.config.property.resourceConfigClass", + "com.sun.jersey.api.core.PackagesResourceConfig") + holder.setInitParameter("com.sun.jersey.config.property.packages", + "org.apache.spark.status.api.v1") + holder.setInitParameter(ResourceConfig.PROPERTY_CONTAINER_REQUEST_FILTERS, + classOf[SecurityFilter].getCanonicalName) + UIRootFromServletContext.setUiRoot(jerseyContext, uiRoot) + jerseyContext.addServlet(holder, "/*") + jerseyContext + } +} + +/** + * This trait is shared by the all the root containers for application UI information -- + * the HistoryServer, the Master UI, and the application UI. This provides the common + * interface needed for them all to expose application info as json. + */ +private[spark] trait UIRoot { + def getSparkUI(appKey: String): Option[SparkUI] + def getApplicationInfoList: Iterator[ApplicationInfo] + + /** + * Get the spark UI with the given appID, and apply a function + * to it. If there is no such app, throw an appropriate exception + */ + def withSparkUI[T](appId: String, attemptId: Option[String])(f: SparkUI => T): T = { + val appKey = attemptId.map(appId + "/" + _).getOrElse(appId) + getSparkUI(appKey) match { + case Some(ui) => + f(ui) + case None => throw new NotFoundException("no such app: " + appId) + } + } + def securityManager: SecurityManager +} + +private[v1] object UIRootFromServletContext { + + private val attribute = getClass.getCanonicalName + + def setUiRoot(contextHandler: ContextHandler, uiRoot: UIRoot): Unit = { + contextHandler.setAttribute(attribute, uiRoot) + } + + def getUiRoot(context: ServletContext): UIRoot = { + context.getAttribute(attribute).asInstanceOf[UIRoot] + } +} + +private[v1] trait UIRootFromServletContext { + @Context + var servletContext: ServletContext = _ + + def uiRoot: UIRoot = UIRootFromServletContext.getUiRoot(servletContext) +} + +private[v1] class NotFoundException(msg: String) extends WebApplicationException( + new NoSuchElementException(msg), + Response + .status(Response.Status.NOT_FOUND) + .entity(ErrorWrapper(msg)) + .build() +) + +private[v1] class BadParameterException(msg: String) extends WebApplicationException( + new IllegalArgumentException(msg), + Response + .status(Response.Status.BAD_REQUEST) + .entity(ErrorWrapper(msg)) + .build() +) { + def this(param: String, exp: String, actual: String) = { + this(raw"""Bad value for parameter "$param". Expected a $exp, got "$actual"""") + } +} + +/** + * Signal to JacksonMessageWriter to not convert the message into json (which would result in an + * extra set of quotes). + */ +private[v1] case class ErrorWrapper(s: String) diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala new file mode 100644 index 0000000000000..b5ef72649e295 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import javax.ws.rs.core.MediaType +import javax.ws.rs.{Produces, PathParam, GET} + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class OneApplicationResource(uiRoot: UIRoot) { + + @GET + def getApp(@PathParam("appId") appId: String): ApplicationInfo = { + val apps = uiRoot.getApplicationInfoList.find { _.id == appId } + apps.getOrElse(throw new NotFoundException("unknown app: " + appId)) + } + +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneJobResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneJobResource.scala new file mode 100644 index 0000000000000..6d8a60d480aed --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneJobResource.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import javax.ws.rs.{PathParam, GET, Produces} +import javax.ws.rs.core.MediaType + +import org.apache.spark.JobExecutionStatus +import org.apache.spark.ui.SparkUI +import org.apache.spark.ui.jobs.UIData.JobUIData + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class OneJobResource(ui: SparkUI) { + + @GET + def oneJob(@PathParam("jobId") jobId: Int): JobData = { + val statusToJobs: Seq[(JobExecutionStatus, Seq[JobUIData])] = + AllJobsResource.getStatusToJobs(ui) + val jobOpt = statusToJobs.map {_._2} .flatten.find { jobInfo => jobInfo.jobId == jobId} + jobOpt.map { job => + AllJobsResource.convertJobData(job, ui.jobProgressListener, false) + }.getOrElse { + throw new NotFoundException("unknown job: " + jobId) + } + } + +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneRDDResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneRDDResource.scala new file mode 100644 index 0000000000000..07b224fac4786 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneRDDResource.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import javax.ws.rs.{PathParam, GET, Produces} +import javax.ws.rs.core.MediaType + +import org.apache.spark.ui.SparkUI + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class OneRDDResource(ui: SparkUI) { + + @GET + def rddData(@PathParam("rddId") rddId: Int): RDDStorageInfo = { + AllRDDResource.getRDDStorageInfo(rddId, ui.storageListener, true).getOrElse( + throw new NotFoundException(s"no rdd found w/ id $rddId") + ) + } + +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala new file mode 100644 index 0000000000000..fd24aea63a8a1 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneStageResource.scala @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import javax.ws.rs._ +import javax.ws.rs.core.MediaType + +import org.apache.spark.SparkException +import org.apache.spark.scheduler.StageInfo +import org.apache.spark.status.api.v1.StageStatus._ +import org.apache.spark.status.api.v1.TaskSorting._ +import org.apache.spark.ui.SparkUI +import org.apache.spark.ui.jobs.JobProgressListener +import org.apache.spark.ui.jobs.UIData.StageUIData + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class OneStageResource(ui: SparkUI) { + + @GET + @Path("") + def stageData(@PathParam("stageId") stageId: Int): Seq[StageData] = { + withStage(stageId){ stageAttempts => + stageAttempts.map { stage => + AllStagesResource.stageUiToStageData(stage.status, stage.info, stage.ui, + includeDetails = true) + } + } + } + + @GET + @Path("/{stageAttemptId: \\d+}") + def oneAttemptData( + @PathParam("stageId") stageId: Int, + @PathParam("stageAttemptId") stageAttemptId: Int): StageData = { + withStageAttempt(stageId, stageAttemptId) { stage => + AllStagesResource.stageUiToStageData(stage.status, stage.info, stage.ui, + includeDetails = true) + } + } + + @GET + @Path("/{stageAttemptId: \\d+}/taskSummary") + def taskSummary( + @PathParam("stageId") stageId: Int, + @PathParam("stageAttemptId") stageAttemptId: Int, + @DefaultValue("0.05,0.25,0.5,0.75,0.95") @QueryParam("quantiles") quantileString: String) + : TaskMetricDistributions = { + withStageAttempt(stageId, stageAttemptId) { stage => + val quantiles = quantileString.split(",").map { s => + try { + s.toDouble + } catch { + case nfe: NumberFormatException => + throw new BadParameterException("quantiles", "double", s) + } + } + AllStagesResource.taskMetricDistributions(stage.ui.taskData.values, quantiles) + } + } + + @GET + @Path("/{stageAttemptId: \\d+}/taskList") + def taskList( + @PathParam("stageId") stageId: Int, + @PathParam("stageAttemptId") stageAttemptId: Int, + @DefaultValue("0") @QueryParam("offset") offset: Int, + @DefaultValue("20") @QueryParam("length") length: Int, + @DefaultValue("ID") @QueryParam("sortBy") sortBy: TaskSorting): Seq[TaskData] = { + withStageAttempt(stageId, stageAttemptId) { stage => + val tasks = stage.ui.taskData.values.map{AllStagesResource.convertTaskData}.toIndexedSeq + .sorted(OneStageResource.ordering(sortBy)) + tasks.slice(offset, offset + length) + } + } + + private case class StageStatusInfoUi(status: StageStatus, info: StageInfo, ui: StageUIData) + + private def withStage[T](stageId: Int)(f: Seq[StageStatusInfoUi] => T): T = { + val stageAttempts = findStageStatusUIData(ui.jobProgressListener, stageId) + if (stageAttempts.isEmpty) { + throw new NotFoundException("unknown stage: " + stageId) + } else { + f(stageAttempts) + } + } + + private def findStageStatusUIData( + listener: JobProgressListener, + stageId: Int): Seq[StageStatusInfoUi] = { + listener.synchronized { + def getStatusInfoUi(status: StageStatus, infos: Seq[StageInfo]): Seq[StageStatusInfoUi] = { + infos.filter { _.stageId == stageId }.map { info => + val ui = listener.stageIdToData.getOrElse((info.stageId, info.attemptId), + // this is an internal error -- we should always have uiData + throw new SparkException( + s"no stage ui data found for stage: ${info.stageId}:${info.attemptId}") + ) + StageStatusInfoUi(status, info, ui) + } + } + getStatusInfoUi(ACTIVE, listener.activeStages.values.toSeq) ++ + getStatusInfoUi(COMPLETE, listener.completedStages) ++ + getStatusInfoUi(FAILED, listener.failedStages) ++ + getStatusInfoUi(PENDING, listener.pendingStages.values.toSeq) + } + } + + private def withStageAttempt[T]( + stageId: Int, + stageAttemptId: Int) + (f: StageStatusInfoUi => T): T = { + withStage(stageId) { attempts => + val oneAttempt = attempts.find { stage => stage.info.attemptId == stageAttemptId } + oneAttempt match { + case Some(stage) => + f(stage) + case None => + val stageAttempts = attempts.map { _.info.attemptId } + throw new NotFoundException(s"unknown attempt for stage $stageId. " + + s"Found attempts: ${stageAttempts.mkString("[", ",", "]")}") + } + } + } +} + +object OneStageResource { + def ordering(taskSorting: TaskSorting): Ordering[TaskData] = { + val extractor: (TaskData => Long) = td => + taskSorting match { + case ID => td.taskId + case INCREASING_RUNTIME => td.taskMetrics.map{_.executorRunTime}.getOrElse(-1L) + case DECREASING_RUNTIME => -td.taskMetrics.map{_.executorRunTime}.getOrElse(-1L) + } + Ordering.by(extractor) + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/SecurityFilter.scala b/core/src/main/scala/org/apache/spark/status/api/v1/SecurityFilter.scala new file mode 100644 index 0000000000000..95fbd96ade5ab --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/SecurityFilter.scala @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import javax.ws.rs.WebApplicationException +import javax.ws.rs.core.Response + +import com.sun.jersey.spi.container.{ContainerRequest, ContainerRequestFilter} + +private[v1] class SecurityFilter extends ContainerRequestFilter with UIRootFromServletContext { + def filter(req: ContainerRequest): ContainerRequest = { + val user = Option(req.getUserPrincipal).map { _.getName }.orNull + if (uiRoot.securityManager.checkUIViewPermissions(user)) { + req + } else { + throw new WebApplicationException( + Response + .status(Response.Status.FORBIDDEN) + .entity(raw"""user "$user"is not authorized""") + .build() + ) + } + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala b/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala new file mode 100644 index 0000000000000..cee29786c3019 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import java.text.SimpleDateFormat +import java.util.TimeZone +import javax.ws.rs.WebApplicationException +import javax.ws.rs.core.Response +import javax.ws.rs.core.Response.Status + +import scala.util.Try + +private[v1] class SimpleDateParam(val originalValue: String) { + val timestamp: Long = { + SimpleDateParam.formats.collectFirst { + case fmt if Try(fmt.parse(originalValue)).isSuccess => + fmt.parse(originalValue).getTime() + }.getOrElse( + throw new WebApplicationException( + Response + .status(Status.BAD_REQUEST) + .entity("Couldn't parse date: " + originalValue) + .build() + ) + ) + } +} + +private[v1] object SimpleDateParam { + + val formats: Seq[SimpleDateFormat] = { + + val gmtDay = new SimpleDateFormat("yyyy-MM-dd") + gmtDay.setTimeZone(TimeZone.getTimeZone("GMT")) + + Seq( + new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz"), + gmtDay + ) + } +} diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala new file mode 100644 index 0000000000000..ef3c8570d8186 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import java.util.Date + +import scala.collection.Map + +import org.apache.spark.JobExecutionStatus + +class ApplicationInfo private[spark]( + val id: String, + val name: String, + val attempts: Seq[ApplicationAttemptInfo]) + +class ApplicationAttemptInfo private[spark]( + val attemptId: Option[String], + val startTime: Date, + val endTime: Date, + val sparkUser: String, + val completed: Boolean = false) + +class ExecutorStageSummary private[spark]( + val taskTime : Long, + val failedTasks : Int, + val succeededTasks : Int, + val inputBytes : Long, + val outputBytes : Long, + val shuffleRead : Long, + val shuffleWrite : Long, + val memoryBytesSpilled : Long, + val diskBytesSpilled : Long) + +class ExecutorSummary private[spark]( + val id: String, + val hostPort: String, + val rddBlocks: Int, + val memoryUsed: Long, + val diskUsed: Long, + val activeTasks: Int, + val failedTasks: Int, + val completedTasks: Int, + val totalTasks: Int, + val totalDuration: Long, + val totalInputBytes: Long, + val totalShuffleRead: Long, + val totalShuffleWrite: Long, + val maxMemory: Long, + val executorLogs: Map[String, String]) + +class JobData private[spark]( + val jobId: Int, + val name: String, + val description: Option[String], + val submissionTime: Option[Date], + val completionTime: Option[Date], + val stageIds: Seq[Int], + val jobGroup: Option[String], + val status: JobExecutionStatus, + val numTasks: Int, + val numActiveTasks: Int, + val numCompletedTasks: Int, + val numSkippedTasks: Int, + val numFailedTasks: Int, + val numActiveStages: Int, + val numCompletedStages: Int, + val numSkippedStages: Int, + val numFailedStages: Int) + +// Q: should Tachyon size go in here as well? currently the UI only shows it on the overall storage +// page ... does anybody pay attention to it? +class RDDStorageInfo private[spark]( + val id: Int, + val name: String, + val numPartitions: Int, + val numCachedPartitions: Int, + val storageLevel: String, + val memoryUsed: Long, + val diskUsed: Long, + val dataDistribution: Option[Seq[RDDDataDistribution]], + val partitions: Option[Seq[RDDPartitionInfo]]) + +class RDDDataDistribution private[spark]( + val address: String, + val memoryUsed: Long, + val memoryRemaining: Long, + val diskUsed: Long) + +class RDDPartitionInfo private[spark]( + val blockName: String, + val storageLevel: String, + val memoryUsed: Long, + val diskUsed: Long, + val executors: Seq[String]) + +class StageData private[spark]( + val status: StageStatus, + val stageId: Int, + val attemptId: Int, + val numActiveTasks: Int , + val numCompleteTasks: Int, + val numFailedTasks: Int, + + val executorRunTime: Long, + + val inputBytes: Long, + val inputRecords: Long, + val outputBytes: Long, + val outputRecords: Long, + val shuffleReadBytes: Long, + val shuffleReadRecords: Long, + val shuffleWriteBytes: Long, + val shuffleWriteRecords: Long, + val memoryBytesSpilled: Long, + val diskBytesSpilled: Long, + + val name: String, + val details: String, + val schedulingPool: String, + + val accumulatorUpdates: Seq[AccumulableInfo], + val tasks: Option[Map[Long, TaskData]], + val executorSummary:Option[Map[String,ExecutorStageSummary]]) + +class TaskData private[spark]( + val taskId: Long, + val index: Int, + val attempt: Int, + val launchTime: Date, + val executorId: String, + val host: String, + val taskLocality: String, + val speculative: Boolean, + val accumulatorUpdates: Seq[AccumulableInfo], + val errorMessage: Option[String] = None, + val taskMetrics: Option[TaskMetrics] = None) + +class TaskMetrics private[spark]( + val executorDeserializeTime: Long, + val executorRunTime: Long, + val resultSize: Long, + val jvmGcTime: Long, + val resultSerializationTime: Long, + val memoryBytesSpilled: Long, + val diskBytesSpilled: Long, + val inputMetrics: Option[InputMetrics], + val outputMetrics: Option[OutputMetrics], + val shuffleReadMetrics: Option[ShuffleReadMetrics], + val shuffleWriteMetrics: Option[ShuffleWriteMetrics]) + +class InputMetrics private[spark]( + val bytesRead: Long, + val recordsRead: Long) + +class OutputMetrics private[spark]( + val bytesWritten: Long, + val recordsWritten: Long) + +class ShuffleReadMetrics private[spark]( + val remoteBlocksFetched: Int, + val localBlocksFetched: Int, + val fetchWaitTime: Long, + val remoteBytesRead: Long, + val totalBlocksFetched: Int, + val recordsRead: Long) + +class ShuffleWriteMetrics private[spark]( + val bytesWritten: Long, + val writeTime: Long, + val recordsWritten: Long) + +class TaskMetricDistributions private[spark]( + val quantiles: IndexedSeq[Double], + + val executorDeserializeTime: IndexedSeq[Double], + val executorRunTime: IndexedSeq[Double], + val resultSize: IndexedSeq[Double], + val jvmGcTime: IndexedSeq[Double], + val resultSerializationTime: IndexedSeq[Double], + val memoryBytesSpilled: IndexedSeq[Double], + val diskBytesSpilled: IndexedSeq[Double], + + val inputMetrics: Option[InputMetricDistributions], + val outputMetrics: Option[OutputMetricDistributions], + val shuffleReadMetrics: Option[ShuffleReadMetricDistributions], + val shuffleWriteMetrics: Option[ShuffleWriteMetricDistributions]) + +class InputMetricDistributions private[spark]( + val bytesRead: IndexedSeq[Double], + val recordsRead: IndexedSeq[Double]) + +class OutputMetricDistributions private[spark]( + val bytesWritten: IndexedSeq[Double], + val recordsWritten: IndexedSeq[Double]) + +class ShuffleReadMetricDistributions private[spark]( + val readBytes: IndexedSeq[Double], + val readRecords: IndexedSeq[Double], + val remoteBlocksFetched: IndexedSeq[Double], + val localBlocksFetched: IndexedSeq[Double], + val fetchWaitTime: IndexedSeq[Double], + val remoteBytesRead: IndexedSeq[Double], + val totalBlocksFetched: IndexedSeq[Double]) + +class ShuffleWriteMetricDistributions private[spark]( + val writeBytes: IndexedSeq[Double], + val writeRecords: IndexedSeq[Double], + val writeTime: IndexedSeq[Double]) + +class AccumulableInfo private[spark]( + val id: Long, + val name: String, + val update: Option[String], + val value: String) diff --git a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala index 7d75929b96f75..ec711480ebf30 100644 --- a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala +++ b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala @@ -25,13 +25,17 @@ import org.apache.spark.scheduler._ /** * :: DeveloperApi :: * A SparkListener that maintains executor storage status. + * + * This class is thread-safe (unlike JobProgressListener) */ @DeveloperApi class StorageStatusListener extends SparkListener { // This maintains only blocks that are cached (i.e. storage level is not StorageLevel.NONE) private[storage] val executorIdToStorageStatus = mutable.Map[String, StorageStatus]() - def storageStatusList: Seq[StorageStatus] = executorIdToStorageStatus.values.toSeq + def storageStatusList: Seq[StorageStatus] = synchronized { + executorIdToStorageStatus.values.toSeq + } /** Update storage status list to reflect updated block statuses */ private def updateStorageStatus(execId: String, updatedBlocks: Seq[(BlockId, BlockStatus)]) { diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala index a5271f0574e6c..bfe4a180e8a6f 100644 --- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala @@ -17,6 +17,9 @@ package org.apache.spark.ui +import java.util.Date + +import org.apache.spark.status.api.v1.{ApplicationAttemptInfo, ApplicationInfo, JsonRootResource, UIRoot} import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext} import org.apache.spark.scheduler._ import org.apache.spark.storage.StorageStatusListener @@ -33,7 +36,7 @@ import org.apache.spark.ui.scope.RDDOperationGraphListener private[spark] class SparkUI private ( val sc: Option[SparkContext], val conf: SparkConf, - val securityManager: SecurityManager, + securityManager: SecurityManager, val environmentListener: EnvironmentListener, val storageStatusListener: StorageStatusListener, val executorsListener: ExecutorsListener, @@ -41,22 +44,27 @@ private[spark] class SparkUI private ( val storageListener: StorageListener, val operationGraphListener: RDDOperationGraphListener, var appName: String, - val basePath: String) + val basePath: String, + val startTime: Long) extends WebUI(securityManager, SparkUI.getUIPort(conf), conf, basePath, "SparkUI") - with Logging { + with Logging + with UIRoot { val killEnabled = sc.map(_.conf.getBoolean("spark.ui.killEnabled", true)).getOrElse(false) + + val stagesTab = new StagesTab(this) + /** Initialize all components of the server. */ def initialize() { attachTab(new JobsTab(this)) - val stagesTab = new StagesTab(this) attachTab(stagesTab) attachTab(new StorageTab(this)) attachTab(new EnvironmentTab(this)) attachTab(new ExecutorsTab(this)) attachHandler(createStaticHandler(SparkUI.STATIC_RESOURCE_DIR, "/static")) attachHandler(createRedirectHandler("/", "/jobs", basePath = basePath)) + attachHandler(JsonRootResource.getJsonServlet(this)) // This should be POST only, but, the YARN AM proxy won't proxy POSTs attachHandler(createRedirectHandler( "/stages/stage/kill", "/stages", stagesTab.handleKillRequest, @@ -83,6 +91,24 @@ private[spark] class SparkUI private ( private[spark] def appUIHostPort = publicHostName + ":" + boundPort private[spark] def appUIAddress = s"http://$appUIHostPort" + + def getSparkUI(appId: String): Option[SparkUI] = { + if (appId == appName) Some(this) else None + } + + def getApplicationInfoList: Iterator[ApplicationInfo] = { + Iterator(new ApplicationInfo( + id = appName, + name = appName, + attempts = Seq(new ApplicationAttemptInfo( + attemptId = None, + startTime = new Date(startTime), + endTime = new Date(-1), + sparkUser = "", + completed = false + )) + )) + } } private[spark] abstract class SparkUITab(parent: SparkUI, prefix: String) @@ -109,9 +135,10 @@ private[spark] object SparkUI { listenerBus: SparkListenerBus, jobProgressListener: JobProgressListener, securityManager: SecurityManager, - appName: String): SparkUI = { + appName: String, + startTime: Long): SparkUI = { create(Some(sc), conf, listenerBus, securityManager, appName, - jobProgressListener = Some(jobProgressListener)) + jobProgressListener = Some(jobProgressListener), startTime = startTime) } def createHistoryUI( @@ -119,8 +146,9 @@ private[spark] object SparkUI { listenerBus: SparkListenerBus, securityManager: SecurityManager, appName: String, - basePath: String): SparkUI = { - create(None, conf, listenerBus, securityManager, appName, basePath) + basePath: String, + startTime: Long): SparkUI = { + create(None, conf, listenerBus, securityManager, appName, basePath, startTime = startTime) } /** @@ -137,7 +165,8 @@ private[spark] object SparkUI { securityManager: SecurityManager, appName: String, basePath: String = "", - jobProgressListener: Option[JobProgressListener] = None): SparkUI = { + jobProgressListener: Option[JobProgressListener] = None, + startTime: Long): SparkUI = { val _jobProgressListener: JobProgressListener = jobProgressListener.getOrElse { val listener = new JobProgressListener(conf) @@ -159,6 +188,6 @@ private[spark] object SparkUI { new SparkUI(sc, conf, securityManager, environmentListener, storageStatusListener, executorsListener, _jobProgressListener, storageListener, operationGraphListener, - appName, basePath) + appName, basePath, startTime) } } diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala index f9860d1a5ce76..384f2ad26e281 100644 --- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala @@ -37,7 +37,7 @@ import org.apache.spark.{Logging, SecurityManager, SparkConf} * pages. The use of tabs is optional, however; a WebUI may choose to include pages directly. */ private[spark] abstract class WebUI( - securityManager: SecurityManager, + val securityManager: SecurityManager, port: Int, conf: SparkConf, basePath: String = "", @@ -77,15 +77,9 @@ private[spark] abstract class WebUI( val pagePath = "/" + page.prefix val renderHandler = createServletHandler(pagePath, (request: HttpServletRequest) => page.render(request), securityManager, basePath) - val renderJsonHandler = createServletHandler(pagePath.stripSuffix("/") + "/json", - (request: HttpServletRequest) => page.renderJson(request), securityManager, basePath) attachHandler(renderHandler) - attachHandler(renderJsonHandler) pageToHandlers.getOrElseUpdate(page, ArrayBuffer[ServletContextHandler]()) .append(renderHandler) - pageToHandlers.getOrElseUpdate(page, ArrayBuffer[ServletContextHandler]()) - .append(renderJsonHandler) - } /** Attach a handler to this UI. */ diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala index 956608d7c0cbe..b247e4cdc3bd4 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala @@ -22,11 +22,11 @@ import javax.servlet.http.HttpServletRequest import scala.xml.Node +import org.apache.spark.status.api.v1.ExecutorSummary import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage} import org.apache.spark.util.Utils -/** Summary information about an executor to display in the UI. */ -// Needs to be private[ui] because of a false positive MiMa failure. +// This isn't even used anymore -- but we need to keep it b/c of a MiMa false positive private[ui] case class ExecutorSummaryInfo( id: String, hostPort: String, @@ -44,6 +44,7 @@ private[ui] case class ExecutorSummaryInfo( maxMemory: Long, executorLogs: Map[String, String]) + private[ui] class ExecutorsPage( parent: ExecutorsTab, threadDumpEnabled: Boolean) @@ -55,7 +56,8 @@ private[ui] class ExecutorsPage( val maxMem = storageStatusList.map(_.maxMem).sum val memUsed = storageStatusList.map(_.memUsed).sum val diskUsed = storageStatusList.map(_.diskUsed).sum - val execInfo = for (statusId <- 0 until storageStatusList.size) yield getExecInfo(statusId) + val execInfo = for (statusId <- 0 until storageStatusList.size) yield + ExecutorsPage.getExecInfo(listener, statusId) val execInfoSorted = execInfo.sortBy(_.id) val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty @@ -111,7 +113,7 @@ private[ui] class ExecutorsPage( } /** Render an HTML row representing an executor */ - private def execRow(info: ExecutorSummaryInfo, logsExist: Boolean): Seq[Node] = { + private def execRow(info: ExecutorSummary, logsExist: Boolean): Seq[Node] = { val maximumMemory = info.maxMemory val memoryUsed = info.memoryUsed val diskUsed = info.diskUsed @@ -170,8 +172,11 @@ private[ui] class ExecutorsPage( } +} + +private[spark] object ExecutorsPage { /** Represent an executor's info as a map given a storage status index */ - private def getExecInfo(statusId: Int): ExecutorSummaryInfo = { + def getExecInfo(listener: ExecutorsListener, statusId: Int): ExecutorSummary = { val status = listener.storageStatusList(statusId) val execId = status.blockManagerId.executorId val hostPort = status.blockManagerId.hostPort @@ -189,7 +194,7 @@ private[ui] class ExecutorsPage( val totalShuffleWrite = listener.executorToShuffleWrite.getOrElse(execId, 0L) val executorLogs = listener.executorToLogUrls.getOrElse(execId, Map.empty) - new ExecutorSummaryInfo( + new ExecutorSummary( execId, hostPort, rddBlocks, diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala index f6abf27db49dd..09323d1d80ad6 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala @@ -271,6 +271,12 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") { val shouldShowCompletedJobs = completedJobs.nonEmpty val shouldShowFailedJobs = failedJobs.nonEmpty + val completedJobNumStr = if (completedJobs.size == listener.numCompletedJobs) { + s"${completedJobs.size}" + } else { + s"${listener.numCompletedJobs}, only showing ${completedJobs.size}" + } + val summary: NodeSeq =
    @@ -295,9 +301,9 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") { } { if (shouldShowCompletedJobs) { -
  • +
  • Completed Jobs: - {completedJobs.size} + {completedJobNumStr}
  • } } @@ -305,7 +311,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") { if (shouldShowFailedJobs) {
  • Failed Jobs: - {failedJobs.size} + {listener.numFailedJobs}
  • } } @@ -322,7 +328,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") { activeJobsTable } if (shouldShowCompletedJobs) { - content ++=

    Completed Jobs ({completedJobs.size})

    ++ + content ++=

    Completed Jobs ({completedJobNumStr})

    ++ completedJobsTable } if (shouldShowFailedJobs) { diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala index 236bc8ea92879..a37f739ab9c66 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala @@ -64,6 +64,12 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") { val shouldShowCompletedStages = completedStages.nonEmpty val shouldShowFailedStages = failedStages.nonEmpty + val completedStageNumStr = if (numCompletedStages == completedStages.size) { + s"$numCompletedStages" + } else { + s"$numCompletedStages, only showing ${completedStages.size}" + } + val summary: NodeSeq =
      @@ -98,9 +104,9 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") { } { if (shouldShowCompletedStages) { -
    • +
    • Completed Stages: - {numCompletedStages} + {completedStageNumStr}
    • } } @@ -132,7 +138,7 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") { pendingStagesTable.toNodeSeq } if (shouldShowCompletedStages) { - content ++=

      Completed Stages ({numCompletedStages})

      ++ + content ++=

      Completed Stages ({completedStageNumStr})

      ++ completedStagesTable.toNodeSeq } if (shouldShowFailedStages) { diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala index 96cc3d78d0f15..7163217e1fed0 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala @@ -187,7 +187,7 @@ private[ui] class JobPage(parent: JobsTab) extends WebUIPage("job") { val jobDataOption = listener.jobIdToData.get(jobId) if (jobDataOption.isEmpty) { val content = -
      +

      No information to display for job {jobId}

      return UIUtils.headerSparkPage( diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala index 8f9aa9fdec819..246e191d64776 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala @@ -74,6 +74,8 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging { // JobProgressListener's retention limits. var numCompletedStages = 0 var numFailedStages = 0 + var numCompletedJobs = 0 + var numFailedJobs = 0 // Misc: val executorIdToBlockManagerId = HashMap[ExecutorId, BlockManagerId]() @@ -217,10 +219,12 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging { completedJobs += jobData trimJobsIfNecessary(completedJobs) jobData.status = JobExecutionStatus.SUCCEEDED + numCompletedJobs += 1 case JobFailed(exception) => failedJobs += jobData trimJobsIfNecessary(failedJobs) jobData.status = JobExecutionStatus.FAILED + numFailedJobs += 1 } for (stageId <- jobData.stageIds) { stageIdToActiveJobIds.get(stageId).foreach { jobsUsingStage => diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala index d725b9d8565ac..f3e0b38523f32 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala @@ -21,7 +21,7 @@ import javax.servlet.http.HttpServletRequest import scala.xml.Node -import org.apache.spark.scheduler.{Schedulable, StageInfo} +import org.apache.spark.scheduler.StageInfo import org.apache.spark.ui.{WebUIPage, UIUtils} /** Page showing specific pool details */ diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index 6c4305873cbd9..b01fad8e453c8 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -25,11 +25,11 @@ import scala.xml.{Elem, Node, Unparsed} import org.apache.commons.lang3.StringEscapeUtils import org.apache.spark.executor.TaskMetrics +import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo} import org.apache.spark.ui.{ToolTips, WebUIPage, UIUtils} import org.apache.spark.ui.jobs.UIData._ import org.apache.spark.ui.scope.RDDOperationGraph import org.apache.spark.util.{Utils, Distribution} -import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo} /** Page showing statistics and task list for a given stage */ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { @@ -52,14 +52,22 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { val stageAttemptId = parameterAttempt.toInt val stageDataOption = progressListener.stageIdToData.get((stageId, stageAttemptId)) - if (stageDataOption.isEmpty || stageDataOption.get.taskData.isEmpty) { + val stageHeader = s"Details for Stage $stageId (Attempt $stageAttemptId)" + if (stageDataOption.isEmpty) { + val content = +
      +

      No information to display for Stage {stageId} (Attempt {stageAttemptId})

      +
      + return UIUtils.headerSparkPage(stageHeader, content, parent) + + } + if (stageDataOption.get.taskData.isEmpty) { val content =

      Summary Metrics

      No tasks have started yet

      Tasks

      No tasks have started yet
      - return UIUtils.headerSparkPage( - s"Details for Stage $stageId (Attempt $stageAttemptId)", content, parent) + return UIUtils.headerSparkPage(stageHeader, content, parent) } val stageData = stageDataOption.get @@ -458,8 +466,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") { maybeAccumulableTable ++

      Tasks

      ++ taskTable - UIUtils.headerSparkPage( - "Details for Stage %d".format(stageId), content, parent, showVisualization = true) + UIUtils.headerSparkPage(stageHeader, content, parent, showVisualization = true) } } diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala index 199f731b92bcc..05f94a7507f4f 100644 --- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala @@ -21,8 +21,8 @@ import javax.servlet.http.HttpServletRequest import scala.xml.Node -import org.apache.spark.storage.{BlockId, BlockStatus, StorageStatus, StorageUtils} -import org.apache.spark.ui.{WebUIPage, UIUtils} +import org.apache.spark.status.api.v1.{AllRDDResource, RDDDataDistribution, RDDPartitionInfo} +import org.apache.spark.ui.{UIUtils, WebUIPage} import org.apache.spark.util.Utils /** Page showing storage details for a given RDD */ @@ -32,28 +32,19 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") { def render(request: HttpServletRequest): Seq[Node] = { val parameterId = request.getParameter("id") require(parameterId != null && parameterId.nonEmpty, "Missing id parameter") - val rddId = parameterId.toInt - val storageStatusList = listener.storageStatusList - val rddInfo = listener.rddInfoList.find(_.id == rddId).getOrElse { - // Rather than crashing, render an "RDD Not Found" page - return UIUtils.headerSparkPage("RDD Not Found", Seq[Node](), parent) - } + val rddStorageInfo = AllRDDResource.getRDDStorageInfo(rddId, listener,includeDetails = true) + .getOrElse { + // Rather than crashing, render an "RDD Not Found" page + return UIUtils.headerSparkPage("RDD Not Found", Seq[Node](), parent) + } // Worker table - val workers = storageStatusList.map((rddId, _)) - val workerTable = UIUtils.listingTable(workerHeader, workerRow, workers, - id = Some("rdd-storage-by-worker-table")) + val workerTable = UIUtils.listingTable(workerHeader, workerRow, + rddStorageInfo.dataDistribution.get, id = Some("rdd-storage-by-worker-table")) // Block table - val blockLocations = StorageUtils.getRddBlockLocations(rddId, storageStatusList) - val blocks = storageStatusList - .flatMap(_.rddBlocksById(rddId)) - .sortWith(_._1.name < _._1.name) - .map { case (blockId, status) => - (blockId, status, blockLocations.get(blockId).getOrElse(Seq[String]("Unknown"))) - } - val blockTable = UIUtils.listingTable(blockHeader, blockRow, blocks, + val blockTable = UIUtils.listingTable(blockHeader, blockRow, rddStorageInfo.partitions.get, id = Some("rdd-storage-by-block-table")) val content = @@ -62,23 +53,23 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
      • Storage Level: - {rddInfo.storageLevel.description} + {rddStorageInfo.storageLevel}
      • Cached Partitions: - {rddInfo.numCachedPartitions} + {rddStorageInfo.numCachedPartitions}
      • Total Partitions: - {rddInfo.numPartitions} + {rddStorageInfo.numPartitions}
      • Memory Size: - {Utils.bytesToString(rddInfo.memSize)} + {Utils.bytesToString(rddStorageInfo.memoryUsed)}
      • Disk Size: - {Utils.bytesToString(rddInfo.diskSize)} + {Utils.bytesToString(rddStorageInfo.diskUsed)}
      @@ -86,19 +77,19 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
      -

      Data Distribution on {workers.size} Executors

      +

      Data Distribution on {rddStorageInfo.dataDistribution.size} Executors

      {workerTable}
      -

      {blocks.size} Partitions

      +

      {rddStorageInfo.partitions.size} Partitions

      {blockTable}
      ; - UIUtils.headerSparkPage("RDD Storage Info for " + rddInfo.name, content, parent) + UIUtils.headerSparkPage("RDD Storage Info for " + rddStorageInfo.name, content, parent) } /** Header fields for the worker table */ @@ -116,34 +107,32 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") { "Executors") /** Render an HTML row representing a worker */ - private def workerRow(worker: (Int, StorageStatus)): Seq[Node] = { - val (rddId, status) = worker + private def workerRow(worker: RDDDataDistribution): Seq[Node] = {
- + - + } /** Render an HTML row representing a block */ - private def blockRow(row: (BlockId, BlockStatus, Seq[String])): Seq[Node] = { - val (id, block, locations) = row + private def blockRow(row: RDDPartitionInfo): Seq[Node] = { - + - - } diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala b/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala index 59dc6b547c1d8..07db783c572cf 100644 --- a/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala @@ -22,7 +22,7 @@ import javax.servlet.http.HttpServletRequest import scala.xml.Node import org.apache.spark.storage.RDDInfo -import org.apache.spark.ui.{WebUIPage, UIUtils} +import org.apache.spark.ui.{UIUtils, WebUIPage} import org.apache.spark.util.Utils /** Page showing list of RDD's currently stored in the cluster */ diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala index 045bd784990d1..0351749700962 100644 --- a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala +++ b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala @@ -35,6 +35,8 @@ private[ui] class StorageTab(parent: SparkUI) extends SparkUITab(parent, "storag /** * :: DeveloperApi :: * A SparkListener that prepares information to be displayed on the BlockManagerUI. + * + * This class is thread-safe (unlike JobProgressListener) */ @DeveloperApi class StorageListener(storageStatusListener: StorageStatusListener) extends SparkListener { @@ -43,7 +45,9 @@ class StorageListener(storageStatusListener: StorageStatusListener) extends Spar def storageStatusList: Seq[StorageStatus] = storageStatusListener.storageStatusList /** Filter RDD info to include only those with cached partitions */ - def rddInfoList: Seq[RDDInfo] = _rddInfoMap.values.filter(_.numCachedPartitions > 0).toSeq + def rddInfoList: Seq[RDDInfo] = synchronized { + _rddInfoMap.values.filter(_.numCachedPartitions > 0).toSeq + } /** Update the storage info of the RDDs whose blocks are among the given updated blocks */ private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = { diff --git a/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json new file mode 100644 index 0000000000000..ce4fe80b66aa5 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json @@ -0,0 +1,53 @@ +[ { + "id" : "local-1430917381534", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-05-06T13:03:00.893GMT", + "endTime" : "2015-05-06T13:03:11.398GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1426533911241", + "name" : "Spark shell", + "attempts" : [ { + "attemptId" : "2", + "startTime" : "2015-03-17T23:11:50.242GMT", + "endTime" : "2015-03-17T23:12:25.177GMT", + "sparkUser" : "irashid", + "completed" : true + }, { + "attemptId" : "1", + "startTime" : "2015-03-16T19:25:10.242GMT", + "endTime" : "2015-03-16T19:25:45.177GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1425081759269", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-28T00:02:38.277GMT", + "endTime" : "2015-02-28T00:02:46.912GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1422981780767", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-03T16:42:59.720GMT", + "endTime" : "2015-02-03T16:43:08.731GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1422981759269", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-03T16:42:38.277GMT", + "endTime" : "2015-02-03T16:42:46.912GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/complete_stage_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/complete_stage_list_json_expectation.json new file mode 100644 index 0000000000000..31ac9beea8788 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/complete_stage_list_json_expectation.json @@ -0,0 +1,67 @@ +[ { + "status" : "COMPLETE", + "stageId" : 3, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 162, + "inputBytes" : 160, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 0, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "count at :17", + "details" : "org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line19.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line19.$read$$iwC$$iwC$$iwC.(:22)\n$line19.$read$$iwC$$iwC.(:24)\n$line19.$read$$iwC.(:26)\n$line19.$read.(:28)\n$line19.$read$.(:32)\n$line19.$read$.()\n$line19.$eval$.(:7)\n$line19.$eval$.()\n$line19.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ] +}, { + "status" : "COMPLETE", + "stageId" : 1, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 3476, + "inputBytes" : 28000128, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 13180, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "map at :14", + "details" : "org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ] +}, { + "status" : "COMPLETE", + "stageId" : 0, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 4338, + "inputBytes" : 0, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 0, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "count at :15", + "details" : "org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json new file mode 100644 index 0000000000000..ce4fe80b66aa5 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json @@ -0,0 +1,53 @@ +[ { + "id" : "local-1430917381534", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-05-06T13:03:00.893GMT", + "endTime" : "2015-05-06T13:03:11.398GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1426533911241", + "name" : "Spark shell", + "attempts" : [ { + "attemptId" : "2", + "startTime" : "2015-03-17T23:11:50.242GMT", + "endTime" : "2015-03-17T23:12:25.177GMT", + "sparkUser" : "irashid", + "completed" : true + }, { + "attemptId" : "1", + "startTime" : "2015-03-16T19:25:10.242GMT", + "endTime" : "2015-03-16T19:25:45.177GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1425081759269", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-28T00:02:38.277GMT", + "endTime" : "2015-02-28T00:02:46.912GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1422981780767", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-03T16:42:59.720GMT", + "endTime" : "2015-02-03T16:43:08.731GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1422981759269", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-03T16:42:38.277GMT", + "endTime" : "2015-02-03T16:42:46.912GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/executor_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/executor_list_json_expectation.json new file mode 100644 index 0000000000000..cb622e147249e --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/executor_list_json_expectation.json @@ -0,0 +1,17 @@ +[ { + "id" : "", + "hostPort" : "localhost:57971", + "rddBlocks" : 8, + "memoryUsed" : 28000128, + "diskUsed" : 0, + "activeTasks" : 0, + "failedTasks" : 1, + "completedTasks" : 31, + "totalTasks" : 32, + "totalDuration" : 8820, + "totalInputBytes" : 28000288, + "totalShuffleRead" : 0, + "totalShuffleWrite" : 13180, + "maxMemory" : 278302556, + "executorLogs" : { } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/failed_stage_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/failed_stage_list_json_expectation.json new file mode 100644 index 0000000000000..bff6a4f69d077 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/failed_stage_list_json_expectation.json @@ -0,0 +1,23 @@ +[ { + "status" : "FAILED", + "stageId" : 2, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 7, + "numFailedTasks" : 1, + "executorRunTime" : 278, + "inputBytes" : 0, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 0, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "count at :20", + "details" : "org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:20)\n$line11.$read$$iwC$$iwC$$iwC.(:25)\n$line11.$read$$iwC$$iwC.(:27)\n$line11.$read$$iwC.(:29)\n$line11.$read.(:31)\n$line11.$read$.(:35)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json b/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json new file mode 100644 index 0000000000000..2e92e1fa0ec23 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json @@ -0,0 +1,15 @@ +[ { + "jobId" : 0, + "name" : "foreach at :15", + "stageIds" : [ 0 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json b/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json new file mode 100644 index 0000000000000..2e92e1fa0ec23 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json @@ -0,0 +1,15 @@ +[ { + "jobId" : 0, + "name" : "foreach at :15", + "stageIds" : [ 0 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json new file mode 100644 index 0000000000000..cab4750270dfa --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json @@ -0,0 +1,43 @@ +[ { + "jobId" : 2, + "name" : "count at :17", + "stageIds" : [ 3 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +}, { + "jobId" : 1, + "name" : "count at :20", + "stageIds" : [ 1, 2 ], + "status" : "FAILED", + "numTasks" : 16, + "numActiveTasks" : 0, + "numCompletedTasks" : 15, + "numSkippedTasks" : 15, + "numFailedTasks" : 1, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 1 +}, { + "jobId" : 0, + "name" : "count at :15", + "stageIds" : [ 0 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/maxDate2_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/maxDate2_app_list_json_expectation.json new file mode 100644 index 0000000000000..483632a3956ed --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/maxDate2_app_list_json_expectation.json @@ -0,0 +1,10 @@ +[ { + "id" : "local-1422981759269", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-03T16:42:38.277GMT", + "endTime" : "2015-02-03T16:42:46.912GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/maxDate_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/maxDate_app_list_json_expectation.json new file mode 100644 index 0000000000000..4b85690fd9199 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/maxDate_app_list_json_expectation.json @@ -0,0 +1,19 @@ +[ { + "id" : "local-1422981780767", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-03T16:42:59.720GMT", + "endTime" : "2015-02-03T16:43:08.731GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1422981759269", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-03T16:42:38.277GMT", + "endTime" : "2015-02-03T16:42:46.912GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json new file mode 100644 index 0000000000000..dca86fe5f7e6a --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json @@ -0,0 +1,35 @@ +[ { + "id" : "local-1430917381534", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-05-06T13:03:00.893GMT", + "endTime" : "2015-05-06T13:03:11.398GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1426533911241", + "name" : "Spark shell", + "attempts" : [ { + "attemptId" : "2", + "startTime" : "2015-03-17T23:11:50.242GMT", + "endTime" : "2015-03-17T23:12:25.177GMT", + "sparkUser" : "irashid", + "completed" : true + }, { + "attemptId" : "1", + "startTime" : "2015-03-16T19:25:10.242GMT", + "endTime" : "2015-03-16T19:25:45.177GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +}, { + "id" : "local-1425081759269", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-28T00:02:38.277GMT", + "endTime" : "2015-02-28T00:02:46.912GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/one_app_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/one_app_json_expectation.json new file mode 100644 index 0000000000000..07489ad96414a --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/one_app_json_expectation.json @@ -0,0 +1,10 @@ +{ + "id" : "local-1422981780767", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2015-02-03T16:42:59.720GMT", + "endTime" : "2015-02-03T16:43:08.731GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/one_app_multi_attempt_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/one_app_multi_attempt_json_expectation.json new file mode 100644 index 0000000000000..8f3d7160c723f --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/one_app_multi_attempt_json_expectation.json @@ -0,0 +1,17 @@ +{ + "id" : "local-1426533911241", + "name" : "Spark shell", + "attempts" : [ { + "attemptId" : "2", + "startTime" : "2015-03-17T23:11:50.242GMT", + "endTime" : "2015-03-17T23:12:25.177GMT", + "sparkUser" : "irashid", + "completed" : true + }, { + "attemptId" : "1", + "startTime" : "2015-03-16T19:25:10.242GMT", + "endTime" : "2015-03-16T19:25:45.177GMT", + "sparkUser" : "irashid", + "completed" : true + } ] +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json new file mode 100644 index 0000000000000..4a29072bdb6e4 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json @@ -0,0 +1,15 @@ +{ + "jobId" : 0, + "name" : "count at :15", + "stageIds" : [ 0 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/one_rdd_storage_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/one_rdd_storage_json_expectation.json new file mode 100644 index 0000000000000..38b5328ffbb03 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/one_rdd_storage_json_expectation.json @@ -0,0 +1,64 @@ +{ + "id" : 0, + "name" : "0", + "numPartitions" : 8, + "numCachedPartitions" : 8, + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 28000128, + "diskUsed" : 0, + "dataDistribution" : [ { + "address" : "localhost:57971", + "memoryUsed" : 28000128, + "memoryRemaining" : 250302428, + "diskUsed" : 0 + } ], + "partitions" : [ { + "blockName" : "rdd_0_0", + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 3500016, + "diskUsed" : 0, + "executors" : [ "localhost:57971" ] + }, { + "blockName" : "rdd_0_1", + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 3500016, + "diskUsed" : 0, + "executors" : [ "localhost:57971" ] + }, { + "blockName" : "rdd_0_2", + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 3500016, + "diskUsed" : 0, + "executors" : [ "localhost:57971" ] + }, { + "blockName" : "rdd_0_3", + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 3500016, + "diskUsed" : 0, + "executors" : [ "localhost:57971" ] + }, { + "blockName" : "rdd_0_4", + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 3500016, + "diskUsed" : 0, + "executors" : [ "localhost:57971" ] + }, { + "blockName" : "rdd_0_5", + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 3500016, + "diskUsed" : 0, + "executors" : [ "localhost:57971" ] + }, { + "blockName" : "rdd_0_6", + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 3500016, + "diskUsed" : 0, + "executors" : [ "localhost:57971" ] + }, { + "blockName" : "rdd_0_7", + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 3500016, + "diskUsed" : 0, + "executors" : [ "localhost:57971" ] + } ] +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/one_stage_attempt_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/one_stage_attempt_json_expectation.json new file mode 100644 index 0000000000000..111cb8163eb3d --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/one_stage_attempt_json_expectation.json @@ -0,0 +1,270 @@ +{ + "status" : "COMPLETE", + "stageId" : 1, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 3476, + "inputBytes" : 28000128, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 13180, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "map at :14", + "details" : "org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ], + "tasks" : { + "8" : { + "taskId" : 8, + "index" : 0, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.829GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 1, + "executorRunTime" : 435, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 94000, + "recordsWritten" : 0 + } + } + }, + "11" : { + "taskId" : 11, + "index" : 3, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.830GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1647, + "writeTime" : 83000, + "recordsWritten" : 0 + } + } + }, + "14" : { + "taskId" : 14, + "index" : 6, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.832GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 88000, + "recordsWritten" : 0 + } + } + }, + "13" : { + "taskId" : 13, + "index" : 5, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.831GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 73000, + "recordsWritten" : 0 + } + } + }, + "10" : { + "taskId" : 10, + "index" : 2, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.830GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 76000, + "recordsWritten" : 0 + } + } + }, + "9" : { + "taskId" : 9, + "index" : 1, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.830GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 1, + "executorRunTime" : 436, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 98000, + "recordsWritten" : 0 + } + } + }, + "12" : { + "taskId" : 12, + "index" : 4, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.831GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1645, + "writeTime" : 101000, + "recordsWritten" : 0 + } + } + }, + "15" : { + "taskId" : 15, + "index" : 7, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.833GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 1, + "executorRunTime" : 435, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 79000, + "recordsWritten" : 0 + } + } + } + }, + "executorSummary" : { + "" : { + "taskTime" : 3624, + "failedTasks" : 0, + "succeededTasks" : 8, + "inputBytes" : 28000128, + "outputBytes" : 0, + "shuffleRead" : 0, + "shuffleWrite" : 13180, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + } +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/one_stage_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/one_stage_json_expectation.json new file mode 100644 index 0000000000000..ef339f89afa45 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/one_stage_json_expectation.json @@ -0,0 +1,270 @@ +[ { + "status" : "COMPLETE", + "stageId" : 1, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 3476, + "inputBytes" : 28000128, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 13180, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "map at :14", + "details" : "org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ], + "tasks" : { + "8" : { + "taskId" : 8, + "index" : 0, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.829GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 1, + "executorRunTime" : 435, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 94000, + "recordsWritten" : 0 + } + } + }, + "11" : { + "taskId" : 11, + "index" : 3, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.830GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1647, + "writeTime" : 83000, + "recordsWritten" : 0 + } + } + }, + "14" : { + "taskId" : 14, + "index" : 6, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.832GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 88000, + "recordsWritten" : 0 + } + } + }, + "13" : { + "taskId" : 13, + "index" : 5, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.831GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 73000, + "recordsWritten" : 0 + } + } + }, + "10" : { + "taskId" : 10, + "index" : 2, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.830GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 76000, + "recordsWritten" : 0 + } + } + }, + "9" : { + "taskId" : 9, + "index" : 1, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.830GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 1, + "executorRunTime" : 436, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 98000, + "recordsWritten" : 0 + } + } + }, + "12" : { + "taskId" : 12, + "index" : 4, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.831GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 434, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1645, + "writeTime" : 101000, + "recordsWritten" : 0 + } + } + }, + "15" : { + "taskId" : 15, + "index" : 7, + "attempt" : 0, + "launchTime" : "2015-02-03T16:43:05.833GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 1, + "executorRunTime" : 435, + "resultSize" : 1902, + "jvmGcTime" : 19, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 3500016, + "recordsRead" : 0 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1648, + "writeTime" : 79000, + "recordsWritten" : 0 + } + } + } + }, + "executorSummary" : { + "" : { + "taskTime" : 3624, + "failedTasks" : 0, + "succeededTasks" : 8, + "inputBytes" : 28000128, + "outputBytes" : 0, + "shuffleRead" : 0, + "shuffleWrite" : 13180, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/rdd_list_storage_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/rdd_list_storage_json_expectation.json new file mode 100644 index 0000000000000..f79a31022d214 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/rdd_list_storage_json_expectation.json @@ -0,0 +1,9 @@ +[ { + "id" : 0, + "name" : "0", + "numPartitions" : 8, + "numCachedPartitions" : 8, + "storageLevel" : "Memory Deserialized 1x Replicated", + "memoryUsed" : 28000128, + "diskUsed" : 0 +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/running_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/running_app_list_json_expectation.json new file mode 100644 index 0000000000000..8878e547a7984 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/running_app_list_json_expectation.json @@ -0,0 +1 @@ +[ ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_list_json_expectation.json new file mode 100644 index 0000000000000..056fac7088594 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_list_json_expectation.json @@ -0,0 +1,89 @@ +[ { + "status" : "COMPLETE", + "stageId" : 3, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 162, + "inputBytes" : 160, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 0, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "count at :17", + "details" : "org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line19.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line19.$read$$iwC$$iwC$$iwC.(:22)\n$line19.$read$$iwC$$iwC.(:24)\n$line19.$read$$iwC.(:26)\n$line19.$read.(:28)\n$line19.$read$.(:32)\n$line19.$read$.()\n$line19.$eval$.(:7)\n$line19.$eval$.()\n$line19.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ] +}, { + "status" : "COMPLETE", + "stageId" : 1, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 3476, + "inputBytes" : 28000128, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 13180, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "map at :14", + "details" : "org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ] +}, { + "status" : "COMPLETE", + "stageId" : 0, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 4338, + "inputBytes" : 0, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 0, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "count at :15", + "details" : "org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ] +}, { + "status" : "FAILED", + "stageId" : 2, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 7, + "numFailedTasks" : 1, + "executorRunTime" : 278, + "inputBytes" : 0, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 0, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "count at :20", + "details" : "org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:20)\n$line11.$read$$iwC$$iwC$$iwC.(:25)\n$line11.$read$$iwC$$iwC.(:27)\n$line11.$read$$iwC.(:29)\n$line11.$read.(:31)\n$line11.$read$.(:35)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_list_with_accumulable_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_list_with_accumulable_json_expectation.json new file mode 100644 index 0000000000000..79ccacd309693 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_list_with_accumulable_json_expectation.json @@ -0,0 +1,27 @@ +[ { + "status" : "COMPLETE", + "stageId" : 0, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 120, + "inputBytes" : 0, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 0, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "foreach at :15", + "details" : "org.apache.spark.rdd.RDD.foreach(RDD.scala:765)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "value" : "5050" + } ] +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_list_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_list_expectation.json new file mode 100644 index 0000000000000..f2cb29b31c85f --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_list_expectation.json @@ -0,0 +1,561 @@ +[ { + "taskId" : 0, + "index" : 0, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.494GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 32, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 49294, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3842811, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 1, + "index" : 1, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.502GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 350, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3934399, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 2, + "index" : 2, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.503GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 32, + "executorRunTime" : 348, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 89885, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 3, + "index" : 3, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.504GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 1311694, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 4, + "index" : 4, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.504GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 83022, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 5, + "index" : 5, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.505GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 30, + "executorRunTime" : 350, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3675510, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 6, + "index" : 6, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.505GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 29, + "executorRunTime" : 351, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 4016617, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 7, + "index" : 7, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.506GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 2579051, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 8, + "index" : 8, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.914GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 80, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 121551, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 9, + "index" : 9, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.915GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60489, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 101664, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 10, + "index" : 10, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.916GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 8, + "executorRunTime" : 73, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 94709, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 11, + "index" : 11, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.918GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 75, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 94507, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 12, + "index" : 12, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.923GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 77, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 102476, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 13, + "index" : 13, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.924GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 76, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95004, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 14, + "index" : 14, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.925GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 83, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95646, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 15, + "index" : 15, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.928GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 76, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 602780, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 16, + "index" : 16, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.001GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 108320, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 17, + "index" : 17, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.005GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 11, + "executorRunTime" : 91, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 99944, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 18, + "index" : 18, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.010GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 92, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100836, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 19, + "index" : 19, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.012GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95788, + "recordsWritten" : 10 + } + } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_list_from_multi_attempt_app_json_1__expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_list_from_multi_attempt_app_json_1__expectation.json new file mode 100644 index 0000000000000..c3febc5fc9447 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_list_from_multi_attempt_app_json_1__expectation.json @@ -0,0 +1,193 @@ +[ { + "taskId" : 0, + "index" : 0, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.515GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "78", + "value" : "5050" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 14, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 1, + "index" : 1, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.521GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "247", + "value" : "2175" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 14, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 2, + "index" : 2, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "378", + "value" : "378" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 13, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 3, + "index" : 3, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "572", + "value" : "950" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 13, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 4, + "index" : 4, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "678", + "value" : "2853" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 5, + "index" : 5, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.523GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "897", + "value" : "3750" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 6, + "index" : 6, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.523GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "978", + "value" : "1928" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 7, + "index" : 7, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.524GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "1222", + "value" : "4972" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_list_from_multi_attempt_app_json_2__expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_list_from_multi_attempt_app_json_2__expectation.json new file mode 100644 index 0000000000000..56d667d88917c --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_list_from_multi_attempt_app_json_2__expectation.json @@ -0,0 +1,193 @@ +[ { + "taskId" : 0, + "index" : 0, + "attempt" : 0, + "launchTime" : "2015-03-17T23:12:16.515GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "78", + "value" : "5050" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 14, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 1, + "index" : 1, + "attempt" : 0, + "launchTime" : "2015-03-17T23:12:16.521GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "247", + "value" : "2175" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 14, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 2, + "index" : 2, + "attempt" : 0, + "launchTime" : "2015-03-17T23:12:16.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "378", + "value" : "378" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 13, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 3, + "index" : 3, + "attempt" : 0, + "launchTime" : "2015-03-17T23:12:16.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "572", + "value" : "950" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 13, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 4, + "index" : 4, + "attempt" : 0, + "launchTime" : "2015-03-17T23:12:16.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "678", + "value" : "2853" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 5, + "index" : 5, + "attempt" : 0, + "launchTime" : "2015-03-17T23:12:16.523GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "897", + "value" : "3750" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 6, + "index" : 6, + "attempt" : 0, + "launchTime" : "2015-03-17T23:12:16.523GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "978", + "value" : "1928" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +}, { + "taskId" : 7, + "index" : 7, + "attempt" : 0, + "launchTime" : "2015-03-17T23:12:16.524GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "1222", + "value" : "4972" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__offset___length_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__offset___length_expectation.json new file mode 100644 index 0000000000000..e5ec3bc4c7126 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__offset___length_expectation.json @@ -0,0 +1,1401 @@ +[ { + "taskId" : 10, + "index" : 10, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.916GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 8, + "executorRunTime" : 73, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 94709, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 11, + "index" : 11, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.918GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 75, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 94507, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 12, + "index" : 12, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.923GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 77, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 102476, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 13, + "index" : 13, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.924GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 76, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95004, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 14, + "index" : 14, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.925GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 83, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95646, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 15, + "index" : 15, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.928GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 76, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 602780, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 16, + "index" : 16, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.001GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 108320, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 17, + "index" : 17, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.005GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 11, + "executorRunTime" : 91, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 99944, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 18, + "index" : 18, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.010GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 92, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100836, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 19, + "index" : 19, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.012GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95788, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 20, + "index" : 20, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.014GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 83, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 97716, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 21, + "index" : 21, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.015GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 88, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100270, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 22, + "index" : 22, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.018GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 93, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 143427, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 23, + "index" : 23, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.031GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 65, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 91844, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 24, + "index" : 24, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.098GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 43, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 157194, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 25, + "index" : 25, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.103GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 49, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 94134, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 26, + "index" : 26, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.105GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 38, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 108213, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 27, + "index" : 27, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.110GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 32, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 102019, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 28, + "index" : 28, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.113GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 29, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 104299, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 29, + "index" : 29, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.114GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 39, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 114938, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 30, + "index" : 30, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.118GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 4, + "executorRunTime" : 34, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 119770, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 31, + "index" : 31, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.127GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 36, + "executorRunTime" : 24, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 92619, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 32, + "index" : 32, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.148GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 89603, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 33, + "index" : 33, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.149GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 43, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 118329, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 34, + "index" : 34, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.156GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 27, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 127746, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 35, + "index" : 35, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.161GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 35, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 160963, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 36, + "index" : 36, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.164GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 29, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 123855, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 37, + "index" : 37, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.165GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 4, + "executorRunTime" : 32, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 111869, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 38, + "index" : 38, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.166GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 31, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 131158, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 39, + "index" : 39, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.180GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 98748, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 40, + "index" : 40, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.197GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 4, + "executorRunTime" : 14, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 94792, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 41, + "index" : 41, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.200GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 90765, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 42, + "index" : 42, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.203GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 103713, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 43, + "index" : 43, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.204GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 171516, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 44, + "index" : 44, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.205GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 98293, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 45, + "index" : 45, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.206GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 19, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 92985, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 46, + "index" : 46, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.210GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 1, + "executorRunTime" : 31, + "resultSize" : 2065, + "jvmGcTime" : 6, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 113322, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 47, + "index" : 47, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.212GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 103015, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 48, + "index" : 48, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.220GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 24, + "resultSize" : 2065, + "jvmGcTime" : 6, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 139844, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 49, + "index" : 49, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.223GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 7, + "executorRunTime" : 23, + "resultSize" : 2065, + "jvmGcTime" : 6, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 94984, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 50, + "index" : 50, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.240GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 4, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 90836, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 51, + "index" : 51, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.242GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 96013, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 52, + "index" : 52, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.243GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 89664, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 53, + "index" : 53, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.244GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 92835, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 54, + "index" : 54, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.244GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 90506, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 55, + "index" : 55, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.246GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 4, + "executorRunTime" : 21, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 108309, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 56, + "index" : 56, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.249GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 20, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 90329, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 57, + "index" : 57, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.257GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 96849, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 58, + "index" : 58, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.263GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 97521, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 59, + "index" : 59, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.265GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100753, + "recordsWritten" : 10 + } + } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_expectation.json new file mode 100644 index 0000000000000..5657123a2db15 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_expectation.json @@ -0,0 +1,561 @@ +[ { + "taskId" : 6, + "index" : 6, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.505GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 29, + "executorRunTime" : 351, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 4016617, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 5, + "index" : 5, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.505GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 30, + "executorRunTime" : 350, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3675510, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 1, + "index" : 1, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.502GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 350, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3934399, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 4, + "index" : 4, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.504GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 83022, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 7, + "index" : 7, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.506GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 2579051, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 3, + "index" : 3, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.504GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 1311694, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 0, + "index" : 0, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.494GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 32, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 49294, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3842811, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 2, + "index" : 2, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.503GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 32, + "executorRunTime" : 348, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 89885, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 22, + "index" : 22, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.018GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 93, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 143427, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 18, + "index" : 18, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.010GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 92, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100836, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 17, + "index" : 17, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.005GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 11, + "executorRunTime" : 91, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 99944, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 21, + "index" : 21, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.015GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 88, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100270, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 16, + "index" : 16, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.001GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 108320, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 19, + "index" : 19, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.012GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95788, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 9, + "index" : 9, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.915GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60489, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 101664, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 20, + "index" : 20, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.014GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 83, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 97716, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 14, + "index" : 14, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.925GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 83, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95646, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 8, + "index" : 8, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.914GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 80, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 121551, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 12, + "index" : 12, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.923GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 77, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 102476, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 13, + "index" : 13, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.924GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 76, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95004, + "recordsWritten" : 10 + } + } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_short_names___runtime_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_short_names___runtime_expectation.json new file mode 100644 index 0000000000000..5657123a2db15 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_short_names___runtime_expectation.json @@ -0,0 +1,561 @@ +[ { + "taskId" : 6, + "index" : 6, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.505GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 29, + "executorRunTime" : 351, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 4016617, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 5, + "index" : 5, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.505GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 30, + "executorRunTime" : 350, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3675510, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 1, + "index" : 1, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.502GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 350, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3934399, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 4, + "index" : 4, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.504GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 83022, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 7, + "index" : 7, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.506GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 2579051, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 3, + "index" : 3, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.504GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 31, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 1311694, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 0, + "index" : 0, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.494GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 32, + "executorRunTime" : 349, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 49294, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 3842811, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 2, + "index" : 2, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.503GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 32, + "executorRunTime" : 348, + "resultSize" : 2010, + "jvmGcTime" : 7, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 89885, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 22, + "index" : 22, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.018GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 93, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 143427, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 18, + "index" : 18, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.010GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 92, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100836, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 17, + "index" : 17, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.005GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 11, + "executorRunTime" : 91, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 99944, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 21, + "index" : 21, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.015GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 88, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100270, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 16, + "index" : 16, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.001GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 108320, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 19, + "index" : 19, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.012GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 5, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95788, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 9, + "index" : 9, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.915GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 84, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60489, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 101664, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 20, + "index" : 20, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.014GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 83, + "resultSize" : 2010, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 97716, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 14, + "index" : 14, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.925GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 83, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95646, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 8, + "index" : 8, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.914GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 80, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 60488, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 121551, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 12, + "index" : 12, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.923GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 77, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 102476, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 13, + "index" : 13, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:06.924GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 9, + "executorRunTime" : 76, + "resultSize" : 2010, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95004, + "recordsWritten" : 10 + } + } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_short_names__runtime_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_short_names__runtime_expectation.json new file mode 100644 index 0000000000000..72fe017e9f85d --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_list_w__sortBy_short_names__runtime_expectation.json @@ -0,0 +1,561 @@ +[ { + "taskId" : 40, + "index" : 40, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.197GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 4, + "executorRunTime" : 14, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 94792, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 86, + "index" : 86, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.374GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 95848, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 41, + "index" : 41, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.200GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 90765, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 68, + "index" : 68, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.306GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 101750, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 58, + "index" : 58, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.263GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 97521, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 43, + "index" : 43, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.204GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 171516, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 57, + "index" : 57, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.257GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 16, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 96849, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 59, + "index" : 59, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.265GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 100753, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 32, + "index" : 32, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.148GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 89603, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 87, + "index" : 87, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.374GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 102159, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 99, + "index" : 99, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.426GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70565, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 133964, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 63, + "index" : 63, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.276GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 20, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 102779, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 90, + "index" : 90, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.385GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 98472, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 39, + "index" : 39, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.180GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 98748, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 42, + "index" : 42, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.203GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 10, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 103713, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 51, + "index" : 51, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.242GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 2, + "executorRunTime" : 17, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 96013, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 50, + "index" : 50, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.240GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 4, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 90836, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 53, + "index" : 53, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.244GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 6, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 92835, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 44, + "index" : 44, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.205GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 3, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 0, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 98293, + "recordsWritten" : 10 + } + } +}, { + "taskId" : 80, + "index" : 80, + "attempt" : 0, + "launchTime" : "2015-05-06T13:03:07.341GMT", + "executorId" : "driver", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ ], + "taskMetrics" : { + "executorDeserializeTime" : 13, + "executorRunTime" : 18, + "resultSize" : 2065, + "jvmGcTime" : 5, + "resultSerializationTime" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "inputMetrics" : { + "bytesRead" : 70564, + "recordsRead" : 10000 + }, + "shuffleWriteMetrics" : { + "bytesWritten" : 1710, + "writeTime" : 98069, + "recordsWritten" : 10 + } + } +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json new file mode 100644 index 0000000000000..bc3c302813de2 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json @@ -0,0 +1,19 @@ +{ + "quantiles" : [ 0.01, 0.5, 0.99 ], + "executorDeserializeTime" : [ 1.0, 3.0, 36.0 ], + "executorRunTime" : [ 16.0, 28.0, 351.0 ], + "resultSize" : [ 2010.0, 2065.0, 2065.0 ], + "jvmGcTime" : [ 0.0, 0.0, 7.0 ], + "resultSerializationTime" : [ 0.0, 0.0, 2.0 ], + "memoryBytesSpilled" : [ 0.0, 0.0, 0.0 ], + "diskBytesSpilled" : [ 0.0, 0.0, 0.0 ], + "inputMetrics" : { + "bytesRead" : [ 60488.0, 70564.0, 70565.0 ], + "recordsRead" : [ 10000.0, 10000.0, 10000.0 ] + }, + "shuffleWriteMetrics" : { + "writeBytes" : [ 1710.0, 1710.0, 1710.0 ], + "writeRecords" : [ 10.0, 10.0, 10.0 ], + "writeTime" : [ 89437.0, 102159.0, 4016617.0 ] + } +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json new file mode 100644 index 0000000000000..e084c839f1d5a --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json @@ -0,0 +1,19 @@ +{ + "quantiles" : [ 0.05, 0.25, 0.5, 0.75, 0.95 ], + "executorDeserializeTime" : [ 1.0, 2.0, 2.0, 2.0, 3.0 ], + "executorRunTime" : [ 30.0, 74.0, 75.0, 76.0, 79.0 ], + "resultSize" : [ 1034.0, 1034.0, 1034.0, 1034.0, 1034.0 ], + "jvmGcTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "resultSerializationTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "memoryBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "diskBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "shuffleReadMetrics" : { + "readBytes" : [ 17100.0, 17100.0, 17100.0, 17100.0, 17100.0 ], + "readRecords" : [ 100.0, 100.0, 100.0, 100.0, 100.0 ], + "remoteBlocksFetched" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "localBlocksFetched" : [ 100.0, 100.0, 100.0, 100.0, 100.0 ], + "fetchWaitTime" : [ 0.0, 0.0, 0.0, 1.0, 1.0 ], + "remoteBytesRead" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "totalBlocksFetched" : [ 100.0, 100.0, 100.0, 100.0, 100.0 ] + } +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json new file mode 100644 index 0000000000000..6ac7811ce691b --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json @@ -0,0 +1,19 @@ +{ + "quantiles" : [ 0.05, 0.25, 0.5, 0.75, 0.95 ], + "executorDeserializeTime" : [ 2.0, 2.0, 3.0, 7.0, 31.0 ], + "executorRunTime" : [ 16.0, 18.0, 28.0, 49.0, 349.0 ], + "resultSize" : [ 2010.0, 2065.0, 2065.0, 2065.0, 2065.0 ], + "jvmGcTime" : [ 0.0, 0.0, 0.0, 5.0, 7.0 ], + "resultSerializationTime" : [ 0.0, 0.0, 0.0, 0.0, 1.0 ], + "memoryBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "diskBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "inputMetrics" : { + "bytesRead" : [ 60488.0, 70564.0, 70564.0, 70564.0, 70564.0 ], + "recordsRead" : [ 10000.0, 10000.0, 10000.0, 10000.0, 10000.0 ] + }, + "shuffleWriteMetrics" : { + "writeBytes" : [ 1710.0, 1710.0, 1710.0, 1710.0, 1710.0 ], + "writeRecords" : [ 10.0, 10.0, 10.0, 10.0, 10.0 ], + "writeTime" : [ 90329.0, 95848.0, 102159.0, 121551.0, 2579051.0 ] + } +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/stage_with_accumulable_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_with_accumulable_json_expectation.json new file mode 100644 index 0000000000000..32d5731676ad5 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/stage_with_accumulable_json_expectation.json @@ -0,0 +1,242 @@ +{ + "status" : "COMPLETE", + "stageId" : 0, + "attemptId" : 0, + "numActiveTasks" : 0, + "numCompleteTasks" : 8, + "numFailedTasks" : 0, + "executorRunTime" : 120, + "inputBytes" : 0, + "inputRecords" : 0, + "outputBytes" : 0, + "outputRecords" : 0, + "shuffleReadBytes" : 0, + "shuffleReadRecords" : 0, + "shuffleWriteBytes" : 0, + "shuffleWriteRecords" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0, + "name" : "foreach at :15", + "details" : "org.apache.spark.rdd.RDD.foreach(RDD.scala:765)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)", + "schedulingPool" : "default", + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "value" : "5050" + } ], + "tasks" : { + "2" : { + "taskId" : 2, + "index" : 2, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "378", + "value" : "378" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 13, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + }, + "5" : { + "taskId" : 5, + "index" : 5, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.523GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "897", + "value" : "3750" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + }, + "4" : { + "taskId" : 4, + "index" : 4, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "678", + "value" : "2853" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 1, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + }, + "7" : { + "taskId" : 7, + "index" : 7, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.524GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "1222", + "value" : "4972" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + }, + "1" : { + "taskId" : 1, + "index" : 1, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.521GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "247", + "value" : "2175" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 14, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + }, + "3" : { + "taskId" : 3, + "index" : 3, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.522GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "572", + "value" : "950" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 13, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + }, + "6" : { + "taskId" : 6, + "index" : 6, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.523GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "978", + "value" : "1928" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 12, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + }, + "0" : { + "taskId" : 0, + "index" : 0, + "attempt" : 0, + "launchTime" : "2015-03-16T19:25:36.515GMT", + "executorId" : "", + "host" : "localhost", + "taskLocality" : "PROCESS_LOCAL", + "speculative" : false, + "accumulatorUpdates" : [ { + "id" : 1, + "name" : "my counter", + "update" : "78", + "value" : "5050" + } ], + "taskMetrics" : { + "executorDeserializeTime" : 14, + "executorRunTime" : 15, + "resultSize" : 697, + "jvmGcTime" : 0, + "resultSerializationTime" : 2, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + } + }, + "executorSummary" : { + "" : { + "taskTime" : 418, + "failedTasks" : 0, + "succeededTasks" : 8, + "inputBytes" : 0, + "outputBytes" : 0, + "shuffleRead" : 0, + "shuffleWrite" : 0, + "memoryBytesSpilled" : 0, + "diskBytesSpilled" : 0 + } + } +} \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json new file mode 100644 index 0000000000000..cab4750270dfa --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json @@ -0,0 +1,43 @@ +[ { + "jobId" : 2, + "name" : "count at :17", + "stageIds" : [ 3 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +}, { + "jobId" : 1, + "name" : "count at :20", + "stageIds" : [ 1, 2 ], + "status" : "FAILED", + "numTasks" : 16, + "numActiveTasks" : 0, + "numCompletedTasks" : 15, + "numSkippedTasks" : 15, + "numFailedTasks" : 1, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 1 +}, { + "jobId" : 0, + "name" : "count at :15", + "stageIds" : [ 0 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +} ] \ No newline at end of file diff --git a/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json new file mode 100644 index 0000000000000..6fd25befbf7e8 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json @@ -0,0 +1,29 @@ +[ { + "jobId" : 2, + "name" : "count at :17", + "stageIds" : [ 3 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +}, { + "jobId" : 0, + "name" : "count at :15", + "stageIds" : [ 0 ], + "status" : "SUCCEEDED", + "numTasks" : 8, + "numActiveTasks" : 0, + "numCompletedTasks" : 8, + "numSkippedTasks" : 8, + "numFailedTasks" : 0, + "numActiveStages" : 0, + "numCompletedStages" : 1, + "numSkippedStages" : 0, + "numFailedStages" : 0 +} ] \ No newline at end of file diff --git a/core/src/test/resources/spark-events/local-1422981759269/APPLICATION_COMPLETE b/core/src/test/resources/spark-events/local-1422981759269/APPLICATION_COMPLETE new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1422981759269/EVENT_LOG_1 b/core/src/test/resources/spark-events/local-1422981759269/EVENT_LOG_1 new file mode 100755 index 0000000000000..4794e56d1107a --- /dev/null +++ b/core/src/test/resources/spark-events/local-1422981759269/EVENT_LOG_1 @@ -0,0 +1,88 @@ +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"","Host":"localhost","Port":57967},"Maximum Memory":278302556,"Timestamp":1422981759407} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre","Java Version":"1.7.0_67 (Oracle Corporation)","Scala Version":"version 2.10.4"},"Spark Properties":{"spark.driver.host":"192.168.1.103","spark.eventLog.enabled":"true","spark.driver.port":"57965","spark.repl.class.uri":"http://192.168.1.103:57964","spark.jars":"","spark.app.name":"Spark shell","spark.scheduler.mode":"FIFO","spark.executor.id":"driver","spark.master":"local[*]","spark.fileserver.uri":"http://192.168.1.103:57966","spark.tachyonStore.folderName":"spark-fd6c823a-8a18-4113-8306-1fa7bb623a7f","spark.app.id":"local-1422981759269"},"System Properties":{"java.io.tmpdir":"/var/folders/36/m29jw1z95qv4ywb1c4n0rz000000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.7","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.7","user.home":"/Users/irashid","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib","user.dir":"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4","java.library.path":"/Users/irashid/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"24.65-b04","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.7.0_67-b01","java.vm.info":"mixed mode","java.ext.dirs":"/Users/irashid/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"51.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.9.5","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"irashid","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --conf spark.eventLog.enabled=true spark-shell","java.home":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre","java.version":"1.7.0_67","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/spark-assembly-1.2.0-hadoop2.4.0.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-rdbms-3.2.9.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-core-3.2.10.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/conf":"System Classpath"}} +{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"local-1422981759269","Timestamp":1422981758277,"User":"irashid"} +{"Event":"SparkListenerJobStart","Job ID":0,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[0]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1422981762075,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1422981762081,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1422981762081,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1422981762082,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1422981762083,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1422981762084,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1422981762084,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1422981762085,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1422981762084,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981762632,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":521,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_6","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1422981762081,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981762633,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":520,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_2","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1422981762082,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981762634,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":521,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_3","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1422981762084,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981762634,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":522,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_5","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1422981762083,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981762635,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":522,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_4","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1422981762075,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981762636,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":522,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_0","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1422981762085,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981762636,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":9,"Executor Run Time":521,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_7","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1422981762081,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981762637,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":522,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_1","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":8,"Memory Size":28000128,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981762069,"Completion Time":1422981762637,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":0,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerJobStart","Job ID":1,"Stage Infos":[{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]},{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"first at :17","Number of Tasks":1,"RDD Info":[{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.first(RDD.scala:1093)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line11.$read$$iwC$$iwC$$iwC.(:22)\n$line11.$read$$iwC$$iwC.(:24)\n$line11.$read$$iwC.(:26)\n$line11.$read.(:28)\n$line11.$read$.(:32)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[1,2]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":8,"Index":0,"Attempt":0,"Launch Time":1422981763578,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":9,"Index":1,"Attempt":0,"Launch Time":1422981763578,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":10,"Index":2,"Attempt":0,"Launch Time":1422981763579,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":11,"Index":3,"Attempt":0,"Launch Time":1422981763579,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":12,"Index":4,"Attempt":0,"Launch Time":1422981763580,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":13,"Index":5,"Attempt":0,"Launch Time":1422981763580,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":14,"Index":6,"Attempt":0,"Launch Time":1422981763581,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":15,"Index":7,"Attempt":0,"Launch Time":1422981763581,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":10,"Index":2,"Attempt":0,"Launch Time":1422981763579,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764001,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":406,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":138000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":9,"Index":1,"Attempt":0,"Launch Time":1422981763578,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764002,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":106000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":12,"Index":4,"Attempt":0,"Launch Time":1422981763580,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764002,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1645,"Shuffle Write Time":99000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":8,"Index":0,"Attempt":0,"Launch Time":1422981763578,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764003,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":123000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":15,"Index":7,"Attempt":0,"Launch Time":1422981763581,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764003,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":406,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":108000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":11,"Index":3,"Attempt":0,"Launch Time":1422981763579,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764004,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1647,"Shuffle Write Time":97000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":14,"Index":6,"Attempt":0,"Launch Time":1422981763581,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764004,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":132000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":13,"Index":5,"Attempt":0,"Launch Time":1422981763580,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764005,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":81000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981763578,"Completion Time":1422981764005,"Accumulables":[]}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"first at :17","Number of Tasks":1,"RDD Info":[{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.first(RDD.scala:1093)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line11.$read$$iwC$$iwC$$iwC.(:22)\n$line11.$read$$iwC$$iwC.(:24)\n$line11.$read$$iwC.(:26)\n$line11.$read.(:28)\n$line11.$read$.(:32)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":16,"Index":0,"Attempt":0,"Launch Time":1422981764014,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":16,"Index":0,"Attempt":0,"Launch Time":1422981764014,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764045,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":0,"Executor Run Time":28,"Result Size":1013,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"first at :17","Number of Tasks":1,"RDD Info":[{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.first(RDD.scala:1093)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line11.$read$$iwC$$iwC$$iwC.(:22)\n$line11.$read$$iwC$$iwC.(:24)\n$line11.$read$$iwC.(:26)\n$line11.$read.(:28)\n$line11.$read$.(:32)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981764014,"Completion Time":1422981764045,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":1,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerJobStart","Job ID":2,"Stage Infos":[{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]},{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"saveAsTextFile at :19","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1164)\n$line13.$read$$iwC$$iwC$$iwC$$iwC.(:19)\n$line13.$read$$iwC$$iwC$$iwC.(:24)\n$line13.$read$$iwC$$iwC.(:26)\n$line13.$read$$iwC.(:28)\n$line13.$read.(:30)\n$line13.$read$.(:34)\n$line13.$read$.()\n$line13.$eval$.(:7)\n$line13.$eval$.()\n$line13.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[3,4]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"saveAsTextFile at :19","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1164)\n$line13.$read$$iwC$$iwC$$iwC$$iwC.(:19)\n$line13.$read$$iwC$$iwC$$iwC.(:24)\n$line13.$read$$iwC$$iwC.(:26)\n$line13.$read$$iwC.(:28)\n$line13.$read.(:30)\n$line13.$read$.(:34)\n$line13.$read$.()\n$line13.$eval$.(:7)\n$line13.$eval$.()\n$line13.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":17,"Index":0,"Attempt":0,"Launch Time":1422981764396,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":18,"Index":1,"Attempt":0,"Launch Time":1422981764396,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":19,"Index":2,"Attempt":0,"Launch Time":1422981764397,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":20,"Index":3,"Attempt":0,"Launch Time":1422981764397,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":21,"Index":4,"Attempt":0,"Launch Time":1422981764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":22,"Index":5,"Attempt":0,"Launch Time":1422981764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":23,"Index":6,"Attempt":0,"Launch Time":1422981764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":24,"Index":7,"Attempt":0,"Launch Time":1422981764399,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":24,"Index":7,"Attempt":0,"Launch Time":1422981764399,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764642,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":240,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":19,"Index":2,"Attempt":0,"Launch Time":1422981764397,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764643,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":241,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":21,"Index":4,"Attempt":0,"Launch Time":1422981764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764643,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":240,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":22,"Index":5,"Attempt":0,"Launch Time":1422981764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764645,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":241,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":23,"Index":6,"Attempt":0,"Launch Time":1422981764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764645,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":241,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":17,"Index":0,"Attempt":0,"Launch Time":1422981764396,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764646,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":243,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":18,"Index":1,"Attempt":0,"Launch Time":1422981764396,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764646,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":243,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":20,"Index":3,"Attempt":0,"Launch Time":1422981764397,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981764648,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":247,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"saveAsTextFile at :19","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1164)\n$line13.$read$$iwC$$iwC$$iwC$$iwC.(:19)\n$line13.$read$$iwC$$iwC$$iwC.(:24)\n$line13.$read$$iwC$$iwC.(:26)\n$line13.$read$$iwC.(:28)\n$line13.$read.(:30)\n$line13.$read$.(:34)\n$line13.$read$.()\n$line13.$eval$.(:7)\n$line13.$eval$.()\n$line13.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981764396,"Completion Time":1422981764648,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":2,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerJobStart","Job ID":3,"Stage Infos":[{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":4,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line15.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line15.$read$$iwC$$iwC$$iwC.(:22)\n$line15.$read$$iwC$$iwC.(:24)\n$line15.$read$$iwC.(:26)\n$line15.$read.(:28)\n$line15.$read$.(:32)\n$line15.$read$.()\n$line15.$eval$.(:7)\n$line15.$eval$.()\n$line15.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[5]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":4,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line15.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line15.$read$$iwC$$iwC$$iwC.(:22)\n$line15.$read$$iwC$$iwC.(:24)\n$line15.$read$$iwC.(:26)\n$line15.$read.(:28)\n$line15.$read$.(:32)\n$line15.$read$.()\n$line15.$eval$.(:7)\n$line15.$eval$.()\n$line15.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":25,"Index":0,"Attempt":0,"Launch Time":1422981765026,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":26,"Index":1,"Attempt":0,"Launch Time":1422981765026,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":27,"Index":2,"Attempt":0,"Launch Time":1422981765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":28,"Index":3,"Attempt":0,"Launch Time":1422981765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":29,"Index":4,"Attempt":0,"Launch Time":1422981765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":30,"Index":5,"Attempt":0,"Launch Time":1422981765028,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":31,"Index":6,"Attempt":0,"Launch Time":1422981765028,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":32,"Index":7,"Attempt":0,"Launch Time":1422981765029,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":27,"Index":2,"Attempt":0,"Launch Time":1422981765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981765045,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":13,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":30,"Index":5,"Attempt":0,"Launch Time":1422981765028,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981765046,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":13,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":25,"Index":0,"Attempt":0,"Launch Time":1422981765026,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981765046,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":12,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":32}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":29,"Index":4,"Attempt":0,"Launch Time":1422981765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981765047,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":13,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":28,"Index":3,"Attempt":0,"Launch Time":1422981765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981765047,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":12,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":32,"Index":7,"Attempt":0,"Launch Time":1422981765029,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981765048,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":13,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":31,"Index":6,"Attempt":0,"Launch Time":1422981765028,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981765048,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":14,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":26,"Index":1,"Attempt":0,"Launch Time":1422981765026,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981765049,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":12,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":32}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":4,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line15.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line15.$read$$iwC$$iwC$$iwC.(:22)\n$line15.$read$$iwC$$iwC.(:24)\n$line15.$read$$iwC.(:26)\n$line15.$read.(:28)\n$line15.$read$.(:32)\n$line15.$read$.()\n$line15.$eval$.(:7)\n$line15.$eval$.()\n$line15.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981765026,"Completion Time":1422981765050,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":3,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerApplicationEnd","Timestamp":1422981766912} diff --git a/core/src/test/resources/spark-events/local-1422981759269/SPARK_VERSION_1.2.0 b/core/src/test/resources/spark-events/local-1422981759269/SPARK_VERSION_1.2.0 new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1422981780767/APPLICATION_COMPLETE b/core/src/test/resources/spark-events/local-1422981780767/APPLICATION_COMPLETE new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1422981780767/EVENT_LOG_1 b/core/src/test/resources/spark-events/local-1422981780767/EVENT_LOG_1 new file mode 100755 index 0000000000000..f14a000bf2c28 --- /dev/null +++ b/core/src/test/resources/spark-events/local-1422981780767/EVENT_LOG_1 @@ -0,0 +1,82 @@ +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"","Host":"localhost","Port":57971},"Maximum Memory":278302556,"Timestamp":1422981780906} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre","Java Version":"1.7.0_67 (Oracle Corporation)","Scala Version":"version 2.10.4"},"Spark Properties":{"spark.driver.host":"192.168.1.103","spark.eventLog.enabled":"true","spark.driver.port":"57969","spark.repl.class.uri":"http://192.168.1.103:57968","spark.jars":"","spark.app.name":"Spark shell","spark.scheduler.mode":"FIFO","spark.executor.id":"driver","spark.master":"local[*]","spark.fileserver.uri":"http://192.168.1.103:57970","spark.tachyonStore.folderName":"spark-3f19daee-844c-41d0-a3fc-5e3e508f9731","spark.app.id":"local-1422981780767"},"System Properties":{"java.io.tmpdir":"/var/folders/36/m29jw1z95qv4ywb1c4n0rz000000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.7","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.7","user.home":"/Users/irashid","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib","user.dir":"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4","java.library.path":"/Users/irashid/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"24.65-b04","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.7.0_67-b01","java.vm.info":"mixed mode","java.ext.dirs":"/Users/irashid/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"51.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.9.5","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"irashid","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --conf spark.eventLog.enabled=true spark-shell","java.home":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre","java.version":"1.7.0_67","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/spark-assembly-1.2.0-hadoop2.4.0.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-rdbms-3.2.9.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-core-3.2.10.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/conf":"System Classpath"}} +{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"local-1422981780767","Timestamp":1422981779720,"User":"irashid"} +{"Event":"SparkListenerJobStart","Job ID":0,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[0]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1422981784234,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1422981784240,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1422981784240,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1422981784241,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1422981784241,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1422981784242,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1422981784242,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1422981784243,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1422981784241,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981784812,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":543,"Result Size":1268,"JVM GC Time":25,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_3","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1422981784240,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981784814,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":542,"Result Size":1268,"JVM GC Time":25,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_1","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1422981784234,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981784816,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":542,"Result Size":1268,"JVM GC Time":25,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_0","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1422981784243,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981784816,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":543,"Result Size":1268,"JVM GC Time":25,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_7","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1422981784242,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981784817,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":541,"Result Size":1268,"JVM GC Time":25,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_5","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1422981784241,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981784817,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":542,"Result Size":1268,"JVM GC Time":25,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_4","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1422981784242,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981784818,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":543,"Result Size":1268,"JVM GC Time":25,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_6","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1422981784240,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981784818,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":542,"Result Size":1268,"JVM GC Time":25,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_2","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":8,"Memory Size":28000128,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981784228,"Completion Time":1422981784819,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":0,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerJobStart","Job ID":1,"Stage Infos":[{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]},{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"count at :20","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:20)\n$line11.$read$$iwC$$iwC$$iwC.(:25)\n$line11.$read$$iwC$$iwC.(:27)\n$line11.$read$$iwC.(:29)\n$line11.$read.(:31)\n$line11.$read$.(:35)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[1,2]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":8,"Index":0,"Attempt":0,"Launch Time":1422981785829,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":9,"Index":1,"Attempt":0,"Launch Time":1422981785830,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":10,"Index":2,"Attempt":0,"Launch Time":1422981785830,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":11,"Index":3,"Attempt":0,"Launch Time":1422981785830,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":12,"Index":4,"Attempt":0,"Launch Time":1422981785831,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":13,"Index":5,"Attempt":0,"Launch Time":1422981785831,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":14,"Index":6,"Attempt":0,"Launch Time":1422981785832,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":15,"Index":7,"Attempt":0,"Launch Time":1422981785833,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":14,"Index":6,"Attempt":0,"Launch Time":1422981785832,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786282,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":434,"Result Size":1902,"JVM GC Time":19,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":88000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":8,"Index":0,"Attempt":0,"Launch Time":1422981785829,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786283,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":435,"Result Size":1902,"JVM GC Time":19,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":94000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":15,"Index":7,"Attempt":0,"Launch Time":1422981785833,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786283,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":435,"Result Size":1902,"JVM GC Time":19,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":79000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":13,"Index":5,"Attempt":0,"Launch Time":1422981785831,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786283,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":434,"Result Size":1902,"JVM GC Time":19,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":73000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":11,"Index":3,"Attempt":0,"Launch Time":1422981785830,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786284,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":434,"Result Size":1902,"JVM GC Time":19,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1647,"Shuffle Write Time":83000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":9,"Index":1,"Attempt":0,"Launch Time":1422981785830,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786284,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":436,"Result Size":1902,"JVM GC Time":19,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":98000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":12,"Index":4,"Attempt":0,"Launch Time":1422981785831,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786285,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":434,"Result Size":1902,"JVM GC Time":19,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1645,"Shuffle Write Time":101000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":10,"Index":2,"Attempt":0,"Launch Time":1422981785830,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786286,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":434,"Result Size":1902,"JVM GC Time":19,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":76000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981785829,"Completion Time":1422981786286,"Accumulables":[]}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"count at :20","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:20)\n$line11.$read$$iwC$$iwC$$iwC.(:25)\n$line11.$read$$iwC$$iwC.(:27)\n$line11.$read$$iwC.(:29)\n$line11.$read.(:31)\n$line11.$read$.(:35)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":16,"Index":0,"Attempt":0,"Launch Time":1422981786296,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":17,"Index":1,"Attempt":0,"Launch Time":1422981786297,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":18,"Index":2,"Attempt":0,"Launch Time":1422981786297,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":19,"Index":3,"Attempt":0,"Launch Time":1422981786297,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":20,"Index":4,"Attempt":0,"Launch Time":1422981786298,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":21,"Index":5,"Attempt":0,"Launch Time":1422981786298,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":22,"Index":6,"Attempt":0,"Launch Time":1422981786298,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":23,"Index":7,"Attempt":0,"Launch Time":1422981786299,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":20,"Index":4,"Attempt":0,"Launch Time":1422981786298,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786337,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":34,"Result Size":862,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":21,"Index":5,"Attempt":0,"Launch Time":1422981786298,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786339,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":35,"Result Size":862,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":17,"Index":1,"Attempt":0,"Launch Time":1422981786297,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786340,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":35,"Result Size":862,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":1,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":18,"Index":2,"Attempt":0,"Launch Time":1422981786297,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786340,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":34,"Result Size":862,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":16,"Index":0,"Attempt":0,"Launch Time":1422981786296,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786340,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":35,"Result Size":862,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":23,"Index":7,"Attempt":0,"Launch Time":1422981786299,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786341,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":35,"Result Size":862,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":22,"Index":6,"Attempt":0,"Launch Time":1422981786298,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786342,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":34,"Result Size":862,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"ExceptionFailure","Class Name":"java.lang.RuntimeException","Description":"got a 3, failing","Stack Trace":[{"Declaring Class":"$line11.$read$$iwC$$iwC$$iwC$$iwC$$anonfun$1","Method Name":"apply","File Name":"","Line Number":18},{"Declaring Class":"$line11.$read$$iwC$$iwC$$iwC$$iwC$$anonfun$1","Method Name":"apply","File Name":"","Line Number":17},{"Declaring Class":"scala.collection.Iterator$$anon$11","Method Name":"next","File Name":"Iterator.scala","Line Number":328},{"Declaring Class":"org.apache.spark.util.Utils$","Method Name":"getIteratorSize","File Name":"Utils.scala","Line Number":1311},{"Declaring Class":"org.apache.spark.rdd.RDD$$anonfun$count$1","Method Name":"apply","File Name":"RDD.scala","Line Number":910},{"Declaring Class":"org.apache.spark.rdd.RDD$$anonfun$count$1","Method Name":"apply","File Name":"RDD.scala","Line Number":910},{"Declaring Class":"org.apache.spark.SparkContext$$anonfun$runJob$4","Method Name":"apply","File Name":"SparkContext.scala","Line Number":1314},{"Declaring Class":"org.apache.spark.SparkContext$$anonfun$runJob$4","Method Name":"apply","File Name":"SparkContext.scala","Line Number":1314},{"Declaring Class":"org.apache.spark.scheduler.ResultTask","Method Name":"runTask","File Name":"ResultTask.scala","Line Number":61},{"Declaring Class":"org.apache.spark.scheduler.Task","Method Name":"run","File Name":"Task.scala","Line Number":56},{"Declaring Class":"org.apache.spark.executor.Executor$TaskRunner","Method Name":"run","File Name":"Executor.scala","Line Number":196},{"Declaring Class":"java.util.concurrent.ThreadPoolExecutor","Method Name":"runWorker","File Name":"ThreadPoolExecutor.java","Line Number":1145},{"Declaring Class":"java.util.concurrent.ThreadPoolExecutor$Worker","Method Name":"run","File Name":"ThreadPoolExecutor.java","Line Number":615},{"Declaring Class":"java.lang.Thread","Method Name":"run","File Name":"Thread.java","Line Number":745}],"Full Stack Trace":"java.lang.RuntimeException: got a 3, failing\n\tat $line11.$read$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(:18)\n\tat $line11.$read$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(:17)\n\tat scala.collection.Iterator$$anon$11.next(Iterator.scala:328)\n\tat org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1311)\n\tat org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:910)\n\tat org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:910)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1314)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1314)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:56)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)\n\tat java.lang.Thread.run(Thread.java:745)\n","Metrics":{"Host Name":"localhost","Executor Deserialize Time":0,"Executor Run Time":36,"Result Size":0,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}},"Task Info":{"Task ID":19,"Index":3,"Attempt":0,"Launch Time":1422981786297,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981786343,"Failed":true,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":0,"Executor Run Time":36,"Result Size":0,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"count at :20","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:20)\n$line11.$read$$iwC$$iwC$$iwC.(:25)\n$line11.$read$$iwC$$iwC.(:27)\n$line11.$read$$iwC.(:29)\n$line11.$read.(:31)\n$line11.$read$.(:35)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981786296,"Completion Time":1422981786347,"Failure Reason":"Job aborted due to stage failure: Task 3 in stage 2.0 failed 1 times, most recent failure: Lost task 3.0 in stage 2.0 (TID 19, localhost): java.lang.RuntimeException: got a 3, failing\n\tat $line11.$read$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(:18)\n\tat $line11.$read$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(:17)\n\tat scala.collection.Iterator$$anon$11.next(Iterator.scala:328)\n\tat org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1311)\n\tat org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:910)\n\tat org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:910)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1314)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1314)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:56)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)\n\tat java.lang.Thread.run(Thread.java:745)\n\nDriver stacktrace:","Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":1,"Job Result":{"Result":"JobFailed","Exception":{"Message":"Job aborted due to stage failure: Task 3 in stage 2.0 failed 1 times, most recent failure: Lost task 3.0 in stage 2.0 (TID 19, localhost): java.lang.RuntimeException: got a 3, failing\n\tat $line11.$read$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(:18)\n\tat $line11.$read$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(:17)\n\tat scala.collection.Iterator$$anon$11.next(Iterator.scala:328)\n\tat org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1311)\n\tat org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:910)\n\tat org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:910)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1314)\n\tat org.apache.spark.SparkContext$$anonfun$runJob$4.apply(SparkContext.scala:1314)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:56)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)\n\tat java.lang.Thread.run(Thread.java:745)\n\nDriver stacktrace:","Stack Trace":[{"Declaring Class":"org.apache.spark.scheduler.DAGScheduler","Method Name":"org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages","File Name":"DAGScheduler.scala","Line Number":1214},{"Declaring Class":"org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1","Method Name":"apply","File Name":"DAGScheduler.scala","Line Number":1203},{"Declaring Class":"org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1","Method Name":"apply","File Name":"DAGScheduler.scala","Line Number":1202},{"Declaring Class":"scala.collection.mutable.ResizableArray$class","Method Name":"foreach","File Name":"ResizableArray.scala","Line Number":59},{"Declaring Class":"scala.collection.mutable.ArrayBuffer","Method Name":"foreach","File Name":"ArrayBuffer.scala","Line Number":47},{"Declaring Class":"org.apache.spark.scheduler.DAGScheduler","Method Name":"abortStage","File Name":"DAGScheduler.scala","Line Number":1202},{"Declaring Class":"org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1","Method Name":"apply","File Name":"DAGScheduler.scala","Line Number":696},{"Declaring Class":"org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1","Method Name":"apply","File Name":"DAGScheduler.scala","Line Number":696},{"Declaring Class":"scala.Option","Method Name":"foreach","File Name":"Option.scala","Line Number":236},{"Declaring Class":"org.apache.spark.scheduler.DAGScheduler","Method Name":"handleTaskSetFailed","File Name":"DAGScheduler.scala","Line Number":696},{"Declaring Class":"org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2","Method Name":"applyOrElse","File Name":"DAGScheduler.scala","Line Number":1420},{"Declaring Class":"akka.actor.Actor$class","Method Name":"aroundReceive","File Name":"Actor.scala","Line Number":465},{"Declaring Class":"org.apache.spark.scheduler.DAGSchedulerEventProcessActor","Method Name":"aroundReceive","File Name":"DAGScheduler.scala","Line Number":1375},{"Declaring Class":"akka.actor.ActorCell","Method Name":"receiveMessage","File Name":"ActorCell.scala","Line Number":516},{"Declaring Class":"akka.actor.ActorCell","Method Name":"invoke","File Name":"ActorCell.scala","Line Number":487},{"Declaring Class":"akka.dispatch.Mailbox","Method Name":"processMailbox","File Name":"Mailbox.scala","Line Number":238},{"Declaring Class":"akka.dispatch.Mailbox","Method Name":"run","File Name":"Mailbox.scala","Line Number":220},{"Declaring Class":"akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask","Method Name":"exec","File Name":"AbstractDispatcher.scala","Line Number":393},{"Declaring Class":"scala.concurrent.forkjoin.ForkJoinTask","Method Name":"doExec","File Name":"ForkJoinTask.java","Line Number":260},{"Declaring Class":"scala.concurrent.forkjoin.ForkJoinPool$WorkQueue","Method Name":"runTask","File Name":"ForkJoinPool.java","Line Number":1339},{"Declaring Class":"scala.concurrent.forkjoin.ForkJoinPool","Method Name":"runWorker","File Name":"ForkJoinPool.java","Line Number":1979},{"Declaring Class":"scala.concurrent.forkjoin.ForkJoinWorkerThread","Method Name":"run","File Name":"ForkJoinWorkerThread.java","Line Number":107}]}}} +{"Event":"SparkListenerJobStart","Job ID":2,"Stage Infos":[{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":6,"Name":"/Users/irashid/spark-examples/tmp_data/sums_2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums_2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line19.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line19.$read$$iwC$$iwC$$iwC.(:22)\n$line19.$read$$iwC$$iwC.(:24)\n$line19.$read$$iwC.(:26)\n$line19.$read.(:28)\n$line19.$read$.(:32)\n$line19.$read$.()\n$line19.$eval$.(:7)\n$line19.$eval$.()\n$line19.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[3]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":6,"Name":"/Users/irashid/spark-examples/tmp_data/sums_2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums_2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line19.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line19.$read$$iwC$$iwC$$iwC.(:22)\n$line19.$read$$iwC$$iwC.(:24)\n$line19.$read$$iwC.(:26)\n$line19.$read.(:28)\n$line19.$read$.(:32)\n$line19.$read$.()\n$line19.$eval$.(:7)\n$line19.$eval$.()\n$line19.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":24,"Index":0,"Attempt":0,"Launch Time":1422981787191,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":25,"Index":1,"Attempt":0,"Launch Time":1422981787191,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":26,"Index":2,"Attempt":0,"Launch Time":1422981787192,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":27,"Index":3,"Attempt":0,"Launch Time":1422981787192,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":28,"Index":4,"Attempt":0,"Launch Time":1422981787193,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":29,"Index":5,"Attempt":0,"Launch Time":1422981787193,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":30,"Index":6,"Attempt":0,"Launch Time":1422981787194,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":31,"Index":7,"Attempt":0,"Launch Time":1422981787194,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":25,"Index":1,"Attempt":0,"Launch Time":1422981787191,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981787222,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":20,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":32}}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":24,"Index":0,"Attempt":0,"Launch Time":1422981787191,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981787223,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":20,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":32}}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":26,"Index":2,"Attempt":0,"Launch Time":1422981787192,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981787223,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":4,"Executor Run Time":22,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":31,"Index":7,"Attempt":0,"Launch Time":1422981787194,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981787223,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":19,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":29,"Index":5,"Attempt":0,"Launch Time":1422981787193,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981787224,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":19,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":27,"Index":3,"Attempt":0,"Launch Time":1422981787192,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981787224,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":6,"Executor Run Time":19,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":28,"Index":4,"Attempt":0,"Launch Time":1422981787193,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981787225,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":22,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":30,"Index":6,"Attempt":0,"Launch Time":1422981787194,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1422981787225,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":21,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":6,"Name":"/Users/irashid/spark-examples/tmp_data/sums_2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums_2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line19.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line19.$read$$iwC$$iwC$$iwC.(:22)\n$line19.$read$$iwC$$iwC.(:24)\n$line19.$read$$iwC.(:26)\n$line19.$read.(:28)\n$line19.$read$.(:32)\n$line19.$read$.()\n$line19.$eval$.(:7)\n$line19.$eval$.()\n$line19.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1422981787191,"Completion Time":1422981787226,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":2,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerApplicationEnd","Timestamp":1422981788731} diff --git a/core/src/test/resources/spark-events/local-1422981780767/SPARK_VERSION_1.2.0 b/core/src/test/resources/spark-events/local-1422981780767/SPARK_VERSION_1.2.0 new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1425081759269/APPLICATION_COMPLETE b/core/src/test/resources/spark-events/local-1425081759269/APPLICATION_COMPLETE new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1425081759269/EVENT_LOG_1 b/core/src/test/resources/spark-events/local-1425081759269/EVENT_LOG_1 new file mode 100755 index 0000000000000..9745b36b09e44 --- /dev/null +++ b/core/src/test/resources/spark-events/local-1425081759269/EVENT_LOG_1 @@ -0,0 +1,88 @@ +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"","Host":"localhost","Port":57967},"Maximum Memory":278302556,"Timestamp":1425081759407} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre","Java Version":"1.7.0_67 (Oracle Corporation)","Scala Version":"version 2.10.4"},"Spark Properties":{"spark.driver.host":"192.168.1.103","spark.eventLog.enabled":"true","spark.driver.port":"57965","spark.repl.class.uri":"http://192.168.1.103:57964","spark.jars":"","spark.app.name":"Spark shell","spark.scheduler.mode":"FIFO","spark.executor.id":"driver","spark.master":"local[*]","spark.fileserver.uri":"http://192.168.1.103:57966","spark.tachyonStore.folderName":"spark-fd6c823a-8a18-4113-8306-1fa7bb623a7f","spark.app.id":"local-1425081759269"},"System Properties":{"java.io.tmpdir":"/var/folders/36/m29jw1z95qv4ywb1c4n0rz000000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.7","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.7","user.home":"/Users/irashid","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib","user.dir":"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4","java.library.path":"/Users/irashid/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"24.65-b04","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.7.0_67-b01","java.vm.info":"mixed mode","java.ext.dirs":"/Users/irashid/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"51.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.9.5","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"irashid","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --conf spark.eventLog.enabled=true spark-shell","java.home":"/Library/Java/JavaVirtualMachines/jdk1.7.0_67.jdk/Contents/Home/jre","java.version":"1.7.0_67","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/spark-assembly-1.2.0-hadoop2.4.0.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-rdbms-3.2.9.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-core-3.2.10.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/conf":"System Classpath"}} +{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"local-1425081759269","Timestamp":1425081758277,"User":"irashid"} +{"Event":"SparkListenerJobStart","Job ID":0,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[0]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1425081762075,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1425081762081,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1425081762081,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1425081762082,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1425081762083,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1425081762084,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1425081762084,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1425081762085,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1425081762084,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081762632,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":521,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_6","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1425081762081,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081762633,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":520,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_2","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1425081762082,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081762634,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":521,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_3","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1425081762084,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081762634,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":522,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_5","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1425081762083,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081762635,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":522,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_4","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1425081762075,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081762636,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":522,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_0","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1425081762085,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081762636,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":9,"Executor Run Time":521,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_7","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1425081762081,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081762637,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":522,"Result Size":1268,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Updated Blocks":[{"Block ID":"rdd_0_1","Status":{"Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Memory Size":3500016,"Tachyon Size":0,"Disk Size":0}}]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":8,"Memory Size":28000128,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1425081762069,"Completion Time":1425081762637,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":0,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerJobStart","Job ID":1,"Stage Infos":[{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]},{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"first at :17","Number of Tasks":1,"RDD Info":[{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.first(RDD.scala:1093)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line11.$read$$iwC$$iwC$$iwC.(:22)\n$line11.$read$$iwC$$iwC.(:24)\n$line11.$read$$iwC.(:26)\n$line11.$read.(:28)\n$line11.$read$.(:32)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[1,2]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":8,"Index":0,"Attempt":0,"Launch Time":1425081763578,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":9,"Index":1,"Attempt":0,"Launch Time":1425081763578,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":10,"Index":2,"Attempt":0,"Launch Time":1425081763579,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":11,"Index":3,"Attempt":0,"Launch Time":1425081763579,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":12,"Index":4,"Attempt":0,"Launch Time":1425081763580,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":13,"Index":5,"Attempt":0,"Launch Time":1425081763580,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":14,"Index":6,"Attempt":0,"Launch Time":1425081763581,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":15,"Index":7,"Attempt":0,"Launch Time":1425081763581,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":10,"Index":2,"Attempt":0,"Launch Time":1425081763579,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764001,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":406,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":138000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":9,"Index":1,"Attempt":0,"Launch Time":1425081763578,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764002,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":106000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":12,"Index":4,"Attempt":0,"Launch Time":1425081763580,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764002,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1645,"Shuffle Write Time":99000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":8,"Index":0,"Attempt":0,"Launch Time":1425081763578,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764003,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":123000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":15,"Index":7,"Attempt":0,"Launch Time":1425081763581,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764003,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":406,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":108000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":11,"Index":3,"Attempt":0,"Launch Time":1425081763579,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764004,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1647,"Shuffle Write Time":97000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":14,"Index":6,"Attempt":0,"Launch Time":1425081763581,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764004,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":132000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":13,"Index":5,"Attempt":0,"Launch Time":1425081763580,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764005,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":407,"Result Size":1902,"JVM GC Time":18,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1648,"Shuffle Write Time":81000},"Input Metrics":{"Data Read Method":"Memory","Bytes Read":3500016}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1425081763578,"Completion Time":1425081764005,"Accumulables":[]}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"first at :17","Number of Tasks":1,"RDD Info":[{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.first(RDD.scala:1093)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line11.$read$$iwC$$iwC$$iwC.(:22)\n$line11.$read$$iwC$$iwC.(:24)\n$line11.$read$$iwC.(:26)\n$line11.$read.(:28)\n$line11.$read$.(:32)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":16,"Index":0,"Attempt":0,"Launch Time":1425081764014,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":16,"Index":0,"Attempt":0,"Launch Time":1425081764014,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764045,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":0,"Executor Run Time":28,"Result Size":1013,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"first at :17","Number of Tasks":1,"RDD Info":[{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.first(RDD.scala:1093)\n$line11.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line11.$read$$iwC$$iwC$$iwC.(:22)\n$line11.$read$$iwC$$iwC.(:24)\n$line11.$read$$iwC.(:26)\n$line11.$read.(:28)\n$line11.$read$.(:32)\n$line11.$read$.()\n$line11.$eval$.(:7)\n$line11.$eval$.()\n$line11.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1425081764014,"Completion Time":1425081764045,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":1,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerJobStart","Job ID":2,"Stage Infos":[{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"map at :14","Number of Tasks":8,"RDD Info":[{"RDD ID":1,"Name":"1","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":true,"Use Tachyon":false,"Deserialized":true,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:271)\n$line10.$read$$iwC$$iwC$$iwC$$iwC.(:14)\n$line10.$read$$iwC$$iwC$$iwC.(:19)\n$line10.$read$$iwC$$iwC.(:21)\n$line10.$read$$iwC.(:23)\n$line10.$read.(:25)\n$line10.$read$.(:29)\n$line10.$read$.()\n$line10.$eval$.(:7)\n$line10.$eval$.()\n$line10.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]},{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"saveAsTextFile at :19","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1164)\n$line13.$read$$iwC$$iwC$$iwC$$iwC.(:19)\n$line13.$read$$iwC$$iwC$$iwC.(:24)\n$line13.$read$$iwC$$iwC.(:26)\n$line13.$read$$iwC.(:28)\n$line13.$read.(:30)\n$line13.$read$.(:34)\n$line13.$read$.()\n$line13.$eval$.(:7)\n$line13.$eval$.()\n$line13.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[3,4]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"saveAsTextFile at :19","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1164)\n$line13.$read$$iwC$$iwC$$iwC$$iwC.(:19)\n$line13.$read$$iwC$$iwC$$iwC.(:24)\n$line13.$read$$iwC$$iwC.(:26)\n$line13.$read$$iwC.(:28)\n$line13.$read.(:30)\n$line13.$read$.(:34)\n$line13.$read$.()\n$line13.$eval$.(:7)\n$line13.$eval$.()\n$line13.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":17,"Index":0,"Attempt":0,"Launch Time":1425081764396,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":18,"Index":1,"Attempt":0,"Launch Time":1425081764396,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":19,"Index":2,"Attempt":0,"Launch Time":1425081764397,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":20,"Index":3,"Attempt":0,"Launch Time":1425081764397,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":21,"Index":4,"Attempt":0,"Launch Time":1425081764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":22,"Index":5,"Attempt":0,"Launch Time":1425081764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":23,"Index":6,"Attempt":0,"Launch Time":1425081764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":24,"Index":7,"Attempt":0,"Launch Time":1425081764399,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":24,"Index":7,"Attempt":0,"Launch Time":1425081764399,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764642,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":240,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":19,"Index":2,"Attempt":0,"Launch Time":1425081764397,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764643,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":241,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":21,"Index":4,"Attempt":0,"Launch Time":1425081764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764643,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":240,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":22,"Index":5,"Attempt":0,"Launch Time":1425081764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764645,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":241,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":23,"Index":6,"Attempt":0,"Launch Time":1425081764398,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764645,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":241,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":17,"Index":0,"Attempt":0,"Launch Time":1425081764396,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764646,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":243,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":18,"Index":1,"Attempt":0,"Launch Time":1425081764396,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764646,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":243,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":20,"Index":3,"Attempt":0,"Launch Time":1425081764397,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081764648,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":247,"Result Size":824,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":8,"Fetch Wait Time":0,"Remote Bytes Read":0}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"saveAsTextFile at :19","Number of Tasks":8,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1164)\n$line13.$read$$iwC$$iwC$$iwC$$iwC.(:19)\n$line13.$read$$iwC$$iwC$$iwC.(:24)\n$line13.$read$$iwC$$iwC.(:26)\n$line13.$read$$iwC.(:28)\n$line13.$read.(:30)\n$line13.$read$.(:34)\n$line13.$read$.()\n$line13.$eval$.(:7)\n$line13.$eval$.()\n$line13.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1425081764396,"Completion Time":1425081764648,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":2,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerJobStart","Job ID":3,"Stage Infos":[{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":4,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line15.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line15.$read$$iwC$$iwC$$iwC.(:22)\n$line15.$read$$iwC$$iwC.(:24)\n$line15.$read$$iwC.(:26)\n$line15.$read.(:28)\n$line15.$read$.(:32)\n$line15.$read$.()\n$line15.$eval$.(:7)\n$line15.$eval$.()\n$line15.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[5]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":4,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line15.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line15.$read$$iwC$$iwC$$iwC.(:22)\n$line15.$read$$iwC$$iwC.(:24)\n$line15.$read$$iwC.(:26)\n$line15.$read.(:28)\n$line15.$read$.(:32)\n$line15.$read$.()\n$line15.$eval$.(:7)\n$line15.$eval$.()\n$line15.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":25,"Index":0,"Attempt":0,"Launch Time":1425081765026,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":26,"Index":1,"Attempt":0,"Launch Time":1425081765026,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":27,"Index":2,"Attempt":0,"Launch Time":1425081765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":28,"Index":3,"Attempt":0,"Launch Time":1425081765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":29,"Index":4,"Attempt":0,"Launch Time":1425081765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":30,"Index":5,"Attempt":0,"Launch Time":1425081765028,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":31,"Index":6,"Attempt":0,"Launch Time":1425081765028,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":32,"Index":7,"Attempt":0,"Launch Time":1425081765029,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":27,"Index":2,"Attempt":0,"Launch Time":1425081765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081765045,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":13,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":30,"Index":5,"Attempt":0,"Launch Time":1425081765028,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081765046,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":13,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":25,"Index":0,"Attempt":0,"Launch Time":1425081765026,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081765046,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":12,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":32}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":29,"Index":4,"Attempt":0,"Launch Time":1425081765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081765047,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":13,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":28,"Index":3,"Attempt":0,"Launch Time":1425081765027,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081765047,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":12,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":32,"Index":7,"Attempt":0,"Launch Time":1425081765029,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081765048,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":13,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":31,"Index":6,"Attempt":0,"Launch Time":1425081765028,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081765048,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":14,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":16}}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":26,"Index":1,"Attempt":0,"Launch Time":1425081765026,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1425081765049,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":12,"Result Size":1812,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":32}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"count at :17","Number of Tasks":8,"RDD Info":[{"RDD ID":5,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":4,"Name":"/Users/irashid/spark-examples/tmp_data/sums","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:910)\n$line15.$read$$iwC$$iwC$$iwC$$iwC.(:17)\n$line15.$read$$iwC$$iwC$$iwC.(:22)\n$line15.$read$$iwC$$iwC.(:24)\n$line15.$read$$iwC.(:26)\n$line15.$read.(:28)\n$line15.$read$.(:32)\n$line15.$read$.()\n$line15.$eval$.(:7)\n$line15.$eval$.()\n$line15.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:606)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1425081765026,"Completion Time":1425081765050,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":3,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerApplicationEnd","Timestamp":1425081766912} diff --git a/core/src/test/resources/spark-events/local-1425081759269/SPARK_VERSION_1.2.0 b/core/src/test/resources/spark-events/local-1425081759269/SPARK_VERSION_1.2.0 new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1426533911241/APPLICATION_COMPLETE b/core/src/test/resources/spark-events/local-1426533911241/APPLICATION_COMPLETE new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1426533911241/EVENT_LOG_1 b/core/src/test/resources/spark-events/local-1426533911241/EVENT_LOG_1 new file mode 100755 index 0000000000000..9ef5bd5d92de5 --- /dev/null +++ b/core/src/test/resources/spark-events/local-1426533911241/EVENT_LOG_1 @@ -0,0 +1,24 @@ +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"","Host":"localhost","Port":58610},"Maximum Memory":278019440,"Timestamp":1426533911361} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","Java Version":"1.8.0_25 (Oracle Corporation)","Scala Version":"version 2.10.4"},"Spark Properties":{"spark.driver.host":"192.168.1.105","spark.eventLog.enabled":"true","spark.driver.port":"58608","spark.repl.class.uri":"http://192.168.1.105:58607","spark.jars":"","spark.app.name":"Spark shell","spark.scheduler.mode":"FIFO","spark.executor.id":"driver","spark.master":"local[*]","spark.fileserver.uri":"http://192.168.1.105:58609","spark.tachyonStore.folderName":"spark-5e9b7f26-8e97-4b43-82d6-25c141530da9","spark.app.id":"local-1426533911241"},"System Properties":{"java.io.tmpdir":"/var/folders/36/m29jw1z95qv4ywb1c4n0rz000000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.8","user.home":"/Users/irashid","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib","user.dir":"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4","java.library.path":"/Users/irashid/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"25.25-b02","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.8.0_25-b17","java.vm.info":"mixed mode","java.ext.dirs":"/Users/irashid/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.9.5","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"irashid","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --conf spark.eventLog.enabled=true spark-shell","java.home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","java.version":"1.8.0_25","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/spark-assembly-1.2.0-hadoop2.4.0.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/etc/hadoop":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-rdbms-3.2.9.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-core-3.2.10.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/conf":"System Classpath"}} +{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"local-1426533911241","Timestamp":1426533910242,"User":"irashid","App Attempt ID":"1"} +{"Event":"SparkListenerJobStart","Job ID":0,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"foreach at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.foreach(RDD.scala:765)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[0]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"foreach at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.foreach(RDD.scala:765)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1426533936515,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1426533936521,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1426533936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1426533936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1426533936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1426533936523,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1426533936523,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1426533936524,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1426533936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426533936570,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"378","Value":"378"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":13,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1426533936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426533936572,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"572","Value":"950"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":13,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1426533936523,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426533936574,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"978","Value":"1928"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1426533936521,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426533936574,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"247","Value":"2175"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":14,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1426533936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426533936574,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"678","Value":"2853"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1426533936523,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426533936575,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"897","Value":"3750"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1426533936524,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426533936575,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"1222","Value":"4972"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1426533936515,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426533936576,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"78","Value":"5050"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":14,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"foreach at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.foreach(RDD.scala:765)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1426533936103,"Completion Time":1426533936579,"Accumulables":[{"ID":1,"Name":"my counter","Value":"5050"}]}} +{"Event":"SparkListenerJobEnd","Job ID":0,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerApplicationEnd","Timestamp":1426533945177} diff --git a/core/src/test/resources/spark-events/local-1426533911241/SPARK_VERSION_1.2.0 b/core/src/test/resources/spark-events/local-1426533911241/SPARK_VERSION_1.2.0 new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1426633911242/APPLICATION_COMPLETE b/core/src/test/resources/spark-events/local-1426633911242/APPLICATION_COMPLETE new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1426633911242/EVENT_LOG_1 b/core/src/test/resources/spark-events/local-1426633911242/EVENT_LOG_1 new file mode 100755 index 0000000000000..e7043282107d3 --- /dev/null +++ b/core/src/test/resources/spark-events/local-1426633911242/EVENT_LOG_1 @@ -0,0 +1,24 @@ +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"","Host":"localhost","Port":58610},"Maximum Memory":278019440,"Timestamp":1426633911361} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","Java Version":"1.8.0_25 (Oracle Corporation)","Scala Version":"version 2.10.4"},"Spark Properties":{"spark.driver.host":"192.168.1.105","spark.eventLog.enabled":"true","spark.driver.port":"58608","spark.repl.class.uri":"http://192.168.1.105:58607","spark.jars":"","spark.app.name":"Spark shell","spark.scheduler.mode":"FIFO","spark.executor.id":"driver","spark.master":"local[*]","spark.fileserver.uri":"http://192.168.1.105:58609","spark.tachyonStore.folderName":"spark-5e9b7f26-8e97-4b43-82d6-25c141530da9","spark.app.id":"local-1426633911241"},"System Properties":{"java.io.tmpdir":"/var/folders/36/m29jw1z95qv4ywb1c4n0rz000000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.8","user.home":"/Users/irashid","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib","user.dir":"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4","java.library.path":"/Users/irashid/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"25.25-b02","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.8.0_25-b17","java.vm.info":"mixed mode","java.ext.dirs":"/Users/irashid/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.9.5","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"irashid","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --conf spark.eventLog.enabled=true spark-shell","java.home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","java.version":"1.8.0_25","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/spark-assembly-1.2.0-hadoop2.4.0.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/etc/hadoop":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-rdbms-3.2.9.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/lib/datanucleus-core-3.2.10.jar":"System Classpath","/Users/irashid/spark-examples/releases/spark-1.2.0-bin-hadoop2.4/conf":"System Classpath"}} +{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"local-1426533911241","Timestamp":1426633910242,"User":"irashid","App Attempt ID":"2"} +{"Event":"SparkListenerJobStart","Job ID":0,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"foreach at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.foreach(RDD.scala:765)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}],"Stage IDs":[0]} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"foreach at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.foreach(RDD.scala:765)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1426633936515,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1426633936521,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1426633936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1426633936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1426633936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1426633936523,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1426633936523,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1426633936524,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1426633936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426633936570,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"378","Value":"378"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":13,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1426633936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426633936572,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"572","Value":"950"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":13,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1426633936523,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426633936574,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"978","Value":"1928"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1426633936521,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426633936574,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"247","Value":"2175"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":14,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1426633936522,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426633936574,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"678","Value":"2853"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1426633936523,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426633936575,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"897","Value":"3750"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1426633936524,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426633936575,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"1222","Value":"4972"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1426633936515,"Executor ID":"","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1426633936576,"Failed":false,"Accumulables":[{"ID":1,"Name":"my counter","Update":"78","Value":"5050"}]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":14,"Executor Run Time":15,"Result Size":697,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"foreach at :15","Number of Tasks":8,"RDD Info":[{"RDD ID":0,"Name":"0","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":8,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.foreach(RDD.scala:765)\n$line9.$read$$iwC$$iwC$$iwC$$iwC.(:15)\n$line9.$read$$iwC$$iwC$$iwC.(:20)\n$line9.$read$$iwC$$iwC.(:22)\n$line9.$read$$iwC.(:24)\n$line9.$read.(:26)\n$line9.$read$.(:30)\n$line9.$read$.()\n$line9.$eval$.(:7)\n$line9.$eval$.()\n$line9.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)\norg.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)\norg.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)\norg.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)","Submission Time":1426633936103,"Completion Time":1426633936579,"Accumulables":[{"ID":1,"Name":"my counter","Value":"5050"}]}} +{"Event":"SparkListenerJobEnd","Job ID":0,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerApplicationEnd","Timestamp":1426633945177} diff --git a/core/src/test/resources/spark-events/local-1426633911242/SPARK_VERSION_1.2.0 b/core/src/test/resources/spark-events/local-1426633911242/SPARK_VERSION_1.2.0 new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/core/src/test/resources/spark-events/local-1430917381534 b/core/src/test/resources/spark-events/local-1430917381534 new file mode 100755 index 0000000000000..24ac478073e3d --- /dev/null +++ b/core/src/test/resources/spark-events/local-1430917381534 @@ -0,0 +1,231 @@ +{"Event":"SparkListenerLogStart","Spark Version":"1.4.0-SNAPSHOT"} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"localhost","Port":61103},"Maximum Memory":278019440,"Timestamp":1430917381651} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","Java Version":"1.8.0_25 (Oracle Corporation)","Scala Version":"version 2.10.4"},"Spark Properties":{"spark.driver.host":"192.168.1.102","spark.eventLog.enabled":"true","spark.driver.port":"61101","spark.repl.class.uri":"http://192.168.1.102:61100","spark.jars":"","spark.app.name":"Spark shell","spark.scheduler.mode":"FIFO","spark.executor.id":"driver","spark.master":"local[*]","spark.eventLog.dir":"/Users/irashid/github/kraps/core/src/test/resources/spark-events","spark.fileserver.uri":"http://192.168.1.102:61102","spark.tachyonStore.folderName":"spark-aaaf41b3-d1dd-447f-8951-acf51490758b","spark.app.id":"local-1430917381534"},"System Properties":{"java.io.tmpdir":"/var/folders/36/m29jw1z95qv4ywb1c4n0rz000000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.8","user.home":"/Users/irashid","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib","user.dir":"/Users/irashid/github/spark","java.library.path":"/Users/irashid/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"25.25-b02","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.8.0_25-b17","java.vm.info":"mixed mode","java.ext.dirs":"/Users/irashid/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","scala.usejavacp":"true","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.9.5","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"irashid","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --conf spark.eventLog.enabled=true --conf spark.eventLog.dir=/Users/irashid/github/kraps/core/src/test/resources/spark-events --class org.apache.spark.repl.Main spark-shell","java.home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home/jre","java.version":"1.8.0_25","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/etc/hadoop":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-rdbms-3.2.9.jar":"System Classpath","/Users/irashid/github/spark/conf/":"System Classpath","/Users/irashid/github/spark/assembly/target/scala-2.10/spark-assembly-1.4.0-SNAPSHOT-hadoop2.5.0.jar":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-core-3.2.10.jar":"System Classpath","/Users/irashid/github/spark/lib_managed/jars/datanucleus-api-jdo-3.2.6.jar":"System Classpath"}} +{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"local-1430917381534","Timestamp":1430917380893,"User":"irashid"} +{"Event":"SparkListenerJobStart","Job ID":0,"Submission Time":1430917386422,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"map at :26","Number of Tasks":100,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"/Users/irashid/spark-examples/tmp_data/raw_data","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":1,"Name":"/Users/irashid/spark-examples/tmp_data/raw_data","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:287)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:26)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:31)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:33)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC.(:35)\n$line20.$read$$iwC$$iwC$$iwC$$iwC.(:37)\n$line20.$read$$iwC$$iwC$$iwC.(:39)\n$line20.$read$$iwC$$iwC.(:41)\n$line20.$read$$iwC.(:43)\n$line20.$read.(:45)\n$line20.$read$.(:49)\n$line20.$read$.()\n$line20.$eval$.(:7)\n$line20.$eval$.()\n$line20.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)","Accumulables":[]},{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"collect at :26","Number of Tasks":10,"RDD Info":[{"RDD ID":4,"Name":"4","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":10,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.collect(RDD.scala:819)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:26)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:31)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:33)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC.(:35)\n$line20.$read$$iwC$$iwC$$iwC$$iwC.(:37)\n$line20.$read$$iwC$$iwC$$iwC.(:39)\n$line20.$read$$iwC$$iwC.(:41)\n$line20.$read$$iwC.(:43)\n$line20.$read.(:45)\n$line20.$read$.(:49)\n$line20.$read$.()\n$line20.$eval$.(:7)\n$line20.$eval$.()\n$line20.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)","Accumulables":[]}],"Stage IDs":[0,1],"Properties":{}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"map at :26","Number of Tasks":100,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"/Users/irashid/spark-examples/tmp_data/raw_data","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":1,"Name":"/Users/irashid/spark-examples/tmp_data/raw_data","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:287)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:26)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:31)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:33)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC.(:35)\n$line20.$read$$iwC$$iwC$$iwC$$iwC.(:37)\n$line20.$read$$iwC$$iwC$$iwC.(:39)\n$line20.$read$$iwC$$iwC.(:41)\n$line20.$read$$iwC.(:43)\n$line20.$read.(:45)\n$line20.$read$.(:49)\n$line20.$read$.()\n$line20.$eval$.(:7)\n$line20.$eval$.()\n$line20.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)","Accumulables":[]},"Properties":{}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1430917386494,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1430917386502,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1430917386503,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1430917386504,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1430917386504,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1430917386505,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1430917386505,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1430917386506,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":8,"Index":8,"Attempt":0,"Launch Time":1430917386914,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":9,"Index":9,"Attempt":0,"Launch Time":1430917386915,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":10,"Index":10,"Attempt":0,"Launch Time":1430917386916,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":11,"Index":11,"Attempt":0,"Launch Time":1430917386918,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":5,"Attempt":0,"Launch Time":1430917386505,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917386919,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":30,"Executor Run Time":350,"Result Size":2010,"JVM GC Time":7,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":3675510,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60488,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1430917386503,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917386922,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":32,"Executor Run Time":348,"Result Size":2010,"JVM GC Time":7,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":89885,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60488,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1430917386502,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917386923,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":31,"Executor Run Time":350,"Result Size":2010,"JVM GC Time":7,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":3934399,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60488,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":4,"Attempt":0,"Launch Time":1430917386504,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917386923,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":31,"Executor Run Time":349,"Result Size":2010,"JVM GC Time":7,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":83022,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60488,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":12,"Index":12,"Attempt":0,"Launch Time":1430917386923,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387016,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":6,"Attempt":0,"Launch Time":1430917386505,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917386924,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":29,"Executor Run Time":351,"Result Size":2010,"JVM GC Time":7,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":4016617,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60488,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":13,"Index":13,"Attempt":0,"Launch Time":1430917386924,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387062,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":14,"Index":14,"Attempt":0,"Launch Time":1430917386925,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387019,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1430917386504,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917386927,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":31,"Executor Run Time":349,"Result Size":2010,"JVM GC Time":7,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":1311694,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60488,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":15,"Index":15,"Attempt":0,"Launch Time":1430917386928,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387011,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1430917386494,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917386929,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":32,"Executor Run Time":349,"Result Size":2010,"JVM GC Time":7,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":3842811,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":49294,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":7,"Attempt":0,"Launch Time":1430917386506,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917386929,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":31,"Executor Run Time":349,"Result Size":2010,"JVM GC Time":7,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":2579051,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60488,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":16,"Index":16,"Attempt":0,"Launch Time":1430917387001,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387099,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":8,"Index":8,"Attempt":0,"Launch Time":1430917386914,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387002,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":80,"Result Size":2010,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":121551,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60488,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":17,"Index":17,"Attempt":0,"Launch Time":1430917387005,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387128,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":11,"Index":11,"Attempt":0,"Launch Time":1430917386918,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387007,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":6,"Executor Run Time":75,"Result Size":2010,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":94507,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":18,"Index":18,"Attempt":0,"Launch Time":1430917387010,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387115,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":15,"Index":15,"Attempt":0,"Launch Time":1430917386928,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387011,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":76,"Result Size":2010,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":602780,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":19,"Index":19,"Attempt":0,"Launch Time":1430917387012,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387106,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":20,"Index":20,"Attempt":0,"Launch Time":1430917387014,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387104,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":21,"Index":21,"Attempt":0,"Launch Time":1430917387015,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387111,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":10,"Index":10,"Attempt":0,"Launch Time":1430917386916,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387015,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":8,"Executor Run Time":73,"Result Size":2010,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":94709,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":9,"Index":9,"Attempt":0,"Launch Time":1430917386915,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387016,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":9,"Executor Run Time":84,"Result Size":2010,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":101664,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":60489,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":12,"Index":12,"Attempt":0,"Launch Time":1430917386923,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387016,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":9,"Executor Run Time":77,"Result Size":2010,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":102476,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":22,"Index":22,"Attempt":0,"Launch Time":1430917387018,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387119,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":14,"Index":14,"Attempt":0,"Launch Time":1430917386925,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387019,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":6,"Executor Run Time":83,"Result Size":2010,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":95646,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":23,"Index":23,"Attempt":0,"Launch Time":1430917387031,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387115,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":13,"Index":13,"Attempt":0,"Launch Time":1430917386924,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387062,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":9,"Executor Run Time":76,"Result Size":2010,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":95004,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":24,"Index":24,"Attempt":0,"Launch Time":1430917387098,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":16,"Index":16,"Attempt":0,"Launch Time":1430917387001,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387099,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":84,"Result Size":2010,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":108320,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":25,"Index":25,"Attempt":0,"Launch Time":1430917387103,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":20,"Index":20,"Attempt":0,"Launch Time":1430917387014,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387104,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":83,"Result Size":2010,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":97716,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":26,"Index":26,"Attempt":0,"Launch Time":1430917387105,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":19,"Index":19,"Attempt":0,"Launch Time":1430917387012,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387106,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":84,"Result Size":2010,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":95788,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":27,"Index":27,"Attempt":0,"Launch Time":1430917387110,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":21,"Index":21,"Attempt":0,"Launch Time":1430917387015,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387111,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":88,"Result Size":2010,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":100270,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":28,"Index":28,"Attempt":0,"Launch Time":1430917387113,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":29,"Index":29,"Attempt":0,"Launch Time":1430917387114,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":23,"Index":23,"Attempt":0,"Launch Time":1430917387031,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387115,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":65,"Result Size":2010,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":91844,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":18,"Index":18,"Attempt":0,"Launch Time":1430917387010,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387115,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":92,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":100836,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":30,"Index":30,"Attempt":0,"Launch Time":1430917387118,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":22,"Index":22,"Attempt":0,"Launch Time":1430917387018,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387119,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":93,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":143427,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":31,"Index":31,"Attempt":0,"Launch Time":1430917387127,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":17,"Index":17,"Attempt":0,"Launch Time":1430917387005,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387128,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":11,"Executor Run Time":91,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":99944,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":32,"Index":32,"Attempt":0,"Launch Time":1430917387148,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":33,"Index":33,"Attempt":0,"Launch Time":1430917387149,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":24,"Index":24,"Attempt":0,"Launch Time":1430917387098,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387150,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":43,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":157194,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":27,"Index":27,"Attempt":0,"Launch Time":1430917387110,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387151,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":32,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":102019,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":34,"Index":34,"Attempt":0,"Launch Time":1430917387156,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":26,"Index":26,"Attempt":0,"Launch Time":1430917387105,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387157,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":6,"Executor Run Time":38,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":108213,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":35,"Index":35,"Attempt":0,"Launch Time":1430917387161,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":28,"Index":28,"Attempt":0,"Launch Time":1430917387113,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387162,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":29,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":104299,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":36,"Index":36,"Attempt":0,"Launch Time":1430917387164,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":25,"Index":25,"Attempt":0,"Launch Time":1430917387103,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387164,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":49,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":94134,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":37,"Index":37,"Attempt":0,"Launch Time":1430917387165,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":38,"Index":38,"Attempt":0,"Launch Time":1430917387166,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":29,"Index":29,"Attempt":0,"Launch Time":1430917387114,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387166,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":39,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":114938,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":39,"Index":39,"Attempt":0,"Launch Time":1430917387180,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":30,"Index":30,"Attempt":0,"Launch Time":1430917387118,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387180,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":4,"Executor Run Time":34,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":119770,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":32,"Index":32,"Attempt":0,"Launch Time":1430917387148,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387181,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":89603,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":40,"Index":40,"Attempt":0,"Launch Time":1430917387197,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":34,"Index":34,"Attempt":0,"Launch Time":1430917387156,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387198,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":6,"Executor Run Time":27,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":127746,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":41,"Index":41,"Attempt":0,"Launch Time":1430917387200,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":31,"Index":31,"Attempt":0,"Launch Time":1430917387127,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387201,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":36,"Executor Run Time":24,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":92619,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":42,"Index":42,"Attempt":0,"Launch Time":1430917387203,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":36,"Index":36,"Attempt":0,"Launch Time":1430917387164,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387204,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":29,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":123855,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":43,"Index":43,"Attempt":0,"Launch Time":1430917387204,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":44,"Index":44,"Attempt":0,"Launch Time":1430917387205,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":45,"Index":45,"Attempt":0,"Launch Time":1430917387206,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":37,"Index":37,"Attempt":0,"Launch Time":1430917387165,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387207,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":4,"Executor Run Time":32,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":111869,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":33,"Index":33,"Attempt":0,"Launch Time":1430917387149,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387207,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":43,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":118329,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":46,"Index":46,"Attempt":0,"Launch Time":1430917387210,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":35,"Index":35,"Attempt":0,"Launch Time":1430917387161,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387211,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":35,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":160963,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":47,"Index":47,"Attempt":0,"Launch Time":1430917387212,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":39,"Index":39,"Attempt":0,"Launch Time":1430917387180,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387212,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":98748,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":38,"Index":38,"Attempt":0,"Launch Time":1430917387166,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387213,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":31,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":131158,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":48,"Index":48,"Attempt":0,"Launch Time":1430917387220,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":40,"Index":40,"Attempt":0,"Launch Time":1430917387197,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387221,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":4,"Executor Run Time":14,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":94792,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":49,"Index":49,"Attempt":0,"Launch Time":1430917387223,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":41,"Index":41,"Attempt":0,"Launch Time":1430917387200,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387224,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":16,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":90765,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":50,"Index":50,"Attempt":0,"Launch Time":1430917387240,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":44,"Index":44,"Attempt":0,"Launch Time":1430917387205,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387242,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":98293,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":51,"Index":51,"Attempt":0,"Launch Time":1430917387242,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":52,"Index":52,"Attempt":0,"Launch Time":1430917387243,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":43,"Index":43,"Attempt":0,"Launch Time":1430917387204,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387243,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":16,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":171516,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":45,"Index":45,"Attempt":0,"Launch Time":1430917387206,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387243,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":19,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":92985,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":53,"Index":53,"Attempt":0,"Launch Time":1430917387244,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":54,"Index":54,"Attempt":0,"Launch Time":1430917387244,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":42,"Index":42,"Attempt":0,"Launch Time":1430917387203,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387245,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":10,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":103713,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":47,"Index":47,"Attempt":0,"Launch Time":1430917387212,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387245,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":103015,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":55,"Index":55,"Attempt":0,"Launch Time":1430917387246,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":56,"Index":56,"Attempt":0,"Launch Time":1430917387249,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":48,"Index":48,"Attempt":0,"Launch Time":1430917387220,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387250,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":24,"Result Size":2065,"JVM GC Time":6,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":139844,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":46,"Index":46,"Attempt":0,"Launch Time":1430917387210,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387253,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":31,"Result Size":2065,"JVM GC Time":6,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":113322,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":57,"Index":57,"Attempt":0,"Launch Time":1430917387257,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":49,"Index":49,"Attempt":0,"Launch Time":1430917387223,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387257,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":7,"Executor Run Time":23,"Result Size":2065,"JVM GC Time":6,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":94984,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":58,"Index":58,"Attempt":0,"Launch Time":1430917387263,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":51,"Index":51,"Attempt":0,"Launch Time":1430917387242,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387263,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":96013,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":59,"Index":59,"Attempt":0,"Launch Time":1430917387265,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":50,"Index":50,"Attempt":0,"Launch Time":1430917387240,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387266,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":4,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":90836,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":60,"Index":60,"Attempt":0,"Launch Time":1430917387269,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":61,"Index":61,"Attempt":0,"Launch Time":1430917387270,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":52,"Index":52,"Attempt":0,"Launch Time":1430917387243,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387271,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":89664,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":62,"Index":62,"Attempt":0,"Launch Time":1430917387272,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":53,"Index":53,"Attempt":0,"Launch Time":1430917387244,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387273,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":6,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":92835,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":63,"Index":63,"Attempt":0,"Launch Time":1430917387276,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":55,"Index":55,"Attempt":0,"Launch Time":1430917387246,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387276,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":4,"Executor Run Time":21,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":108309,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":64,"Index":64,"Attempt":0,"Launch Time":1430917387277,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":57,"Index":57,"Attempt":0,"Launch Time":1430917387257,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387278,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":16,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":96849,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":65,"Index":65,"Attempt":0,"Launch Time":1430917387279,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":56,"Index":56,"Attempt":0,"Launch Time":1430917387249,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387280,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":20,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":90329,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":66,"Index":66,"Attempt":0,"Launch Time":1430917387285,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":58,"Index":58,"Attempt":0,"Launch Time":1430917387263,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387286,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":16,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":97521,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":67,"Index":67,"Attempt":0,"Launch Time":1430917387288,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":59,"Index":59,"Attempt":0,"Launch Time":1430917387265,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387288,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":100753,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":54,"Index":54,"Attempt":0,"Launch Time":1430917387244,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387303,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":90506,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":68,"Index":68,"Attempt":0,"Launch Time":1430917387306,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":60,"Index":60,"Attempt":0,"Launch Time":1430917387269,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387307,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":8,"Executor Run Time":28,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":156916,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":69,"Index":69,"Attempt":0,"Launch Time":1430917387308,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":62,"Index":62,"Attempt":0,"Launch Time":1430917387272,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387309,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":32,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":100396,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":70,"Index":70,"Attempt":0,"Launch Time":1430917387311,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":64,"Index":64,"Attempt":0,"Launch Time":1430917387277,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387311,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":25,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":104087,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":71,"Index":71,"Attempt":0,"Launch Time":1430917387313,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":72,"Index":72,"Attempt":0,"Launch Time":1430917387314,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":61,"Index":61,"Attempt":0,"Launch Time":1430917387270,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387315,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":4,"Executor Run Time":36,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":100426,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":73,"Index":73,"Attempt":0,"Launch Time":1430917387316,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":63,"Index":63,"Attempt":0,"Launch Time":1430917387276,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387316,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":20,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":102779,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":74,"Index":74,"Attempt":0,"Launch Time":1430917387318,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":66,"Index":66,"Attempt":0,"Launch Time":1430917387285,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387319,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":27,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":100823,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":65,"Index":65,"Attempt":0,"Launch Time":1430917387279,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387321,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":27,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":99123,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":75,"Index":75,"Attempt":0,"Launch Time":1430917387323,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":67,"Index":67,"Attempt":0,"Launch Time":1430917387288,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387323,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":14,"Executor Run Time":19,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":92741,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":76,"Index":76,"Attempt":0,"Launch Time":1430917387327,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":68,"Index":68,"Attempt":0,"Launch Time":1430917387306,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387328,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":16,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":101750,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":77,"Index":77,"Attempt":0,"Launch Time":1430917387329,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":69,"Index":69,"Attempt":0,"Launch Time":1430917387308,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387331,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":89437,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":78,"Index":78,"Attempt":0,"Launch Time":1430917387338,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":71,"Index":71,"Attempt":0,"Launch Time":1430917387313,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387339,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":22,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":183323,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":79,"Index":79,"Attempt":0,"Launch Time":1430917387339,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":70,"Index":70,"Attempt":0,"Launch Time":1430917387311,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387340,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":24,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":105933,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":80,"Index":80,"Attempt":0,"Launch Time":1430917387341,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":74,"Index":74,"Attempt":0,"Launch Time":1430917387318,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387341,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":98047,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":81,"Index":81,"Attempt":0,"Launch Time":1430917387342,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":72,"Index":72,"Attempt":0,"Launch Time":1430917387314,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387343,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":22,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":139584,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":82,"Index":82,"Attempt":0,"Launch Time":1430917387349,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":75,"Index":75,"Attempt":0,"Launch Time":1430917387323,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387359,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":20,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":134733,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":83,"Index":83,"Attempt":0,"Launch Time":1430917387360,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":73,"Index":73,"Attempt":0,"Launch Time":1430917387316,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387360,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":16,"Executor Run Time":25,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":197728,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":84,"Index":84,"Attempt":0,"Launch Time":1430917387368,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":77,"Index":77,"Attempt":0,"Launch Time":1430917387329,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387368,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":30,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":104879,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":85,"Index":85,"Attempt":0,"Launch Time":1430917387371,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":76,"Index":76,"Attempt":0,"Launch Time":1430917387327,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387371,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":38,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":117198,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":86,"Index":86,"Attempt":0,"Launch Time":1430917387374,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":87,"Index":87,"Attempt":0,"Launch Time":1430917387374,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":79,"Index":79,"Attempt":0,"Launch Time":1430917387339,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387375,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":28,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":760569,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":80,"Index":80,"Attempt":0,"Launch Time":1430917387341,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387375,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":13,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":98069,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":88,"Index":88,"Attempt":0,"Launch Time":1430917387376,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":78,"Index":78,"Attempt":0,"Launch Time":1430917387338,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387377,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":31,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":94029,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":89,"Index":89,"Attempt":0,"Launch Time":1430917387384,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":82,"Index":82,"Attempt":0,"Launch Time":1430917387349,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387385,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":23,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":103837,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":90,"Index":90,"Attempt":0,"Launch Time":1430917387385,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":81,"Index":81,"Attempt":0,"Launch Time":1430917387342,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387386,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":38,"Result Size":2065,"JVM GC Time":5,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":102935,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":91,"Index":91,"Attempt":0,"Launch Time":1430917387394,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":84,"Index":84,"Attempt":0,"Launch Time":1430917387368,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387395,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":22,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":97176,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":92,"Index":92,"Attempt":0,"Launch Time":1430917387400,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":85,"Index":85,"Attempt":0,"Launch Time":1430917387371,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387401,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":19,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":103743,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":93,"Index":93,"Attempt":0,"Launch Time":1430917387402,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":86,"Index":86,"Attempt":0,"Launch Time":1430917387374,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387402,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":16,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":95848,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":94,"Index":94,"Attempt":0,"Launch Time":1430917387404,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":83,"Index":83,"Attempt":0,"Launch Time":1430917387360,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387405,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":37,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":134049,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":95,"Index":95,"Attempt":0,"Launch Time":1430917387406,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":96,"Index":96,"Attempt":0,"Launch Time":1430917387406,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":97,"Index":97,"Attempt":0,"Launch Time":1430917387407,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":88,"Index":88,"Attempt":0,"Launch Time":1430917387376,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387407,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":19,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":94269,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":90,"Index":90,"Attempt":0,"Launch Time":1430917387385,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387408,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":98472,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":98,"Index":98,"Attempt":0,"Launch Time":1430917387410,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":87,"Index":87,"Attempt":0,"Launch Time":1430917387374,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387410,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":12,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":102159,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":89,"Index":89,"Attempt":0,"Launch Time":1430917387384,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387417,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":18,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":97226,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":99,"Index":99,"Attempt":0,"Launch Time":1430917387426,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":91,"Index":91,"Attempt":0,"Launch Time":1430917387394,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387427,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":8,"Executor Run Time":21,"Result Size":2065,"JVM GC Time":4,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":98301,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":94,"Index":94,"Attempt":0,"Launch Time":1430917387404,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387441,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":5,"Executor Run Time":22,"Result Size":2065,"JVM GC Time":4,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":102455,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":92,"Index":92,"Attempt":0,"Launch Time":1430917387400,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387442,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":31,"Result Size":2065,"JVM GC Time":4,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":114766,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":96,"Index":96,"Attempt":0,"Launch Time":1430917387406,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387443,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":26,"Result Size":2065,"JVM GC Time":4,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":100678,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":93,"Index":93,"Attempt":0,"Launch Time":1430917387402,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387443,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":32,"Result Size":2065,"JVM GC Time":4,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":95818,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":95,"Index":95,"Attempt":0,"Launch Time":1430917387406,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387444,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":28,"Result Size":2065,"JVM GC Time":4,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":117803,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":98,"Index":98,"Attempt":0,"Launch Time":1430917387410,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387444,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":7,"Executor Run Time":22,"Result Size":2065,"JVM GC Time":4,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":105039,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":99,"Index":99,"Attempt":0,"Launch Time":1430917387426,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387448,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":17,"Result Size":2065,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":133964,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70565,"Records Read":10000}}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":97,"Index":97,"Attempt":0,"Launch Time":1430917387407,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387450,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":38,"Result Size":2065,"JVM GC Time":4,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Write Metrics":{"Shuffle Bytes Written":1710,"Shuffle Write Time":131987,"Shuffle Records Written":10},"Input Metrics":{"Data Read Method":"Hadoop","Bytes Read":70564,"Records Read":10000}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"map at :26","Number of Tasks":100,"RDD Info":[{"RDD ID":3,"Name":"3","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":2,"Name":"2","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":0,"Name":"/Users/irashid/spark-examples/tmp_data/raw_data","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0},{"RDD ID":1,"Name":"/Users/irashid/spark-examples/tmp_data/raw_data","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":100,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.map(RDD.scala:287)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:26)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:31)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:33)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC.(:35)\n$line20.$read$$iwC$$iwC$$iwC$$iwC.(:37)\n$line20.$read$$iwC$$iwC$$iwC.(:39)\n$line20.$read$$iwC$$iwC.(:41)\n$line20.$read$$iwC.(:43)\n$line20.$read.(:45)\n$line20.$read$.(:49)\n$line20.$read$.()\n$line20.$eval$.(:7)\n$line20.$eval$.()\n$line20.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)","Submission Time":1430917386485,"Completion Time":1430917387451,"Accumulables":[]}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"collect at :26","Number of Tasks":10,"RDD Info":[{"RDD ID":4,"Name":"4","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":10,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.collect(RDD.scala:819)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:26)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:31)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:33)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC.(:35)\n$line20.$read$$iwC$$iwC$$iwC$$iwC.(:37)\n$line20.$read$$iwC$$iwC$$iwC.(:39)\n$line20.$read$$iwC$$iwC.(:41)\n$line20.$read$$iwC.(:43)\n$line20.$read.(:45)\n$line20.$read$.(:49)\n$line20.$read$.()\n$line20.$eval$.(:7)\n$line20.$eval$.()\n$line20.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)","Accumulables":[]},"Properties":{}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":100,"Index":0,"Attempt":0,"Launch Time":1430917387462,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":101,"Index":1,"Attempt":0,"Launch Time":1430917387464,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":102,"Index":2,"Attempt":0,"Launch Time":1430917387465,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":103,"Index":3,"Attempt":0,"Launch Time":1430917387465,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":104,"Index":4,"Attempt":0,"Launch Time":1430917387465,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":105,"Index":5,"Attempt":0,"Launch Time":1430917387466,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":106,"Index":6,"Attempt":0,"Launch Time":1430917387466,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":107,"Index":7,"Attempt":0,"Launch Time":1430917387466,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":108,"Index":8,"Attempt":0,"Launch Time":1430917387546,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":103,"Index":3,"Attempt":0,"Launch Time":1430917387465,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387546,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":75,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":0,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":109,"Index":9,"Attempt":0,"Launch Time":1430917387546,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":105,"Index":5,"Attempt":0,"Launch Time":1430917387466,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387547,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":76,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":0,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":100,"Index":0,"Attempt":0,"Launch Time":1430917387462,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387548,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":74,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":0,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":107,"Index":7,"Attempt":0,"Launch Time":1430917387466,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387548,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":76,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":0,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":101,"Index":1,"Attempt":0,"Launch Time":1430917387464,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387548,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":1,"Executor Run Time":76,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":1,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":106,"Index":6,"Attempt":0,"Launch Time":1430917387466,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387549,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":75,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":0,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":104,"Index":4,"Attempt":0,"Launch Time":1430917387465,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387549,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":3,"Executor Run Time":74,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":1,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":102,"Index":2,"Attempt":0,"Launch Time":1430917387465,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387550,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":79,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":0,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":108,"Index":8,"Attempt":0,"Launch Time":1430917387546,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387583,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":31,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":0,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":109,"Index":9,"Attempt":0,"Launch Time":1430917387546,"Executor ID":"driver","Host":"localhost","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1430917387585,"Failed":false,"Accumulables":[]},"Task Metrics":{"Host Name":"localhost","Executor Deserialize Time":2,"Executor Run Time":30,"Result Size":1034,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":100,"Fetch Wait Time":1,"Remote Bytes Read":0,"Local Bytes Read":17100,"Total Records Read":100}}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"collect at :26","Number of Tasks":10,"RDD Info":[{"RDD ID":4,"Name":"4","Storage Level":{"Use Disk":false,"Use Memory":false,"Use Tachyon":false,"Deserialized":false,"Replication":1},"Number of Partitions":10,"Number of Cached Partitions":0,"Memory Size":0,"Tachyon Size":0,"Disk Size":0}],"Details":"org.apache.spark.rdd.RDD.collect(RDD.scala:819)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:26)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:31)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:33)\n$line20.$read$$iwC$$iwC$$iwC$$iwC$$iwC.(:35)\n$line20.$read$$iwC$$iwC$$iwC$$iwC.(:37)\n$line20.$read$$iwC$$iwC$$iwC.(:39)\n$line20.$read$$iwC$$iwC.(:41)\n$line20.$read$$iwC.(:43)\n$line20.$read.(:45)\n$line20.$read$.(:49)\n$line20.$read$.()\n$line20.$eval$.(:7)\n$line20.$eval$.()\n$line20.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:483)\norg.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)","Submission Time":1430917387462,"Completion Time":1430917387585,"Accumulables":[]}} +{"Event":"SparkListenerJobEnd","Job ID":0,"Completion Time":1430917387589,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"SparkListenerApplicationEnd","Timestamp":1430917391398} diff --git a/core/src/test/scala/org/apache/spark/JsonTestUtils.scala b/core/src/test/scala/org/apache/spark/JsonTestUtils.scala new file mode 100644 index 0000000000000..ba367cd476146 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/JsonTestUtils.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark + +import org.json4s._ +import org.json4s.jackson.JsonMethods + +trait JsonTestUtils { + def assertValidDataInJson(validateJson: JValue, expectedJson: JValue) { + val Diff(c, a, d) = validateJson.diff(expectedJson) + val validatePretty = JsonMethods.pretty(validateJson) + val expectedPretty = JsonMethods.pretty(expectedJson) + val errorMessage = s"Expected:\n$expectedPretty\nFound:\n$validatePretty" + import org.scalactic.TripleEquals._ + assert(c === JNothing, s"$errorMessage\nChanged:\n${JsonMethods.pretty(c)}") + assert(a === JNothing, s"$errorMessage\nAdded:\n${JsonMethods.pretty(a)}") + assert(d === JNothing, s"$errorMessage\nDeleted:\n${JsonMethods.pretty(d)}") + } + +} diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala index b58d62567afe1..e04a79284175c 100644 --- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala @@ -28,9 +28,9 @@ import org.scalatest.FunSuite import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, WorkerStateResponse} import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, RecoveryState, WorkerInfo} import org.apache.spark.deploy.worker.{DriverRunner, ExecutorRunner} -import org.apache.spark.{SecurityManager, SparkConf} +import org.apache.spark.{JsonTestUtils, SecurityManager, SparkConf} -class JsonProtocolSuite extends FunSuite { +class JsonProtocolSuite extends FunSuite with JsonTestUtils { test("writeApplicationInfo") { val output = JsonProtocol.writeApplicationInfo(createAppInfo()) @@ -136,16 +136,6 @@ class JsonProtocolSuite extends FunSuite { case e: JsonParseException => fail("Invalid Json detected", e) } } - - def assertValidDataInJson(validateJson: JValue, expectedJson: JValue) { - val Diff(c, a, d) = validateJson diff expectedJson - val validatePretty = JsonMethods.pretty(validateJson) - val expectedPretty = JsonMethods.pretty(expectedJson) - val errorMessage = s"Expected:\n$expectedPretty\nFound:\n$validatePretty" - assert(c === JNothing, s"$errorMessage\nChanged:\n${JsonMethods.pretty(c)}") - assert(a === JNothing, s"$errorMessage\nAdded:\n${JsonMethods.pretty(a)}") - assert(d === JNothing, s"$errorMessage\nDelected:\n${JsonMethods.pretty(d)}") - } } object JsonConstants { diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala index 71ba9c18257b8..318ab5dbc4804 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala @@ -14,22 +14,164 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.spark.deploy.history -import javax.servlet.http.HttpServletRequest - -import scala.collection.mutable +import java.io.{File, FileInputStream, FileWriter, IOException} +import java.net.{HttpURLConnection, URL} +import javax.servlet.http.{HttpServletRequest, HttpServletResponse} -import org.apache.hadoop.fs.Path -import org.mockito.Mockito.{when} -import org.scalatest.FunSuite -import org.scalatest.Matchers +import org.apache.commons.io.{FileUtils, IOUtils} +import org.mockito.Mockito.when +import org.scalatest.{BeforeAndAfter, FunSuite, Matchers} import org.scalatest.mock.MockitoSugar +import org.apache.spark.{JsonTestUtils, SecurityManager, SparkConf} import org.apache.spark.ui.SparkUI -class HistoryServerSuite extends FunSuite with Matchers with MockitoSugar { +/** + * A collection of tests against the historyserver, including comparing responses from the json + * metrics api to a set of known "golden files". If new endpoints / parameters are added, + * cases should be added to this test suite. The expected outcomes can be genered by running + * the HistoryServerSuite.main. Note that this will blindly generate new expectation files matching + * the current behavior -- the developer must verify that behavior is correct. + * + * Similarly, if the behavior is changed, HistoryServerSuite.main can be run to update the + * expectations. However, in general this should be done with extreme caution, as the metrics + * are considered part of Spark's public api. + */ +class HistoryServerSuite extends FunSuite with BeforeAndAfter with Matchers with MockitoSugar + with JsonTestUtils { + + private val logDir = new File("src/test/resources/spark-events") + private val expRoot = new File("src/test/resources/HistoryServerExpectations/") + + private var provider: FsHistoryProvider = null + private var server: HistoryServer = null + private var port: Int = -1 + + def init(): Unit = { + val conf = new SparkConf() + .set("spark.history.fs.logDirectory", logDir.getAbsolutePath) + .set("spark.history.fs.updateInterval", "0") + .set("spark.testing", "true") + provider = new FsHistoryProvider(conf) + provider.checkForLogs() + val securityManager = new SecurityManager(conf) + + server = new HistoryServer(conf, provider, securityManager, 18080) + server.initialize() + server.bind() + port = server.boundPort + } + + def stop(): Unit = { + server.stop() + } + + before { + init() + } + + after{ + stop() + } + + val cases = Seq( + "application list json" -> "applications", + "completed app list json" -> "applications?status=completed", + "running app list json" -> "applications?status=running", + "minDate app list json" -> "applications?minDate=2015-02-10", + "maxDate app list json" -> "applications?maxDate=2015-02-10", + "maxDate2 app list json" -> "applications?maxDate=2015-02-03T10:42:40.000CST", + "one app json" -> "applications/local-1422981780767", + "one app multi-attempt json" -> "applications/local-1426533911241", + "job list json" -> "applications/local-1422981780767/jobs", + "job list from multi-attempt app json(1)" -> "applications/local-1426533911241/1/jobs", + "job list from multi-attempt app json(2)" -> "applications/local-1426533911241/2/jobs", + "one job json" -> "applications/local-1422981780767/jobs/0", + "succeeded job list json" -> "applications/local-1422981780767/jobs?status=succeeded", + "succeeded&failed job list json" -> + "applications/local-1422981780767/jobs?status=succeeded&status=failed", + "executor list json" -> "applications/local-1422981780767/executors", + "stage list json" -> "applications/local-1422981780767/stages", + "complete stage list json" -> "applications/local-1422981780767/stages?status=complete", + "failed stage list json" -> "applications/local-1422981780767/stages?status=failed", + "one stage json" -> "applications/local-1422981780767/stages/1", + "one stage attempt json" -> "applications/local-1422981780767/stages/1/0", + + "stage task summary w shuffle write" + -> "applications/local-1430917381534/stages/0/0/taskSummary", + "stage task summary w shuffle read" + -> "applications/local-1430917381534/stages/1/0/taskSummary", + "stage task summary w/ custom quantiles" -> + "applications/local-1430917381534/stages/0/0/taskSummary?quantiles=0.01,0.5,0.99", + + "stage task list" -> "applications/local-1430917381534/stages/0/0/taskList", + "stage task list w/ offset & length" -> + "applications/local-1430917381534/stages/0/0/taskList?offset=10&length=50", + "stage task list w/ sortBy" -> + "applications/local-1430917381534/stages/0/0/taskList?sortBy=DECREASING_RUNTIME", + "stage task list w/ sortBy short names: -runtime" -> + "applications/local-1430917381534/stages/0/0/taskList?sortBy=-runtime", + "stage task list w/ sortBy short names: runtime" -> + "applications/local-1430917381534/stages/0/0/taskList?sortBy=runtime", + + "stage list with accumulable json" -> "applications/local-1426533911241/1/stages", + "stage with accumulable json" -> "applications/local-1426533911241/1/stages/0/0", + "stage task list from multi-attempt app json(1)" -> + "applications/local-1426533911241/1/stages/0/0/taskList", + "stage task list from multi-attempt app json(2)" -> + "applications/local-1426533911241/2/stages/0/0/taskList", + + "rdd list storage json" -> "applications/local-1422981780767/storage/rdd", + "one rdd storage json" -> "applications/local-1422981780767/storage/rdd/0" + ) + + // run a bunch of characterization tests -- just verify the behavior is the same as what is saved + // in the test resource folder + cases.foreach { case (name, path) => + test(name) { + val (code, jsonOpt, errOpt) = getContentAndCode(path) + code should be (HttpServletResponse.SC_OK) + jsonOpt should be ('defined) + errOpt should be (None) + val json = jsonOpt.get + val exp = IOUtils.toString(new FileInputStream( + new File(expRoot, HistoryServerSuite.sanitizePath(name) + "_expectation.json"))) + // compare the ASTs so formatting differences don't cause failures + import org.json4s._ + import org.json4s.jackson.JsonMethods._ + val jsonAst = parse(json) + val expAst = parse(exp) + assertValidDataInJson(jsonAst, expAst) + } + } + + test("response codes on bad paths") { + val badAppId = getContentAndCode("applications/foobar") + badAppId._1 should be (HttpServletResponse.SC_NOT_FOUND) + badAppId._3 should be (Some("unknown app: foobar")) + + val badStageId = getContentAndCode("applications/local-1422981780767/stages/12345") + badStageId._1 should be (HttpServletResponse.SC_NOT_FOUND) + badStageId._3 should be (Some("unknown stage: 12345")) + + val badStageAttemptId = getContentAndCode("applications/local-1422981780767/stages/1/1") + badStageAttemptId._1 should be (HttpServletResponse.SC_NOT_FOUND) + badStageAttemptId._3 should be (Some("unknown attempt for stage 1. Found attempts: [0]")) + + val badStageId2 = getContentAndCode("applications/local-1422981780767/stages/flimflam") + badStageId2._1 should be (HttpServletResponse.SC_NOT_FOUND) + // will take some mucking w/ jersey to get a better error msg in this case + + val badQuantiles = getContentAndCode( + "applications/local-1430917381534/stages/0/0/taskSummary?quantiles=foo,0.1") + badQuantiles._1 should be (HttpServletResponse.SC_BAD_REQUEST) + badQuantiles._3 should be (Some("Bad value for parameter \"quantiles\". Expected a double, " + + "got \"foo\"")) + + getContentAndCode("foobar")._1 should be (HttpServletResponse.SC_NOT_FOUND) + } test("generate history page with relative links") { val historyServer = mock[HistoryServer] @@ -54,4 +196,75 @@ class HistoryServerSuite extends FunSuite with Matchers with MockitoSugar { } yield (attrs.toString) justHrefs should contain(link) } + + def getContentAndCode(path: String, port: Int = port): (Int, Option[String], Option[String]) = { + HistoryServerSuite.getContentAndCode(new URL(s"http://localhost:$port/json/v1/$path")) + } + + def getUrl(path: String): String = { + HistoryServerSuite.getUrl(new URL(s"http://localhost:$port/json/v1/$path")) + } + + def generateExpectation(name: String, path: String): Unit = { + val json = getUrl(path) + val file = new File(expRoot, HistoryServerSuite.sanitizePath(name) + "_expectation.json") + val out = new FileWriter(file) + out.write(json) + out.close() + } +} + +object HistoryServerSuite { + def main(args: Array[String]): Unit = { + // generate the "expected" results for the characterization tests. Just blindly assume the + // current behavior is correct, and write out the returned json to the test/resource files + + val suite = new HistoryServerSuite + FileUtils.deleteDirectory(suite.expRoot) + suite.expRoot.mkdirs() + try { + suite.init() + suite.cases.foreach { case (name, path) => + suite.generateExpectation(name, path) + } + } finally { + suite.stop() + } + } + + def getContentAndCode(url: URL): (Int, Option[String], Option[String]) = { + val connection = url.openConnection().asInstanceOf[HttpURLConnection] + connection.setRequestMethod("GET") + connection.connect() + val code = connection.getResponseCode() + val inString = try { + val in = Option(connection.getInputStream()) + in.map(IOUtils.toString) + } catch { + case io: IOException => None + } + val errString = try { + val err = Option(connection.getErrorStream()) + err.map(IOUtils.toString) + } catch { + case io: IOException => None + } + (code, inString, errString) + } + + + def sanitizePath(path: String): String = { + // this doesn't need to be perfect, just good enough to avoid collisions + path.replaceAll("\\W", "_") + } + + def getUrl(path: URL): String = { + val (code, resultOpt, error) = getContentAndCode(path) + if (code == 200) { + resultOpt.get + } else { + throw new RuntimeException( + "got code: " + code + " when getting " + path + " w/ error: " + error) + } + } } diff --git a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamTest.scala b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamTest.scala new file mode 100644 index 0000000000000..5274df904d395 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamTest.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import org.scalatest.{Matchers, FunSuite} + +class SimpleDateParamTest extends FunSuite with Matchers { + + test("date parsing") { + new SimpleDateParam("2015-02-20T23:21:17.190GMT").timestamp should be (1424474477190L) + new SimpleDateParam("2015-02-20T17:21:17.190CST").timestamp should be (1424474477190L) + new SimpleDateParam("2015-02-20").timestamp should be (1424390400000L) // GMT + } + +} diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala index d53d7f3ba5ae7..117b2c3960820 100644 --- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala @@ -18,11 +18,13 @@ package org.apache.spark.ui import java.net.{HttpURLConnection, URL} -import javax.servlet.http.HttpServletRequest +import javax.servlet.http.{HttpServletResponse, HttpServletRequest} import scala.collection.JavaConversions._ import scala.xml.Node +import org.json4s._ +import org.json4s.jackson.JsonMethods import org.openqa.selenium.htmlunit.HtmlUnitDriver import org.openqa.selenium.{By, WebDriver} import org.scalatest._ @@ -33,8 +35,9 @@ import org.scalatest.time.SpanSugar._ import org.apache.spark.LocalSparkContext._ import org.apache.spark._ import org.apache.spark.api.java.StorageLevels +import org.apache.spark.deploy.history.HistoryServerSuite import org.apache.spark.shuffle.FetchFailedException - +import org.apache.spark.status.api.v1.{JacksonMessageWriter, StageStatus} /** * Selenium tests for the Spark Web UI. @@ -42,6 +45,8 @@ import org.apache.spark.shuffle.FetchFailedException class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with BeforeAndAfterAll { implicit var webDriver: WebDriver = _ + implicit val formats = DefaultFormats + override def beforeAll(): Unit = { webDriver = new HtmlUnitDriver @@ -76,28 +81,42 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before val rdd = sc.parallelize(Seq(1, 2, 3)) rdd.persist(StorageLevels.DISK_ONLY).count() eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (ui.appUIAddress.stripSuffix("/") + "/storage") + goToUi(ui, "/storage") val tableRowText = findAll(cssSelector("#storage-by-rdd-table td")).map(_.text).toSeq tableRowText should contain (StorageLevels.DISK_ONLY.description) } eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (ui.appUIAddress.stripSuffix("/") + "/storage/rdd/?id=0") + goToUi(ui, "/storage/rdd/?id=0") val tableRowText = findAll(cssSelector("#rdd-storage-by-block-table td")).map(_.text).toSeq tableRowText should contain (StorageLevels.DISK_ONLY.description) } + val storageJson = getJson(ui, "storage/rdd") + storageJson.children.length should be (1) + (storageJson \ "storageLevel").extract[String] should be (StorageLevels.DISK_ONLY.description) + val rddJson = getJson(ui, "storage/rdd/0") + (rddJson \ "storageLevel").extract[String] should be (StorageLevels.DISK_ONLY.description) + rdd.unpersist() rdd.persist(StorageLevels.MEMORY_ONLY).count() eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (ui.appUIAddress.stripSuffix("/") + "/storage") + goToUi(ui, "/storage") val tableRowText = findAll(cssSelector("#storage-by-rdd-table td")).map(_.text).toSeq tableRowText should contain (StorageLevels.MEMORY_ONLY.description) } eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (ui.appUIAddress.stripSuffix("/") + "/storage/rdd/?id=0") + goToUi(ui, "/storage/rdd/?id=0") val tableRowText = findAll(cssSelector("#rdd-storage-by-block-table td")).map(_.text).toSeq tableRowText should contain (StorageLevels.MEMORY_ONLY.description) } + + val updatedStorageJson = getJson(ui, "storage/rdd") + updatedStorageJson.children.length should be (1) + (updatedStorageJson \ "storageLevel").extract[String] should be ( + StorageLevels.MEMORY_ONLY.description) + val updatedRddJson = getJson(ui, "storage/rdd/0") + (updatedRddJson \ "storageLevel").extract[String] should be ( + StorageLevels.MEMORY_ONLY.description) } } @@ -108,10 +127,13 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before sc.parallelize(1 to 10).map { x => throw new Exception()}.collect() } eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/stages") + goToUi(sc, "/stages") find(id("active")) should be(None) // Since we hide empty tables find(id("failed")).get.text should be("Failed Stages (1)") } + val stageJson = getJson(sc.ui.get, "stages") + stageJson.children.length should be (1) + (stageJson \ "status").extract[String] should be (StageStatus.FAILED.name()) // Regression test for SPARK-2105 class NotSerializable @@ -120,12 +142,15 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before sc.parallelize(1 to 10).map { x => unserializableObject}.collect() } eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/stages") + goToUi(sc, "/stages") find(id("active")) should be(None) // Since we hide empty tables // The failure occurs before the stage becomes active, hence we should still show only one // failed stage, not two: find(id("failed")).get.text should be("Failed Stages (1)") } + + val updatedStageJson = getJson(sc.ui.get, "stages") + updatedStageJson should be (stageJson) } } @@ -138,7 +163,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before withSpark(newSparkContext(killEnabled = true)) { sc => runSlowJob(sc) eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/stages") + goToUi(sc, "/stages") assert(hasKillLink) } } @@ -146,7 +171,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before withSpark(newSparkContext(killEnabled = false)) { sc => runSlowJob(sc) eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/stages") + goToUi(sc, "/stages") assert(!hasKillLink) } } @@ -157,7 +182,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before // If no job has been run in a job group, then "(Job Group)" should not appear in the header sc.parallelize(Seq(1, 2, 3)).count() eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs") + goToUi(sc, "/jobs") val tableHeaders = findAll(cssSelector("th")).map(_.text).toSeq tableHeaders should not contain "Job Id (Job Group)" } @@ -165,10 +190,22 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before sc.setJobGroup("my-job-group", "my-job-group-description") sc.parallelize(Seq(1, 2, 3)).count() eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs") + goToUi(sc, "/jobs") val tableHeaders = findAll(cssSelector("th")).map(_.text).toSeq tableHeaders should contain ("Job Id (Job Group)") } + + val jobJson = getJson(sc.ui.get, "jobs") + for { + job @ JObject(_) <- jobJson + JInt(jobId) <- job \ "jobId" + jobGroup = job \ "jobGroup" + } { + jobId.toInt match { + case 0 => jobGroup should be (JNothing) + case 1 => jobGroup should be (JString("my-job-group")) + } + } } } @@ -195,7 +232,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before } mappedData.count() eventually(timeout(5 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs") + goToUi(sc, "/jobs") find(cssSelector(".stage-progress-cell")).get.text should be ("2/2 (1 failed)") // Ideally, the following test would pass, but currently we overcount completed tasks // if task recomputations occur: @@ -204,6 +241,32 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before // of completed tasks may be higher: find(cssSelector(".progress-cell .progress")).get.text should be ("3/2 (1 failed)") } + val jobJson = getJson(sc.ui.get, "jobs") + (jobJson \ "numTasks").extract[Int]should be (2) + (jobJson \ "numCompletedTasks").extract[Int] should be (3) + (jobJson \ "numFailedTasks").extract[Int] should be (1) + (jobJson \ "numCompletedStages").extract[Int] should be (2) + (jobJson \ "numFailedStages").extract[Int] should be (1) + val stageJson = getJson(sc.ui.get, "stages") + + for { + stage @ JObject(_) <- stageJson + JString(status) <- stage \ "status" + JInt(stageId) <- stage \ "stageId" + JInt(attemptId) <- stage \ "attemptId" + } { + val exp = if (attemptId == 0 && stageId == 1) StageStatus.FAILED else StageStatus.COMPLETE + status should be (exp.name()) + } + + for { + stageId <- 0 to 1 + attemptId <- 0 to 1 + } { + val exp = if (attemptId == 0 && stageId == 1) StageStatus.FAILED else StageStatus.COMPLETE + val stageJson = getJson(sc.ui.get, s"stages/$stageId/$attemptId") + (stageJson \ "status").extract[String] should be (exp.name()) + } } } @@ -218,7 +281,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before // Start the job: rdd.countAsync() eventually(timeout(10 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs/job/?id=0") + goToUi(sc, "/jobs/job/?id=0") find(id("active")).get.text should be ("Active Stages (1)") find(id("pending")).get.text should be ("Pending Stages (2)") // Essentially, we want to check that none of the stage rows show @@ -244,7 +307,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before rdd.count() rdd.count() eventually(timeout(10 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs") + goToUi(sc, "/jobs") // The completed jobs table should have two rows. The first row will be the most recent job: val firstRow = find(cssSelector("tbody tr")).get.underlying val firstRowColumns = firstRow.findElements(By.tagName("td")) @@ -271,7 +334,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before rdd.count() rdd.count() eventually(timeout(10 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs/job/?id=1") + goToUi(sc, "/jobs/job/?id=1") find(id("pending")) should be (None) find(id("active")) should be (None) find(id("failed")) should be (None) @@ -299,7 +362,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before rdd.count() rdd.count() eventually(timeout(10 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs") + goToUi(sc, "/jobs") findAll(cssSelector("tbody tr a")).foreach { link => link.text.toLowerCase should include ("count") link.text.toLowerCase should not include "unknown" @@ -321,7 +384,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before } sparkUI.attachTab(newTab) eventually(timeout(10 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/")) + goToUi(sc, "") find(cssSelector("""ul li a[href*="jobs"]""")) should not be(None) find(cssSelector("""ul li a[href*="stages"]""")) should not be(None) find(cssSelector("""ul li a[href*="storage"]""")) should not be(None) @@ -330,12 +393,12 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before } eventually(timeout(10 seconds), interval(50 milliseconds)) { // check whether new page exists - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/foo") + goToUi(sc, "/foo") find(cssSelector("b")).get.text should include ("html magic") } sparkUI.detachTab(newTab) eventually(timeout(10 seconds), interval(50 milliseconds)) { - go to (sc.ui.get.appUIAddress.stripSuffix("/")) + goToUi(sc, "") find(cssSelector("""ul li a[href*="jobs"]""")) should not be(None) find(cssSelector("""ul li a[href*="stages"]""")) should not be(None) find(cssSelector("""ul li a[href*="storage"]""")) should not be(None) @@ -344,7 +407,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before } eventually(timeout(10 seconds), interval(50 milliseconds)) { // check new page not exist - go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/foo") + goToUi(sc, "/foo") find(cssSelector("b")) should be(None) } } @@ -371,4 +434,163 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with Before } } } + + test("stage & job retention") { + val conf = new SparkConf() + .setMaster("local") + .setAppName("test") + .set("spark.ui.enabled", "true") + .set("spark.ui.port", "0") + .set("spark.ui.retainedStages", "3") + .set("spark.ui.retainedJobs", "2") + val sc = new SparkContext(conf) + assert(sc.ui.isDefined) + + withSpark(sc) { sc => + // run a few jobs & stages ... + (0 until 5).foreach { idx => + // NOTE: if we reverse the order, things don't really behave nicely + // we lose the stage for a job we keep, and then the job doesn't know + // about its last stage + sc.parallelize(idx to (idx + 3)).map(identity).groupBy(identity).map(identity) + .groupBy(identity).count() + sc.parallelize(idx to (idx + 3)).collect() + } + + val expJobInfo = Seq( + ("9", "collect"), + ("8", "count") + ) + + eventually(timeout(1 second), interval(50 milliseconds)) { + goToUi(sc, "/jobs") + // The completed jobs table should have two rows. The first row will be the most recent job: + find("completed-summary").get.text should be ("Completed Jobs: 10, only showing 2") + find("completed").get.text should be ("Completed Jobs (10, only showing 2)") + val rows = findAll(cssSelector("tbody tr")).toIndexedSeq.map{_.underlying} + rows.size should be (expJobInfo.size) + for { + (row, idx) <- rows.zipWithIndex + columns = row.findElements(By.tagName("td")) + id = columns(0).getText() + description = columns(1).getText() + } { + id should be (expJobInfo(idx)._1) + description should include (expJobInfo(idx)._2) + } + } + + val jobsJson = getJson(sc.ui.get, "jobs") + jobsJson.children.size should be (expJobInfo.size) + for { + (job @ JObject(_),idx) <- jobsJson.children.zipWithIndex + id = (job \ "jobId").extract[String] + name = (job \ "name").extract[String] + } { + withClue(s"idx = $idx; id = $id; name = ${name.substring(0,20)}") { + id should be (expJobInfo(idx)._1) + name should include (expJobInfo(idx)._2) + } + } + + // what about when we query for a job that did exist, but has been cleared? + goToUi(sc, "/jobs/job/?id=7") + find("no-info").get.text should be ("No information to display for job 7") + + val badJob = HistoryServerSuite.getContentAndCode(jsonUrl(sc.ui.get, "jobs/7")) + badJob._1 should be (HttpServletResponse.SC_NOT_FOUND) + badJob._2 should be (None) + badJob._3 should be (Some("unknown job: 7")) + + val expStageInfo = Seq( + ("19", "collect"), + ("18", "count"), + ("17", "groupBy") + ) + + eventually(timeout(1 second), interval(50 milliseconds)) { + goToUi(sc, "/stages") + find("completed-summary").get.text should be ("Completed Stages: 20, only showing 3") + find("completed").get.text should be ("Completed Stages (20, only showing 3)") + val rows = findAll(cssSelector("tbody tr")).toIndexedSeq.map{_.underlying} + rows.size should be (3) + for { + (row, idx) <- rows.zipWithIndex + columns = row.findElements(By.tagName("td")) + id = columns(0).getText() + description = columns(1).getText() + } { + id should be (expStageInfo(idx)._1) + description should include (expStageInfo(idx)._2) + } + } + + val stagesJson = getJson(sc.ui.get, "stages") + stagesJson.children.size should be (3) + for { + (stage @ JObject(_), idx) <- stagesJson.children.zipWithIndex + id = (stage \ "stageId").extract[String] + name = (stage \ "name").extract[String] + } { + id should be (expStageInfo(idx)._1) + name should include (expStageInfo(idx)._2) + } + + // nonexistent stage + + goToUi(sc, "/stages/stage/?id=12&attempt=0") + find("no-info").get.text should be ("No information to display for Stage 12 (Attempt 0)") + val badStage = HistoryServerSuite.getContentAndCode(jsonUrl(sc.ui.get,"stages/12/0")) + badStage._1 should be (HttpServletResponse.SC_NOT_FOUND) + badStage._2 should be (None) + badStage._3 should be (Some("unknown stage: 12")) + + val badAttempt = HistoryServerSuite.getContentAndCode(jsonUrl(sc.ui.get,"stages/19/15")) + badAttempt._1 should be (HttpServletResponse.SC_NOT_FOUND) + badAttempt._2 should be (None) + badAttempt._3 should be (Some("unknown attempt for stage 19. Found attempts: [0]")) + + val badStageAttemptList = HistoryServerSuite.getContentAndCode( + jsonUrl(sc.ui.get, "stages/12")) + badStageAttemptList._1 should be (HttpServletResponse.SC_NOT_FOUND) + badStageAttemptList._2 should be (None) + badStageAttemptList._3 should be (Some("unknown stage: 12")) + } + } + + test("live UI json application list") { + withSpark(newSparkContext()) { sc => + val appListRawJson = HistoryServerSuite.getUrl(new URL( + sc.ui.get.appUIAddress + "/json/v1/applications")) + val appListJsonAst = JsonMethods.parse(appListRawJson) + appListJsonAst.children.length should be (1) + val attempts = (appListJsonAst \ "attempts").children + attempts.size should be (1) + (attempts(0) \ "completed").extract[Boolean] should be (false) + parseDate(attempts(0) \ "startTime") should be (sc.startTime) + parseDate(attempts(0) \ "endTime") should be (-1) + val oneAppJsonAst = getJson(sc.ui.get, "") + oneAppJsonAst should be (appListJsonAst.children(0)) + } + } + + def goToUi(sc: SparkContext, path: String): Unit = { + goToUi(sc.ui.get, path) + } + + def goToUi(ui: SparkUI, path: String): Unit = { + go to (ui.appUIAddress.stripSuffix("/") + path) + } + + def parseDate(json: JValue): Long = { + JacksonMessageWriter.makeISODateFormat.parse(json.extract[String]).getTime + } + + def getJson(ui: SparkUI, path: String): JValue = { + JsonMethods.parse(HistoryServerSuite.getUrl(jsonUrl(ui, path))) + } + + def jsonUrl(ui: SparkUI, path: String): URL = { + new URL(ui.appUIAddress + "/json/v1/applications/test/" + path) + } } diff --git a/docs/monitoring.md b/docs/monitoring.md index 8a85928d6d44d..1e0fc150862fb 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -174,6 +174,80 @@ making it easy to identify slow tasks, data skew, etc. Note that the history server only displays completed Spark jobs. One way to signal the completion of a Spark job is to stop the Spark Context explicitly (`sc.stop()`), or in Python using the `with SparkContext() as sc:` to handle the Spark Context setup and tear down, and still show the job history on the UI. +## REST API + +In addition to viewing the metrics in the UI, they are also available as JSON. This gives developers +an easy way to create new visualizations and monitoring tools for Spark. The JSON is available for +both running applications, and in the history server. The endpoints are mounted at `/json/v1`. Eg., +for the history server, they would typically be accessible at `http://:18080/json/v1`, and +for a running application, at `http://localhost:4040/json/v1`. + +
{status.blockManagerId.host + ":" + status.blockManagerId.port}{worker.address} - {Utils.bytesToString(status.memUsedByRdd(rddId))} - ({Utils.bytesToString(status.memRemaining)} Remaining) + {Utils.bytesToString(worker.memoryUsed)} + ({Utils.bytesToString(worker.memoryRemaining)} Remaining) {Utils.bytesToString(status.diskUsedByRdd(rddId))}{Utils.bytesToString(worker.diskUsed)}
{id}{row.blockName} - {block.storageLevel.description} + {row.storageLevel} - {Utils.bytesToString(block.memSize)} + + {Utils.bytesToString(row.memoryUsed)} - {Utils.bytesToString(block.diskSize)} + + {Utils.bytesToString(row.diskUsed)} - {locations.map(l => {l}
)} + {row.executors.map(l => {l}
)}
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EndpointMeaning
/applicationsA list of all applications
/applications/[app-id]/jobsA list of all jobs for a given application
/applications/[app-id]/jobs/[job-id]Details for the given job
/applications/[app-id]/stagesA list of all stages for a given application
/applications/[app-id]/stages/[stage-id]A list of all attempts for the given stage
/applications/[app-id]/stages/[stage-id]/[stage-attempt-id]Details for the given stage attempt
/applications/[app-id]/stages/[stage-id]/[stage-attempt-id]/taskSummarySummary metrics of all tasks in the given stage attempt
/applications/[app-id]/stages/[stage-id]/[stage-attempt-id]/taskListA list of all tasks for the given stage attempt
/applications/[app-id]/executorsA list of all executors for the given application
/applications/[app-id]/storage/rddA list of stored RDDs for the given application
/applications/[app-id]/storage/rdd/[rdd-id]Details for the storage status of a given RDD
+ +When running on Yarn, each application has multiple attempts, so `[app-id]` is actually +`[app-id]/[attempt-id]` in all cases. + +These endpoints have been strongly versioned to make it easier to develop applications on top. + In particular, Spark guarantees: + +* Endpoints will never be removed from one version +* Individual fields will never be removed for any given endpoint +* New endpoints may be added +* New fields may be added to existing endpoints +* New versions of the api may be added in the future at a separate endpoint (eg., `json/v2`). New versions are *not* required to be backwards compatible. +* Api versions may be dropped, but only after at least one minor release of co-existing with a new api version + +Note that even when examining the UI of a running applications, the `applications/[app-id]` portion is +still required, though there is only one application available. Eg. to see the list of jobs for the +running app, you would go to `http://localhost:4040/json/v1/applications/[app-id]/jobs`. This is to +keep the paths consistent in both modes. + # Metrics Spark has a configurable metrics system based on the diff --git a/pom.xml b/pom.xml index 12ad7c5be4e40..94cf28332cf47 100644 --- a/pom.xml +++ b/pom.xml @@ -623,6 +623,18 @@ + + com.sun.jersey + jersey-server + 1.9 + ${hadoop.deps.scope} + + + com.sun.jersey + jersey-core + 1.9 + ${hadoop.deps.scope} + org.scala-lang scala-compiler From f5ff4a84c4c75143086aae7d38730156bee35933 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Fri, 8 May 2015 11:14:39 -0700 Subject: [PATCH 46/48] [SPARK-7383] [ML] Feature Parity in PySpark for ml.features Implemented python wrappers for Scala functions that don't exist in `ml.features` Author: Burak Yavuz Closes #5991 from brkyvz/ml-feat-PR and squashes the following commits: adcca55 [Burak Yavuz] add regex tokenizer to __all__ b91cb44 [Burak Yavuz] addressed comments bd39fd2 [Burak Yavuz] remove addition b82bd7c [Burak Yavuz] Parity in PySpark for ml.features --- .../ml/feature/PolynomialExpansion.scala | 2 +- .../apache/spark/ml/feature/Tokenizer.scala | 2 +- python/pyspark/ml/feature.py | 797 +++++++++++++++++- .../ml/param/_shared_params_code_gen.py | 4 +- python/pyspark/ml/param/shared.py | 89 ++ 5 files changed, 851 insertions(+), 43 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 63e190c8aae53..9e6177ca27e4a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.types.DataType * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an * expansion of a product of sums expresses it as a sum of products by using the fact that * multiplication distributes over addition". Take a 2-variable feature vector as an example: - * `(x, y)`, if we want to expand it with degree 2, then we get `(x, y, x * x, x * y, y * y)`. + * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`. */ @AlphaComponent class PolynomialExpansion extends UnaryTransformer[Vector, Vector, PolynomialExpansion] { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 2863b7621526e..649c217b16590 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -42,7 +42,7 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { /** * :: AlphaComponent :: - * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) + * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) * or using it to split the text (set matching to false). Optional parameters also allow filtering * tokens using a minimal length. * It returns an array of strings that can be empty. diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 705a368192c24..f35bc1463d51b 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -16,61 +16,72 @@ # from pyspark.rdd import ignore_unicode_prefix -from pyspark.ml.param.shared import HasInputCol, HasInputCols, HasOutputCol, HasNumFeatures +from pyspark.ml.param.shared import * from pyspark.ml.util import keyword_only -from pyspark.ml.wrapper import JavaTransformer +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer from pyspark.mllib.common import inherit_doc -__all__ = ['Tokenizer', 'HashingTF', 'VectorAssembler'] +__all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'Normalizer', 'OneHotEncoder', + 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', + 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', + 'Word2Vec', 'Word2VecModel'] @inherit_doc -@ignore_unicode_prefix -class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol): +class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): """ - A tokenizer that converts the input string to lowercase and then - splits it by white spaces. + Binarize a column of continuous features given a threshold. - >>> from pyspark.sql import Row - >>> df = sc.parallelize([Row(text="a b c")]).toDF() - >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") - >>> tokenizer.transform(df).head() - Row(text=u'a b c', words=[u'a', u'b', u'c']) - >>> # Change a parameter. - >>> tokenizer.setParams(outputCol="tokens").transform(df).head() - Row(text=u'a b c', tokens=[u'a', u'b', u'c']) - >>> # Temporarily modify a parameter. - >>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head() - Row(text=u'a b c', words=[u'a', u'b', u'c']) - >>> tokenizer.transform(df).head() - Row(text=u'a b c', tokens=[u'a', u'b', u'c']) - >>> # Must use keyword arguments to specify params. - >>> tokenizer.setParams("text") - Traceback (most recent call last): - ... - TypeError: Method setParams forces keyword arguments. + >>> df = sqlContext.createDataFrame([(0.5,)], ["values"]) + >>> binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features") + >>> binarizer.transform(df).head().features + 0.0 + >>> binarizer.setParams(outputCol="freqs").transform(df).head().freqs + 0.0 + >>> params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"} + >>> binarizer.transform(df, params).head().vector + 1.0 """ - _java_class = "org.apache.spark.ml.feature.Tokenizer" + _java_class = "org.apache.spark.ml.feature.Binarizer" + # a placeholder to make it appear in the generated doc + threshold = Param(Params._dummy(), "threshold", + "threshold in binary classification prediction, in range [0, 1]") @keyword_only - def __init__(self, inputCol=None, outputCol=None): + def __init__(self, threshold=0.0, inputCol=None, outputCol=None): """ - __init__(self, inputCol=None, outputCol=None) + __init__(self, threshold=0.0, inputCol=None, outputCol=None) """ - super(Tokenizer, self).__init__() + super(Binarizer, self).__init__() + self.threshold = Param(self, "threshold", + "threshold in binary classification prediction, in range [0, 1]") + self._setDefault(threshold=0.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only - def setParams(self, inputCol=None, outputCol=None): + def setParams(self, threshold=0.0, inputCol=None, outputCol=None): """ - setParams(self, inputCol="input", outputCol="output") - Sets params for this Tokenizer. + setParams(self, threshold=0.0, inputCol=None, outputCol=None) + Sets params for this Binarizer. """ kwargs = self.setParams._input_kwargs return self._set(**kwargs) + def setThreshold(self, value): + """ + Sets the value of :py:attr:`threshold`. + """ + self.paramMap[self.threshold] = value + return self + + def getThreshold(self): + """ + Gets the value of threshold or its default value. + """ + return self.getOrDefault(self.threshold) + @inherit_doc class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures): @@ -78,8 +89,7 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures): Maps a sequence of terms to their term frequencies using the hashing trick. - >>> from pyspark.sql import Row - >>> df = sc.parallelize([Row(words=["a", "b", "c"])]).toDF() + >>> df = sqlContext.createDataFrame([(["a", "b", "c"],)], ["words"]) >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features") >>> hashingTF.transform(df).head().features SparseVector(10, {7: 1.0, 8: 1.0, 9: 1.0}) @@ -112,13 +122,520 @@ def setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None): return self._set(**kwargs) +@inherit_doc +class IDF(JavaEstimator, HasInputCol, HasOutputCol): + """ + Compute the Inverse Document Frequency (IDF) given a collection of documents. + + >>> from pyspark.mllib.linalg import DenseVector + >>> df = sqlContext.createDataFrame([(DenseVector([1.0, 2.0]),), + ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) + >>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf") + >>> idf.fit(df).transform(df).head().idf + DenseVector([0.0, 0.0]) + >>> idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs + DenseVector([0.0, 0.0]) + >>> params = {idf.minDocFreq: 1, idf.outputCol: "vector"} + >>> idf.fit(df, params).transform(df).head().vector + DenseVector([0.2877, 0.0]) + """ + + _java_class = "org.apache.spark.ml.feature.IDF" + + # a placeholder to make it appear in the generated doc + minDocFreq = Param(Params._dummy(), "minDocFreq", + "minimum of documents in which a term should appear for filtering") + + @keyword_only + def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): + """ + __init__(self, minDocFreq=0, inputCol=None, outputCol=None) + """ + super(IDF, self).__init__() + self.minDocFreq = Param(self, "minDocFreq", + "minimum of documents in which a term should appear for filtering") + self._setDefault(minDocFreq=0) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, minDocFreq=0, inputCol=None, outputCol=None): + """ + setParams(self, minDocFreq=0, inputCol=None, outputCol=None) + Sets params for this IDF. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setMinDocFreq(self, value): + """ + Sets the value of :py:attr:`minDocFreq`. + """ + self.paramMap[self.minDocFreq] = value + return self + + def getMinDocFreq(self): + """ + Gets the value of minDocFreq or its default value. + """ + return self.getOrDefault(self.minDocFreq) + + +class IDFModel(JavaModel): + """ + Model fitted by IDF. + """ + + +@inherit_doc +class Normalizer(JavaTransformer, HasInputCol, HasOutputCol): + """ + Normalize a vector to have unit norm using the given p-norm. + + >>> from pyspark.mllib.linalg import Vectors + >>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0}) + >>> df = sqlContext.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"]) + >>> normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features") + >>> normalizer.transform(df).head().features + DenseVector([0.6, -0.8]) + >>> normalizer.setParams(inputCol="sparse", outputCol="freqs").transform(df).head().freqs + SparseVector(4, {1: 0.8, 3: 0.6}) + >>> params = {normalizer.p: 1.0, normalizer.inputCol: "dense", normalizer.outputCol: "vector"} + >>> normalizer.transform(df, params).head().vector + DenseVector([0.4286, -0.5714]) + """ + + # a placeholder to make it appear in the generated doc + p = Param(Params._dummy(), "p", "the p norm value.") + + _java_class = "org.apache.spark.ml.feature.Normalizer" + + @keyword_only + def __init__(self, p=2.0, inputCol=None, outputCol=None): + """ + __init__(self, p=2.0, inputCol=None, outputCol=None) + """ + super(Normalizer, self).__init__() + self.p = Param(self, "p", "the p norm value.") + self._setDefault(p=2.0) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, p=2.0, inputCol=None, outputCol=None): + """ + setParams(self, p=2.0, inputCol=None, outputCol=None) + Sets params for this Normalizer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setP(self, value): + """ + Sets the value of :py:attr:`p`. + """ + self.paramMap[self.p] = value + return self + + def getP(self): + """ + Gets the value of p or its default value. + """ + return self.getOrDefault(self.p) + + +@inherit_doc +class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol): + """ + A one-hot encoder that maps a column of label indices to a column of binary vectors, with + at most a single one-value. By default, the binary vector has an element for each category, so + with 5 categories, an input value of 2.0 would map to an output vector of + (0.0, 0.0, 1.0, 0.0, 0.0). If includeFirst is set to false, the first category is omitted, so + the output vector for the previous example would be (0.0, 1.0, 0.0, 0.0) and an input value + of 0.0 would map to a vector of all zeros. Including the first category makes the vector columns + linearly dependent because they sum up to one. + + TODO: This method requires the use of StringIndexer first. Decouple them. + + >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") + >>> model = stringIndexer.fit(stringIndDf) + >>> td = model.transform(stringIndDf) + >>> encoder = OneHotEncoder(includeFirst=False, inputCol="indexed", outputCol="features") + >>> encoder.transform(td).head().features + SparseVector(2, {}) + >>> encoder.setParams(outputCol="freqs").transform(td).head().freqs + SparseVector(2, {}) + >>> params = {encoder.includeFirst: True, encoder.outputCol: "test"} + >>> encoder.transform(td, params).head().test + SparseVector(3, {0: 1.0}) + """ + + _java_class = "org.apache.spark.ml.feature.OneHotEncoder" + + # a placeholder to make it appear in the generated doc + includeFirst = Param(Params._dummy(), "includeFirst", "include first category") + + @keyword_only + def __init__(self, includeFirst=True, inputCol=None, outputCol=None): + """ + __init__(self, includeFirst=True, inputCol=None, outputCol=None) + """ + super(OneHotEncoder, self).__init__() + self.includeFirst = Param(self, "includeFirst", "include first category") + self._setDefault(includeFirst=True) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, includeFirst=True, inputCol=None, outputCol=None): + """ + setParams(self, includeFirst=True, inputCol=None, outputCol=None) + Sets params for this OneHotEncoder. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setIncludeFirst(self, value): + """ + Sets the value of :py:attr:`includeFirst`. + """ + self.paramMap[self.includeFirst] = value + return self + + def getIncludeFirst(self): + """ + Gets the value of includeFirst or its default value. + """ + return self.getOrDefault(self.includeFirst) + + +@inherit_doc +class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol): + """ + Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion, + which is available at `http://en.wikipedia.org/wiki/Polynomial_expansion`, "In mathematics, an + expansion of a product of sums expresses it as a sum of products by using the fact that + multiplication distributes over addition". Take a 2-variable feature vector as an example: + `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`. + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"]) + >>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") + >>> px.transform(df).head().expanded + DenseVector([0.5, 0.25, 2.0, 1.0, 4.0]) + >>> px.setParams(outputCol="test").transform(df).head().test + DenseVector([0.5, 0.25, 2.0, 1.0, 4.0]) + """ + + _java_class = "org.apache.spark.ml.feature.PolynomialExpansion" + + # a placeholder to make it appear in the generated doc + degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)") + + @keyword_only + def __init__(self, degree=2, inputCol=None, outputCol=None): + """ + __init__(self, degree=2, inputCol=None, outputCol=None) + """ + super(PolynomialExpansion, self).__init__() + self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)") + self._setDefault(degree=2) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, degree=2, inputCol=None, outputCol=None): + """ + setParams(self, degree=2, inputCol=None, outputCol=None) + Sets params for this PolynomialExpansion. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setDegree(self, value): + """ + Sets the value of :py:attr:`degree`. + """ + self.paramMap[self.degree] = value + return self + + def getDegree(self): + """ + Gets the value of degree or its default value. + """ + return self.getOrDefault(self.degree) + + +@inherit_doc +@ignore_unicode_prefix +class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): + """ + A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default) + or using it to split the text (set matching to false). Optional parameters also allow filtering + tokens using a minimal length. + It returns an array of strings that can be empty. + + >>> df = sqlContext.createDataFrame([("a b c",)], ["text"]) + >>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words") + >>> reTokenizer.transform(df).head() + Row(text=u'a b c', words=[u'a', u'b', u'c']) + >>> # Change a parameter. + >>> reTokenizer.setParams(outputCol="tokens").transform(df).head() + Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + >>> # Temporarily modify a parameter. + >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head() + Row(text=u'a b c', words=[u'a', u'b', u'c']) + >>> reTokenizer.transform(df).head() + Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + >>> # Must use keyword arguments to specify params. + >>> reTokenizer.setParams("text") + Traceback (most recent call last): + ... + TypeError: Method setParams forces keyword arguments. + """ + + _java_class = "org.apache.spark.ml.feature.RegexTokenizer" + # a placeholder to make it appear in the generated doc + minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)") + gaps = Param(Params._dummy(), "gaps", "Set regex to match gaps or tokens") + pattern = Param(Params._dummy(), "pattern", "regex pattern used for tokenizing") + + @keyword_only + def __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", + inputCol=None, outputCol=None): + """ + __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", + inputCol=None, outputCol=None) + """ + super(RegexTokenizer, self).__init__() + self.minTokenLength = Param(self, "minLength", "minimum token length (>= 0)") + self.gaps = Param(self, "gaps", "Set regex to match gaps or tokens") + self.pattern = Param(self, "pattern", "regex pattern used for tokenizing") + self._setDefault(minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", + inputCol=None, outputCol=None): + """ + setParams(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", + inputCol="input", outputCol="output") + Sets params for this RegexTokenizer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setMinTokenLength(self, value): + """ + Sets the value of :py:attr:`minTokenLength`. + """ + self.paramMap[self.minTokenLength] = value + return self + + def getMinTokenLength(self): + """ + Gets the value of minTokenLength or its default value. + """ + return self.getOrDefault(self.minTokenLength) + + def setGaps(self, value): + """ + Sets the value of :py:attr:`gaps`. + """ + self.paramMap[self.gaps] = value + return self + + def getGaps(self): + """ + Gets the value of gaps or its default value. + """ + return self.getOrDefault(self.gaps) + + def setPattern(self, value): + """ + Sets the value of :py:attr:`pattern`. + """ + self.paramMap[self.pattern] = value + return self + + def getPattern(self): + """ + Gets the value of pattern or its default value. + """ + return self.getOrDefault(self.pattern) + + +@inherit_doc +class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol): + """ + Standardizes features by removing the mean and scaling to unit variance using column summary + statistics on the samples in the training set. + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) + >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled") + >>> model = standardScaler.fit(df) + >>> model.transform(df).collect()[1].scaled + DenseVector([1.4142]) + """ + + _java_class = "org.apache.spark.ml.feature.StandardScaler" + + # a placeholder to make it appear in the generated doc + withMean = Param(Params._dummy(), "withMean", "Center data with mean") + withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation") + + @keyword_only + def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): + """ + __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None) + """ + super(StandardScaler, self).__init__() + self.withMean = Param(self, "withMean", "Center data with mean") + self.withStd = Param(self, "withStd", "Scale to unit standard deviation") + self._setDefault(withMean=False, withStd=True) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None): + """ + setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None) + Sets params for this StandardScaler. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setWithMean(self, value): + """ + Sets the value of :py:attr:`withMean`. + """ + self.paramMap[self.withMean] = value + return self + + def getWithMean(self): + """ + Gets the value of withMean or its default value. + """ + return self.getOrDefault(self.withMean) + + def setWithStd(self, value): + """ + Sets the value of :py:attr:`withStd`. + """ + self.paramMap[self.withStd] = value + return self + + def getWithStd(self): + """ + Gets the value of withStd or its default value. + """ + return self.getOrDefault(self.withStd) + + +class StandardScalerModel(JavaModel): + """ + Model fitted by StandardScaler. + """ + + +@inherit_doc +class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): + """ + A label indexer that maps a string column of labels to an ML column of label indices. + If the input column is numeric, we cast it to string and index the string values. + The indices are in [0, numLabels), ordered by label frequencies. + So the most frequent label gets index 0. + + >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") + >>> model = stringIndexer.fit(stringIndDf) + >>> td = model.transform(stringIndDf) + >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), + ... key=lambda x: x[0]) + [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] + """ + + _java_class = "org.apache.spark.ml.feature.StringIndexer" + + @keyword_only + def __init__(self, inputCol=None, outputCol=None): + """ + __init__(self, inputCol=None, outputCol=None) + """ + super(StringIndexer, self).__init__() + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None): + """ + setParams(self, inputCol=None, outputCol=None) + Sets params for this StringIndexer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + +class StringIndexerModel(JavaModel): + """ + Model fitted by StringIndexer. + """ + + +@inherit_doc +@ignore_unicode_prefix +class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol): + """ + A tokenizer that converts the input string to lowercase and then + splits it by white spaces. + + >>> df = sqlContext.createDataFrame([("a b c",)], ["text"]) + >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") + >>> tokenizer.transform(df).head() + Row(text=u'a b c', words=[u'a', u'b', u'c']) + >>> # Change a parameter. + >>> tokenizer.setParams(outputCol="tokens").transform(df).head() + Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + >>> # Temporarily modify a parameter. + >>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head() + Row(text=u'a b c', words=[u'a', u'b', u'c']) + >>> tokenizer.transform(df).head() + Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + >>> # Must use keyword arguments to specify params. + >>> tokenizer.setParams("text") + Traceback (most recent call last): + ... + TypeError: Method setParams forces keyword arguments. + """ + + _java_class = "org.apache.spark.ml.feature.Tokenizer" + + @keyword_only + def __init__(self, inputCol=None, outputCol=None): + """ + __init__(self, inputCol=None, outputCol=None) + """ + super(Tokenizer, self).__init__() + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None): + """ + setParams(self, inputCol="input", outputCol="output") + Sets params for this Tokenizer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + @inherit_doc class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol): """ A feature transformer that merges multiple columns into a vector column. - >>> from pyspark.sql import Row - >>> df = sc.parallelize([Row(a=1, b=0, c=3)]).toDF() + >>> df = sqlContext.createDataFrame([(1, 0, 3)], ["a", "b", "c"]) >>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features") >>> vecAssembler.transform(df).head().features DenseVector([1.0, 0.0, 3.0]) @@ -137,7 +654,6 @@ def __init__(self, inputCols=None, outputCol=None): __init__(self, inputCols=None, outputCol=None) """ super(VectorAssembler, self).__init__() - self._setDefault() kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -151,10 +667,208 @@ def setParams(self, inputCols=None, outputCol=None): return self._set(**kwargs) +@inherit_doc +class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol): + """ + Class for indexing categorical feature columns in a dataset of [[Vector]]. + + This has 2 usage modes: + - Automatically identify categorical features (default behavior) + - This helps process a dataset of unknown vectors into a dataset with some continuous + features and some categorical features. The choice between continuous and categorical + is based upon a maxCategories parameter. + - Set maxCategories to the maximum number of categorical any categorical feature should + have. + - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}. + If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1}, + and feature 1 will be declared continuous. + - Index all features, if all features are categorical + - If maxCategories is set to be very large, then this will build an index of unique + values for all features. + - Warning: This can cause problems if features are continuous since this will collect ALL + unique values to the driver. + - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}. + If maxCategories >= 3, then both features will be declared categorical. + + This returns a model which can transform categorical features to use 0-based indices. + + Index stability: + - This is not guaranteed to choose the same category index across multiple runs. + - If a categorical feature includes value 0, then this is guaranteed to map value 0 to + index 0. This maintains vector sparsity. + - More stability may be added in the future. + + TODO: Future extensions: The following functionality is planned for the future: + - Preserve metadata in transform; if a feature's metadata is already present, + do not recompute. + - Specify certain features to not index, either via a parameter or via existing metadata. + - Add warning if a categorical feature has only 1 category. + - Add option for allowing unknown categories. + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([(Vectors.dense([-1.0, 0.0]),), + ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"]) + >>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed") + >>> model = indexer.fit(df) + >>> model.transform(df).head().indexed + DenseVector([1.0, 0.0]) + >>> indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test + DenseVector([0.0, 1.0]) + >>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"} + >>> model2 = indexer.fit(df, params) + >>> model2.transform(df).head().vector + DenseVector([1.0, 0.0]) + """ + + _java_class = "org.apache.spark.ml.feature.VectorIndexer" + # a placeholder to make it appear in the generated doc + maxCategories = Param(Params._dummy(), "maxCategories", + "Threshold for the number of values a categorical feature can take " + + "(>= 2). If a feature is found to have > maxCategories values, then " + + "it is declared continuous.") + + @keyword_only + def __init__(self, maxCategories=20, inputCol=None, outputCol=None): + """ + __init__(self, maxCategories=20, inputCol=None, outputCol=None) + """ + super(VectorIndexer, self).__init__() + self.maxCategories = Param(self, "maxCategories", + "Threshold for the number of values a categorical feature " + + "can take (>= 2). If a feature is found to have " + + "> maxCategories values, then it is declared continuous.") + self._setDefault(maxCategories=20) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, maxCategories=20, inputCol=None, outputCol=None): + """ + setParams(self, maxCategories=20, inputCol=None, outputCol=None) + Sets params for this VectorIndexer. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setMaxCategories(self, value): + """ + Sets the value of :py:attr:`maxCategories`. + """ + self.paramMap[self.maxCategories] = value + return self + + def getMaxCategories(self): + """ + Gets the value of maxCategories or its default value. + """ + return self.getOrDefault(self.maxCategories) + + +@inherit_doc +@ignore_unicode_prefix +class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol): + """ + Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further + natural language processing or machine learning process. + + >>> sent = ("a b " * 100 + "a c " * 10).split(" ") + >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"]) + >>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model").fit(doc) + >>> model.transform(doc).head().model + DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276]) + """ + + _java_class = "org.apache.spark.ml.feature.Word2Vec" + # a placeholder to make it appear in the generated doc + vectorSize = Param(Params._dummy(), "vectorSize", + "the dimension of codes after transforming from words") + numPartitions = Param(Params._dummy(), "numPartitions", + "number of partitions for sentences of words") + minCount = Param(Params._dummy(), "minCount", + "the minimum number of times a token must appear to be included in the " + + "word2vec model's vocabulary") + + @keyword_only + def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, + seed=42, inputCol=None, outputCol=None): + """ + __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, + seed=42, inputCol=None, outputCol=None) + """ + super(Word2Vec, self).__init__() + self.vectorSize = Param(self, "vectorSize", + "the dimension of codes after transforming from words") + self.numPartitions = Param(self, "numPartitions", + "number of partitions for sentences of words") + self.minCount = Param(self, "minCount", + "the minimum number of times a token must appear to be included " + + "in the word2vec model's vocabulary") + self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, + seed=42) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, + seed=42, inputCol=None, outputCol=None): + """ + setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=42, + inputCol=None, outputCol=None) + Sets params for this Word2Vec. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setVectorSize(self, value): + """ + Sets the value of :py:attr:`vectorSize`. + """ + self.paramMap[self.vectorSize] = value + return self + + def getVectorSize(self): + """ + Gets the value of vectorSize or its default value. + """ + return self.getOrDefault(self.vectorSize) + + def setNumPartitions(self, value): + """ + Sets the value of :py:attr:`numPartitions`. + """ + self.paramMap[self.numPartitions] = value + return self + + def getNumPartitions(self): + """ + Gets the value of numPartitions or its default value. + """ + return self.getOrDefault(self.numPartitions) + + def setMinCount(self, value): + """ + Sets the value of :py:attr:`minCount`. + """ + self.paramMap[self.minCount] = value + return self + + def getMinCount(self): + """ + Gets the value of minCount or its default value. + """ + return self.getOrDefault(self.minCount) + + +class Word2VecModel(JavaModel): + """ + Model fitted by Word2Vec. + """ + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext - from pyspark.sql import SQLContext + from pyspark.sql import Row, SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: @@ -162,8 +876,11 @@ def setParams(self, inputCols=None, outputCol=None): sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext - (failure_count, test_count) = doctest.testmod( - globs=globs, optionflags=doctest.ELLIPSIS) + testData = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="b"), + Row(id=2, label="c"), Row(id=3, label="a"), + Row(id=4, label="a"), Row(id=5, label="c")], 2) + globs['stringIndDf'] = sqlContext.createDataFrame(testData) + (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1) diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index c1c8e921dda87..ee901f25847bc 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -97,7 +97,9 @@ def get$Name(self): ("inputCol", "input column name", None), ("inputCols", "input column names", None), ("outputCol", "output column name", None), - ("numFeatures", "number of features", None)] + ("seed", "random seed", None), + ("tol", "the convergence tolerance for iterative algorithms", None), + ("stepSize", "Step size to be used for each iteration of optimization.", None)] code = [] for name, doc, defaultValueStr in shared: code.append(_gen_param_code(name, doc, defaultValueStr)) diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index aaf80f00085bf..5e7529c1dcf1e 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -308,3 +308,92 @@ def getNumFeatures(self): Gets the value of numFeatures or its default value. """ return self.getOrDefault(self.numFeatures) + + +class HasSeed(Params): + """ + Mixin for param seed: random seed. + """ + + # a placeholder to make it appear in the generated doc + seed = Param(Params._dummy(), "seed", "random seed") + + def __init__(self): + super(HasSeed, self).__init__() + #: param for random seed + self.seed = Param(self, "seed", "random seed") + if None is not None: + self._setDefault(seed=None) + + def setSeed(self, value): + """ + Sets the value of :py:attr:`seed`. + """ + self.paramMap[self.seed] = value + return self + + def getSeed(self): + """ + Gets the value of seed or its default value. + """ + return self.getOrDefault(self.seed) + + +class HasTol(Params): + """ + Mixin for param tol: the convergence tolerance for iterative algorithms. + """ + + # a placeholder to make it appear in the generated doc + tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms") + + def __init__(self): + super(HasTol, self).__init__() + #: param for the convergence tolerance for iterative algorithms + self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms") + if None is not None: + self._setDefault(tol=None) + + def setTol(self, value): + """ + Sets the value of :py:attr:`tol`. + """ + self.paramMap[self.tol] = value + return self + + def getTol(self): + """ + Gets the value of tol or its default value. + """ + return self.getOrDefault(self.tol) + + +class HasStepSize(Params): + """ + Mixin for param stepSize: Step size to be used for each iteration of optimization.. + """ + + # a placeholder to make it appear in the generated doc + stepSize = Param(Params._dummy(), "stepSize", + "Step size to be used for each iteration of optimization.") + + def __init__(self): + super(HasStepSize, self).__init__() + #: param for Step size to be used for each iteration of optimization. + self.stepSize = Param(self, "stepSize", + "Step size to be used for each iteration of optimization.") + if None is not None: + self._setDefault(stepSize=None) + + def setStepSize(self, value): + """ + Sets the value of :py:attr:`stepSize`. + """ + self.paramMap[self.stepSize] = value + return self + + def getStepSize(self): + """ + Gets the value of stepSize or its default value. + """ + return self.getOrDefault(self.stepSize) From 65afd3ce8b8a0b00f4ea8294eac14b72e964872d Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Fri, 8 May 2015 11:16:04 -0700 Subject: [PATCH 47/48] [SPARK-7474] [MLLIB] update ParamGridBuilder doctest Multiline commands are properly handled in this PR. oefirouz ![screen shot 2015-05-07 at 10 53 25 pm](https://cloud.githubusercontent.com/assets/829644/7531290/02ad2fd4-f50c-11e4-8c04-e58d1a61ad69.png) Author: Xiangrui Meng Closes #6001 from mengxr/SPARK-7474 and squashes the following commits: b94b11d [Xiangrui Meng] update ParamGridBuilder doctest --- python/pyspark/ml/tuning.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 1e04c37fca53e..28e3727f2c064 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -27,24 +27,22 @@ class ParamGridBuilder(object): - """ + r""" Builder for a param grid used in grid search-based model selection. - >>> from classification import LogisticRegression + >>> from pyspark.ml.classification import LogisticRegression >>> lr = LogisticRegression() - >>> output = ParamGridBuilder().baseOn({lr.labelCol: 'l'}) \ - .baseOn([lr.predictionCol, 'p']) \ - .addGrid(lr.regParam, [1.0, 2.0, 3.0]) \ - .addGrid(lr.maxIter, [1, 5]) \ - .addGrid(lr.featuresCol, ['f']) \ - .build() - >>> expected = [ \ -{lr.regParam: 1.0, lr.featuresCol: 'f', lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'}, \ -{lr.regParam: 2.0, lr.featuresCol: 'f', lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'}, \ -{lr.regParam: 3.0, lr.featuresCol: 'f', lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'}, \ -{lr.regParam: 1.0, lr.featuresCol: 'f', lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}, \ -{lr.regParam: 2.0, lr.featuresCol: 'f', lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}, \ -{lr.regParam: 3.0, lr.featuresCol: 'f', lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}] + >>> output = ParamGridBuilder() \ + ... .baseOn({lr.labelCol: 'l'}) \ + ... .baseOn([lr.predictionCol, 'p']) \ + ... .addGrid(lr.regParam, [1.0, 2.0]) \ + ... .addGrid(lr.maxIter, [1, 5]) \ + ... .build() + >>> expected = [ + ... {lr.regParam: 1.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'}, + ... {lr.regParam: 2.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'}, + ... {lr.regParam: 1.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}, + ... {lr.regParam: 2.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}] >>> len(output) == len(expected) True >>> all([m in expected for m in output]) From 008a60dd371e76819d8e08ab638cac7b3a48c9fc Mon Sep 17 00:00:00 2001 From: hqzizania Date: Fri, 8 May 2015 11:25:04 -0700 Subject: [PATCH 48/48] [SPARK-6824] Fill the docs for DataFrame API in SparkR This patch also removes the RDD docs from being built as a part of roxygen just by the method to delete " ' '" of " \#' ". Author: hqzizania Author: qhuang Closes #5969 from hqzizania/R1 and squashes the following commits: 6d27696 [qhuang] fixes in NAMESPACE eb4b095 [qhuang] remove more docs 6394579 [qhuang] remove RDD docs in generics.R 6813860 [hqzizania] Fill the docs for DataFrame API in SparkR 857220f [hqzizania] remove the pairRDD docs from being built as a part of roxygen c045d64 [hqzizania] remove the RDD docs from being built as a part of roxygen --- R/pkg/DESCRIPTION | 2 +- R/pkg/NAMESPACE | 4 - R/pkg/R/DataFrame.R | 95 +-- R/pkg/R/RDD.R | 1546 +++++++++++++++++++++--------------------- R/pkg/R/SQLContext.R | 64 +- R/pkg/R/broadcast.R | 64 +- R/pkg/R/context.R | 240 +++---- R/pkg/R/generics.R | 318 ++++----- R/pkg/R/pairRDD.R | 886 ++++++++++++------------ 9 files changed, 1610 insertions(+), 1609 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 1c1779a763c7e..efc85bbc4b316 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -15,11 +15,11 @@ Suggests: Description: R frontend for Spark License: Apache License (== 2.0) Collate: + 'schema.R' 'generics.R' 'jobj.R' 'RDD.R' 'pairRDD.R' - 'schema.R' 'column.R' 'group.R' 'DataFrame.R' diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 3fb92be0940b7..7611f479a628b 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -26,7 +26,6 @@ exportMethods("cache", "intersect", "isLocal", "join", - "length", "limit", "orderBy", "names", @@ -101,9 +100,6 @@ export("cacheTable", "tables", "uncacheTable") -export("sparkRSQL.init", - "sparkRHive.init") - export("structField", "structField.jobj", "structField.character", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 47d92f141cc7d..354642e7bc307 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -45,6 +45,9 @@ setMethod("initialize", "DataFrame", function(.Object, sdf, isCached) { #' @rdname DataFrame #' @export +#' +#' @param sdf A Java object reference to the backing Scala DataFrame +#' @param isCached TRUE if the dataFrame is cached dataFrame <- function(sdf, isCached = FALSE) { new("DataFrame", sdf, isCached) } @@ -244,7 +247,7 @@ setMethod("columns", }) #' @rdname columns -#' @export +#' @aliases names,DataFrame,function-method setMethod("names", signature(x = "DataFrame"), function(x) { @@ -399,23 +402,23 @@ setMethod("repartition", dataFrame(sdf) }) -#' toJSON -#' -#' Convert the rows of a DataFrame into JSON objects and return an RDD where -#' each element contains a JSON string. -#' -#' @param x A SparkSQL DataFrame -#' @return A StringRRDD of JSON objects -#' @rdname tojson -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' sqlCtx <- sparkRSQL.init(sc) -#' path <- "path/to/file.json" -#' df <- jsonFile(sqlCtx, path) -#' newRDD <- toJSON(df) -#'} +# toJSON +# +# Convert the rows of a DataFrame into JSON objects and return an RDD where +# each element contains a JSON string. +# +#@param x A SparkSQL DataFrame +# @return A StringRRDD of JSON objects +# @rdname tojson +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# sqlCtx <- sparkRSQL.init(sc) +# path <- "path/to/file.json" +# df <- jsonFile(sqlCtx, path) +# newRDD <- toJSON(df) +#} setMethod("toJSON", signature(x = "DataFrame"), function(x) { @@ -578,8 +581,8 @@ setMethod("limit", dataFrame(res) }) -# Take the first NUM rows of a DataFrame and return a the results as a data.frame - +#' Take the first NUM rows of a DataFrame and return a the results as a data.frame +#' #' @rdname take #' @export #' @examples @@ -644,22 +647,22 @@ setMethod("first", take(x, 1) }) -#' toRDD() -#' -#' Converts a Spark DataFrame to an RDD while preserving column names. -#' -#' @param x A Spark DataFrame -#' -#' @rdname DataFrame -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' sqlCtx <- sparkRSQL.init(sc) -#' path <- "path/to/file.json" -#' df <- jsonFile(sqlCtx, path) -#' rdd <- toRDD(df) -#' } +# toRDD() +# +# Converts a Spark DataFrame to an RDD while preserving column names. +# +# @param x A Spark DataFrame +# +# @rdname DataFrame +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# sqlCtx <- sparkRSQL.init(sc) +# path <- "path/to/file.json" +# df <- jsonFile(sqlCtx, path) +# rdd <- toRDD(df) +# } setMethod("toRDD", signature(x = "DataFrame"), function(x) { @@ -706,6 +709,7 @@ setMethod("groupBy", #' #' Compute aggregates by specifying a list of columns #' +#' @param x a DataFrame #' @rdname DataFrame #' @export setMethod("agg", @@ -721,7 +725,7 @@ setMethod("agg", # the requested map function. # ################################################################################### -#' @rdname lapply +# @rdname lapply setMethod("lapply", signature(X = "DataFrame", FUN = "function"), function(X, FUN) { @@ -729,14 +733,14 @@ setMethod("lapply", lapply(rdd, FUN) }) -#' @rdname lapply +# @rdname lapply setMethod("map", signature(X = "DataFrame", FUN = "function"), function(X, FUN) { lapply(X, FUN) }) -#' @rdname flatMap +# @rdname flatMap setMethod("flatMap", signature(X = "DataFrame", FUN = "function"), function(X, FUN) { @@ -744,7 +748,7 @@ setMethod("flatMap", flatMap(rdd, FUN) }) -#' @rdname lapplyPartition +# @rdname lapplyPartition setMethod("lapplyPartition", signature(X = "DataFrame", FUN = "function"), function(X, FUN) { @@ -752,14 +756,14 @@ setMethod("lapplyPartition", lapplyPartition(rdd, FUN) }) -#' @rdname lapplyPartition +# @rdname lapplyPartition setMethod("mapPartitions", signature(X = "DataFrame", FUN = "function"), function(X, FUN) { lapplyPartition(X, FUN) }) -#' @rdname foreach +# @rdname foreach setMethod("foreach", signature(x = "DataFrame", func = "function"), function(x, func) { @@ -767,7 +771,7 @@ setMethod("foreach", foreach(rdd, func) }) -#' @rdname foreach +# @rdname foreach setMethod("foreachPartition", signature(x = "DataFrame", func = "function"), function(x, func) { @@ -788,6 +792,7 @@ setMethod("$", signature(x = "DataFrame"), getColumn(x, name) }) +#' @rdname select setMethod("$<-", signature(x = "DataFrame"), function(x, name, value) { stopifnot(class(value) == "Column" || is.null(value)) @@ -1009,7 +1014,7 @@ setMethod("sortDF", }) #' @rdname sortDF -#' @export +#' @aliases orderBy,DataFrame,function-method setMethod("orderBy", signature(x = "DataFrame", col = "characterOrColumn"), function(x, col) { @@ -1046,7 +1051,7 @@ setMethod("filter", }) #' @rdname filter -#' @export +#' @aliases where,DataFrame,function-method setMethod("where", signature(x = "DataFrame", condition = "characterOrColumn"), function(x, condition) { diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index d1018c2361670..73999a6737032 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -19,16 +19,16 @@ setOldClass("jobj") -#' @title S4 class that represents an RDD -#' @description RDD can be created using functions like -#' \code{parallelize}, \code{textFile} etc. -#' @rdname RDD -#' @seealso parallelize, textFile -#' -#' @slot env An R environment that stores bookkeeping states of the RDD -#' @slot jrdd Java object reference to the backing JavaRDD -#' to an RDD -#' @export +# @title S4 class that represents an RDD +# @description RDD can be created using functions like +# \code{parallelize}, \code{textFile} etc. +# @rdname RDD +# @seealso parallelize, textFile +# +# @slot env An R environment that stores bookkeeping states of the RDD +# @slot jrdd Java object reference to the backing JavaRDD +# to an RDD +# @export setClass("RDD", slots = list(env = "environment", jrdd = "jobj")) @@ -108,14 +108,14 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) .Object }) -#' @rdname RDD -#' @export -#' -#' @param jrdd Java object reference to the backing JavaRDD -#' @param serializedMode Use "byte" if the RDD stores data serialized in R, "string" if the RDD -#' stores strings, and "row" if the RDD stores the rows of a DataFrame -#' @param isCached TRUE if the RDD is cached -#' @param isCheckpointed TRUE if the RDD has been checkpointed +# @rdname RDD +# @export +# +# @param jrdd Java object reference to the backing JavaRDD +# @param serializedMode Use "byte" if the RDD stores data serialized in R, "string" if the RDD +# stores strings, and "row" if the RDD stores the rows of a DataFrame +# @param isCached TRUE if the RDD is cached +# @param isCheckpointed TRUE if the RDD has been checkpointed RDD <- function(jrdd, serializedMode = "byte", isCached = FALSE, isCheckpointed = FALSE) { new("RDD", jrdd, serializedMode, isCached, isCheckpointed) @@ -200,19 +200,19 @@ setValidity("RDD", ############ Actions and Transformations ############ -#' Persist an RDD -#' -#' Persist this RDD with the default storage level (MEMORY_ONLY). -#' -#' @param x The RDD to cache -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10, 2L) -#' cache(rdd) -#'} -#' @rdname cache-methods -#' @aliases cache,RDD-method +# Persist an RDD +# +# Persist this RDD with the default storage level (MEMORY_ONLY). +# +# @param x The RDD to cache +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10, 2L) +# cache(rdd) +#} +# @rdname cache-methods +# @aliases cache,RDD-method setMethod("cache", signature(x = "RDD"), function(x) { @@ -221,22 +221,22 @@ setMethod("cache", x }) -#' Persist an RDD -#' -#' Persist this RDD with the specified storage level. For details of the -#' supported storage levels, refer to -#' http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence. -#' -#' @param x The RDD to persist -#' @param newLevel The new storage level to be assigned -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10, 2L) -#' persist(rdd, "MEMORY_AND_DISK") -#'} -#' @rdname persist -#' @aliases persist,RDD-method +# Persist an RDD +# +# Persist this RDD with the specified storage level. For details of the +# supported storage levels, refer to +# http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence. +# +# @param x The RDD to persist +# @param newLevel The new storage level to be assigned +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10, 2L) +# persist(rdd, "MEMORY_AND_DISK") +#} +# @rdname persist +# @aliases persist,RDD-method setMethod("persist", signature(x = "RDD", newLevel = "character"), function(x, newLevel) { @@ -245,21 +245,21 @@ setMethod("persist", x }) -#' Unpersist an RDD -#' -#' Mark the RDD as non-persistent, and remove all blocks for it from memory and -#' disk. -#' -#' @param x The RDD to unpersist -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10, 2L) -#' cache(rdd) # rdd@@env$isCached == TRUE -#' unpersist(rdd) # rdd@@env$isCached == FALSE -#'} -#' @rdname unpersist-methods -#' @aliases unpersist,RDD-method +# Unpersist an RDD +# +# Mark the RDD as non-persistent, and remove all blocks for it from memory and +# disk. +# +# @param x The RDD to unpersist +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10, 2L) +# cache(rdd) # rdd@@env$isCached == TRUE +# unpersist(rdd) # rdd@@env$isCached == FALSE +#} +# @rdname unpersist-methods +# @aliases unpersist,RDD-method setMethod("unpersist", signature(x = "RDD"), function(x) { @@ -268,24 +268,24 @@ setMethod("unpersist", x }) -#' Checkpoint an RDD -#' -#' Mark this RDD for checkpointing. It will be saved to a file inside the -#' checkpoint directory set with setCheckpointDir() and all references to its -#' parent RDDs will be removed. This function must be called before any job has -#' been executed on this RDD. It is strongly recommended that this RDD is -#' persisted in memory, otherwise saving it on a file will require recomputation. -#' -#' @param x The RDD to checkpoint -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' setCheckpointDir(sc, "checkpoint") -#' rdd <- parallelize(sc, 1:10, 2L) -#' checkpoint(rdd) -#'} -#' @rdname checkpoint-methods -#' @aliases checkpoint,RDD-method +# Checkpoint an RDD +# +# Mark this RDD for checkpointing. It will be saved to a file inside the +# checkpoint directory set with setCheckpointDir() and all references to its +# parent RDDs will be removed. This function must be called before any job has +# been executed on this RDD. It is strongly recommended that this RDD is +# persisted in memory, otherwise saving it on a file will require recomputation. +# +# @param x The RDD to checkpoint +# @examples +#\dontrun{ +# sc <- sparkR.init() +# setCheckpointDir(sc, "checkpoint") +# rdd <- parallelize(sc, 1:10, 2L) +# checkpoint(rdd) +#} +# @rdname checkpoint-methods +# @aliases checkpoint,RDD-method setMethod("checkpoint", signature(x = "RDD"), function(x) { @@ -295,18 +295,18 @@ setMethod("checkpoint", x }) -#' Gets the number of partitions of an RDD -#' -#' @param x A RDD. -#' @return the number of partitions of rdd as an integer. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10, 2L) -#' numPartitions(rdd) # 2L -#'} -#' @rdname numPartitions -#' @aliases numPartitions,RDD-method +# Gets the number of partitions of an RDD +# +# @param x A RDD. +# @return the number of partitions of rdd as an integer. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10, 2L) +# numPartitions(rdd) # 2L +#} +# @rdname numPartitions +# @aliases numPartitions,RDD-method setMethod("numPartitions", signature(x = "RDD"), function(x) { @@ -315,24 +315,24 @@ setMethod("numPartitions", callJMethod(partitions, "size") }) -#' Collect elements of an RDD -#' -#' @description -#' \code{collect} returns a list that contains all of the elements in this RDD. -#' -#' @param x The RDD to collect -#' @param ... Other optional arguments to collect -#' @param flatten FALSE if the list should not flattened -#' @return a list containing elements in the RDD -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10, 2L) -#' collect(rdd) # list from 1 to 10 -#' collectPartition(rdd, 0L) # list from 1 to 5 -#'} -#' @rdname collect-methods -#' @aliases collect,RDD-method +# Collect elements of an RDD +# +# @description +# \code{collect} returns a list that contains all of the elements in this RDD. +# +# @param x The RDD to collect +# @param ... Other optional arguments to collect +# @param flatten FALSE if the list should not flattened +# @return a list containing elements in the RDD +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10, 2L) +# collect(rdd) # list from 1 to 10 +# collectPartition(rdd, 0L) # list from 1 to 5 +#} +# @rdname collect-methods +# @aliases collect,RDD-method setMethod("collect", signature(x = "RDD"), function(x, flatten = TRUE) { @@ -343,12 +343,12 @@ setMethod("collect", }) -#' @description -#' \code{collectPartition} returns a list that contains all of the elements -#' in the specified partition of the RDD. -#' @param partitionId the partition to collect (starts from 0) -#' @rdname collect-methods -#' @aliases collectPartition,integer,RDD-method +# @description +# \code{collectPartition} returns a list that contains all of the elements +# in the specified partition of the RDD. +# @param partitionId the partition to collect (starts from 0) +# @rdname collect-methods +# @aliases collectPartition,integer,RDD-method setMethod("collectPartition", signature(x = "RDD", partitionId = "integer"), function(x, partitionId) { @@ -361,17 +361,17 @@ setMethod("collectPartition", serializedMode = getSerializedMode(x)) }) -#' @description -#' \code{collectAsMap} returns a named list as a map that contains all of the elements -#' in a key-value pair RDD. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(list(1, 2), list(3, 4)), 2L) -#' collectAsMap(rdd) # list(`1` = 2, `3` = 4) -#'} -#' @rdname collect-methods -#' @aliases collectAsMap,RDD-method +# @description +# \code{collectAsMap} returns a named list as a map that contains all of the elements +# in a key-value pair RDD. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(list(1, 2), list(3, 4)), 2L) +# collectAsMap(rdd) # list(`1` = 2, `3` = 4) +#} +# @rdname collect-methods +# @aliases collectAsMap,RDD-method setMethod("collectAsMap", signature(x = "RDD"), function(x) { @@ -381,19 +381,19 @@ setMethod("collectAsMap", as.list(map) }) -#' Return the number of elements in the RDD. -#' -#' @param x The RDD to count -#' @return number of elements in the RDD. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' count(rdd) # 10 -#' length(rdd) # Same as count -#'} -#' @rdname count -#' @aliases count,RDD-method +# Return the number of elements in the RDD. +# +# @param x The RDD to count +# @return number of elements in the RDD. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# count(rdd) # 10 +# length(rdd) # Same as count +#} +# @rdname count +# @aliases count,RDD-method setMethod("count", signature(x = "RDD"), function(x) { @@ -405,31 +405,31 @@ setMethod("count", sum(as.integer(vals)) }) -#' Return the number of elements in the RDD -#' @export -#' @rdname count +# Return the number of elements in the RDD +# @export +# @rdname count setMethod("length", signature(x = "RDD"), function(x) { count(x) }) -#' Return the count of each unique value in this RDD as a list of -#' (value, count) pairs. -#' -#' Same as countByValue in Spark. -#' -#' @param x The RDD to count -#' @return list of (value, count) pairs, where count is number of each unique -#' value in rdd. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, c(1,2,3,2,1)) -#' countByValue(rdd) # (1,2L), (2,2L), (3,1L) -#'} -#' @rdname countByValue -#' @aliases countByValue,RDD-method +# Return the count of each unique value in this RDD as a list of +# (value, count) pairs. +# +# Same as countByValue in Spark. +# +# @param x The RDD to count +# @return list of (value, count) pairs, where count is number of each unique +# value in rdd. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, c(1,2,3,2,1)) +# countByValue(rdd) # (1,2L), (2,2L), (3,1L) +#} +# @rdname countByValue +# @aliases countByValue,RDD-method setMethod("countByValue", signature(x = "RDD"), function(x) { @@ -437,23 +437,23 @@ setMethod("countByValue", collect(reduceByKey(ones, `+`, numPartitions(x))) }) -#' Apply a function to all elements -#' -#' This function creates a new RDD by applying the given transformation to all -#' elements of the given RDD -#' -#' @param X The RDD to apply the transformation. -#' @param FUN the transformation to apply on each element -#' @return a new RDD created by the transformation. -#' @rdname lapply -#' @aliases lapply -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' multiplyByTwo <- lapply(rdd, function(x) { x * 2 }) -#' collect(multiplyByTwo) # 2,4,6... -#'} +# Apply a function to all elements +# +# This function creates a new RDD by applying the given transformation to all +# elements of the given RDD +# +# @param X The RDD to apply the transformation. +# @param FUN the transformation to apply on each element +# @return a new RDD created by the transformation. +# @rdname lapply +# @aliases lapply +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# multiplyByTwo <- lapply(rdd, function(x) { x * 2 }) +# collect(multiplyByTwo) # 2,4,6... +#} setMethod("lapply", signature(X = "RDD", FUN = "function"), function(X, FUN) { @@ -463,31 +463,31 @@ setMethod("lapply", lapplyPartitionsWithIndex(X, func) }) -#' @rdname lapply -#' @aliases map,RDD,function-method +# @rdname lapply +# @aliases map,RDD,function-method setMethod("map", signature(X = "RDD", FUN = "function"), function(X, FUN) { lapply(X, FUN) }) -#' Flatten results after apply a function to all elements -#' -#' This function return a new RDD by first applying a function to all -#' elements of this RDD, and then flattening the results. -#' -#' @param X The RDD to apply the transformation. -#' @param FUN the transformation to apply on each element -#' @return a new RDD created by the transformation. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' multiplyByTwo <- flatMap(rdd, function(x) { list(x*2, x*10) }) -#' collect(multiplyByTwo) # 2,20,4,40,6,60... -#'} -#' @rdname flatMap -#' @aliases flatMap,RDD,function-method +# Flatten results after apply a function to all elements +# +# This function return a new RDD by first applying a function to all +# elements of this RDD, and then flattening the results. +# +# @param X The RDD to apply the transformation. +# @param FUN the transformation to apply on each element +# @return a new RDD created by the transformation. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# multiplyByTwo <- flatMap(rdd, function(x) { list(x*2, x*10) }) +# collect(multiplyByTwo) # 2,20,4,40,6,60... +#} +# @rdname flatMap +# @aliases flatMap,RDD,function-method setMethod("flatMap", signature(X = "RDD", FUN = "function"), function(X, FUN) { @@ -500,83 +500,83 @@ setMethod("flatMap", lapplyPartition(X, partitionFunc) }) -#' Apply a function to each partition of an RDD -#' -#' Return a new RDD by applying a function to each partition of this RDD. -#' -#' @param X The RDD to apply the transformation. -#' @param FUN the transformation to apply on each partition. -#' @return a new RDD created by the transformation. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' partitionSum <- lapplyPartition(rdd, function(part) { Reduce("+", part) }) -#' collect(partitionSum) # 15, 40 -#'} -#' @rdname lapplyPartition -#' @aliases lapplyPartition,RDD,function-method +# Apply a function to each partition of an RDD +# +# Return a new RDD by applying a function to each partition of this RDD. +# +# @param X The RDD to apply the transformation. +# @param FUN the transformation to apply on each partition. +# @return a new RDD created by the transformation. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# partitionSum <- lapplyPartition(rdd, function(part) { Reduce("+", part) }) +# collect(partitionSum) # 15, 40 +#} +# @rdname lapplyPartition +# @aliases lapplyPartition,RDD,function-method setMethod("lapplyPartition", signature(X = "RDD", FUN = "function"), function(X, FUN) { lapplyPartitionsWithIndex(X, function(s, part) { FUN(part) }) }) -#' mapPartitions is the same as lapplyPartition. -#' -#' @rdname lapplyPartition -#' @aliases mapPartitions,RDD,function-method +# mapPartitions is the same as lapplyPartition. +# +# @rdname lapplyPartition +# @aliases mapPartitions,RDD,function-method setMethod("mapPartitions", signature(X = "RDD", FUN = "function"), function(X, FUN) { lapplyPartition(X, FUN) }) -#' Return a new RDD by applying a function to each partition of this RDD, while -#' tracking the index of the original partition. -#' -#' @param X The RDD to apply the transformation. -#' @param FUN the transformation to apply on each partition; takes the partition -#' index and a list of elements in the particular partition. -#' @return a new RDD created by the transformation. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10, 5L) -#' prod <- lapplyPartitionsWithIndex(rdd, function(partIndex, part) { -#' partIndex * Reduce("+", part) }) -#' collect(prod, flatten = FALSE) # 0, 7, 22, 45, 76 -#'} -#' @rdname lapplyPartitionsWithIndex -#' @aliases lapplyPartitionsWithIndex,RDD,function-method +# Return a new RDD by applying a function to each partition of this RDD, while +# tracking the index of the original partition. +# +# @param X The RDD to apply the transformation. +# @param FUN the transformation to apply on each partition; takes the partition +# index and a list of elements in the particular partition. +# @return a new RDD created by the transformation. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10, 5L) +# prod <- lapplyPartitionsWithIndex(rdd, function(partIndex, part) { +# partIndex * Reduce("+", part) }) +# collect(prod, flatten = FALSE) # 0, 7, 22, 45, 76 +#} +# @rdname lapplyPartitionsWithIndex +# @aliases lapplyPartitionsWithIndex,RDD,function-method setMethod("lapplyPartitionsWithIndex", signature(X = "RDD", FUN = "function"), function(X, FUN) { PipelinedRDD(X, FUN) }) -#' @rdname lapplyPartitionsWithIndex -#' @aliases mapPartitionsWithIndex,RDD,function-method +# @rdname lapplyPartitionsWithIndex +# @aliases mapPartitionsWithIndex,RDD,function-method setMethod("mapPartitionsWithIndex", signature(X = "RDD", FUN = "function"), function(X, FUN) { lapplyPartitionsWithIndex(X, FUN) }) -#' This function returns a new RDD containing only the elements that satisfy -#' a predicate (i.e. returning TRUE in a given logical function). -#' The same as `filter()' in Spark. -#' -#' @param x The RDD to be filtered. -#' @param f A unary predicate function. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' unlist(collect(filterRDD(rdd, function (x) { x < 3 }))) # c(1, 2) -#'} -#' @rdname filterRDD -#' @aliases filterRDD,RDD,function-method +# This function returns a new RDD containing only the elements that satisfy +# a predicate (i.e. returning TRUE in a given logical function). +# The same as `filter()' in Spark. +# +# @param x The RDD to be filtered. +# @param f A unary predicate function. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# unlist(collect(filterRDD(rdd, function (x) { x < 3 }))) # c(1, 2) +#} +# @rdname filterRDD +# @aliases filterRDD,RDD,function-method setMethod("filterRDD", signature(x = "RDD", f = "function"), function(x, f) { @@ -586,30 +586,30 @@ setMethod("filterRDD", lapplyPartition(x, filter.func) }) -#' @rdname filterRDD -#' @aliases Filter +# @rdname filterRDD +# @aliases Filter setMethod("Filter", signature(f = "function", x = "RDD"), function(f, x) { filterRDD(x, f) }) -#' Reduce across elements of an RDD. -#' -#' This function reduces the elements of this RDD using the -#' specified commutative and associative binary operator. -#' -#' @param x The RDD to reduce -#' @param func Commutative and associative function to apply on elements -#' of the RDD. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' reduce(rdd, "+") # 55 -#'} -#' @rdname reduce -#' @aliases reduce,RDD,ANY-method +# Reduce across elements of an RDD. +# +# This function reduces the elements of this RDD using the +# specified commutative and associative binary operator. +# +# @param x The RDD to reduce +# @param func Commutative and associative function to apply on elements +# of the RDD. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# reduce(rdd, "+") # 55 +#} +# @rdname reduce +# @aliases reduce,RDD,ANY-method setMethod("reduce", signature(x = "RDD", func = "ANY"), function(x, func) { @@ -623,70 +623,70 @@ setMethod("reduce", Reduce(func, partitionList) }) -#' Get the maximum element of an RDD. -#' -#' @param x The RDD to get the maximum element from -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' maximum(rdd) # 10 -#'} -#' @rdname maximum -#' @aliases maximum,RDD +# Get the maximum element of an RDD. +# +# @param x The RDD to get the maximum element from +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# maximum(rdd) # 10 +#} +# @rdname maximum +# @aliases maximum,RDD setMethod("maximum", signature(x = "RDD"), function(x) { reduce(x, max) }) -#' Get the minimum element of an RDD. -#' -#' @param x The RDD to get the minimum element from -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' minimum(rdd) # 1 -#'} -#' @rdname minimum -#' @aliases minimum,RDD +# Get the minimum element of an RDD. +# +# @param x The RDD to get the minimum element from +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# minimum(rdd) # 1 +#} +# @rdname minimum +# @aliases minimum,RDD setMethod("minimum", signature(x = "RDD"), function(x) { reduce(x, min) }) -#' Add up the elements in an RDD. -#' -#' @param x The RDD to add up the elements in -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' sumRDD(rdd) # 55 -#'} -#' @rdname sumRDD -#' @aliases sumRDD,RDD +# Add up the elements in an RDD. +# +# @param x The RDD to add up the elements in +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# sumRDD(rdd) # 55 +#} +# @rdname sumRDD +# @aliases sumRDD,RDD setMethod("sumRDD", signature(x = "RDD"), function(x) { reduce(x, "+") }) -#' Applies a function to all elements in an RDD, and force evaluation. -#' -#' @param x The RDD to apply the function -#' @param func The function to be applied. -#' @return invisible NULL. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' foreach(rdd, function(x) { save(x, file=...) }) -#'} -#' @rdname foreach -#' @aliases foreach,RDD,function-method +# Applies a function to all elements in an RDD, and force evaluation. +# +# @param x The RDD to apply the function +# @param func The function to be applied. +# @return invisible NULL. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# foreach(rdd, function(x) { save(x, file=...) }) +#} +# @rdname foreach +# @aliases foreach,RDD,function-method setMethod("foreach", signature(x = "RDD", func = "function"), function(x, func) { @@ -697,37 +697,37 @@ setMethod("foreach", invisible(collect(mapPartitions(x, partition.func))) }) -#' Applies a function to each partition in an RDD, and force evaluation. -#' -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' foreachPartition(rdd, function(part) { save(part, file=...); NULL }) -#'} -#' @rdname foreach -#' @aliases foreachPartition,RDD,function-method +# Applies a function to each partition in an RDD, and force evaluation. +# +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# foreachPartition(rdd, function(part) { save(part, file=...); NULL }) +#} +# @rdname foreach +# @aliases foreachPartition,RDD,function-method setMethod("foreachPartition", signature(x = "RDD", func = "function"), function(x, func) { invisible(collect(mapPartitions(x, func))) }) -#' Take elements from an RDD. -#' -#' This function takes the first NUM elements in the RDD and -#' returns them in a list. -#' -#' @param x The RDD to take elements from -#' @param num Number of elements to take -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' take(rdd, 2L) # list(1, 2) -#'} -#' @rdname take -#' @aliases take,RDD,numeric-method +# Take elements from an RDD. +# +# This function takes the first NUM elements in the RDD and +# returns them in a list. +# +# @param x The RDD to take elements from +# @param num Number of elements to take +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# take(rdd, 2L) # list(1, 2) +#} +# @rdname take +# @aliases take,RDD,numeric-method setMethod("take", signature(x = "RDD", num = "numeric"), function(x, num) { @@ -762,39 +762,39 @@ setMethod("take", }) -#' First -#' -#' Return the first element of an RDD -#' -#' @rdname first -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' first(rdd) -#' } +# First +# +# Return the first element of an RDD +# +# @rdname first +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# first(rdd) +# } setMethod("first", signature(x = "RDD"), function(x) { take(x, 1)[[1]] }) -#' Removes the duplicates from RDD. -#' -#' This function returns a new RDD containing the distinct elements in the -#' given RDD. The same as `distinct()' in Spark. -#' -#' @param x The RDD to remove duplicates from. -#' @param numPartitions Number of partitions to create. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, c(1,2,2,3,3,3)) -#' sort(unlist(collect(distinct(rdd)))) # c(1, 2, 3) -#'} -#' @rdname distinct -#' @aliases distinct,RDD-method +# Removes the duplicates from RDD. +# +# This function returns a new RDD containing the distinct elements in the +# given RDD. The same as `distinct()' in Spark. +# +# @param x The RDD to remove duplicates from. +# @param numPartitions Number of partitions to create. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, c(1,2,2,3,3,3)) +# sort(unlist(collect(distinct(rdd)))) # c(1, 2, 3) +#} +# @rdname distinct +# @aliases distinct,RDD-method setMethod("distinct", signature(x = "RDD"), function(x, numPartitions = SparkR:::numPartitions(x)) { @@ -806,24 +806,24 @@ setMethod("distinct", resRDD }) -#' Return an RDD that is a sampled subset of the given RDD. -#' -#' The same as `sample()' in Spark. (We rename it due to signature -#' inconsistencies with the `sample()' function in R's base package.) -#' -#' @param x The RDD to sample elements from -#' @param withReplacement Sampling with replacement or not -#' @param fraction The (rough) sample target fraction -#' @param seed Randomness seed value -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' collect(sampleRDD(rdd, FALSE, 0.5, 1618L)) # ~5 distinct elements -#' collect(sampleRDD(rdd, TRUE, 0.5, 9L)) # ~5 elements possibly with duplicates -#'} -#' @rdname sampleRDD -#' @aliases sampleRDD,RDD +# Return an RDD that is a sampled subset of the given RDD. +# +# The same as `sample()' in Spark. (We rename it due to signature +# inconsistencies with the `sample()' function in R's base package.) +# +# @param x The RDD to sample elements from +# @param withReplacement Sampling with replacement or not +# @param fraction The (rough) sample target fraction +# @param seed Randomness seed value +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# collect(sampleRDD(rdd, FALSE, 0.5, 1618L)) # ~5 distinct elements +# collect(sampleRDD(rdd, TRUE, 0.5, 9L)) # ~5 elements possibly with duplicates +#} +# @rdname sampleRDD +# @aliases sampleRDD,RDD setMethod("sampleRDD", signature(x = "RDD", withReplacement = "logical", fraction = "numeric", seed = "integer"), @@ -867,23 +867,23 @@ setMethod("sampleRDD", lapplyPartitionsWithIndex(x, samplingFunc) }) -#' Return a list of the elements that are a sampled subset of the given RDD. -#' -#' @param x The RDD to sample elements from -#' @param withReplacement Sampling with replacement or not -#' @param num Number of elements to return -#' @param seed Randomness seed value -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:100) -#' # exactly 5 elements sampled, which may not be distinct -#' takeSample(rdd, TRUE, 5L, 1618L) -#' # exactly 5 distinct elements sampled -#' takeSample(rdd, FALSE, 5L, 16181618L) -#'} -#' @rdname takeSample -#' @aliases takeSample,RDD +# Return a list of the elements that are a sampled subset of the given RDD. +# +# @param x The RDD to sample elements from +# @param withReplacement Sampling with replacement or not +# @param num Number of elements to return +# @param seed Randomness seed value +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:100) +# # exactly 5 elements sampled, which may not be distinct +# takeSample(rdd, TRUE, 5L, 1618L) +# # exactly 5 distinct elements sampled +# takeSample(rdd, FALSE, 5L, 16181618L) +#} +# @rdname takeSample +# @aliases takeSample,RDD setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", num = "integer", seed = "integer"), function(x, withReplacement, num, seed) { @@ -930,18 +930,18 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", sample(samples)[1:total] }) -#' Creates tuples of the elements in this RDD by applying a function. -#' -#' @param x The RDD. -#' @param func The function to be applied. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(1, 2, 3)) -#' collect(keyBy(rdd, function(x) { x*x })) # list(list(1, 1), list(4, 2), list(9, 3)) -#'} -#' @rdname keyBy -#' @aliases keyBy,RDD +# Creates tuples of the elements in this RDD by applying a function. +# +# @param x The RDD. +# @param func The function to be applied. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(1, 2, 3)) +# collect(keyBy(rdd, function(x) { x*x })) # list(list(1, 1), list(4, 2), list(9, 3)) +#} +# @rdname keyBy +# @aliases keyBy,RDD setMethod("keyBy", signature(x = "RDD", func = "function"), function(x, func) { @@ -951,44 +951,44 @@ setMethod("keyBy", lapply(x, apply.func) }) -#' Return a new RDD that has exactly numPartitions partitions. -#' Can increase or decrease the level of parallelism in this RDD. Internally, -#' this uses a shuffle to redistribute data. -#' If you are decreasing the number of partitions in this RDD, consider using -#' coalesce, which can avoid performing a shuffle. -#' -#' @param x The RDD. -#' @param numPartitions Number of partitions to create. -#' @seealso coalesce -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(1, 2, 3, 4, 5, 6, 7), 4L) -#' numPartitions(rdd) # 4 -#' numPartitions(repartition(rdd, 2L)) # 2 -#'} -#' @rdname repartition -#' @aliases repartition,RDD +# Return a new RDD that has exactly numPartitions partitions. +# Can increase or decrease the level of parallelism in this RDD. Internally, +# this uses a shuffle to redistribute data. +# If you are decreasing the number of partitions in this RDD, consider using +# coalesce, which can avoid performing a shuffle. +# +# @param x The RDD. +# @param numPartitions Number of partitions to create. +# @seealso coalesce +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(1, 2, 3, 4, 5, 6, 7), 4L) +# numPartitions(rdd) # 4 +# numPartitions(repartition(rdd, 2L)) # 2 +#} +# @rdname repartition +# @aliases repartition,RDD setMethod("repartition", signature(x = "RDD", numPartitions = "numeric"), function(x, numPartitions) { coalesce(x, numPartitions, TRUE) }) -#' Return a new RDD that is reduced into numPartitions partitions. -#' -#' @param x The RDD. -#' @param numPartitions Number of partitions to create. -#' @seealso repartition -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(1, 2, 3, 4, 5), 3L) -#' numPartitions(rdd) # 3 -#' numPartitions(coalesce(rdd, 1L)) # 1 -#'} -#' @rdname coalesce -#' @aliases coalesce,RDD +# Return a new RDD that is reduced into numPartitions partitions. +# +# @param x The RDD. +# @param numPartitions Number of partitions to create. +# @seealso repartition +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(1, 2, 3, 4, 5), 3L) +# numPartitions(rdd) # 3 +# numPartitions(coalesce(rdd, 1L)) # 1 +#} +# @rdname coalesce +# @aliases coalesce,RDD setMethod("coalesce", signature(x = "RDD", numPartitions = "numeric"), function(x, numPartitions, shuffle = FALSE) { @@ -1012,19 +1012,19 @@ setMethod("coalesce", } }) -#' Save this RDD as a SequenceFile of serialized objects. -#' -#' @param x The RDD to save -#' @param path The directory where the file is saved -#' @seealso objectFile -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:3) -#' saveAsObjectFile(rdd, "/tmp/sparkR-tmp") -#'} -#' @rdname saveAsObjectFile -#' @aliases saveAsObjectFile,RDD +# Save this RDD as a SequenceFile of serialized objects. +# +# @param x The RDD to save +# @param path The directory where the file is saved +# @seealso objectFile +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:3) +# saveAsObjectFile(rdd, "/tmp/sparkR-tmp") +#} +# @rdname saveAsObjectFile +# @aliases saveAsObjectFile,RDD setMethod("saveAsObjectFile", signature(x = "RDD", path = "character"), function(x, path) { @@ -1037,18 +1037,18 @@ setMethod("saveAsObjectFile", invisible(callJMethod(getJRDD(x), "saveAsObjectFile", path)) }) -#' Save this RDD as a text file, using string representations of elements. -#' -#' @param x The RDD to save -#' @param path The directory where the partitions of the text file are saved -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:3) -#' saveAsTextFile(rdd, "/tmp/sparkR-tmp") -#'} -#' @rdname saveAsTextFile -#' @aliases saveAsTextFile,RDD +# Save this RDD as a text file, using string representations of elements. +# +# @param x The RDD to save +# @param path The directory where the partitions of the text file are saved +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:3) +# saveAsTextFile(rdd, "/tmp/sparkR-tmp") +#} +# @rdname saveAsTextFile +# @aliases saveAsTextFile,RDD setMethod("saveAsTextFile", signature(x = "RDD", path = "character"), function(x, path) { @@ -1061,21 +1061,21 @@ setMethod("saveAsTextFile", callJMethod(getJRDD(stringRdd, serializedMode = "string"), "saveAsTextFile", path)) }) -#' Sort an RDD by the given key function. -#' -#' @param x An RDD to be sorted. -#' @param func A function used to compute the sort key for each element. -#' @param ascending A flag to indicate whether the sorting is ascending or descending. -#' @param numPartitions Number of partitions to create. -#' @return An RDD where all elements are sorted. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(3, 2, 1)) -#' collect(sortBy(rdd, function(x) { x })) # list (1, 2, 3) -#'} -#' @rdname sortBy -#' @aliases sortBy,RDD,RDD-method +# Sort an RDD by the given key function. +# +# @param x An RDD to be sorted. +# @param func A function used to compute the sort key for each element. +# @param ascending A flag to indicate whether the sorting is ascending or descending. +# @param numPartitions Number of partitions to create. +# @return An RDD where all elements are sorted. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(3, 2, 1)) +# collect(sortBy(rdd, function(x) { x })) # list (1, 2, 3) +#} +# @rdname sortBy +# @aliases sortBy,RDD,RDD-method setMethod("sortBy", signature(x = "RDD", func = "function"), function(x, func, ascending = TRUE, numPartitions = SparkR:::numPartitions(x)) { @@ -1137,97 +1137,97 @@ takeOrderedElem <- function(x, num, ascending = TRUE) { resList } -#' Returns the first N elements from an RDD in ascending order. -#' -#' @param x An RDD. -#' @param num Number of elements to return. -#' @return The first N elements from the RDD in ascending order. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7)) -#' takeOrdered(rdd, 6L) # list(1, 2, 3, 4, 5, 6) -#'} -#' @rdname takeOrdered -#' @aliases takeOrdered,RDD,RDD-method +# Returns the first N elements from an RDD in ascending order. +# +# @param x An RDD. +# @param num Number of elements to return. +# @return The first N elements from the RDD in ascending order. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7)) +# takeOrdered(rdd, 6L) # list(1, 2, 3, 4, 5, 6) +#} +# @rdname takeOrdered +# @aliases takeOrdered,RDD,RDD-method setMethod("takeOrdered", signature(x = "RDD", num = "integer"), function(x, num) { takeOrderedElem(x, num) }) -#' Returns the top N elements from an RDD. -#' -#' @param x An RDD. -#' @param num Number of elements to return. -#' @return The top N elements from the RDD. -#' @rdname top -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7)) -#' top(rdd, 6L) # list(10, 9, 7, 6, 5, 4) -#'} -#' @rdname top -#' @aliases top,RDD,RDD-method +# Returns the top N elements from an RDD. +# +# @param x An RDD. +# @param num Number of elements to return. +# @return The top N elements from the RDD. +# @rdname top +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7)) +# top(rdd, 6L) # list(10, 9, 7, 6, 5, 4) +#} +# @rdname top +# @aliases top,RDD,RDD-method setMethod("top", signature(x = "RDD", num = "integer"), function(x, num) { takeOrderedElem(x, num, FALSE) }) -#' Fold an RDD using a given associative function and a neutral "zero value". -#' -#' Aggregate the elements of each partition, and then the results for all the -#' partitions, using a given associative function and a neutral "zero value". -#' -#' @param x An RDD. -#' @param zeroValue A neutral "zero value". -#' @param op An associative function for the folding operation. -#' @return The folding result. -#' @rdname fold -#' @seealso reduce -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(1, 2, 3, 4, 5)) -#' fold(rdd, 0, "+") # 15 -#'} -#' @rdname fold -#' @aliases fold,RDD,RDD-method +# Fold an RDD using a given associative function and a neutral "zero value". +# +# Aggregate the elements of each partition, and then the results for all the +# partitions, using a given associative function and a neutral "zero value". +# +# @param x An RDD. +# @param zeroValue A neutral "zero value". +# @param op An associative function for the folding operation. +# @return The folding result. +# @rdname fold +# @seealso reduce +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(1, 2, 3, 4, 5)) +# fold(rdd, 0, "+") # 15 +#} +# @rdname fold +# @aliases fold,RDD,RDD-method setMethod("fold", signature(x = "RDD", zeroValue = "ANY", op = "ANY"), function(x, zeroValue, op) { aggregateRDD(x, zeroValue, op, op) }) -#' Aggregate an RDD using the given combine functions and a neutral "zero value". -#' -#' Aggregate the elements of each partition, and then the results for all the -#' partitions, using given combine functions and a neutral "zero value". -#' -#' @param x An RDD. -#' @param zeroValue A neutral "zero value". -#' @param seqOp A function to aggregate the RDD elements. It may return a different -#' result type from the type of the RDD elements. -#' @param combOp A function to aggregate results of seqOp. -#' @return The aggregation result. -#' @rdname aggregateRDD -#' @seealso reduce -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(1, 2, 3, 4)) -#' zeroValue <- list(0, 0) -#' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) } -#' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) } -#' aggregateRDD(rdd, zeroValue, seqOp, combOp) # list(10, 4) -#'} -#' @rdname aggregateRDD -#' @aliases aggregateRDD,RDD,RDD-method +# Aggregate an RDD using the given combine functions and a neutral "zero value". +# +# Aggregate the elements of each partition, and then the results for all the +# partitions, using given combine functions and a neutral "zero value". +# +# @param x An RDD. +# @param zeroValue A neutral "zero value". +# @param seqOp A function to aggregate the RDD elements. It may return a different +# result type from the type of the RDD elements. +# @param combOp A function to aggregate results of seqOp. +# @return The aggregation result. +# @rdname aggregateRDD +# @seealso reduce +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(1, 2, 3, 4)) +# zeroValue <- list(0, 0) +# seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) } +# combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) } +# aggregateRDD(rdd, zeroValue, seqOp, combOp) # list(10, 4) +#} +# @rdname aggregateRDD +# @aliases aggregateRDD,RDD,RDD-method setMethod("aggregateRDD", signature(x = "RDD", zeroValue = "ANY", seqOp = "ANY", combOp = "ANY"), function(x, zeroValue, seqOp, combOp) { @@ -1240,25 +1240,25 @@ setMethod("aggregateRDD", Reduce(combOp, partitionList, zeroValue) }) -#' Pipes elements to a forked external process. -#' -#' The same as 'pipe()' in Spark. -#' -#' @param x The RDD whose elements are piped to the forked external process. -#' @param command The command to fork an external process. -#' @param env A named list to set environment variables of the external process. -#' @return A new RDD created by piping all elements to a forked external process. -#' @rdname pipeRDD -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' collect(pipeRDD(rdd, "more") -#' Output: c("1", "2", ..., "10") -#'} -#' @rdname pipeRDD -#' @aliases pipeRDD,RDD,character-method +# Pipes elements to a forked external process. +# +# The same as 'pipe()' in Spark. +# +# @param x The RDD whose elements are piped to the forked external process. +# @param command The command to fork an external process. +# @param env A named list to set environment variables of the external process. +# @return A new RDD created by piping all elements to a forked external process. +# @rdname pipeRDD +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# collect(pipeRDD(rdd, "more") +# Output: c("1", "2", ..., "10") +#} +# @rdname pipeRDD +# @aliases pipeRDD,RDD,character-method setMethod("pipeRDD", signature(x = "RDD", command = "character"), function(x, command, env = list()) { @@ -1274,41 +1274,41 @@ setMethod("pipeRDD", }) # TODO: Consider caching the name in the RDD's environment -#' Return an RDD's name. -#' -#' @param x The RDD whose name is returned. -#' @rdname name -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(1,2,3)) -#' name(rdd) # NULL (if not set before) -#'} -#' @rdname name -#' @aliases name,RDD +# Return an RDD's name. +# +# @param x The RDD whose name is returned. +# @rdname name +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(1,2,3)) +# name(rdd) # NULL (if not set before) +#} +# @rdname name +# @aliases name,RDD setMethod("name", signature(x = "RDD"), function(x) { callJMethod(getJRDD(x), "name") }) -#' Set an RDD's name. -#' -#' @param x The RDD whose name is to be set. -#' @param name The RDD name to be set. -#' @return a new RDD renamed. -#' @rdname setName -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(1,2,3)) -#' setName(rdd, "myRDD") -#' name(rdd) # "myRDD" -#'} -#' @rdname setName -#' @aliases setName,RDD +# Set an RDD's name. +# +# @param x The RDD whose name is to be set. +# @param name The RDD name to be set. +# @return a new RDD renamed. +# @rdname setName +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(1,2,3)) +# setName(rdd, "myRDD") +# name(rdd) # "myRDD" +#} +# @rdname setName +# @aliases setName,RDD setMethod("setName", signature(x = "RDD", name = "character"), function(x, name) { @@ -1316,25 +1316,25 @@ setMethod("setName", x }) -#' Zip an RDD with generated unique Long IDs. -#' -#' Items in the kth partition will get ids k, n+k, 2*n+k, ..., where -#' n is the number of partitions. So there may exist gaps, but this -#' method won't trigger a spark job, which is different from -#' zipWithIndex. -#' -#' @param x An RDD to be zipped. -#' @return An RDD with zipped items. -#' @seealso zipWithIndex -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L) -#' collect(zipWithUniqueId(rdd)) -#' # list(list("a", 0), list("b", 3), list("c", 1), list("d", 4), list("e", 2)) -#'} -#' @rdname zipWithUniqueId -#' @aliases zipWithUniqueId,RDD +# Zip an RDD with generated unique Long IDs. +# +# Items in the kth partition will get ids k, n+k, 2*n+k, ..., where +# n is the number of partitions. So there may exist gaps, but this +# method won't trigger a spark job, which is different from +# zipWithIndex. +# +# @param x An RDD to be zipped. +# @return An RDD with zipped items. +# @seealso zipWithIndex +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L) +# collect(zipWithUniqueId(rdd)) +# # list(list("a", 0), list("b", 3), list("c", 1), list("d", 4), list("e", 2)) +#} +# @rdname zipWithUniqueId +# @aliases zipWithUniqueId,RDD setMethod("zipWithUniqueId", signature(x = "RDD"), function(x) { @@ -1353,28 +1353,28 @@ setMethod("zipWithUniqueId", lapplyPartitionsWithIndex(x, partitionFunc) }) -#' Zip an RDD with its element indices. -#' -#' The ordering is first based on the partition index and then the -#' ordering of items within each partition. So the first item in -#' the first partition gets index 0, and the last item in the last -#' partition receives the largest index. -#' -#' This method needs to trigger a Spark job when this RDD contains -#' more than one partition. -#' -#' @param x An RDD to be zipped. -#' @return An RDD with zipped items. -#' @seealso zipWithUniqueId -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L) -#' collect(zipWithIndex(rdd)) -#' # list(list("a", 0), list("b", 1), list("c", 2), list("d", 3), list("e", 4)) -#'} -#' @rdname zipWithIndex -#' @aliases zipWithIndex,RDD +# Zip an RDD with its element indices. +# +# The ordering is first based on the partition index and then the +# ordering of items within each partition. So the first item in +# the first partition gets index 0, and the last item in the last +# partition receives the largest index. +# +# This method needs to trigger a Spark job when this RDD contains +# more than one partition. +# +# @param x An RDD to be zipped. +# @return An RDD with zipped items. +# @seealso zipWithUniqueId +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L) +# collect(zipWithIndex(rdd)) +# # list(list("a", 0), list("b", 1), list("c", 2), list("d", 3), list("e", 4)) +#} +# @rdname zipWithIndex +# @aliases zipWithIndex,RDD setMethod("zipWithIndex", signature(x = "RDD"), function(x) { @@ -1406,20 +1406,20 @@ setMethod("zipWithIndex", lapplyPartitionsWithIndex(x, partitionFunc) }) -#' Coalesce all elements within each partition of an RDD into a list. -#' -#' @param x An RDD. -#' @return An RDD created by coalescing all elements within -#' each partition into a list. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, as.list(1:4), 2L) -#' collect(glom(rdd)) -#' # list(list(1, 2), list(3, 4)) -#'} -#' @rdname glom -#' @aliases glom,RDD +# Coalesce all elements within each partition of an RDD into a list. +# +# @param x An RDD. +# @return An RDD created by coalescing all elements within +# each partition into a list. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, as.list(1:4), 2L) +# collect(glom(rdd)) +# # list(list(1, 2), list(3, 4)) +#} +# @rdname glom +# @aliases glom,RDD setMethod("glom", signature(x = "RDD"), function(x) { @@ -1432,21 +1432,21 @@ setMethod("glom", ############ Binary Functions ############# -#' Return the union RDD of two RDDs. -#' The same as union() in Spark. -#' -#' @param x An RDD. -#' @param y An RDD. -#' @return a new RDD created by performing the simple union (witout removing -#' duplicates) of two input RDDs. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:3) -#' unionRDD(rdd, rdd) # 1, 2, 3, 1, 2, 3 -#'} -#' @rdname unionRDD -#' @aliases unionRDD,RDD,RDD-method +# Return the union RDD of two RDDs. +# The same as union() in Spark. +# +# @param x An RDD. +# @param y An RDD. +# @return a new RDD created by performing the simple union (witout removing +# duplicates) of two input RDDs. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:3) +# unionRDD(rdd, rdd) # 1, 2, 3, 1, 2, 3 +#} +# @rdname unionRDD +# @aliases unionRDD,RDD,RDD-method setMethod("unionRDD", signature(x = "RDD", y = "RDD"), function(x, y) { @@ -1463,27 +1463,27 @@ setMethod("unionRDD", union.rdd }) -#' Zip an RDD with another RDD. -#' -#' Zips this RDD with another one, returning key-value pairs with the -#' first element in each RDD second element in each RDD, etc. Assumes -#' that the two RDDs have the same number of partitions and the same -#' number of elements in each partition (e.g. one was made through -#' a map on the other). -#' -#' @param x An RDD to be zipped. -#' @param other Another RDD to be zipped. -#' @return An RDD zipped from the two RDDs. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, 0:4) -#' rdd2 <- parallelize(sc, 1000:1004) -#' collect(zipRDD(rdd1, rdd2)) -#' # list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004)) -#'} -#' @rdname zipRDD -#' @aliases zipRDD,RDD +# Zip an RDD with another RDD. +# +# Zips this RDD with another one, returning key-value pairs with the +# first element in each RDD second element in each RDD, etc. Assumes +# that the two RDDs have the same number of partitions and the same +# number of elements in each partition (e.g. one was made through +# a map on the other). +# +# @param x An RDD to be zipped. +# @param other Another RDD to be zipped. +# @return An RDD zipped from the two RDDs. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, 0:4) +# rdd2 <- parallelize(sc, 1000:1004) +# collect(zipRDD(rdd1, rdd2)) +# # list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004)) +#} +# @rdname zipRDD +# @aliases zipRDD,RDD setMethod("zipRDD", signature(x = "RDD", other = "RDD"), function(x, other) { @@ -1502,24 +1502,24 @@ setMethod("zipRDD", mergePartitions(rdd, TRUE) }) -#' Cartesian product of this RDD and another one. -#' -#' Return the Cartesian product of this RDD and another one, -#' that is, the RDD of all pairs of elements (a, b) where a -#' is in this and b is in other. -#' -#' @param x An RDD. -#' @param other An RDD. -#' @return A new RDD which is the Cartesian product of these two RDDs. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:2) -#' sortByKey(cartesian(rdd, rdd)) -#' # list(list(1, 1), list(1, 2), list(2, 1), list(2, 2)) -#'} -#' @rdname cartesian -#' @aliases cartesian,RDD,RDD-method +# Cartesian product of this RDD and another one. +# +# Return the Cartesian product of this RDD and another one, +# that is, the RDD of all pairs of elements (a, b) where a +# is in this and b is in other. +# +# @param x An RDD. +# @param other An RDD. +# @return A new RDD which is the Cartesian product of these two RDDs. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:2) +# sortByKey(cartesian(rdd, rdd)) +# # list(list(1, 1), list(1, 2), list(2, 1), list(2, 2)) +#} +# @rdname cartesian +# @aliases cartesian,RDD,RDD-method setMethod("cartesian", signature(x = "RDD", other = "RDD"), function(x, other) { @@ -1532,24 +1532,24 @@ setMethod("cartesian", mergePartitions(rdd, FALSE) }) -#' Subtract an RDD with another RDD. -#' -#' Return an RDD with the elements from this that are not in other. -#' -#' @param x An RDD. -#' @param other An RDD. -#' @param numPartitions Number of the partitions in the result RDD. -#' @return An RDD with the elements from this that are not in other. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, list(1, 1, 2, 2, 3, 4)) -#' rdd2 <- parallelize(sc, list(2, 4)) -#' collect(subtract(rdd1, rdd2)) -#' # list(1, 1, 3) -#'} -#' @rdname subtract -#' @aliases subtract,RDD +# Subtract an RDD with another RDD. +# +# Return an RDD with the elements from this that are not in other. +# +# @param x An RDD. +# @param other An RDD. +# @param numPartitions Number of the partitions in the result RDD. +# @return An RDD with the elements from this that are not in other. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, list(1, 1, 2, 2, 3, 4)) +# rdd2 <- parallelize(sc, list(2, 4)) +# collect(subtract(rdd1, rdd2)) +# # list(1, 1, 3) +#} +# @rdname subtract +# @aliases subtract,RDD setMethod("subtract", signature(x = "RDD", other = "RDD"), function(x, other, numPartitions = SparkR:::numPartitions(x)) { @@ -1559,28 +1559,28 @@ setMethod("subtract", keys(subtractByKey(rdd1, rdd2, numPartitions)) }) -#' Intersection of this RDD and another one. -#' -#' Return the intersection of this RDD and another one. -#' The output will not contain any duplicate elements, -#' even if the input RDDs did. Performs a hash partition -#' across the cluster. -#' Note that this method performs a shuffle internally. -#' -#' @param x An RDD. -#' @param other An RDD. -#' @param numPartitions The number of partitions in the result RDD. -#' @return An RDD which is the intersection of these two RDDs. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5)) -#' rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8)) -#' collect(sortBy(intersection(rdd1, rdd2), function(x) { x })) -#' # list(1, 2, 3) -#'} -#' @rdname intersection -#' @aliases intersection,RDD +# Intersection of this RDD and another one. +# +# Return the intersection of this RDD and another one. +# The output will not contain any duplicate elements, +# even if the input RDDs did. Performs a hash partition +# across the cluster. +# Note that this method performs a shuffle internally. +# +# @param x An RDD. +# @param other An RDD. +# @param numPartitions The number of partitions in the result RDD. +# @return An RDD which is the intersection of these two RDDs. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5)) +# rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8)) +# collect(sortBy(intersection(rdd1, rdd2), function(x) { x })) +# # list(1, 2, 3) +#} +# @rdname intersection +# @aliases intersection,RDD setMethod("intersection", signature(x = "RDD", other = "RDD"), function(x, other, numPartitions = SparkR:::numPartitions(x)) { @@ -1596,26 +1596,26 @@ setMethod("intersection", keys(filterRDD(cogroup(rdd1, rdd2, numPartitions = numPartitions), filterFunction)) }) -#' Zips an RDD's partitions with one (or more) RDD(s). -#' Same as zipPartitions in Spark. -#' -#' @param ... RDDs to be zipped. -#' @param func A function to transform zipped partitions. -#' @return A new RDD by applying a function to the zipped partitions. -#' Assumes that all the RDDs have the *same number of partitions*, but -#' does *not* require them to have the same number of elements in each partition. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, 1:2, 2L) # 1, 2 -#' rdd2 <- parallelize(sc, 1:4, 2L) # 1:2, 3:4 -#' rdd3 <- parallelize(sc, 1:6, 2L) # 1:3, 4:6 -#' collect(zipPartitions(rdd1, rdd2, rdd3, -#' func = function(x, y, z) { list(list(x, y, z))} )) -#' # list(list(1, c(1,2), c(1,2,3)), list(2, c(3,4), c(4,5,6))) -#'} -#' @rdname zipRDD -#' @aliases zipPartitions,RDD +# Zips an RDD's partitions with one (or more) RDD(s). +# Same as zipPartitions in Spark. +# +# @param ... RDDs to be zipped. +# @param func A function to transform zipped partitions. +# @return A new RDD by applying a function to the zipped partitions. +# Assumes that all the RDDs have the *same number of partitions*, but +# does *not* require them to have the same number of elements in each partition. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, 1:2, 2L) # 1, 2 +# rdd2 <- parallelize(sc, 1:4, 2L) # 1:2, 3:4 +# rdd3 <- parallelize(sc, 1:6, 2L) # 1:3, 4:6 +# collect(zipPartitions(rdd1, rdd2, rdd3, +# func = function(x, y, z) { list(list(x, y, z))} )) +# # list(list(1, c(1,2), c(1,2,3)), list(2, c(3,4), c(4,5,6))) +#} +# @rdname zipRDD +# @aliases zipPartitions,RDD setMethod("zipPartitions", "RDD", function(..., func) { diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 4f05ba524a01a..cae06e6af2bff 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -150,21 +150,21 @@ createDataFrame <- function(sqlCtx, data, schema = NULL, samplingRatio = 1.0) { dataFrame(sdf) } -#' toDF -#' -#' Converts an RDD to a DataFrame by infer the types. -#' -#' @param x An RDD -#' -#' @rdname DataFrame -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' sqlCtx <- sparkRSQL.init(sc) -#' rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x))) -#' df <- toDF(rdd) -#' } +# toDF +# +# Converts an RDD to a DataFrame by infer the types. +# +# @param x An RDD +# +# @rdname DataFrame +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# sqlCtx <- sparkRSQL.init(sc) +# rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x))) +# df <- toDF(rdd) +# } setGeneric("toDF", function(x, ...) { standardGeneric("toDF") }) @@ -207,23 +207,23 @@ jsonFile <- function(sqlCtx, path) { } -#' JSON RDD -#' -#' Loads an RDD storing one JSON object per string as a DataFrame. -#' -#' @param sqlCtx SQLContext to use -#' @param rdd An RDD of JSON string -#' @param schema A StructType object to use as schema -#' @param samplingRatio The ratio of simpling used to infer the schema -#' @return A DataFrame -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' sqlCtx <- sparkRSQL.init(sc) -#' rdd <- texFile(sc, "path/to/json") -#' df <- jsonRDD(sqlCtx, rdd) -#' } +# JSON RDD +# +# Loads an RDD storing one JSON object per string as a DataFrame. +# +# @param sqlCtx SQLContext to use +# @param rdd An RDD of JSON string +# @param schema A StructType object to use as schema +# @param samplingRatio The ratio of simpling used to infer the schema +# @return A DataFrame +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# sqlCtx <- sparkRSQL.init(sc) +# rdd <- texFile(sc, "path/to/json") +# df <- jsonRDD(sqlCtx, rdd) +# } # TODO: support schema jsonRDD <- function(sqlCtx, rdd, schema = NULL, samplingRatio = 1.0) { diff --git a/R/pkg/R/broadcast.R b/R/pkg/R/broadcast.R index 583fa2e7fdcfd..23dc38780716e 100644 --- a/R/pkg/R/broadcast.R +++ b/R/pkg/R/broadcast.R @@ -23,21 +23,21 @@ .broadcastValues <- new.env() .broadcastIdToName <- new.env() -#' @title S4 class that represents a Broadcast variable -#' @description Broadcast variables can be created using the broadcast -#' function from a \code{SparkContext}. -#' @rdname broadcast-class -#' @seealso broadcast -#' -#' @param id Id of the backing Spark broadcast variable -#' @export +# @title S4 class that represents a Broadcast variable +# @description Broadcast variables can be created using the broadcast +# function from a \code{SparkContext}. +# @rdname broadcast-class +# @seealso broadcast +# +# @param id Id of the backing Spark broadcast variable +# @export setClass("Broadcast", slots = list(id = "character")) -#' @rdname broadcast-class -#' @param value Value of the broadcast variable -#' @param jBroadcastRef reference to the backing Java broadcast object -#' @param objName name of broadcasted object -#' @export +# @rdname broadcast-class +# @param value Value of the broadcast variable +# @param jBroadcastRef reference to the backing Java broadcast object +# @param objName name of broadcasted object +# @export Broadcast <- function(id, value, jBroadcastRef, objName) { .broadcastValues[[id]] <- value .broadcastNames[[as.character(objName)]] <- jBroadcastRef @@ -45,13 +45,13 @@ Broadcast <- function(id, value, jBroadcastRef, objName) { new("Broadcast", id = id) } -#' @description -#' \code{value} can be used to get the value of a broadcast variable inside -#' a distributed function. -#' -#' @param bcast The broadcast variable to get -#' @rdname broadcast -#' @aliases value,Broadcast-method +# @description +# \code{value} can be used to get the value of a broadcast variable inside +# a distributed function. +# +# @param bcast The broadcast variable to get +# @rdname broadcast +# @aliases value,Broadcast-method setMethod("value", signature(bcast = "Broadcast"), function(bcast) { @@ -62,24 +62,24 @@ setMethod("value", } }) -#' Internal function to set values of a broadcast variable. -#' -#' This function is used internally by Spark to set the value of a broadcast -#' variable on workers. Not intended for use outside the package. -#' -#' @rdname broadcast-internal -#' @seealso broadcast, value +# Internal function to set values of a broadcast variable. +# +# This function is used internally by Spark to set the value of a broadcast +# variable on workers. Not intended for use outside the package. +# +# @rdname broadcast-internal +# @seealso broadcast, value -#' @param bcastId The id of broadcast variable to set -#' @param value The value to be set -#' @export +# @param bcastId The id of broadcast variable to set +# @param value The value to be set +# @export setBroadcastValue <- function(bcastId, value) { bcastIdStr <- as.character(bcastId) .broadcastValues[[bcastIdStr]] <- value } -#' Helper function to clear the list of broadcast variables we know about -#' Should be called when the SparkR JVM backend is shutdown +# Helper function to clear the list of broadcast variables we know about +# Should be called when the SparkR JVM backend is shutdown clearBroadcastVariables <- function() { bcasts <- ls(.broadcastNames) rm(list = bcasts, envir = .broadcastNames) diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index b4845b6948997..43be9c904fdf6 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -25,27 +25,27 @@ getMinPartitions <- function(sc, minPartitions) { as.integer(minPartitions) } -#' Create an RDD from a text file. -#' -#' This function reads a text file from HDFS, a local file system (available on all -#' nodes), or any Hadoop-supported file system URI, and creates an -#' RDD of strings from it. -#' -#' @param sc SparkContext to use -#' @param path Path of file to read. A vector of multiple paths is allowed. -#' @param minPartitions Minimum number of partitions to be created. If NULL, the default -#' value is chosen based on available parallelism. -#' @return RDD where each item is of type \code{character} -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' lines <- textFile(sc, "myfile.txt") -#'} +# Create an RDD from a text file. +# +# This function reads a text file from HDFS, a local file system (available on all +# nodes), or any Hadoop-supported file system URI, and creates an +# RDD of strings from it. +# +# @param sc SparkContext to use +# @param path Path of file to read. A vector of multiple paths is allowed. +# @param minPartitions Minimum number of partitions to be created. If NULL, the default +# value is chosen based on available parallelism. +# @return RDD where each item is of type \code{character} +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# lines <- textFile(sc, "myfile.txt") +#} textFile <- function(sc, path, minPartitions = NULL) { # Allow the user to have a more flexible definiton of the text file path path <- suppressWarnings(normalizePath(path)) - #' Convert a string vector of paths to a string containing comma separated paths + # Convert a string vector of paths to a string containing comma separated paths path <- paste(path, collapse = ",") jrdd <- callJMethod(sc, "textFile", path, getMinPartitions(sc, minPartitions)) @@ -53,27 +53,27 @@ textFile <- function(sc, path, minPartitions = NULL) { RDD(jrdd, "string") } -#' Load an RDD saved as a SequenceFile containing serialized objects. -#' -#' The file to be loaded should be one that was previously generated by calling -#' saveAsObjectFile() of the RDD class. -#' -#' @param sc SparkContext to use -#' @param path Path of file to read. A vector of multiple paths is allowed. -#' @param minPartitions Minimum number of partitions to be created. If NULL, the default -#' value is chosen based on available parallelism. -#' @return RDD containing serialized R objects. -#' @seealso saveAsObjectFile -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- objectFile(sc, "myfile") -#'} +# Load an RDD saved as a SequenceFile containing serialized objects. +# +# The file to be loaded should be one that was previously generated by calling +# saveAsObjectFile() of the RDD class. +# +# @param sc SparkContext to use +# @param path Path of file to read. A vector of multiple paths is allowed. +# @param minPartitions Minimum number of partitions to be created. If NULL, the default +# value is chosen based on available parallelism. +# @return RDD containing serialized R objects. +# @seealso saveAsObjectFile +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- objectFile(sc, "myfile") +#} objectFile <- function(sc, path, minPartitions = NULL) { # Allow the user to have a more flexible definiton of the text file path path <- suppressWarnings(normalizePath(path)) - #' Convert a string vector of paths to a string containing comma separated paths + # Convert a string vector of paths to a string containing comma separated paths path <- paste(path, collapse = ",") jrdd <- callJMethod(sc, "objectFile", path, getMinPartitions(sc, minPartitions)) @@ -81,24 +81,24 @@ objectFile <- function(sc, path, minPartitions = NULL) { RDD(jrdd, "byte") } -#' Create an RDD from a homogeneous list or vector. -#' -#' This function creates an RDD from a local homogeneous list in R. The elements -#' in the list are split into \code{numSlices} slices and distributed to nodes -#' in the cluster. -#' -#' @param sc SparkContext to use -#' @param coll collection to parallelize -#' @param numSlices number of partitions to create in the RDD -#' @return an RDD created from this collection -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10, 2) -#' # The RDD should contain 10 elements -#' length(rdd) -#'} +# Create an RDD from a homogeneous list or vector. +# +# This function creates an RDD from a local homogeneous list in R. The elements +# in the list are split into \code{numSlices} slices and distributed to nodes +# in the cluster. +# +# @param sc SparkContext to use +# @param coll collection to parallelize +# @param numSlices number of partitions to create in the RDD +# @return an RDD created from this collection +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10, 2) +# # The RDD should contain 10 elements +# length(rdd) +#} parallelize <- function(sc, coll, numSlices = 1) { # TODO: bound/safeguard numSlices # TODO: unit tests for if the split works for all primitives @@ -133,33 +133,33 @@ parallelize <- function(sc, coll, numSlices = 1) { RDD(jrdd, "byte") } -#' Include this specified package on all workers -#' -#' This function can be used to include a package on all workers before the -#' user's code is executed. This is useful in scenarios where other R package -#' functions are used in a function passed to functions like \code{lapply}. -#' NOTE: The package is assumed to be installed on every node in the Spark -#' cluster. -#' -#' @param sc SparkContext to use -#' @param pkg Package name -#' -#' @export -#' @examples -#'\dontrun{ -#' library(Matrix) -#' -#' sc <- sparkR.init() -#' # Include the matrix library we will be using -#' includePackage(sc, Matrix) -#' -#' generateSparse <- function(x) { -#' sparseMatrix(i=c(1, 2, 3), j=c(1, 2, 3), x=c(1, 2, 3)) -#' } -#' -#' rdd <- lapplyPartition(parallelize(sc, 1:2, 2L), generateSparse) -#' collect(rdd) -#'} +# Include this specified package on all workers +# +# This function can be used to include a package on all workers before the +# user's code is executed. This is useful in scenarios where other R package +# functions are used in a function passed to functions like \code{lapply}. +# NOTE: The package is assumed to be installed on every node in the Spark +# cluster. +# +# @param sc SparkContext to use +# @param pkg Package name +# +# @export +# @examples +#\dontrun{ +# library(Matrix) +# +# sc <- sparkR.init() +# # Include the matrix library we will be using +# includePackage(sc, Matrix) +# +# generateSparse <- function(x) { +# sparseMatrix(i=c(1, 2, 3), j=c(1, 2, 3), x=c(1, 2, 3)) +# } +# +# rdd <- lapplyPartition(parallelize(sc, 1:2, 2L), generateSparse) +# collect(rdd) +#} includePackage <- function(sc, pkg) { pkg <- as.character(substitute(pkg)) if (exists(".packages", .sparkREnv)) { @@ -171,30 +171,30 @@ includePackage <- function(sc, pkg) { .sparkREnv$.packages <- packages } -#' @title Broadcast a variable to all workers -#' -#' @description -#' Broadcast a read-only variable to the cluster, returning a \code{Broadcast} -#' object for reading it in distributed functions. -#' -#' @param sc Spark Context to use -#' @param object Object to be broadcast -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:2, 2L) -#' -#' # Large Matrix object that we want to broadcast -#' randomMat <- matrix(nrow=100, ncol=10, data=rnorm(1000)) -#' randomMatBr <- broadcast(sc, randomMat) -#' -#' # Use the broadcast variable inside the function -#' useBroadcast <- function(x) { -#' sum(value(randomMatBr) * x) -#' } -#' sumRDD <- lapply(rdd, useBroadcast) -#'} +# @title Broadcast a variable to all workers +# +# @description +# Broadcast a read-only variable to the cluster, returning a \code{Broadcast} +# object for reading it in distributed functions. +# +# @param sc Spark Context to use +# @param object Object to be broadcast +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:2, 2L) +# +# # Large Matrix object that we want to broadcast +# randomMat <- matrix(nrow=100, ncol=10, data=rnorm(1000)) +# randomMatBr <- broadcast(sc, randomMat) +# +# # Use the broadcast variable inside the function +# useBroadcast <- function(x) { +# sum(value(randomMatBr) * x) +# } +# sumRDD <- lapply(rdd, useBroadcast) +#} broadcast <- function(sc, object) { objName <- as.character(substitute(object)) serializedObj <- serialize(object, connection = NULL) @@ -205,21 +205,21 @@ broadcast <- function(sc, object) { Broadcast(id, object, jBroadcast, objName) } -#' @title Set the checkpoint directory -#' -#' Set the directory under which RDDs are going to be checkpointed. The -#' directory must be a HDFS path if running on a cluster. -#' -#' @param sc Spark Context to use -#' @param dirName Directory path -#' @export -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' setCheckpointDir(sc, "~/checkpoint") -#' rdd <- parallelize(sc, 1:2, 2L) -#' checkpoint(rdd) -#'} +# @title Set the checkpoint directory +# +# Set the directory under which RDDs are going to be checkpointed. The +# directory must be a HDFS path if running on a cluster. +# +# @param sc Spark Context to use +# @param dirName Directory path +# @export +# @examples +#\dontrun{ +# sc <- sparkR.init() +# setCheckpointDir(sc, "~/checkpoint") +# rdd <- parallelize(sc, 1:2, 2L) +# checkpoint(rdd) +#} setCheckpointDir <- function(sc, dirName) { invisible(callJMethod(sc, "setCheckpointDir", suppressWarnings(normalizePath(dirName)))) } diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 5838955f74dad..380e8ebe8c8f4 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -17,353 +17,353 @@ ############ RDD Actions and Transformations ############ -#' @rdname aggregateRDD -#' @seealso reduce -#' @export +# @rdname aggregateRDD +# @seealso reduce +# @export setGeneric("aggregateRDD", function(x, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") }) -#' @rdname cache-methods -#' @export +# @rdname cache-methods +# @export setGeneric("cache", function(x) { standardGeneric("cache") }) -#' @rdname coalesce -#' @seealso repartition -#' @export +# @rdname coalesce +# @seealso repartition +# @export setGeneric("coalesce", function(x, numPartitions, ...) { standardGeneric("coalesce") }) -#' @rdname checkpoint-methods -#' @export +# @rdname checkpoint-methods +# @export setGeneric("checkpoint", function(x) { standardGeneric("checkpoint") }) -#' @rdname collect-methods -#' @export +# @rdname collect-methods +# @export setGeneric("collect", function(x, ...) { standardGeneric("collect") }) -#' @rdname collect-methods -#' @export +# @rdname collect-methods +# @export setGeneric("collectAsMap", function(x) { standardGeneric("collectAsMap") }) -#' @rdname collect-methods -#' @export +# @rdname collect-methods +# @export setGeneric("collectPartition", function(x, partitionId) { standardGeneric("collectPartition") }) -#' @rdname count -#' @export +# @rdname count +# @export setGeneric("count", function(x) { standardGeneric("count") }) -#' @rdname countByValue -#' @export +# @rdname countByValue +# @export setGeneric("countByValue", function(x) { standardGeneric("countByValue") }) -#' @rdname distinct -#' @export +# @rdname distinct +# @export setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") }) -#' @rdname filterRDD -#' @export +# @rdname filterRDD +# @export setGeneric("filterRDD", function(x, f) { standardGeneric("filterRDD") }) -#' @rdname first -#' @export +# @rdname first +# @export setGeneric("first", function(x) { standardGeneric("first") }) -#' @rdname flatMap -#' @export +# @rdname flatMap +# @export setGeneric("flatMap", function(X, FUN) { standardGeneric("flatMap") }) -#' @rdname fold -#' @seealso reduce -#' @export +# @rdname fold +# @seealso reduce +# @export setGeneric("fold", function(x, zeroValue, op) { standardGeneric("fold") }) -#' @rdname foreach -#' @export +# @rdname foreach +# @export setGeneric("foreach", function(x, func) { standardGeneric("foreach") }) -#' @rdname foreach -#' @export +# @rdname foreach +# @export setGeneric("foreachPartition", function(x, func) { standardGeneric("foreachPartition") }) # The jrdd accessor function. setGeneric("getJRDD", function(rdd, ...) { standardGeneric("getJRDD") }) -#' @rdname glom -#' @export +# @rdname glom +# @export setGeneric("glom", function(x) { standardGeneric("glom") }) -#' @rdname keyBy -#' @export +# @rdname keyBy +# @export setGeneric("keyBy", function(x, func) { standardGeneric("keyBy") }) -#' @rdname lapplyPartition -#' @export +# @rdname lapplyPartition +# @export setGeneric("lapplyPartition", function(X, FUN) { standardGeneric("lapplyPartition") }) -#' @rdname lapplyPartitionsWithIndex -#' @export +# @rdname lapplyPartitionsWithIndex +# @export setGeneric("lapplyPartitionsWithIndex", function(X, FUN) { standardGeneric("lapplyPartitionsWithIndex") }) -#' @rdname lapply -#' @export +# @rdname lapply +# @export setGeneric("map", function(X, FUN) { standardGeneric("map") }) -#' @rdname lapplyPartition -#' @export +# @rdname lapplyPartition +# @export setGeneric("mapPartitions", function(X, FUN) { standardGeneric("mapPartitions") }) -#' @rdname lapplyPartitionsWithIndex -#' @export +# @rdname lapplyPartitionsWithIndex +# @export setGeneric("mapPartitionsWithIndex", function(X, FUN) { standardGeneric("mapPartitionsWithIndex") }) -#' @rdname maximum -#' @export +# @rdname maximum +# @export setGeneric("maximum", function(x) { standardGeneric("maximum") }) -#' @rdname minimum -#' @export +# @rdname minimum +# @export setGeneric("minimum", function(x) { standardGeneric("minimum") }) -#' @rdname sumRDD -#' @export +# @rdname sumRDD +# @export setGeneric("sumRDD", function(x) { standardGeneric("sumRDD") }) -#' @rdname name -#' @export +# @rdname name +# @export setGeneric("name", function(x) { standardGeneric("name") }) -#' @rdname numPartitions -#' @export +# @rdname numPartitions +# @export setGeneric("numPartitions", function(x) { standardGeneric("numPartitions") }) -#' @rdname persist -#' @export +# @rdname persist +# @export setGeneric("persist", function(x, newLevel) { standardGeneric("persist") }) -#' @rdname pipeRDD -#' @export +# @rdname pipeRDD +# @export setGeneric("pipeRDD", function(x, command, env = list()) { standardGeneric("pipeRDD")}) -#' @rdname reduce -#' @export +# @rdname reduce +# @export setGeneric("reduce", function(x, func) { standardGeneric("reduce") }) -#' @rdname repartition -#' @seealso coalesce -#' @export +# @rdname repartition +# @seealso coalesce +# @export setGeneric("repartition", function(x, numPartitions) { standardGeneric("repartition") }) -#' @rdname sampleRDD -#' @export +# @rdname sampleRDD +# @export setGeneric("sampleRDD", function(x, withReplacement, fraction, seed) { standardGeneric("sampleRDD") }) -#' @rdname saveAsObjectFile -#' @seealso objectFile -#' @export +# @rdname saveAsObjectFile +# @seealso objectFile +# @export setGeneric("saveAsObjectFile", function(x, path) { standardGeneric("saveAsObjectFile") }) -#' @rdname saveAsTextFile -#' @export +# @rdname saveAsTextFile +# @export setGeneric("saveAsTextFile", function(x, path) { standardGeneric("saveAsTextFile") }) -#' @rdname setName -#' @export +# @rdname setName +# @export setGeneric("setName", function(x, name) { standardGeneric("setName") }) -#' @rdname sortBy -#' @export +# @rdname sortBy +# @export setGeneric("sortBy", function(x, func, ascending = TRUE, numPartitions = 1) { standardGeneric("sortBy") }) -#' @rdname take -#' @export +# @rdname take +# @export setGeneric("take", function(x, num) { standardGeneric("take") }) -#' @rdname takeOrdered -#' @export +# @rdname takeOrdered +# @export setGeneric("takeOrdered", function(x, num) { standardGeneric("takeOrdered") }) -#' @rdname takeSample -#' @export +# @rdname takeSample +# @export setGeneric("takeSample", function(x, withReplacement, num, seed) { standardGeneric("takeSample") }) -#' @rdname top -#' @export +# @rdname top +# @export setGeneric("top", function(x, num) { standardGeneric("top") }) -#' @rdname unionRDD -#' @export +# @rdname unionRDD +# @export setGeneric("unionRDD", function(x, y) { standardGeneric("unionRDD") }) -#' @rdname unpersist-methods -#' @export +# @rdname unpersist-methods +# @export setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") }) -#' @rdname zipRDD -#' @export +# @rdname zipRDD +# @export setGeneric("zipRDD", function(x, other) { standardGeneric("zipRDD") }) -#' @rdname zipRDD -#' @export +# @rdname zipRDD +# @export setGeneric("zipPartitions", function(..., func) { standardGeneric("zipPartitions") }, signature = "...") -#' @rdname zipWithIndex -#' @seealso zipWithUniqueId -#' @export +# @rdname zipWithIndex +# @seealso zipWithUniqueId +# @export setGeneric("zipWithIndex", function(x) { standardGeneric("zipWithIndex") }) -#' @rdname zipWithUniqueId -#' @seealso zipWithIndex -#' @export +# @rdname zipWithUniqueId +# @seealso zipWithIndex +# @export setGeneric("zipWithUniqueId", function(x) { standardGeneric("zipWithUniqueId") }) ############ Binary Functions ############# -#' @rdname cartesian -#' @export +# @rdname cartesian +# @export setGeneric("cartesian", function(x, other) { standardGeneric("cartesian") }) -#' @rdname countByKey -#' @export +# @rdname countByKey +# @export setGeneric("countByKey", function(x) { standardGeneric("countByKey") }) -#' @rdname flatMapValues -#' @export +# @rdname flatMapValues +# @export setGeneric("flatMapValues", function(X, FUN) { standardGeneric("flatMapValues") }) -#' @rdname intersection -#' @export +# @rdname intersection +# @export setGeneric("intersection", function(x, other, numPartitions = 1) { standardGeneric("intersection") }) -#' @rdname keys -#' @export +# @rdname keys +# @export setGeneric("keys", function(x) { standardGeneric("keys") }) -#' @rdname lookup -#' @export +# @rdname lookup +# @export setGeneric("lookup", function(x, key) { standardGeneric("lookup") }) -#' @rdname mapValues -#' @export +# @rdname mapValues +# @export setGeneric("mapValues", function(X, FUN) { standardGeneric("mapValues") }) -#' @rdname sampleByKey -#' @export +# @rdname sampleByKey +# @export setGeneric("sampleByKey", function(x, withReplacement, fractions, seed) { standardGeneric("sampleByKey") }) -#' @rdname values -#' @export +# @rdname values +# @export setGeneric("values", function(x) { standardGeneric("values") }) ############ Shuffle Functions ############ -#' @rdname aggregateByKey -#' @seealso foldByKey, combineByKey -#' @export +# @rdname aggregateByKey +# @seealso foldByKey, combineByKey +# @export setGeneric("aggregateByKey", function(x, zeroValue, seqOp, combOp, numPartitions) { standardGeneric("aggregateByKey") }) -#' @rdname cogroup -#' @export +# @rdname cogroup +# @export setGeneric("cogroup", function(..., numPartitions) { standardGeneric("cogroup") }, signature = "...") -#' @rdname combineByKey -#' @seealso groupByKey, reduceByKey -#' @export +# @rdname combineByKey +# @seealso groupByKey, reduceByKey +# @export setGeneric("combineByKey", function(x, createCombiner, mergeValue, mergeCombiners, numPartitions) { standardGeneric("combineByKey") }) -#' @rdname foldByKey -#' @seealso aggregateByKey, combineByKey -#' @export +# @rdname foldByKey +# @seealso aggregateByKey, combineByKey +# @export setGeneric("foldByKey", function(x, zeroValue, func, numPartitions) { standardGeneric("foldByKey") }) -#' @rdname join-methods -#' @export +# @rdname join-methods +# @export setGeneric("fullOuterJoin", function(x, y, numPartitions) { standardGeneric("fullOuterJoin") }) -#' @rdname groupByKey -#' @seealso reduceByKey -#' @export +# @rdname groupByKey +# @seealso reduceByKey +# @export setGeneric("groupByKey", function(x, numPartitions) { standardGeneric("groupByKey") }) -#' @rdname join-methods -#' @export +# @rdname join-methods +# @export setGeneric("join", function(x, y, ...) { standardGeneric("join") }) -#' @rdname join-methods -#' @export +# @rdname join-methods +# @export setGeneric("leftOuterJoin", function(x, y, numPartitions) { standardGeneric("leftOuterJoin") }) -#' @rdname partitionBy -#' @export +# @rdname partitionBy +# @export setGeneric("partitionBy", function(x, numPartitions, ...) { standardGeneric("partitionBy") }) -#' @rdname reduceByKey -#' @seealso groupByKey -#' @export +# @rdname reduceByKey +# @seealso groupByKey +# @export setGeneric("reduceByKey", function(x, combineFunc, numPartitions) { standardGeneric("reduceByKey")}) -#' @rdname reduceByKeyLocally -#' @seealso reduceByKey -#' @export +# @rdname reduceByKeyLocally +# @seealso reduceByKey +# @export setGeneric("reduceByKeyLocally", function(x, combineFunc) { standardGeneric("reduceByKeyLocally") }) -#' @rdname join-methods -#' @export +# @rdname join-methods +# @export setGeneric("rightOuterJoin", function(x, y, numPartitions) { standardGeneric("rightOuterJoin") }) -#' @rdname sortByKey -#' @export +# @rdname sortByKey +# @export setGeneric("sortByKey", function(x, ascending = TRUE, numPartitions = 1) { standardGeneric("sortByKey") }) -#' @rdname subtract -#' @export +# @rdname subtract +# @export setGeneric("subtract", function(x, other, numPartitions = 1) { standardGeneric("subtract") }) -#' @rdname subtractByKey -#' @export +# @rdname subtractByKey +# @export setGeneric("subtractByKey", function(x, other, numPartitions = 1) { standardGeneric("subtractByKey") @@ -372,8 +372,8 @@ setGeneric("subtractByKey", ################### Broadcast Variable Methods ################# -#' @rdname broadcast -#' @export +# @rdname broadcast +# @export setGeneric("value", function(bcast) { standardGeneric("value") }) @@ -477,8 +477,8 @@ setGeneric("showDF", function(x,...) { standardGeneric("showDF") }) #' @export setGeneric("sortDF", function(x, col, ...) { standardGeneric("sortDF") }) -#' @rdname tojson -#' @export +# @rdname tojson +# @export setGeneric("toJSON", function(x) { standardGeneric("toJSON") }) #' @rdname DataFrame diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index edeb8d9f75f7b..7694652856da5 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -21,23 +21,23 @@ NULL ############ Actions and Transformations ############ -#' Look up elements of a key in an RDD -#' -#' @description -#' \code{lookup} returns a list of values in this RDD for key key. -#' -#' @param x The RDD to collect -#' @param key The key to look up for -#' @return a list of values in this RDD for key key -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' pairs <- list(c(1, 1), c(2, 2), c(1, 3)) -#' rdd <- parallelize(sc, pairs) -#' lookup(rdd, 1) # list(1, 3) -#'} -#' @rdname lookup -#' @aliases lookup,RDD-method +# Look up elements of a key in an RDD +# +# @description +# \code{lookup} returns a list of values in this RDD for key key. +# +# @param x The RDD to collect +# @param key The key to look up for +# @return a list of values in this RDD for key key +# @examples +#\dontrun{ +# sc <- sparkR.init() +# pairs <- list(c(1, 1), c(2, 2), c(1, 3)) +# rdd <- parallelize(sc, pairs) +# lookup(rdd, 1) # list(1, 3) +#} +# @rdname lookup +# @aliases lookup,RDD-method setMethod("lookup", signature(x = "RDD", key = "ANY"), function(x, key) { @@ -49,21 +49,21 @@ setMethod("lookup", collect(valsRDD) }) -#' Count the number of elements for each key, and return the result to the -#' master as lists of (key, count) pairs. -#' -#' Same as countByKey in Spark. -#' -#' @param x The RDD to count keys. -#' @return list of (key, count) pairs, where count is number of each key in rdd. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(c("a", 1), c("b", 1), c("a", 1))) -#' countByKey(rdd) # ("a", 2L), ("b", 1L) -#'} -#' @rdname countByKey -#' @aliases countByKey,RDD-method +# Count the number of elements for each key, and return the result to the +# master as lists of (key, count) pairs. +# +# Same as countByKey in Spark. +# +# @param x The RDD to count keys. +# @return list of (key, count) pairs, where count is number of each key in rdd. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(c("a", 1), c("b", 1), c("a", 1))) +# countByKey(rdd) # ("a", 2L), ("b", 1L) +#} +# @rdname countByKey +# @aliases countByKey,RDD-method setMethod("countByKey", signature(x = "RDD"), function(x) { @@ -71,17 +71,17 @@ setMethod("countByKey", countByValue(keys) }) -#' Return an RDD with the keys of each tuple. -#' -#' @param x The RDD from which the keys of each tuple is returned. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(list(1, 2), list(3, 4))) -#' collect(keys(rdd)) # list(1, 3) -#'} -#' @rdname keys -#' @aliases keys,RDD +# Return an RDD with the keys of each tuple. +# +# @param x The RDD from which the keys of each tuple is returned. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(list(1, 2), list(3, 4))) +# collect(keys(rdd)) # list(1, 3) +#} +# @rdname keys +# @aliases keys,RDD setMethod("keys", signature(x = "RDD"), function(x) { @@ -91,17 +91,17 @@ setMethod("keys", lapply(x, func) }) -#' Return an RDD with the values of each tuple. -#' -#' @param x The RDD from which the values of each tuple is returned. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(list(1, 2), list(3, 4))) -#' collect(values(rdd)) # list(2, 4) -#'} -#' @rdname values -#' @aliases values,RDD +# Return an RDD with the values of each tuple. +# +# @param x The RDD from which the values of each tuple is returned. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(list(1, 2), list(3, 4))) +# collect(values(rdd)) # list(2, 4) +#} +# @rdname values +# @aliases values,RDD setMethod("values", signature(x = "RDD"), function(x) { @@ -111,23 +111,23 @@ setMethod("values", lapply(x, func) }) -#' Applies a function to all values of the elements, without modifying the keys. -#' -#' The same as `mapValues()' in Spark. -#' -#' @param X The RDD to apply the transformation. -#' @param FUN the transformation to apply on the value of each element. -#' @return a new RDD created by the transformation. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:10) -#' makePairs <- lapply(rdd, function(x) { list(x, x) }) -#' collect(mapValues(makePairs, function(x) { x * 2) }) -#' Output: list(list(1,2), list(2,4), list(3,6), ...) -#'} -#' @rdname mapValues -#' @aliases mapValues,RDD,function-method +# Applies a function to all values of the elements, without modifying the keys. +# +# The same as `mapValues()' in Spark. +# +# @param X The RDD to apply the transformation. +# @param FUN the transformation to apply on the value of each element. +# @return a new RDD created by the transformation. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:10) +# makePairs <- lapply(rdd, function(x) { list(x, x) }) +# collect(mapValues(makePairs, function(x) { x * 2) }) +# Output: list(list(1,2), list(2,4), list(3,6), ...) +#} +# @rdname mapValues +# @aliases mapValues,RDD,function-method setMethod("mapValues", signature(X = "RDD", FUN = "function"), function(X, FUN) { @@ -137,23 +137,23 @@ setMethod("mapValues", lapply(X, func) }) -#' Pass each value in the key-value pair RDD through a flatMap function without -#' changing the keys; this also retains the original RDD's partitioning. -#' -#' The same as 'flatMapValues()' in Spark. -#' -#' @param X The RDD to apply the transformation. -#' @param FUN the transformation to apply on the value of each element. -#' @return a new RDD created by the transformation. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(list(1, c(1,2)), list(2, c(3,4)))) -#' collect(flatMapValues(rdd, function(x) { x })) -#' Output: list(list(1,1), list(1,2), list(2,3), list(2,4)) -#'} -#' @rdname flatMapValues -#' @aliases flatMapValues,RDD,function-method +# Pass each value in the key-value pair RDD through a flatMap function without +# changing the keys; this also retains the original RDD's partitioning. +# +# The same as 'flatMapValues()' in Spark. +# +# @param X The RDD to apply the transformation. +# @param FUN the transformation to apply on the value of each element. +# @return a new RDD created by the transformation. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(list(1, c(1,2)), list(2, c(3,4)))) +# collect(flatMapValues(rdd, function(x) { x })) +# Output: list(list(1,1), list(1,2), list(2,3), list(2,4)) +#} +# @rdname flatMapValues +# @aliases flatMapValues,RDD,function-method setMethod("flatMapValues", signature(X = "RDD", FUN = "function"), function(X, FUN) { @@ -165,30 +165,30 @@ setMethod("flatMapValues", ############ Shuffle Functions ############ -#' Partition an RDD by key -#' -#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V). -#' For each element of this RDD, the partitioner is used to compute a hash -#' function and the RDD is partitioned using this hash value. -#' -#' @param x The RDD to partition. Should be an RDD where each element is -#' list(K, V) or c(K, V). -#' @param numPartitions Number of partitions to create. -#' @param ... Other optional arguments to partitionBy. -#' -#' @param partitionFunc The partition function to use. Uses a default hashCode -#' function if not provided -#' @return An RDD partitioned using the specified partitioner. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) -#' rdd <- parallelize(sc, pairs) -#' parts <- partitionBy(rdd, 2L) -#' collectPartition(parts, 0L) # First partition should contain list(1, 2) and list(1, 4) -#'} -#' @rdname partitionBy -#' @aliases partitionBy,RDD,integer-method +# Partition an RDD by key +# +# This function operates on RDDs where every element is of the form list(K, V) or c(K, V). +# For each element of this RDD, the partitioner is used to compute a hash +# function and the RDD is partitioned using this hash value. +# +# @param x The RDD to partition. Should be an RDD where each element is +# list(K, V) or c(K, V). +# @param numPartitions Number of partitions to create. +# @param ... Other optional arguments to partitionBy. +# +# @param partitionFunc The partition function to use. Uses a default hashCode +# function if not provided +# @return An RDD partitioned using the specified partitioner. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) +# rdd <- parallelize(sc, pairs) +# parts <- partitionBy(rdd, 2L) +# collectPartition(parts, 0L) # First partition should contain list(1, 2) and list(1, 4) +#} +# @rdname partitionBy +# @aliases partitionBy,RDD,integer-method setMethod("partitionBy", signature(x = "RDD", numPartitions = "numeric"), function(x, numPartitions, partitionFunc = hashCode) { @@ -234,27 +234,27 @@ setMethod("partitionBy", RDD(r, serializedMode = "byte") }) -#' Group values by key -#' -#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V). -#' and group values for each key in the RDD into a single sequence. -#' -#' @param x The RDD to group. Should be an RDD where each element is -#' list(K, V) or c(K, V). -#' @param numPartitions Number of partitions to create. -#' @return An RDD where each element is list(K, list(V)) -#' @seealso reduceByKey -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) -#' rdd <- parallelize(sc, pairs) -#' parts <- groupByKey(rdd, 2L) -#' grouped <- collect(parts) -#' grouped[[1]] # Should be a list(1, list(2, 4)) -#'} -#' @rdname groupByKey -#' @aliases groupByKey,RDD,integer-method +# Group values by key +# +# This function operates on RDDs where every element is of the form list(K, V) or c(K, V). +# and group values for each key in the RDD into a single sequence. +# +# @param x The RDD to group. Should be an RDD where each element is +# list(K, V) or c(K, V). +# @param numPartitions Number of partitions to create. +# @return An RDD where each element is list(K, list(V)) +# @seealso reduceByKey +# @examples +#\dontrun{ +# sc <- sparkR.init() +# pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) +# rdd <- parallelize(sc, pairs) +# parts <- groupByKey(rdd, 2L) +# grouped <- collect(parts) +# grouped[[1]] # Should be a list(1, list(2, 4)) +#} +# @rdname groupByKey +# @aliases groupByKey,RDD,integer-method setMethod("groupByKey", signature(x = "RDD", numPartitions = "numeric"), function(x, numPartitions) { @@ -292,28 +292,28 @@ setMethod("groupByKey", lapplyPartition(shuffled, groupVals) }) -#' Merge values by key -#' -#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V). -#' and merges the values for each key using an associative reduce function. -#' -#' @param x The RDD to reduce by key. Should be an RDD where each element is -#' list(K, V) or c(K, V). -#' @param combineFunc The associative reduce function to use. -#' @param numPartitions Number of partitions to create. -#' @return An RDD where each element is list(K, V') where V' is the merged -#' value -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) -#' rdd <- parallelize(sc, pairs) -#' parts <- reduceByKey(rdd, "+", 2L) -#' reduced <- collect(parts) -#' reduced[[1]] # Should be a list(1, 6) -#'} -#' @rdname reduceByKey -#' @aliases reduceByKey,RDD,integer-method +# Merge values by key +# +# This function operates on RDDs where every element is of the form list(K, V) or c(K, V). +# and merges the values for each key using an associative reduce function. +# +# @param x The RDD to reduce by key. Should be an RDD where each element is +# list(K, V) or c(K, V). +# @param combineFunc The associative reduce function to use. +# @param numPartitions Number of partitions to create. +# @return An RDD where each element is list(K, V') where V' is the merged +# value +# @examples +#\dontrun{ +# sc <- sparkR.init() +# pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) +# rdd <- parallelize(sc, pairs) +# parts <- reduceByKey(rdd, "+", 2L) +# reduced <- collect(parts) +# reduced[[1]] # Should be a list(1, 6) +#} +# @rdname reduceByKey +# @aliases reduceByKey,RDD,integer-method setMethod("reduceByKey", signature(x = "RDD", combineFunc = "ANY", numPartitions = "numeric"), function(x, combineFunc, numPartitions) { @@ -333,27 +333,27 @@ setMethod("reduceByKey", lapplyPartition(shuffled, reduceVals) }) -#' Merge values by key locally -#' -#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V). -#' and merges the values for each key using an associative reduce function, but return the -#' results immediately to the driver as an R list. -#' -#' @param x The RDD to reduce by key. Should be an RDD where each element is -#' list(K, V) or c(K, V). -#' @param combineFunc The associative reduce function to use. -#' @return A list of elements of type list(K, V') where V' is the merged value for each key -#' @seealso reduceByKey -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) -#' rdd <- parallelize(sc, pairs) -#' reduced <- reduceByKeyLocally(rdd, "+") -#' reduced # list(list(1, 6), list(1.1, 3)) -#'} -#' @rdname reduceByKeyLocally -#' @aliases reduceByKeyLocally,RDD,integer-method +# Merge values by key locally +# +# This function operates on RDDs where every element is of the form list(K, V) or c(K, V). +# and merges the values for each key using an associative reduce function, but return the +# results immediately to the driver as an R list. +# +# @param x The RDD to reduce by key. Should be an RDD where each element is +# list(K, V) or c(K, V). +# @param combineFunc The associative reduce function to use. +# @return A list of elements of type list(K, V') where V' is the merged value for each key +# @seealso reduceByKey +# @examples +#\dontrun{ +# sc <- sparkR.init() +# pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) +# rdd <- parallelize(sc, pairs) +# reduced <- reduceByKeyLocally(rdd, "+") +# reduced # list(list(1, 6), list(1.1, 3)) +#} +# @rdname reduceByKeyLocally +# @aliases reduceByKeyLocally,RDD,integer-method setMethod("reduceByKeyLocally", signature(x = "RDD", combineFunc = "ANY"), function(x, combineFunc) { @@ -385,41 +385,41 @@ setMethod("reduceByKeyLocally", convertEnvsToList(merged[[1]], merged[[2]]) }) -#' Combine values by key -#' -#' Generic function to combine the elements for each key using a custom set of -#' aggregation functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], -#' for a "combined type" C. Note that V and C can be different -- for example, one -#' might group an RDD of type (Int, Int) into an RDD of type (Int, Seq[Int]). - -#' Users provide three functions: -#' \itemize{ -#' \item createCombiner, which turns a V into a C (e.g., creates a one-element list) -#' \item mergeValue, to merge a V into a C (e.g., adds it to the end of a list) - -#' \item mergeCombiners, to combine two C's into a single one (e.g., concatentates -#' two lists). -#' } -#' -#' @param x The RDD to combine. Should be an RDD where each element is -#' list(K, V) or c(K, V). -#' @param createCombiner Create a combiner (C) given a value (V) -#' @param mergeValue Merge the given value (V) with an existing combiner (C) -#' @param mergeCombiners Merge two combiners and return a new combiner -#' @param numPartitions Number of partitions to create. -#' @return An RDD where each element is list(K, C) where C is the combined type -#' -#' @seealso groupByKey, reduceByKey -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) -#' rdd <- parallelize(sc, pairs) -#' parts <- combineByKey(rdd, function(x) { x }, "+", "+", 2L) -#' combined <- collect(parts) -#' combined[[1]] # Should be a list(1, 6) -#'} -#' @rdname combineByKey -#' @aliases combineByKey,RDD,ANY,ANY,ANY,integer-method +# Combine values by key +# +# Generic function to combine the elements for each key using a custom set of +# aggregation functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], +# for a "combined type" C. Note that V and C can be different -- for example, one +# might group an RDD of type (Int, Int) into an RDD of type (Int, Seq[Int]). + +# Users provide three functions: +# \itemize{ +# \item createCombiner, which turns a V into a C (e.g., creates a one-element list) +# \item mergeValue, to merge a V into a C (e.g., adds it to the end of a list) - +# \item mergeCombiners, to combine two C's into a single one (e.g., concatentates +# two lists). +# } +# +# @param x The RDD to combine. Should be an RDD where each element is +# list(K, V) or c(K, V). +# @param createCombiner Create a combiner (C) given a value (V) +# @param mergeValue Merge the given value (V) with an existing combiner (C) +# @param mergeCombiners Merge two combiners and return a new combiner +# @param numPartitions Number of partitions to create. +# @return An RDD where each element is list(K, C) where C is the combined type +# +# @seealso groupByKey, reduceByKey +# @examples +#\dontrun{ +# sc <- sparkR.init() +# pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) +# rdd <- parallelize(sc, pairs) +# parts <- combineByKey(rdd, function(x) { x }, "+", "+", 2L) +# combined <- collect(parts) +# combined[[1]] # Should be a list(1, 6) +#} +# @rdname combineByKey +# @aliases combineByKey,RDD,ANY,ANY,ANY,integer-method setMethod("combineByKey", signature(x = "RDD", createCombiner = "ANY", mergeValue = "ANY", mergeCombiners = "ANY", numPartitions = "numeric"), @@ -451,36 +451,36 @@ setMethod("combineByKey", lapplyPartition(shuffled, mergeAfterShuffle) }) -#' Aggregate a pair RDD by each key. -#' -#' Aggregate the values of each key in an RDD, using given combine functions -#' and a neutral "zero value". This function can return a different result type, -#' U, than the type of the values in this RDD, V. Thus, we need one operation -#' for merging a V into a U and one operation for merging two U's, The former -#' operation is used for merging values within a partition, and the latter is -#' used for merging values between partitions. To avoid memory allocation, both -#' of these functions are allowed to modify and return their first argument -#' instead of creating a new U. -#' -#' @param x An RDD. -#' @param zeroValue A neutral "zero value". -#' @param seqOp A function to aggregate the values of each key. It may return -#' a different result type from the type of the values. -#' @param combOp A function to aggregate results of seqOp. -#' @return An RDD containing the aggregation result. -#' @seealso foldByKey, combineByKey -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4))) -#' zeroValue <- list(0, 0) -#' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) } -#' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) } -#' aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L) -#' # list(list(1, list(3, 2)), list(2, list(7, 2))) -#'} -#' @rdname aggregateByKey -#' @aliases aggregateByKey,RDD,ANY,ANY,ANY,integer-method +# Aggregate a pair RDD by each key. +# +# Aggregate the values of each key in an RDD, using given combine functions +# and a neutral "zero value". This function can return a different result type, +# U, than the type of the values in this RDD, V. Thus, we need one operation +# for merging a V into a U and one operation for merging two U's, The former +# operation is used for merging values within a partition, and the latter is +# used for merging values between partitions. To avoid memory allocation, both +# of these functions are allowed to modify and return their first argument +# instead of creating a new U. +# +# @param x An RDD. +# @param zeroValue A neutral "zero value". +# @param seqOp A function to aggregate the values of each key. It may return +# a different result type from the type of the values. +# @param combOp A function to aggregate results of seqOp. +# @return An RDD containing the aggregation result. +# @seealso foldByKey, combineByKey +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4))) +# zeroValue <- list(0, 0) +# seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) } +# combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) } +# aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L) +# # list(list(1, list(3, 2)), list(2, list(7, 2))) +#} +# @rdname aggregateByKey +# @aliases aggregateByKey,RDD,ANY,ANY,ANY,integer-method setMethod("aggregateByKey", signature(x = "RDD", zeroValue = "ANY", seqOp = "ANY", combOp = "ANY", numPartitions = "numeric"), @@ -492,26 +492,26 @@ setMethod("aggregateByKey", combineByKey(x, createCombiner, seqOp, combOp, numPartitions) }) -#' Fold a pair RDD by each key. -#' -#' Aggregate the values of each key in an RDD, using an associative function "func" -#' and a neutral "zero value" which may be added to the result an arbitrary -#' number of times, and must not change the result (e.g., 0 for addition, or -#' 1 for multiplication.). -#' -#' @param x An RDD. -#' @param zeroValue A neutral "zero value". -#' @param func An associative function for folding values of each key. -#' @return An RDD containing the aggregation result. -#' @seealso aggregateByKey, combineByKey -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4))) -#' foldByKey(rdd, 0, "+", 2L) # list(list(1, 3), list(2, 7)) -#'} -#' @rdname foldByKey -#' @aliases foldByKey,RDD,ANY,ANY,integer-method +# Fold a pair RDD by each key. +# +# Aggregate the values of each key in an RDD, using an associative function "func" +# and a neutral "zero value" which may be added to the result an arbitrary +# number of times, and must not change the result (e.g., 0 for addition, or +# 1 for multiplication.). +# +# @param x An RDD. +# @param zeroValue A neutral "zero value". +# @param func An associative function for folding values of each key. +# @return An RDD containing the aggregation result. +# @seealso aggregateByKey, combineByKey +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4))) +# foldByKey(rdd, 0, "+", 2L) # list(list(1, 3), list(2, 7)) +#} +# @rdname foldByKey +# @aliases foldByKey,RDD,ANY,ANY,integer-method setMethod("foldByKey", signature(x = "RDD", zeroValue = "ANY", func = "ANY", numPartitions = "numeric"), @@ -521,28 +521,28 @@ setMethod("foldByKey", ############ Binary Functions ############# -#' Join two RDDs -#' -#' @description -#' \code{join} This function joins two RDDs where every element is of the form list(K, V). -#' The key types of the two RDDs should be the same. -#' -#' @param x An RDD to be joined. Should be an RDD where each element is -#' list(K, V). -#' @param y An RDD to be joined. Should be an RDD where each element is -#' list(K, V). -#' @param numPartitions Number of partitions to create. -#' @return a new RDD containing all pairs of elements with matching keys in -#' two input RDDs. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) -#' rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) -#' join(rdd1, rdd2, 2L) # list(list(1, list(1, 2)), list(1, list(1, 3)) -#'} -#' @rdname join-methods -#' @aliases join,RDD,RDD-method +# Join two RDDs +# +# @description +# \code{join} This function joins two RDDs where every element is of the form list(K, V). +# The key types of the two RDDs should be the same. +# +# @param x An RDD to be joined. Should be an RDD where each element is +# list(K, V). +# @param y An RDD to be joined. Should be an RDD where each element is +# list(K, V). +# @param numPartitions Number of partitions to create. +# @return a new RDD containing all pairs of elements with matching keys in +# two input RDDs. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) +# rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) +# join(rdd1, rdd2, 2L) # list(list(1, list(1, 2)), list(1, list(1, 3)) +#} +# @rdname join-methods +# @aliases join,RDD,RDD-method setMethod("join", signature(x = "RDD", y = "RDD"), function(x, y, numPartitions) { @@ -557,30 +557,30 @@ setMethod("join", doJoin) }) -#' Left outer join two RDDs -#' -#' @description -#' \code{leftouterjoin} This function left-outer-joins two RDDs where every element is of the form list(K, V). -#' The key types of the two RDDs should be the same. -#' -#' @param x An RDD to be joined. Should be an RDD where each element is -#' list(K, V). -#' @param y An RDD to be joined. Should be an RDD where each element is -#' list(K, V). -#' @param numPartitions Number of partitions to create. -#' @return For each element (k, v) in x, the resulting RDD will either contain -#' all pairs (k, (v, w)) for (k, w) in rdd2, or the pair (k, (v, NULL)) -#' if no elements in rdd2 have key k. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) -#' rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) -#' leftOuterJoin(rdd1, rdd2, 2L) -#' # list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL))) -#'} -#' @rdname join-methods -#' @aliases leftOuterJoin,RDD,RDD-method +# Left outer join two RDDs +# +# @description +# \code{leftouterjoin} This function left-outer-joins two RDDs where every element is of the form list(K, V). +# The key types of the two RDDs should be the same. +# +# @param x An RDD to be joined. Should be an RDD where each element is +# list(K, V). +# @param y An RDD to be joined. Should be an RDD where each element is +# list(K, V). +# @param numPartitions Number of partitions to create. +# @return For each element (k, v) in x, the resulting RDD will either contain +# all pairs (k, (v, w)) for (k, w) in rdd2, or the pair (k, (v, NULL)) +# if no elements in rdd2 have key k. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) +# rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) +# leftOuterJoin(rdd1, rdd2, 2L) +# # list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL))) +#} +# @rdname join-methods +# @aliases leftOuterJoin,RDD,RDD-method setMethod("leftOuterJoin", signature(x = "RDD", y = "RDD", numPartitions = "numeric"), function(x, y, numPartitions) { @@ -594,30 +594,30 @@ setMethod("leftOuterJoin", joined <- flatMapValues(groupByKey(unionRDD(xTagged, yTagged), numPartitions), doJoin) }) -#' Right outer join two RDDs -#' -#' @description -#' \code{rightouterjoin} This function right-outer-joins two RDDs where every element is of the form list(K, V). -#' The key types of the two RDDs should be the same. -#' -#' @param x An RDD to be joined. Should be an RDD where each element is -#' list(K, V). -#' @param y An RDD to be joined. Should be an RDD where each element is -#' list(K, V). -#' @param numPartitions Number of partitions to create. -#' @return For each element (k, w) in y, the resulting RDD will either contain -#' all pairs (k, (v, w)) for (k, v) in x, or the pair (k, (NULL, w)) -#' if no elements in x have key k. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3))) -#' rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4))) -#' rightOuterJoin(rdd1, rdd2, 2L) -#' # list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4))) -#'} -#' @rdname join-methods -#' @aliases rightOuterJoin,RDD,RDD-method +# Right outer join two RDDs +# +# @description +# \code{rightouterjoin} This function right-outer-joins two RDDs where every element is of the form list(K, V). +# The key types of the two RDDs should be the same. +# +# @param x An RDD to be joined. Should be an RDD where each element is +# list(K, V). +# @param y An RDD to be joined. Should be an RDD where each element is +# list(K, V). +# @param numPartitions Number of partitions to create. +# @return For each element (k, w) in y, the resulting RDD will either contain +# all pairs (k, (v, w)) for (k, v) in x, or the pair (k, (NULL, w)) +# if no elements in x have key k. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3))) +# rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4))) +# rightOuterJoin(rdd1, rdd2, 2L) +# # list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4))) +#} +# @rdname join-methods +# @aliases rightOuterJoin,RDD,RDD-method setMethod("rightOuterJoin", signature(x = "RDD", y = "RDD", numPartitions = "numeric"), function(x, y, numPartitions) { @@ -631,33 +631,33 @@ setMethod("rightOuterJoin", joined <- flatMapValues(groupByKey(unionRDD(xTagged, yTagged), numPartitions), doJoin) }) -#' Full outer join two RDDs -#' -#' @description -#' \code{fullouterjoin} This function full-outer-joins two RDDs where every element is of the form list(K, V). -#' The key types of the two RDDs should be the same. -#' -#' @param x An RDD to be joined. Should be an RDD where each element is -#' list(K, V). -#' @param y An RDD to be joined. Should be an RDD where each element is -#' list(K, V). -#' @param numPartitions Number of partitions to create. -#' @return For each element (k, v) in x and (k, w) in y, the resulting RDD -#' will contain all pairs (k, (v, w)) for both (k, v) in x and -#' (k, w) in y, or the pair (k, (NULL, w))/(k, (v, NULL)) if no elements -#' in x/y have key k. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3))) -#' rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4))) -#' fullOuterJoin(rdd1, rdd2, 2L) # list(list(1, list(2, 1)), -#' # list(1, list(3, 1)), -#' # list(2, list(NULL, 4))) -#' # list(3, list(3, NULL)), -#'} -#' @rdname join-methods -#' @aliases fullOuterJoin,RDD,RDD-method +# Full outer join two RDDs +# +# @description +# \code{fullouterjoin} This function full-outer-joins two RDDs where every element is of the form list(K, V). +# The key types of the two RDDs should be the same. +# +# @param x An RDD to be joined. Should be an RDD where each element is +# list(K, V). +# @param y An RDD to be joined. Should be an RDD where each element is +# list(K, V). +# @param numPartitions Number of partitions to create. +# @return For each element (k, v) in x and (k, w) in y, the resulting RDD +# will contain all pairs (k, (v, w)) for both (k, v) in x and +# (k, w) in y, or the pair (k, (NULL, w))/(k, (v, NULL)) if no elements +# in x/y have key k. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3))) +# rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4))) +# fullOuterJoin(rdd1, rdd2, 2L) # list(list(1, list(2, 1)), +# # list(1, list(3, 1)), +# # list(2, list(NULL, 4))) +# # list(3, list(3, NULL)), +#} +# @rdname join-methods +# @aliases fullOuterJoin,RDD,RDD-method setMethod("fullOuterJoin", signature(x = "RDD", y = "RDD", numPartitions = "numeric"), function(x, y, numPartitions) { @@ -671,23 +671,23 @@ setMethod("fullOuterJoin", joined <- flatMapValues(groupByKey(unionRDD(xTagged, yTagged), numPartitions), doJoin) }) -#' For each key k in several RDDs, return a resulting RDD that -#' whose values are a list of values for the key in all RDDs. -#' -#' @param ... Several RDDs. -#' @param numPartitions Number of partitions to create. -#' @return a new RDD containing all pairs of elements with values in a list -#' in all RDDs. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) -#' rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) -#' cogroup(rdd1, rdd2, numPartitions = 2L) -#' # list(list(1, list(1, list(2, 3))), list(2, list(list(4), list())) -#'} -#' @rdname cogroup -#' @aliases cogroup,RDD-method +# For each key k in several RDDs, return a resulting RDD that +# whose values are a list of values for the key in all RDDs. +# +# @param ... Several RDDs. +# @param numPartitions Number of partitions to create. +# @return a new RDD containing all pairs of elements with values in a list +# in all RDDs. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) +# rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) +# cogroup(rdd1, rdd2, numPartitions = 2L) +# # list(list(1, list(1, list(2, 3))), list(2, list(list(4), list())) +#} +# @rdname cogroup +# @aliases cogroup,RDD-method setMethod("cogroup", "RDD", function(..., numPartitions) { @@ -723,20 +723,20 @@ setMethod("cogroup", group.func) }) -#' Sort a (k, v) pair RDD by k. -#' -#' @param x A (k, v) pair RDD to be sorted. -#' @param ascending A flag to indicate whether the sorting is ascending or descending. -#' @param numPartitions Number of partitions to create. -#' @return An RDD where all (k, v) pair elements are sorted. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, list(list(3, 1), list(2, 2), list(1, 3))) -#' collect(sortByKey(rdd)) # list (list(1, 3), list(2, 2), list(3, 1)) -#'} -#' @rdname sortByKey -#' @aliases sortByKey,RDD,RDD-method +# Sort a (k, v) pair RDD by k. +# +# @param x A (k, v) pair RDD to be sorted. +# @param ascending A flag to indicate whether the sorting is ascending or descending. +# @param numPartitions Number of partitions to create. +# @return An RDD where all (k, v) pair elements are sorted. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, list(list(3, 1), list(2, 2), list(1, 3))) +# collect(sortByKey(rdd)) # list (list(1, 3), list(2, 2), list(3, 1)) +#} +# @rdname sortByKey +# @aliases sortByKey,RDD,RDD-method setMethod("sortByKey", signature(x = "RDD"), function(x, ascending = TRUE, numPartitions = SparkR:::numPartitions(x)) { @@ -785,25 +785,25 @@ setMethod("sortByKey", lapplyPartition(newRDD, partitionFunc) }) -#' Subtract a pair RDD with another pair RDD. -#' -#' Return an RDD with the pairs from x whose keys are not in other. -#' -#' @param x An RDD. -#' @param other An RDD. -#' @param numPartitions Number of the partitions in the result RDD. -#' @return An RDD with the pairs from x whose keys are not in other. -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4), -#' list("b", 5), list("a", 2))) -#' rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1))) -#' collect(subtractByKey(rdd1, rdd2)) -#' # list(list("b", 4), list("b", 5)) -#'} -#' @rdname subtractByKey -#' @aliases subtractByKey,RDD +# Subtract a pair RDD with another pair RDD. +# +# Return an RDD with the pairs from x whose keys are not in other. +# +# @param x An RDD. +# @param other An RDD. +# @param numPartitions Number of the partitions in the result RDD. +# @return An RDD with the pairs from x whose keys are not in other. +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4), +# list("b", 5), list("a", 2))) +# rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1))) +# collect(subtractByKey(rdd1, rdd2)) +# # list(list("b", 4), list("b", 5)) +#} +# @rdname subtractByKey +# @aliases subtractByKey,RDD setMethod("subtractByKey", signature(x = "RDD", other = "RDD"), function(x, other, numPartitions = SparkR:::numPartitions(x)) { @@ -819,41 +819,41 @@ setMethod("subtractByKey", function (v) { v[[1]] }) }) -#' Return a subset of this RDD sampled by key. -#' -#' @description -#' \code{sampleByKey} Create a sample of this RDD using variable sampling rates -#' for different keys as specified by fractions, a key to sampling rate map. -#' -#' @param x The RDD to sample elements by key, where each element is -#' list(K, V) or c(K, V). -#' @param withReplacement Sampling with replacement or not -#' @param fraction The (rough) sample target fraction -#' @param seed Randomness seed value -#' @examples -#'\dontrun{ -#' sc <- sparkR.init() -#' rdd <- parallelize(sc, 1:3000) -#' pairs <- lapply(rdd, function(x) { if (x %% 3 == 0) list("a", x) -#' else { if (x %% 3 == 1) list("b", x) else list("c", x) }}) -#' fractions <- list(a = 0.2, b = 0.1, c = 0.3) -#' sample <- sampleByKey(pairs, FALSE, fractions, 1618L) -#' 100 < length(lookup(sample, "a")) && 300 > length(lookup(sample, "a")) # TRUE -#' 50 < length(lookup(sample, "b")) && 150 > length(lookup(sample, "b")) # TRUE -#' 200 < length(lookup(sample, "c")) && 400 > length(lookup(sample, "c")) # TRUE -#' lookup(sample, "a")[which.min(lookup(sample, "a"))] >= 0 # TRUE -#' lookup(sample, "a")[which.max(lookup(sample, "a"))] <= 2000 # TRUE -#' lookup(sample, "b")[which.min(lookup(sample, "b"))] >= 0 # TRUE -#' lookup(sample, "b")[which.max(lookup(sample, "b"))] <= 2000 # TRUE -#' lookup(sample, "c")[which.min(lookup(sample, "c"))] >= 0 # TRUE -#' lookup(sample, "c")[which.max(lookup(sample, "c"))] <= 2000 # TRUE -#' fractions <- list(a = 0.2, b = 0.1, c = 0.3, d = 0.4) -#' sample <- sampleByKey(pairs, FALSE, fractions, 1618L) # Key "d" will be ignored -#' fractions <- list(a = 0.2, b = 0.1) -#' sample <- sampleByKey(pairs, FALSE, fractions, 1618L) # KeyError: "c" -#'} -#' @rdname sampleByKey -#' @aliases sampleByKey,RDD-method +# Return a subset of this RDD sampled by key. +# +# @description +# \code{sampleByKey} Create a sample of this RDD using variable sampling rates +# for different keys as specified by fractions, a key to sampling rate map. +# +# @param x The RDD to sample elements by key, where each element is +# list(K, V) or c(K, V). +# @param withReplacement Sampling with replacement or not +# @param fraction The (rough) sample target fraction +# @param seed Randomness seed value +# @examples +#\dontrun{ +# sc <- sparkR.init() +# rdd <- parallelize(sc, 1:3000) +# pairs <- lapply(rdd, function(x) { if (x %% 3 == 0) list("a", x) +# else { if (x %% 3 == 1) list("b", x) else list("c", x) }}) +# fractions <- list(a = 0.2, b = 0.1, c = 0.3) +# sample <- sampleByKey(pairs, FALSE, fractions, 1618L) +# 100 < length(lookup(sample, "a")) && 300 > length(lookup(sample, "a")) # TRUE +# 50 < length(lookup(sample, "b")) && 150 > length(lookup(sample, "b")) # TRUE +# 200 < length(lookup(sample, "c")) && 400 > length(lookup(sample, "c")) # TRUE +# lookup(sample, "a")[which.min(lookup(sample, "a"))] >= 0 # TRUE +# lookup(sample, "a")[which.max(lookup(sample, "a"))] <= 2000 # TRUE +# lookup(sample, "b")[which.min(lookup(sample, "b"))] >= 0 # TRUE +# lookup(sample, "b")[which.max(lookup(sample, "b"))] <= 2000 # TRUE +# lookup(sample, "c")[which.min(lookup(sample, "c"))] >= 0 # TRUE +# lookup(sample, "c")[which.max(lookup(sample, "c"))] <= 2000 # TRUE +# fractions <- list(a = 0.2, b = 0.1, c = 0.3, d = 0.4) +# sample <- sampleByKey(pairs, FALSE, fractions, 1618L) # Key "d" will be ignored +# fractions <- list(a = 0.2, b = 0.1) +# sample <- sampleByKey(pairs, FALSE, fractions, 1618L) # KeyError: "c" +#} +# @rdname sampleByKey +# @aliases sampleByKey,RDD-method setMethod("sampleByKey", signature(x = "RDD", withReplacement = "logical", fractions = "vector", seed = "integer"),