From 3a03728e784f65f5e7ed01feb4a6c594b9728bab Mon Sep 17 00:00:00 2001 From: merzouk Date: Wed, 26 May 2021 14:39:42 +0200 Subject: [PATCH 1/3] first commit branch --- src/main/scala/io/github/jsarni/CaraModel.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/scala/io/github/jsarni/CaraModel.scala b/src/main/scala/io/github/jsarni/CaraModel.scala index 6b678d7..1adb2f4 100644 --- a/src/main/scala/io/github/jsarni/CaraModel.scala +++ b/src/main/scala/io/github/jsarni/CaraModel.scala @@ -25,7 +25,9 @@ final class CaraModel(yamlPath: String, datasetPath: String, format: String, sav def generateReport(model: PipelineModel) : Try[Unit] = ??? - private def generateModel(caraPipeline: CaraPipeline): Try[Pipeline] = ??? + private def generateModel(caraPipeline: CaraPipeline): Try[Pipeline] = Try{ + ??? + } private def train(pipeline: Pipeline, dataset: Dataset[_]): Try[PipelineModel] = Try { pipeline.fit(dataset) From 76672d3ed118aee5d3df709b09c1c4ecaddd3a48 Mon Sep 17 00:00:00 2001 From: merzouk Date: Wed, 26 May 2021 19:18:18 +0200 Subject: [PATCH 2/3] finish generateModel method and add CaraModelTest class --- .../scala/io/github/jsarni/CaraModel.scala | 38 ++++++++++++++-- .../io/github/jsarni/CaraModelTest.scala | 43 +++++++++++++++++++ 2 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 src/test/scala/io/github/jsarni/CaraModelTest.scala diff --git a/src/main/scala/io/github/jsarni/CaraModel.scala b/src/main/scala/io/github/jsarni/CaraModel.scala index 1adb2f4..806773a 100644 --- a/src/main/scala/io/github/jsarni/CaraModel.scala +++ b/src/main/scala/io/github/jsarni/CaraModel.scala @@ -5,6 +5,7 @@ import io.github.jsarni.DatasetLoader.CaraLoader import io.github.jsarni.PipelineParser.{CaraParser, CaraPipeline} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.ml.tuning.{CrossValidator, TrainValidationSplit} import scala.util.Try @@ -25,11 +26,42 @@ final class CaraModel(yamlPath: String, datasetPath: String, format: String, sav def generateReport(model: PipelineModel) : Try[Unit] = ??? - private def generateModel(caraPipeline: CaraPipeline): Try[Pipeline] = Try{ - ??? + + private def generateModel(caraPipeline: CaraPipeline) : Try[Pipeline] = Try { + val pipeline = caraPipeline.pipeline + val evaluator = caraPipeline.evaluator + val tuningStage = caraPipeline.tuner.tuningStage + val methodeName = "set" + caraPipeline.tuner.paramName + val model = tuningStage match { + case "CrossValidator" => { + val paramValue = caraPipeline.tuner.paramValue + val crossValidatorModel = new CrossValidator() + .setEstimator(pipeline) + .setEvaluator(evaluator) + .setParallelism(2) + + crossValidatorModel.getClass.getMethod(methodeName, Int.getClass ) + .invoke(crossValidatorModel,paramValue.asInstanceOf[java.lang.Integer]) + + new Pipeline().setStages(Array(crossValidatorModel)) + } + case "TrainValidationSplit" => { + val paramValue = caraPipeline.tuner.paramValue + val validationSplitModel = new TrainValidationSplit() + .setEstimator(pipeline) + .setEvaluator(evaluator) + .setParallelism(2) + + validationSplitModel.getClass.getMethod(methodeName, Double.getClass ) + .invoke(validationSplitModel,paramValue.asInstanceOf[java.lang.Double]) + + new Pipeline().setStages(Array(validationSplitModel)) + } + } + model } - private def train(pipeline: Pipeline, dataset: Dataset[_]): Try[PipelineModel] = Try { + private def train(pipeline: Pipeline , dataset: Dataset[_]): Try[PipelineModel] = Try { pipeline.fit(dataset) } diff --git a/src/test/scala/io/github/jsarni/CaraModelTest.scala b/src/test/scala/io/github/jsarni/CaraModelTest.scala new file mode 100644 index 0000000..ba328c0 --- /dev/null +++ b/src/test/scala/io/github/jsarni/CaraModelTest.scala @@ -0,0 +1,43 @@ +package io.github.jsarni +import io.github.jsarni.CaraStage.ModelStage.LogisticRegression +import io.github.jsarni.CaraStage.TuningStage.TuningStageDescription +import io.github.jsarni.PipelineParser.CaraPipeline +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, RegressionEvaluator} + + +class CaraModelTest extends TestBase { + "generateModel" should "Return validation model with the right method and params" in { + val params = Map( + "MaxIter" -> "10", + "RegParam" -> "0.3", + "ElasticNetParam" -> "0.1", + "Family" -> "multinomial", + "FeaturesCol" -> "FeatureColname", + "FitIntercept" -> "True", + "PredictionCol" -> "Age", + "ProbabilityCol" -> "ProbaColname", + "RawPredictionCol"-> "RawPredictColname", + "Standardization" -> "True", + "Tol" -> "0.13", + "WeightCol" -> "WeightColname" + ) + val lr = LogisticRegression(params) + val crossEvaluator = new BinaryClassificationEvaluator + val crossTuner = TuningStageDescription("CrossValidator", "NumFolds", "2" ) + val splitEvaluator = new RegressionEvaluator + val splitTuner = TuningStageDescription("TrainValidationSplit", "TrainRatio", "0.6" ) + + lr.build().isSuccess shouldBe true + + val pipeline = new Pipeline() + .setStages(Array(lr.build().get)) + + val crossCaraPipeline = CaraPipeline(pipeline, crossEvaluator, crossTuner) + PrivateMethod('generateModel) (crossCaraPipeline) + + val splitCaraPipeline = CaraPipeline(pipeline, splitEvaluator, splitTuner) + PrivateMethod('generateModel) (splitCaraPipeline) + } + +} From 8f830ae6dbf80835bf8749c8eb14d5a54f26109a Mon Sep 17 00:00:00 2001 From: merzouk Date: Thu, 27 May 2021 15:13:18 +0200 Subject: [PATCH 3/3] review cara_pipine_model test --- .../scala/io/github/jsarni/CaraModel.scala | 8 +-- .../io/github/jsarni/CaraModelTest.scala | 57 +++++++++++-------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/src/main/scala/io/github/jsarni/CaraModel.scala b/src/main/scala/io/github/jsarni/CaraModel.scala index 806773a..4e9847e 100644 --- a/src/main/scala/io/github/jsarni/CaraModel.scala +++ b/src/main/scala/io/github/jsarni/CaraModel.scala @@ -34,25 +34,25 @@ final class CaraModel(yamlPath: String, datasetPath: String, format: String, sav val methodeName = "set" + caraPipeline.tuner.paramName val model = tuningStage match { case "CrossValidator" => { - val paramValue = caraPipeline.tuner.paramValue + val paramValue = caraPipeline.tuner.paramValue.toInt val crossValidatorModel = new CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setParallelism(2) - crossValidatorModel.getClass.getMethod(methodeName, Int.getClass ) + crossValidatorModel.getClass.getMethod(methodeName, paramValue.getClass ) .invoke(crossValidatorModel,paramValue.asInstanceOf[java.lang.Integer]) new Pipeline().setStages(Array(crossValidatorModel)) } case "TrainValidationSplit" => { - val paramValue = caraPipeline.tuner.paramValue + val paramValue = caraPipeline.tuner.paramValue.toDouble val validationSplitModel = new TrainValidationSplit() .setEstimator(pipeline) .setEvaluator(evaluator) .setParallelism(2) - validationSplitModel.getClass.getMethod(methodeName, Double.getClass ) + validationSplitModel.getClass.getMethod(methodeName, paramValue.getClass ) .invoke(validationSplitModel,paramValue.asInstanceOf[java.lang.Double]) new Pipeline().setStages(Array(validationSplitModel)) diff --git a/src/test/scala/io/github/jsarni/CaraModelTest.scala b/src/test/scala/io/github/jsarni/CaraModelTest.scala index ba328c0..e65b79d 100644 --- a/src/test/scala/io/github/jsarni/CaraModelTest.scala +++ b/src/test/scala/io/github/jsarni/CaraModelTest.scala @@ -1,43 +1,50 @@ package io.github.jsarni -import io.github.jsarni.CaraStage.ModelStage.LogisticRegression import io.github.jsarni.CaraStage.TuningStage.TuningStageDescription import io.github.jsarni.PipelineParser.CaraPipeline import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, RegressionEvaluator} +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.ml.tuning.{CrossValidator, TrainValidationSplit} +import org.apache.spark.sql.SparkSession + +import scala.util.Try class CaraModelTest extends TestBase { "generateModel" should "Return validation model with the right method and params" in { - val params = Map( - "MaxIter" -> "10", - "RegParam" -> "0.3", - "ElasticNetParam" -> "0.1", - "Family" -> "multinomial", - "FeaturesCol" -> "FeatureColname", - "FitIntercept" -> "True", - "PredictionCol" -> "Age", - "ProbabilityCol" -> "ProbaColname", - "RawPredictionCol"-> "RawPredictColname", - "Standardization" -> "True", - "Tol" -> "0.13", - "WeightCol" -> "WeightColname" - ) - val lr = LogisticRegression(params) + val lr = new LinearRegression() + .setMaxIter(10) + val crossEvaluator = new BinaryClassificationEvaluator - val crossTuner = TuningStageDescription("CrossValidator", "NumFolds", "2" ) + val crossTuner = TuningStageDescription("CrossValidator", "NumFolds", "2") val splitEvaluator = new RegressionEvaluator - val splitTuner = TuningStageDescription("TrainValidationSplit", "TrainRatio", "0.6" ) + val splitTuner = TuningStageDescription("TrainValidationSplit", "TrainRatio", "0.6") - lr.build().isSuccess shouldBe true + implicit val spark: SparkSession = + SparkSession.builder() + .appName("CaraML") + .master("local[1]") + .getOrCreate() + val caraModel = new CaraModel("YamlPath", "datasetPath", "format", "savePath")(spark) val pipeline = new Pipeline() - .setStages(Array(lr.build().get)) - + .setStages(Array(lr)) val crossCaraPipeline = CaraPipeline(pipeline, crossEvaluator, crossTuner) - PrivateMethod('generateModel) (crossCaraPipeline) - val splitCaraPipeline = CaraPipeline(pipeline, splitEvaluator, splitTuner) - PrivateMethod('generateModel) (splitCaraPipeline) - } + val method = PrivateMethod[Try[Pipeline]]('generateModel) + val crossModel = caraModel.invokePrivate(method(crossCaraPipeline)) + val splitModel = caraModel.invokePrivate(method(splitCaraPipeline)) + + crossModel.isSuccess shouldBe true + crossModel.get.getStages.length shouldBe 1 + crossModel.get.getStages.head.isInstanceOf[CrossValidator] shouldBe true + crossModel.get.getStages.head.asInstanceOf[CrossValidator].getNumFolds shouldBe 2 + + splitModel.isSuccess shouldBe true + splitModel.get.getStages.length shouldBe 1 + splitModel.get.getStages.head.isInstanceOf[TrainValidationSplit] shouldBe true + splitModel.get.getStages.head.asInstanceOf[TrainValidationSplit].getTrainRatio shouldBe 0.6 + + } }