high_level: ml: Add predict_features parameter to 'accuracy()'

Signed-off-by: mHash1m <hashimchaudry23@gmail.com>
intel · Aug 17, 2021 · a520483 · a520483
1 parent 1b0327c
commit a520483
Show file tree

Hide file tree

Showing 97 changed files with 371 additions and 213 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Usecase example notebook for "Saving and loading models"
 - Usecase example notebook for "Transfer Learning"
 - Usecase example notebook for "Ensemble by stacking"
+- Support for Multi-Output models.
+- Usecase example notebook for "Working with Multi-Output models"
 ### Changed
 - Calls to hashlib now go through helper functions
 - Build docs using `dffml service dev docs`
@@ -33,6 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `high_level` split into `ml`, `dataflow` & `source` submodules
 - Config objects now support mutability/immutability at the property scope.
   See `docs/arch/0003-Config-Property-Mutable-vs-Immutable` for details.
+- high_level `accuracy()` now takes predict features as parameter.
 ### Fixed
 - Record object key properties are now always strings
 

diff --git a/dffml/accuracy/accuracy.py b/dffml/accuracy/accuracy.py
@@ -1,5 +1,6 @@
 import abc
 
+from ..feature import Feature
 from ..model import ModelContext
 from ..source.source import SourcesContext
 from ..util.entrypoint import base_entry_point
@@ -25,7 +26,7 @@ def __init__(self, parent: "Accuracy") -> None:
 
     @abc.abstractmethod
     async def score(
-        self, mctx: ModelContext, sources: SourcesContext,
+        self, mctx: ModelContext, sources: SourcesContext, *args: Feature
     ) -> float:
         """
         Abstract method to get the score

diff --git a/dffml/accuracy/clf.py b/dffml/accuracy/clf.py
@@ -1,6 +1,6 @@
 from ..base import config
 from ..record import Record
-from ..feature import Feature
+from ..feature import Feature, Features
 from ..model import ModelContext
 from ..util.entrypoint import entrypoint
 from ..source.source import SourcesContext
@@ -21,16 +21,14 @@ class ClassificationAccuracyContext(AccuracyContext):
     Classification Accuracy
     """
 
-    async def score(self, mctx: ModelContext, sources: SourcesContext):
-        if len([mctx.parent.config.predict]) != 1:
-            raise InvalidNumberOfFeaturesError(
-                f"{self.__class__.__qualname__} can only assess accuracy of one feature. features: {features}"
-            )
+    async def score(
+        self, mctx: ModelContext, sources: SourcesContext, feature: Feature,
+    ):
         total = 0
         right_predictions = 0
         async for record in mctx.predict(sources):
-            if str(record.feature(mctx.parent.config.predict.name)) == str(
-                record.prediction(mctx.parent.config.predict.name).value
+            if str(record.feature(feature.name)) == str(
+                record.prediction(feature.name).value
             ):
                 right_predictions += 1
             total += 1

diff --git a/dffml/accuracy/mse.py b/dffml/accuracy/mse.py
@@ -21,18 +21,14 @@ class MeanSquaredErrorAccuracyContext(AccuracyContext):
     Mean Squared Error
     """
 
-    async def score(self, mctx: ModelContext, sources: SourcesContext):
-        if len([mctx.parent.config.predict]) != 1:
-            raise InvalidNumberOfFeaturesError(
-                f"{self.__class__.__qualname__} can only assess accuracy of one feature. features: {features}"
-            )
+    async def score(
+        self, mctx: ModelContext, sources: SourcesContext, feature: Feature,
+    ):
         y = []
         y_predict = []
         async for record in mctx.predict(sources):
-            y.append(record.feature(mctx.parent.config.predict.name))
-            y_predict.append(
-                record.prediction(mctx.parent.config.predict.name).value
-            )
+            y.append(record.feature(feature.name))
+            y_predict.append(record.prediction(feature.name).value)
         accuracy = sum(
             list(map(lambda x, y: abs(x - y) ** 2, y, y_predict))
         ) / len(y)

diff --git a/dffml/cli/ml.py b/dffml/cli/ml.py
@@ -1,4 +1,5 @@
 import inspect
+from typing import Union
 
 from ..model.model import Model
 from ..source.source import Sources, SubsetSources
@@ -15,6 +16,7 @@
 )
 from ..base import config, field
 from ..accuracy import AccuracyScorer
+from ..feature import Feature, Features
 
 
 @config
@@ -28,6 +30,7 @@ class AccuracyCMDConfig:
     scorer: AccuracyScorer = field(
         "Method to use to score accuracy", required=True
     )
+    features: Features = field("Predict Feature(s)", default=Features())
     sources: Sources = FIELD_SOURCES
 
 
@@ -64,7 +67,9 @@ async def run(self):
         # at this point rather than an instance.
         if inspect.isclass(self.scorer):
             self.scorer = self.scorer.withconfig(self.extra_config)
-        return await accuracy(self.model, self.scorer, self.sources)
+        return await accuracy(
+            self.model, self.scorer, self.features, self.sources
+        )
 
 
 @config

diff --git a/dffml/high_level/ml.py b/dffml/high_level/ml.py
@@ -3,6 +3,7 @@
 
 from ..model import Model
 from ..record import Record
+from ..feature import Feature, Features
 from ..source.source import BaseSource
 from ..accuracy.accuracy import AccuracyScorer, AccuracyContext
 from ..util.internal import records_to_sources
@@ -64,6 +65,7 @@ async def train(model, *args: Union[BaseSource, Record, Dict[str, Any]]):
 async def accuracy(
     model,
     accuracy_scorer: Union[AccuracyScorer, AccuracyContext],
+    features: Union[Feature, Features],
     *args: Union[BaseSource, Record, Dict[str, Any]],
 ) -> float:
     """
@@ -116,6 +118,7 @@ async def accuracy(
     ...         await accuracy(
     ...             model,
     ...             MeanSquaredErrorAccuracy(),
+    ...             Feature("Salary", int, 1),
     ...             {"Years": 4, "Salary": 50},
     ...             {"Years": 5, "Salary": 60},
     ...         ),
@@ -124,6 +127,16 @@ async def accuracy(
     >>> asyncio.run(main())
     Accuracy: 0.0
     """
+    # TODO Use this to ensure that we're always passing features before records
+    # We can remove it eventually once we know we've updated everywhere
+    # appropriately
+    if not isinstance(features, (Feature, Features)):
+        raise TypeError(
+            f"features was {type(features)}: {features!r}. Should have been Feature or Features"
+        )
+    if isinstance(features, Feature):
+        features = Features(features)
+
     async with contextlib.AsyncExitStack() as astack:
         # Open sources
         sctx = await astack.enter_async_context(records_to_sources(*args))
@@ -139,8 +152,8 @@ async def accuracy(
             # TODO Replace this with static type checking and maybe dynamic
             # through something like pydantic. See issue #36
             raise TypeError(f"{accuracy_scorer} is not an AccuracyScorer")
-        # Run accuracy method
-        return float(await actx.score(mctx, sctx))
+
+        return float(await actx.score(mctx, sctx, *features))
 
 
 async def predict(

diff --git a/dffml/model/slr.py b/dffml/model/slr.py
@@ -96,6 +96,7 @@ class SLRModel(SimpleModel):
             -model-features f1:float:1 \
             -model-predict ans:int:1 \
             -model-location tempdir \
+            -features ans:int:1 \
             -sources f=csv \
             -source-filename dataset.csv \
             -scorer mse \

diff --git a/dffml/skel/model/README.md b/dffml/skel/model/README.md
@@ -34,6 +34,7 @@ dffml accuracy \
   -model-location tempdir \
   -sources csv=iris_training.csv \
   -classifications 0 1 2 \
+  -features label:int:1 \
   -model-features \
     SepalLength:float:1 \
     SepalWidth:float:1 \

diff --git a/dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py b/dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py
@@ -120,6 +120,7 @@ class MySLRModel(SimpleModel):
             -model-features x:float:1 \
             -model-predict y:int:1 \
             -model-location tempdir \
+            -features y:int:1 \
             -sources f=csv \
             -source-filename test.csv
         1.0

diff --git a/dffml/skel/model/examples/example_myslr.py b/dffml/skel/model/examples/example_myslr.py
@@ -15,7 +15,7 @@
 
 # Assess accuracy (alternate way of specifying data source)
 scorer = MeanSquaredErrorAccuracy()
-print("Accuracy:", accuracy(model, scorer, "test.csv"))
+print("Accuracy:", accuracy(model, scorer, Feature("y", int, 1), "test.csv"))
 
 # Make prediction
 for i, features, prediction in predict(model, "predict.csv"):

diff --git a/dffml/skel/model/tests/test_model.py b/dffml/skel/model/tests/test_model.py
@@ -60,7 +60,10 @@ async def test_00_train(self):
     async def test_01_accuracy(self):
         # Use the test data to assess the model's accuracy
         res = await accuracy(
-            self.model, self.scorer, *[{"X": x, "Y": y} for x, y in TEST_DATA]
+            self.model,
+            self.scorer,
+            Feature("Y", float, 1),
+            *[{"X": x, "Y": y} for x, y in TEST_DATA],
         )
         # Ensure the accuracy is above 80%
         self.assertTrue(0.0 <= res < 0.1)

diff --git a/dffml/util/python.py b/dffml/util/python.py
@@ -175,27 +175,3 @@ def within_method(obj: object, method_name: str, max_depth: int = -1) -> bool:
         ):
             return True
     return False
-
-
-def no_inplace_append(list_a: list, arg_b: Union[str, list]) -> list:
-    """
-    Append method that acts as inplace = False. Takes a list and another argument 
-    list or string. Creates a duplicate of the first list and appends the second argument 
-    to the duplicate. Returns the duplicate list.
-
-    Parameters
-    ----------
-    list_a : list
-        The list to append the argument to.
-    arg_b : str, list
-        The argument to append to the list.
-
-    Returns
-    ------
-    list_dup : list
-        Duplicate list of list_a with arg_b appended.
-    """
-
-    lista_dup = list_a.copy()
-    lista_dup.append(arg_b)
-    return lista_dup
diff --git a/docs/tutorials/accuracy/mse.rst b/docs/tutorials/accuracy/mse.rst
@@ -49,7 +49,7 @@ have access the model's config and the sources records.
 
 .. literalinclude:: /../dffml/accuracy/mse.py
     :test:
-    :lines: 19-39
+    :lines: 19-35
 
 Scorer
 ------
@@ -61,4 +61,4 @@ scorer in the cli.
 
 .. literalinclude:: /../dffml/accuracy/mse.py
     :test:
-    :lines: 42-45
+    :lines: 38-41
diff --git a/docs/tutorials/models/docs.rst b/docs/tutorials/models/docs.rst
@@ -137,7 +137,7 @@ in the console should be highlighted. There is no proper way to highlight a
 
 .. literalinclude:: /../dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py
     :language: rst
-    :lines: 99-163
+    :lines: 99-164
     :linenos:
     :lineno-start: 99
 
@@ -154,9 +154,9 @@ format Python files. It can't format examples within rST within a docstring.
 
 .. literalinclude:: /../dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py
     :language: rst
-    :lines: 164-171
+    :lines: 165-172
     :linenos:
-    :lineno-start: 164
+    :lineno-start: 165
 
 By specifying the ``:filepath:`` we copied the contents of the Python example
 to the test environment's directory. The last thing we need to do is run the
@@ -167,9 +167,9 @@ another ``"""```.
 
 .. literalinclude:: /../dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py
     :language: rst
-    :lines: 173-178
+    :lines: 174-179
     :linenos:
-    :lineno-start: 172
+    :lineno-start: 173
 
 .. _model_tutorial_docs_testing_examples:
 

diff --git a/docs/tutorials/models/slr.rst b/docs/tutorials/models/slr.rst
@@ -101,15 +101,15 @@ We must set the ``CONFIG`` attribute to the respective ``Config`` class.
 
 .. literalinclude:: /../dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py
     :test:
-    :lines: 180-181
+    :lines: 181-182
 
 We can override the ``__init__()`` method to do validation on the ``features``
 config property. Simple linear regression only supports one input feature, so we
 will raise a ``ValueError`` if the user supplys more than one feature.
 
 .. literalinclude:: /../dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py
     :test:
-    :lines: 183-187
+    :lines: 184-188
 
 Train
 -----
@@ -125,7 +125,7 @@ is saved and loaded from a JSON file on disk.
 
 .. literalinclude:: /../dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py
     :test:
-    :lines: 188-202
+    :lines: 190-204
 
 Predict
 -------
@@ -140,7 +140,7 @@ confidence in our prediction.
 
 .. literalinclude:: /../dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/myslr.py
     :test:
-    :lines: 205-224
+    :lines: 206-225
 
 Python Usage
 ------------

diff --git a/examples/MNIST/accuracy.sh b/examples/MNIST/accuracy.sh
@@ -8,6 +8,7 @@ dffml accuracy \
     -model-location tempdir \
     -model-classifications $(seq 0 9) \
     -model-features image:int:$((28 * 28)) \
+    -features label:int:1 \
     -sources images=df label=idx1 \
     -source-images-dataflow normalize.yaml \
     -source-images-features image:int:$((28 * 28)) \

diff --git a/examples/accuracy/mse/mse.py b/examples/accuracy/mse/mse.py
@@ -15,4 +15,7 @@
 mse_accuracy = MeanSquaredErrorAccuracy()
 
 # Assess accuracy (alternate way of specifying data source)
-print("Accuracy:", accuracy(model, mse_accuracy, "dataset.csv"))
+print(
+    "Accuracy:",
+    accuracy(model, mse_accuracy, Feature("ans", int, 1), "dataset.csv"),
+)
diff --git a/examples/flower17/pytorch-alexnet/accuracy.sh b/examples/flower17/pytorch-alexnet/accuracy.sh
@@ -14,6 +14,7 @@ dffml accuracy \
   -model-normalize_std 0.229 0.224 0.225 \
   -model-features image:int:$((500*500)) \
   -model-predict label:str:1 \
+  -features label:str:1 \
   -sources f=dir \
     -source-foldername flower_dataset/test \
     -source-feature image \

diff --git a/examples/flower17/sklearn-opencv/accuracy.sh b/examples/flower17/sklearn-opencv/accuracy.sh
@@ -7,6 +7,7 @@ dffml accuracy \
     Haralick:int:13 \
   -model-predict label:str:1 \
   -model-location tempdir \
+  -features label:str:1 \
   -sources images=df \
     -source-images-source dir \
     -source-images-source-foldername flower_dataset/test \

diff --git a/examples/model/slr/accuracy.sh b/examples/model/slr/accuracy.sh
@@ -3,6 +3,7 @@ dffml accuracy \
   -model-features f1:float:1 \
   -model-predict ans:int:1 \
   -model-location tempdir \
+  -features ans:int:1 \
   -sources f=csv \
   -source-filename dataset.csv \
   -scorer mse \

diff --git a/examples/model/slr/slr.py b/examples/model/slr/slr.py
@@ -13,7 +13,9 @@
 
 # Assess accuracy (alternate way of specifying data source)
 scorer = MeanSquaredErrorAccuracy()
-print("Accuracy:", accuracy(model, scorer, "dataset.csv"))
+print(
+    "Accuracy:", accuracy(model, scorer, Feature("ans", int, 1), "dataset.csv")
+)
 
 # Make prediction
 for i, features, prediction in predict(model, {"f1": 0.8, "ans": 0}):

diff --git a/examples/nlp/accuracy.sh b/examples/nlp/accuracy.sh
@@ -7,6 +7,7 @@ dffml accuracy \
     -model-classifications 0 1 \
     -model-location tempdir \
     -model-features embedding:float:[1,10,96] \
+    -features sentiment:int:1 \
     -sources text=df \
     -source-text-dataflow nlp_ops_dataflow.json \
     -source-text-features sentence:str:1 \

diff --git a/examples/nlp/sklearn/accuracy.sh b/examples/nlp/sklearn/accuracy.sh
@@ -3,6 +3,7 @@ dffml accuracy \
     -model-features extract_array_from_matrix.outputs.result:float:1 \
     -model-predict sentiment:int:1 \
     -model-location tempdir \
+    -features sentiment:int:1 \
     -sources text=df \
     -source-text-dataflow nlp_ops_dataflow.json \
     -source-text-features sentence:str:1 \