From 62324bee018058e827dd03d90ecd22ad271dd468 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 16 Apr 2022 11:16:44 +0530 Subject: [PATCH 01/30] Add rough class for RougeL --- keras_nlp/metrics/rouge_l.py | 173 +++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 keras_nlp/metrics/rouge_l.py diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py new file mode 100644 index 0000000000..5a56a782a8 --- /dev/null +++ b/keras_nlp/metrics/rouge_l.py @@ -0,0 +1,173 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ROUGE-L metric implementation based on `keras.metrics.Metric`.""" + +import tensorflow as tf +import tensorflow_text as tf_text +from tensorflow import keras + + +class RougeL(keras.metrics.Metric): + """ROUGE-L metric. + + This class implements the ROUGE-L metric. + + Args: + alpha: float. `alpha` is used as the weight for the + harmonic mean of precision and recall. A value of 0 means recall is + more important and a value of 1 means precision is more important + (same behaviour as + https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l). + metric_type: string. One of "precision", "recall", "f1_score". Defaults + to "f1_score". + mask_token_id: int. ID of the token to be masked. If provided, the mask + is computed for this class. Note that if this field is provided, and + if the `sample_weight` field in `update_state()` is also provided, + we will compute the final `sample_weight` as the element-wise + product of the mask and the `sample_weight`. In the product, any + value >= 1 will be treated as True, and False, otherwise, for + masking. + dtype: string or tf.dtypes.Dtype. Precision of metric computation. If + not specified, it defaults to tf.float32. + name: string. Name of the metric instance. + **kwargs: Other keyword arguments. + + Examples: + + """ + + def __init__( + self, + alpha=0.5, + metric_type="f1_score", + mask_token_id=None, + dtype=None, + name="rouge_l", + **kwargs, + ): + super().__init__(name=name, dtype=dtype, **kwargs) + + if not tf.as_dtype(self.dtype).is_floating: + raise ValueError( + "`dtype` must be a floating point type. " + f"Received: dtype={dtype}" + ) + + if metric_type not in ["precision", "recall", "f1_score"]: + raise ValueError( + "`metric_type` must be one of 'precision', 'recall', " + "'f1_score'. Received: metric_type={metric_type}" + ) + + self.alpha = alpha + self.metric_type = metric_type + self.mask_token_id = mask_token_id + + self._rouge_l_score = self.add_weight( + name="rouge_l_score", + initializer="zeros", + dtype=self.dtype, + ) + self._number_of_samples = self.add_weight( + name="number_of_samples", initializer="zeros", dtype=self.dtype + ) + + def update_state(self, y_true, y_pred, sample_weight=None): + # Both y_true and y_pred have shape: [batch_size, seq_len]. Note that + # they can also be ragged tensors with shape [num_samples, (seq_len)]. + + # If the input tensors are not ragged tensors, convert them to ragged + # tensors. `tf_text.metrics.rouge_l` expects ragged tensors. + if not isinstance(y_true, tf.RaggedTensor): + y_true = tf.RaggedTensor.from_tensor(y_true) + if not isinstance(y_pred, tf.RaggedTensor): + y_pred = tf.RaggedTensor.from_tensor(y_pred) + + if sample_weight is not None: + sample_weight = tf.cast(sample_weight, self.dtype) + + batch_size = tf.cast(y_true.nrows(), self.dtype) + + if self.mask_token_id is not None: + mask = tf.cast( + tf.math.logical_not(tf.equal(y_true, self.mask_token_id)), + self.dtype, + ) + if sample_weight is None: + sample_weight = mask + else: + sample_weight = tf.multiply(mask, sample_weight) + + if sample_weight is not None: + sample_weight = tf.cast(sample_weight, tf.bool) + + # Apply mask to both tensors. + y_true = tf.ragged.boolean_mask(y_true, sample_weight) + y_pred = tf.ragged.boolean_mask(y_pred, sample_weight) + + f1_scores, precisions, recalls = rouge_l( + y_true, y_pred, alpha=self.alpha + ) + if self.metric_type == "precision": + scores = precisions + elif self.metric_type == "recall": + scores = recalls + else: + scores = f1_scores + self._rouge_l_score.assign_add(tf.reduce_sum(scores)) + self._number_of_samples.assign_add(batch_size) + + def result(self): + if self._number_of_samples == 0: + return 0.0 + rouge_l_score = self._rouge_l_score / self._number_of_samples + return rouge_l_score + + def reset_state(self): + self._rouge_l_score.assign(0.0) + self._number_of_samples.assign(0.0) + + def get_config(self): + config = super().get_config() + config.update( + { + "alpha": 0.5, + "metric_type": "f1_score", + "mask_token_id": self.mask_token_id, + } + ) + return config + + +def rouge_l(y_true, y_pred, alpha=0.5): + """ + Computes the ROUGE-L score. + Args: + y_true (_type_): tf.RaggedTensor. The reference summaries. + y_pred (_type_): tf.RaggedTensor. The generated summaries. + alpha (float, optional): float. Defaults to 0.5. `alpha` is used as the + weight for the harmonic mean of precision and recall. A value of 0 + means recall is more important and a value of 1 means precision is + more important (same behaviour as + https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l). + + Returns: + (f1_scores, precisions, recalls): Tuple of tf.Tensor. The f1_scores, + precisions and recalls are returned for every sample. + """ + f1_scores, precisions, recalls = tf_text.metrics.rouge_l( + y_true, y_pred, alpha=alpha + ) + return f1_scores, precisions, recalls From 3bc476a7f5520368a241d48b84be5452ccfbbf24 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 16 Apr 2022 11:20:17 +0530 Subject: [PATCH 02/30] Fix typos --- keras_nlp/metrics/rouge_l.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 5a56a782a8..3f3e09c381 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -143,8 +143,8 @@ def get_config(self): config = super().get_config() config.update( { - "alpha": 0.5, - "metric_type": "f1_score", + "alpha": self.alpha, + "metric_type": self.metric_type, "mask_token_id": self.mask_token_id, } ) @@ -155,12 +155,12 @@ def rouge_l(y_true, y_pred, alpha=0.5): """ Computes the ROUGE-L score. Args: - y_true (_type_): tf.RaggedTensor. The reference summaries. - y_pred (_type_): tf.RaggedTensor. The generated summaries. - alpha (float, optional): float. Defaults to 0.5. `alpha` is used as the - weight for the harmonic mean of precision and recall. A value of 0 - means recall is more important and a value of 1 means precision is - more important (same behaviour as + y_true: tf.RaggedTensor. The reference summaries. + y_pred: tf.RaggedTensor. The generated summaries. + alpha: float. Defaults to 0.5. `alpha` is used as the weight for the + harmonic mean of precision and recall. A value of 0 means recall is + more important and a value of 1 means precision is more important + (same behaviour as https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l). Returns: From b09302d0f35fc3aea45a27f735b8717e930ce956 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 16 Apr 2022 14:55:32 +0530 Subject: [PATCH 03/30] Correct logic --- keras_nlp/metrics/__init__.py | 1 + keras_nlp/metrics/rouge_l.py | 46 +++---- keras_nlp/metrics/rouge_l_test.py | 208 ++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 27 deletions(-) create mode 100644 keras_nlp/metrics/rouge_l_test.py diff --git a/keras_nlp/metrics/__init__.py b/keras_nlp/metrics/__init__.py index 7152a97032..71509009a3 100644 --- a/keras_nlp/metrics/__init__.py +++ b/keras_nlp/metrics/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from keras_nlp.metrics.perplexity import Perplexity +from keras_nlp.metrics.rouge_l import RougeL diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 3f3e09c381..09219e9f00 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -32,13 +32,7 @@ class RougeL(keras.metrics.Metric): https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l). metric_type: string. One of "precision", "recall", "f1_score". Defaults to "f1_score". - mask_token_id: int. ID of the token to be masked. If provided, the mask - is computed for this class. Note that if this field is provided, and - if the `sample_weight` field in `update_state()` is also provided, - we will compute the final `sample_weight` as the element-wise - product of the mask and the `sample_weight`. In the product, any - value >= 1 will be treated as True, and False, otherwise, for - masking. + mask_token_ids: list of integers. IDs of the tokens to be masked. dtype: string or tf.dtypes.Dtype. Precision of metric computation. If not specified, it defaults to tf.float32. name: string. Name of the metric instance. @@ -52,7 +46,7 @@ def __init__( self, alpha=0.5, metric_type="f1_score", - mask_token_id=None, + mask_token_ids=None, dtype=None, name="rouge_l", **kwargs, @@ -65,7 +59,7 @@ def __init__( f"Received: dtype={dtype}" ) - if metric_type not in ["precision", "recall", "f1_score"]: + if metric_type not in ("precision", "recall", "f1_score"): raise ValueError( "`metric_type` must be one of 'precision', 'recall', " "'f1_score'. Received: metric_type={metric_type}" @@ -73,7 +67,7 @@ def __init__( self.alpha = alpha self.metric_type = metric_type - self.mask_token_id = mask_token_id + self.mask_token_ids = mask_token_ids self._rouge_l_score = self.add_weight( name="rouge_l_score", @@ -95,27 +89,25 @@ def update_state(self, y_true, y_pred, sample_weight=None): if not isinstance(y_pred, tf.RaggedTensor): y_pred = tf.RaggedTensor.from_tensor(y_pred) - if sample_weight is not None: - sample_weight = tf.cast(sample_weight, self.dtype) - batch_size = tf.cast(y_true.nrows(), self.dtype) - if self.mask_token_id is not None: - mask = tf.cast( - tf.math.logical_not(tf.equal(y_true, self.mask_token_id)), - self.dtype, - ) - if sample_weight is None: - sample_weight = mask - else: - sample_weight = tf.multiply(mask, sample_weight) + y_true_mask = tf.cast(tf.ones_like(y_true), tf.bool) + y_pred_mask = tf.cast(tf.ones_like(y_pred), tf.bool) - if sample_weight is not None: - sample_weight = tf.cast(sample_weight, tf.bool) + if self.mask_token_ids is not None: + for mask_token_id in self.mask_token_ids: + y_true_mask = tf.logical_and( + y_true_mask, + tf.math.logical_not(tf.equal(y_true, mask_token_id)), + ) + y_pred_mask = tf.logical_and( + y_pred_mask, + tf.math.logical_not(tf.equal(y_pred, mask_token_id)), + ) # Apply mask to both tensors. - y_true = tf.ragged.boolean_mask(y_true, sample_weight) - y_pred = tf.ragged.boolean_mask(y_pred, sample_weight) + y_true = tf.ragged.boolean_mask(y_true, y_true_mask) + y_pred = tf.ragged.boolean_mask(y_pred, y_pred_mask) f1_scores, precisions, recalls = rouge_l( y_true, y_pred, alpha=self.alpha @@ -145,7 +137,7 @@ def get_config(self): { "alpha": self.alpha, "metric_type": self.metric_type, - "mask_token_id": self.mask_token_id, + "mask_token_ids": self.mask_token_ids, } ) return config diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py new file mode 100644 index 0000000000..7b2926941d --- /dev/null +++ b/keras_nlp/metrics/rouge_l_test.py @@ -0,0 +1,208 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for RougeL.""" + +import tensorflow as tf + +from keras_nlp.metrics import RougeL + + +class RougeLTest(tf.test.TestCase): + def test_vars_after_initializing_class(self): + rouge_l = RougeL() + self.assertEqual(rouge_l.result().numpy(), 0.0) + + def test_without_mask_token_ids(self): + rouge_l = RougeL() + y_true = tf.constant([[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) + y_pred = tf.constant([[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32) + + rouge_l_val = rouge_l(y_true, y_pred) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.70, delta=1e-3) + + def test_with_mask_token_ids(self): + rouge_l = RougeL(mask_token_ids=[0, 1]) + y_true = tf.constant( + [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 + ) + y_pred = tf.constant( + [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 + ) + + rouge_l_val = rouge_l(y_true, y_pred) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3) + + def test_ragged_input_without_mask_token_ids(self): + rouge_l = RougeL() + y_true = tf.ragged.constant( + [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32 + ) + y_pred = tf.ragged.constant([[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32) + + rouge_l_val = rouge_l(y_true, y_pred) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.5357, delta=1e-3) + + def test_ragged_input_with_mask_token_ids(self): + rouge_l = RougeL(mask_token_ids=[0, 1]) + y_true = tf.ragged.constant( + [[1, 2, 3, 4], [1, 5, 6, 0, 0]], dtype=tf.int32 + ) + y_pred = tf.ragged.constant( + [[1, 3, 2, 4, 4, 4], [5, 6, 0, 2, 2]], dtype=tf.int32 + ) + + rouge_l_val = rouge_l(y_true, y_pred) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.583, delta=1e-3) + + def test_precision(self): + rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="precision") + y_true = tf.constant( + [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 + ) + y_pred = tf.constant( + [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 + ) + + rouge_l_val = rouge_l(y_true, y_pred) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.8333, delta=1e-3) + + def test_recall(self): + rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="recall") + y_true = tf.constant( + [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 + ) + y_pred = tf.constant( + [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 + ) + + rouge_l_val = rouge_l(y_true, y_pred) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.45, delta=1e-3) + + def test_two_inputs_from_logits(self): + rouge_l = RougeL(mask_token_ids=[0, 1]) + y_true_1 = tf.constant( + [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 + ) + y_pred_1 = tf.constant( + [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 + ) + + rouge_l_val = rouge_l(y_true_1, y_pred_1) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3) + + y_true_2 = tf.ragged.constant( + [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32 + ) + y_pred_2 = tf.ragged.constant( + [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32 + ) + + rouge_l_val = rouge_l(y_true_2, y_pred_2) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.7014, delta=1e-3) + + def test_reset_state(self): + rouge_l = RougeL(mask_token_ids=[0, 1]) + y_true = tf.constant( + [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 + ) + y_pred = tf.constant( + [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 + ) + + rouge_l.update_state(y_true, y_pred) + self.assertNotEqual(rouge_l.result(), 0.0) + + rouge_l.reset_state() + self.assertEqual(rouge_l.result(), 0.0) + + def test_update_state(self): + rouge_l = RougeL(mask_token_ids=[0, 1]) + y_true_1 = tf.constant( + [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 + ) + y_pred_1 = tf.constant( + [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 + ) + + rouge_l.update_state(y_true_1, y_pred_1) + rouge_l_val = rouge_l.result() + self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3) + + y_true_2 = tf.ragged.constant( + [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32 + ) + y_pred_2 = tf.ragged.constant( + [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32 + ) + + rouge_l.update_state(y_true_2, y_pred_2) + rouge_l_val = rouge_l.result() + self.assertAlmostEqual(rouge_l_val.numpy(), 0.7014, delta=1e-3) + + def test_merge_state(self): + rouge_l_1 = RougeL(mask_token_ids=[0, 1]) + rouge_l_2 = RougeL(mask_token_ids=[0, 1]) + + y_true_1 = tf.constant( + [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 + ) + y_pred_1 = tf.constant( + [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 + ) + + y_true_2 = tf.ragged.constant( + [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32 + ) + y_pred_2 = tf.ragged.constant( + [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32 + ) + + y_true_3 = tf.ragged.constant( + [[9, 8, 7, 1], [10, 5, 1, 2, 3]], dtype=tf.int32 + ) + y_pred_3 = tf.ragged.constant( + [[1, 2, 7, 9, 8, 0], [10, 1, 2]], dtype=tf.int32 + ) + + rouge_l_1.update_state(y_true_1, y_pred_1) + rouge_l_1.update_state(y_true_2, y_pred_2) + self.assertAlmostEqual(rouge_l_1.result().numpy(), 0.7014, delta=1e-3) + + rouge_l_2.update_state(y_true_3, y_pred_3) + self.assertAlmostEqual(rouge_l_2.result().numpy(), 0.6190, delta=1e-3) + + merged_rouge_l = RougeL(mask_token_ids=[0, 1]) + merged_rouge_l.merge_state([rouge_l_1, rouge_l_2]) + self.assertAlmostEqual( + merged_rouge_l.result().numpy(), 0.6739, delta=1e-3 + ) + + def test_get_config(self): + rouge_l = RougeL( + alpha=0.7, + metric_type="precision", + mask_token_ids=[0], + dtype=tf.float32, + name="rouge_l_test", + ) + config = rouge_l.get_config() + expected_config = { + "alpha": 0.7, + "metric_type": "precision", + "mask_token_ids": [0], + "dtype": tf.float32, + "name": "rouge_l_test", + } + self.assertEqual(config, expected_config) From cadbd01056661a9ed5308b91d69c94e52f1f1213 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 16 Apr 2022 15:17:19 +0530 Subject: [PATCH 04/30] Add examples --- keras_nlp/metrics/rouge_l.py | 67 ++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 09219e9f00..987fa6a15a 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -40,6 +40,73 @@ class RougeL(keras.metrics.Metric): Examples: + 1. Calculate RougeL (F1 Score) by calling `update_state()` and `result()`. + 1.1. `mask_token_ids` not provided. + >>> tf.random.set_seed(42) + >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l") + >>> references = tf.random.uniform( + ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> hypotheses = tf.random.uniform( + ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> rouge_l.update_state(references, hypotheses) + >>> rouge_l.result() + >> tf.random.set_seed(42) + >>> rouge_l = keras_nlp.metrics.RougeL( + ... name="rouge_l", mask_token_ids=[0, 1]) + >>> references = tf.random.uniform( + ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> hypotheses = tf.random.uniform( + ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> rouge_l.update_state(references, hypotheses) + >>> rouge_l.result() + + + 1.3. tf.RaggedTensor as input, and `mask_token_ids` not provided. + >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l") + >>> references = tf.ragged.constant( + ... [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) + >>> hypotheses = tf.ragged.constant( + ... [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32) + >>> rouge_l.update_state(references, hypotheses) + >>> rouge_l.result() + + + 1.4. tf.RaggedTensor as input, and `mask_token_ids` provided. + >>> rouge_l = keras_nlp.metrics.RougeL( + ... name="rouge_l", mask_token_ids=[1, 5]) + >>> references = tf.ragged.constant( + ... [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) + >>> hypotheses = tf.ragged.constant( + ... [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32) + >>> rouge_l.update_state(references, hypotheses) + >>> rouge_l.result() + + + 2. Calculate ROUGE-L directly. This has the same functionality as above. + >>> tf.random.set_seed(42) + >>> rouge_l = keras_nlp.metrics.RougeL( + ... name="rouge_l", mask_token_ids=[0, 1]) + >>> references = tf.random.uniform( + ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> hypotheses = tf.random.uniform( + ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> rouge_l(references, hypotheses) + + + 3. Traditionally, the ROUGE-L metric calculates the F1-score. However, if + the user wants the precision, this is how it can be done: + >>> tf.random.set_seed(42) + >>> rouge_l = keras_nlp.metrics.RougeL( + ... name="rouge_l", metric_type="precision") + >>> references = tf.random.uniform( + ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> hypotheses = tf.random.uniform( + ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> rouge_l(references, hypotheses) + """ def __init__( From 3e767ff467f0c3b6587c8fc61bffcf72acbac636 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 16 Apr 2022 16:23:06 +0530 Subject: [PATCH 05/30] Small doc-string changes --- keras_nlp/metrics/rouge_l.py | 4 ++-- keras_nlp/metrics/rouge_l_test.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 987fa6a15a..70d76f32de 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -83,7 +83,7 @@ class RougeL(keras.metrics.Metric): ... [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32) >>> rouge_l.update_state(references, hypotheses) >>> rouge_l.result() - + 2. Calculate ROUGE-L directly. This has the same functionality as above. >>> tf.random.set_seed(42) @@ -227,6 +227,6 @@ def rouge_l(y_true, y_pred, alpha=0.5): precisions and recalls are returned for every sample. """ f1_scores, precisions, recalls = tf_text.metrics.rouge_l( - y_true, y_pred, alpha=alpha + y_pred, y_true, alpha=alpha ) return f1_scores, precisions, recalls diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py index 7b2926941d..892f5b8df7 100644 --- a/keras_nlp/metrics/rouge_l_test.py +++ b/keras_nlp/metrics/rouge_l_test.py @@ -76,7 +76,7 @@ def test_precision(self): ) rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.8333, delta=1e-3) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.45, delta=1e-3) def test_recall(self): rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="recall") @@ -88,7 +88,15 @@ def test_recall(self): ) rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.45, delta=1e-3) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.8333, delta=1e-3) + + def test_output_with_alpha(self): + rouge_l = RougeL(alpha=0.7) + y_true = tf.constant([[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) + y_pred = tf.constant([[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32) + + rouge_l_val = rouge_l(y_true, y_pred) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.70, delta=1e-3) def test_two_inputs_from_logits(self): rouge_l = RougeL(mask_token_ids=[0, 1]) From b622cfe885f601d21a186efc02bf5b11b35f7e4b Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 16 Apr 2022 16:40:00 +0530 Subject: [PATCH 06/30] Add alpha example --- keras_nlp/metrics/rouge_l.py | 8 ++++++++ keras_nlp/metrics/rouge_l_test.py | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 70d76f32de..7e2eb7e30a 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -107,6 +107,14 @@ class RougeL(keras.metrics.Metric): ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) >>> rouge_l(references, hypotheses) + + 4. Modify the precision vs recall importance by specifying the `alpha` + parameter. + >>> rouge_l = RougeL(name="rouge_l", alpha=0.7) + >>> y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32) + >>> y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32) + >>> rouge_l(references, hypotheses) + """ def __init__( diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py index 892f5b8df7..593a5d1d2b 100644 --- a/keras_nlp/metrics/rouge_l_test.py +++ b/keras_nlp/metrics/rouge_l_test.py @@ -92,11 +92,11 @@ def test_recall(self): def test_output_with_alpha(self): rouge_l = RougeL(alpha=0.7) - y_true = tf.constant([[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) - y_pred = tf.constant([[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32) + y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32) + y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32) rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.70, delta=1e-3) + self.assertAlmostEqual(rouge_l_val.numpy(), 0.5253, delta=1e-3) def test_two_inputs_from_logits(self): rouge_l = RougeL(mask_token_ids=[0, 1]) From 38a809fbb5173a4986d96ca9ec46ad147c1e2a01 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 16 Apr 2022 16:40:49 +0530 Subject: [PATCH 07/30] Small doc-string change --- keras_nlp/metrics/rouge_l.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 7e2eb7e30a..b1990e6ca8 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -108,8 +108,8 @@ class RougeL(keras.metrics.Metric): >>> rouge_l(references, hypotheses) - 4. Modify the precision vs recall importance by specifying the `alpha` - parameter. + 4. Modify the precision vs recall importance (for calculating F1-score) by + specifying the `alpha` parameter. >>> rouge_l = RougeL(name="rouge_l", alpha=0.7) >>> y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32) >>> y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32) From e3bf5030d6aef1a01eba14c820a7352c7cd1b585 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 16 Apr 2022 16:48:02 +0530 Subject: [PATCH 08/30] Fix doc-string --- keras_nlp/metrics/rouge_l.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index b1990e6ca8..4a1265f2d8 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -110,11 +110,13 @@ class RougeL(keras.metrics.Metric): 4. Modify the precision vs recall importance (for calculating F1-score) by specifying the `alpha` parameter. - >>> rouge_l = RougeL(name="rouge_l", alpha=0.7) - >>> y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32) - >>> y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32) + >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l", alpha=0.7) + >>> references = tf.ragged.constant( + ... [[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32) + >>> hypotheses = tf.ragged.constant( + ... [[1], [5, 6, 10, 10, 10]], dtype=tf.int32) >>> rouge_l(references, hypotheses) - + """ def __init__( From d25403bc07b1ba9019ae12006de5c25efc0c971c Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sun, 17 Apr 2022 22:14:05 +0530 Subject: [PATCH 09/30] Fix f-string --- keras_nlp/metrics/rouge_l.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 4a1265f2d8..94d56cf7d9 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -138,8 +138,8 @@ def __init__( if metric_type not in ("precision", "recall", "f1_score"): raise ValueError( - "`metric_type` must be one of 'precision', 'recall', " - "'f1_score'. Received: metric_type={metric_type}" + '`metric_type` must be one of "precision", "recall", ' + f'"f1_score". Received: metric_type={metric_type}' ) self.alpha = alpha From 2f9a35cbbb124daa312ce64b6655eebb047637df Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sun, 17 Apr 2022 22:25:26 +0530 Subject: [PATCH 10/30] Minor doc-string edit --- keras_nlp/metrics/rouge_l.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 94d56cf7d9..8f7958e994 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -26,9 +26,9 @@ class RougeL(keras.metrics.Metric): Args: alpha: float. `alpha` is used as the weight for the - harmonic mean of precision and recall. A value of 0 means recall is - more important and a value of 1 means precision is more important - (same behaviour as + harmonic mean of precision and recall (for calculating F1-score). A + value of 0 means recall is more important and a value of 1 means + precision is more important (same behaviour as https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l). metric_type: string. One of "precision", "recall", "f1_score". Defaults to "f1_score". @@ -227,9 +227,9 @@ def rouge_l(y_true, y_pred, alpha=0.5): y_true: tf.RaggedTensor. The reference summaries. y_pred: tf.RaggedTensor. The generated summaries. alpha: float. Defaults to 0.5. `alpha` is used as the weight for the - harmonic mean of precision and recall. A value of 0 means recall is - more important and a value of 1 means precision is more important - (same behaviour as + harmonic mean of precision and recall (for calculating F1-score). A + value of 0 means recall is more important and a value of 1 means + precision is more important (same behaviour as https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l). Returns: From 9b4c1f165230f8d9fc761ebb4e35a00bb384afc6 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sun, 17 Apr 2022 22:26:01 +0530 Subject: [PATCH 11/30] Minor doc-string edit - 2 --- keras_nlp/metrics/rouge_l.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 8f7958e994..0cabbb13a0 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -34,7 +34,7 @@ class RougeL(keras.metrics.Metric): to "f1_score". mask_token_ids: list of integers. IDs of the tokens to be masked. dtype: string or tf.dtypes.Dtype. Precision of metric computation. If - not specified, it defaults to tf.float32. + not specified, it defaults to tf.float32. name: string. Name of the metric instance. **kwargs: Other keyword arguments. From c59aa74b49d0d12da11bbd683468a75abca8a934 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Wed, 20 Apr 2022 19:48:24 +0530 Subject: [PATCH 12/30] Address review comments - I --- keras_nlp/metrics/rouge_l.py | 73 ++++++++++--------------------- keras_nlp/metrics/rouge_l_test.py | 2 +- 2 files changed, 25 insertions(+), 50 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 0cabbb13a0..947040fcde 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -42,27 +42,25 @@ class RougeL(keras.metrics.Metric): 1. Calculate RougeL (F1 Score) by calling `update_state()` and `result()`. 1.1. `mask_token_ids` not provided. - >>> tf.random.set_seed(42) >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l") - >>> references = tf.random.uniform( - ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) - >>> hypotheses = tf.random.uniform( - ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> references = tf.constant( + ... [[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) + >>> hypotheses = tf.constant( + ... [[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32) >>> rouge_l.update_state(references, hypotheses) >>> rouge_l.result() - >> tf.random.set_seed(42) >>> rouge_l = keras_nlp.metrics.RougeL( ... name="rouge_l", mask_token_ids=[0, 1]) - >>> references = tf.random.uniform( - ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) - >>> hypotheses = tf.random.uniform( - ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> references = tf.constant( + ... [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32) + >>> hypotheses = tf.constant( + ... [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32) >>> rouge_l.update_state(references, hypotheses) >>> rouge_l.result() - + 1.3. tf.RaggedTensor as input, and `mask_token_ids` not provided. >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l") @@ -89,24 +87,23 @@ class RougeL(keras.metrics.Metric): >>> tf.random.set_seed(42) >>> rouge_l = keras_nlp.metrics.RougeL( ... name="rouge_l", mask_token_ids=[0, 1]) - >>> references = tf.random.uniform( - ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) - >>> hypotheses = tf.random.uniform( - ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> references = tf.constant( + ... [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32) + >>> hypotheses = tf.constant( + ... [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32) >>> rouge_l(references, hypotheses) - + 3. Traditionally, the ROUGE-L metric calculates the F1-score. However, if the user wants the precision, this is how it can be done: - >>> tf.random.set_seed(42) >>> rouge_l = keras_nlp.metrics.RougeL( ... name="rouge_l", metric_type="precision") - >>> references = tf.random.uniform( - ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) - >>> hypotheses = tf.random.uniform( - ... shape=[2,5], maxval=10, dtype=tf.int32, seed=42) + >>> references = tf.constant( + ... [[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) + >>> hypotheses = tf.constant( + ... [[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32) >>> rouge_l(references, hypotheses) - + 4. Modify the precision vs recall importance (for calculating F1-score) by specifying the `alpha` parameter. @@ -124,7 +121,7 @@ def __init__( alpha=0.5, metric_type="f1_score", mask_token_ids=None, - dtype=None, + dtype=tf.float32, name="rouge_l", **kwargs, ): @@ -157,7 +154,7 @@ def __init__( def update_state(self, y_true, y_pred, sample_weight=None): # Both y_true and y_pred have shape: [batch_size, seq_len]. Note that - # they can also be ragged tensors with shape [num_samples, (seq_len)]. + # they can also be ragged tensors with shape [batch_size, (seq_len)]. # If the input tensors are not ragged tensors, convert them to ragged # tensors. `tf_text.metrics.rouge_l` expects ragged tensors. @@ -186,8 +183,8 @@ def update_state(self, y_true, y_pred, sample_weight=None): y_true = tf.ragged.boolean_mask(y_true, y_true_mask) y_pred = tf.ragged.boolean_mask(y_pred, y_pred_mask) - f1_scores, precisions, recalls = rouge_l( - y_true, y_pred, alpha=self.alpha + f1_scores, precisions, recalls = tf_text.metrics.rouge_l( + y_pred, y_true, alpha=self.alpha ) if self.metric_type == "precision": scores = precisions @@ -218,25 +215,3 @@ def get_config(self): } ) return config - - -def rouge_l(y_true, y_pred, alpha=0.5): - """ - Computes the ROUGE-L score. - Args: - y_true: tf.RaggedTensor. The reference summaries. - y_pred: tf.RaggedTensor. The generated summaries. - alpha: float. Defaults to 0.5. `alpha` is used as the weight for the - harmonic mean of precision and recall (for calculating F1-score). A - value of 0 means recall is more important and a value of 1 means - precision is more important (same behaviour as - https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l). - - Returns: - (f1_scores, precisions, recalls): Tuple of tf.Tensor. The f1_scores, - precisions and recalls are returned for every sample. - """ - f1_scores, precisions, recalls = tf_text.metrics.rouge_l( - y_pred, y_true, alpha=alpha - ) - return f1_scores, precisions, recalls diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py index 593a5d1d2b..0fb1409499 100644 --- a/keras_nlp/metrics/rouge_l_test.py +++ b/keras_nlp/metrics/rouge_l_test.py @@ -20,7 +20,7 @@ class RougeLTest(tf.test.TestCase): - def test_vars_after_initializing_class(self): + def test_initialization(self): rouge_l = RougeL() self.assertEqual(rouge_l.result().numpy(), 0.0) From d166ab7a238c270fdb4fa55c891df4b85c14781a Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Wed, 20 Apr 2022 19:53:40 +0530 Subject: [PATCH 13/30] Minor change in examples --- keras_nlp/metrics/rouge_l.py | 1 - 1 file changed, 1 deletion(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 947040fcde..ff31bba773 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -84,7 +84,6 @@ class RougeL(keras.metrics.Metric): 2. Calculate ROUGE-L directly. This has the same functionality as above. - >>> tf.random.set_seed(42) >>> rouge_l = keras_nlp.metrics.RougeL( ... name="rouge_l", mask_token_ids=[0, 1]) >>> references = tf.constant( From 632df5d9500cc6775a931be79fd400a2adbf44b0 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Mon, 23 May 2022 21:10:46 +0530 Subject: [PATCH 14/30] Use the rouge_score package --- keras_nlp/metrics/__init__.py | 2 +- keras_nlp/metrics/rouge.py | 169 +++++++++++++++++++++++ keras_nlp/metrics/rouge_l.py | 216 ------------------------------ keras_nlp/metrics/rouge_l_test.py | 216 ------------------------------ keras_nlp/metrics/rouge_test.py | 216 ++++++++++++++++++++++++++++++ 5 files changed, 386 insertions(+), 433 deletions(-) create mode 100644 keras_nlp/metrics/rouge.py delete mode 100644 keras_nlp/metrics/rouge_l.py delete mode 100644 keras_nlp/metrics/rouge_l_test.py create mode 100644 keras_nlp/metrics/rouge_test.py diff --git a/keras_nlp/metrics/__init__.py b/keras_nlp/metrics/__init__.py index 71509009a3..2a0682138e 100644 --- a/keras_nlp/metrics/__init__.py +++ b/keras_nlp/metrics/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. from keras_nlp.metrics.perplexity import Perplexity -from keras_nlp.metrics.rouge_l import RougeL +from keras_nlp.metrics.rouge import Rouge diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge.py new file mode 100644 index 0000000000..089157c139 --- /dev/null +++ b/keras_nlp/metrics/rouge.py @@ -0,0 +1,169 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ROUGE metric implementation based on `keras.metrics.Metric`.""" + +import tensorflow as tf +from tensorflow import keras + +from keras_nlp.utils.tensor_utils import tensor_to_string_list + +try: + from rouge_score import rouge_scorer +except: + pass + + +class Rouge(keras.metrics.Metric): + """ROUGE metric. + + This class implements all the variants of the ROUGE metric - ROUGE-N, + ROUGE-L and ROUGE-LSum. + + Args: + variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to + "rouge2". For "rougeN", N lies in the range [1, 9]. + metric_type: string. One of "precision", "recall", "f1_score". Defaults + to "f1_score". + use_stemmer: bool. Whether Porter Stemmer should be used to strip word + suffixes to improve matching. Defaults to False. + dtype: string or tf.dtypes.Dtype. Precision of metric computation. If + not specified, it defaults to tf.float32. + name: string. Name of the metric instance. + **kwargs: Other keyword arguments. + """ + + def __init__( + self, + variant="rouge2", + metric_type="f1_score", + use_stemmer=False, + dtype=None, + name="rouge", + **kwargs, + ): + super().__init__(name=name, dtype=dtype, **kwargs) + + if rouge_scorer is None: + raise ImportError( + "ROUGE metric requires the `rouge_score` package." + "Please install it with `pip install rouge_score`." + ) + + if not tf.as_dtype(self.dtype).is_floating: + raise ValueError( + "`dtype` must be a floating point type. " + f"Received: dtype={dtype}" + ) + + if variant not in tuple( + ("rouge" + str(order) for order in range(1, 10)) + ) + ( + "rougeL", + "rougeLsum", + ): + raise ValueError( + "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, " + "rougeLsum, with N ranging from 1 to 9. Received: " + f"variant={variant}" + ) + if metric_type not in ("precision", "recall", "f1_score"): + raise ValueError( + '`metric_type` must be one of "precision", "recall", ' + f'"f1_score". Received: metric_type={metric_type}' + ) + + self.variant = variant + self.metric_type = metric_type + self.use_stemmer = use_stemmer + + # To-do: Add split_summaries and tokenizer options after the maintainers + # of rouge_scorer have released a new version. + self._rouge_scorer = rouge_scorer.RougeScorer( + rouge_types=[self.variant], + use_stemmer=use_stemmer, + ) + + self._rouge_score = self.add_weight( + name="rouge_score", + initializer="zeros", + dtype=self.dtype, + ) + self._number_of_samples = self.add_weight( + name="number_of_samples", initializer="zeros", dtype=self.dtype + ) + + def update_state(self, y_true, y_pred, sample_weight=None): + # Both y_true and y_pred have shape: [batch_size]. Each element is a + # string. + + # Check if input is a raw string/list. + if isinstance(y_true, str): + y_true = tf.constant([y_true]) + elif isinstance(y_true, list): + y_true = tf.constant(y_true) + if isinstance(y_pred, str): + y_pred = tf.constant([y_pred]) + elif isinstance(y_pred, list): + y_pred = tf.constant(y_pred) + + batch_size = tf.shape(y_true)[0] + + def _calculate_rouge_score(reference, hypothesis): + reference = tensor_to_string_list(reference) + hypothesis = tensor_to_string_list(hypothesis) + score = self._rouge_scorer.score(reference, hypothesis)[ + self.variant + ] + + if self.metric_type == "precision": + score = score.precision + elif self.metric_type == "recall": + score = score.recall + else: + score = score.fmeasure + return score + + for batch_idx in range(batch_size): + score = tf.py_function( + func=_calculate_rouge_score, + inp=[y_true[batch_idx], y_pred[batch_idx]], + Tout=self.dtype, + ) + self._rouge_score.assign_add(score) + + self._number_of_samples.assign_add( + tf.cast(batch_size, dtype=self.dtype) + ) + + def result(self): + if self._number_of_samples == 0: + return 0.0 + rouge_l_score = self._rouge_score / self._number_of_samples + return rouge_l_score + + def reset_state(self): + self._rouge_score.assign(0.0) + self._number_of_samples.assign(0.0) + + def get_config(self): + config = super().get_config() + config.update( + { + "variant": self.variant, + "metric_type": self.metric_type, + "use_stemmer": self.use_stemmer, + } + ) + return config diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py deleted file mode 100644 index ff31bba773..0000000000 --- a/keras_nlp/metrics/rouge_l.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright 2022 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ROUGE-L metric implementation based on `keras.metrics.Metric`.""" - -import tensorflow as tf -import tensorflow_text as tf_text -from tensorflow import keras - - -class RougeL(keras.metrics.Metric): - """ROUGE-L metric. - - This class implements the ROUGE-L metric. - - Args: - alpha: float. `alpha` is used as the weight for the - harmonic mean of precision and recall (for calculating F1-score). A - value of 0 means recall is more important and a value of 1 means - precision is more important (same behaviour as - https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l). - metric_type: string. One of "precision", "recall", "f1_score". Defaults - to "f1_score". - mask_token_ids: list of integers. IDs of the tokens to be masked. - dtype: string or tf.dtypes.Dtype. Precision of metric computation. If - not specified, it defaults to tf.float32. - name: string. Name of the metric instance. - **kwargs: Other keyword arguments. - - Examples: - - 1. Calculate RougeL (F1 Score) by calling `update_state()` and `result()`. - 1.1. `mask_token_ids` not provided. - >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l") - >>> references = tf.constant( - ... [[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) - >>> hypotheses = tf.constant( - ... [[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32) - >>> rouge_l.update_state(references, hypotheses) - >>> rouge_l.result() - >> rouge_l = keras_nlp.metrics.RougeL( - ... name="rouge_l", mask_token_ids=[0, 1]) - >>> references = tf.constant( - ... [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32) - >>> hypotheses = tf.constant( - ... [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32) - >>> rouge_l.update_state(references, hypotheses) - >>> rouge_l.result() - - - 1.3. tf.RaggedTensor as input, and `mask_token_ids` not provided. - >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l") - >>> references = tf.ragged.constant( - ... [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) - >>> hypotheses = tf.ragged.constant( - ... [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32) - >>> rouge_l.update_state(references, hypotheses) - >>> rouge_l.result() - - - 1.4. tf.RaggedTensor as input, and `mask_token_ids` provided. - >>> rouge_l = keras_nlp.metrics.RougeL( - ... name="rouge_l", mask_token_ids=[1, 5]) - >>> references = tf.ragged.constant( - ... [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) - >>> hypotheses = tf.ragged.constant( - ... [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32) - >>> rouge_l.update_state(references, hypotheses) - >>> rouge_l.result() - - - 2. Calculate ROUGE-L directly. This has the same functionality as above. - >>> rouge_l = keras_nlp.metrics.RougeL( - ... name="rouge_l", mask_token_ids=[0, 1]) - >>> references = tf.constant( - ... [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32) - >>> hypotheses = tf.constant( - ... [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32) - >>> rouge_l(references, hypotheses) - - - 3. Traditionally, the ROUGE-L metric calculates the F1-score. However, if - the user wants the precision, this is how it can be done: - >>> rouge_l = keras_nlp.metrics.RougeL( - ... name="rouge_l", metric_type="precision") - >>> references = tf.constant( - ... [[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) - >>> hypotheses = tf.constant( - ... [[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32) - >>> rouge_l(references, hypotheses) - - - 4. Modify the precision vs recall importance (for calculating F1-score) by - specifying the `alpha` parameter. - >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l", alpha=0.7) - >>> references = tf.ragged.constant( - ... [[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32) - >>> hypotheses = tf.ragged.constant( - ... [[1], [5, 6, 10, 10, 10]], dtype=tf.int32) - >>> rouge_l(references, hypotheses) - - """ - - def __init__( - self, - alpha=0.5, - metric_type="f1_score", - mask_token_ids=None, - dtype=tf.float32, - name="rouge_l", - **kwargs, - ): - super().__init__(name=name, dtype=dtype, **kwargs) - - if not tf.as_dtype(self.dtype).is_floating: - raise ValueError( - "`dtype` must be a floating point type. " - f"Received: dtype={dtype}" - ) - - if metric_type not in ("precision", "recall", "f1_score"): - raise ValueError( - '`metric_type` must be one of "precision", "recall", ' - f'"f1_score". Received: metric_type={metric_type}' - ) - - self.alpha = alpha - self.metric_type = metric_type - self.mask_token_ids = mask_token_ids - - self._rouge_l_score = self.add_weight( - name="rouge_l_score", - initializer="zeros", - dtype=self.dtype, - ) - self._number_of_samples = self.add_weight( - name="number_of_samples", initializer="zeros", dtype=self.dtype - ) - - def update_state(self, y_true, y_pred, sample_weight=None): - # Both y_true and y_pred have shape: [batch_size, seq_len]. Note that - # they can also be ragged tensors with shape [batch_size, (seq_len)]. - - # If the input tensors are not ragged tensors, convert them to ragged - # tensors. `tf_text.metrics.rouge_l` expects ragged tensors. - if not isinstance(y_true, tf.RaggedTensor): - y_true = tf.RaggedTensor.from_tensor(y_true) - if not isinstance(y_pred, tf.RaggedTensor): - y_pred = tf.RaggedTensor.from_tensor(y_pred) - - batch_size = tf.cast(y_true.nrows(), self.dtype) - - y_true_mask = tf.cast(tf.ones_like(y_true), tf.bool) - y_pred_mask = tf.cast(tf.ones_like(y_pred), tf.bool) - - if self.mask_token_ids is not None: - for mask_token_id in self.mask_token_ids: - y_true_mask = tf.logical_and( - y_true_mask, - tf.math.logical_not(tf.equal(y_true, mask_token_id)), - ) - y_pred_mask = tf.logical_and( - y_pred_mask, - tf.math.logical_not(tf.equal(y_pred, mask_token_id)), - ) - - # Apply mask to both tensors. - y_true = tf.ragged.boolean_mask(y_true, y_true_mask) - y_pred = tf.ragged.boolean_mask(y_pred, y_pred_mask) - - f1_scores, precisions, recalls = tf_text.metrics.rouge_l( - y_pred, y_true, alpha=self.alpha - ) - if self.metric_type == "precision": - scores = precisions - elif self.metric_type == "recall": - scores = recalls - else: - scores = f1_scores - self._rouge_l_score.assign_add(tf.reduce_sum(scores)) - self._number_of_samples.assign_add(batch_size) - - def result(self): - if self._number_of_samples == 0: - return 0.0 - rouge_l_score = self._rouge_l_score / self._number_of_samples - return rouge_l_score - - def reset_state(self): - self._rouge_l_score.assign(0.0) - self._number_of_samples.assign(0.0) - - def get_config(self): - config = super().get_config() - config.update( - { - "alpha": self.alpha, - "metric_type": self.metric_type, - "mask_token_ids": self.mask_token_ids, - } - ) - return config diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py deleted file mode 100644 index 0fb1409499..0000000000 --- a/keras_nlp/metrics/rouge_l_test.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright 2022 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for RougeL.""" - -import tensorflow as tf - -from keras_nlp.metrics import RougeL - - -class RougeLTest(tf.test.TestCase): - def test_initialization(self): - rouge_l = RougeL() - self.assertEqual(rouge_l.result().numpy(), 0.0) - - def test_without_mask_token_ids(self): - rouge_l = RougeL() - y_true = tf.constant([[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32) - y_pred = tf.constant([[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32) - - rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.70, delta=1e-3) - - def test_with_mask_token_ids(self): - rouge_l = RougeL(mask_token_ids=[0, 1]) - y_true = tf.constant( - [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 - ) - y_pred = tf.constant( - [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 - ) - - rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3) - - def test_ragged_input_without_mask_token_ids(self): - rouge_l = RougeL() - y_true = tf.ragged.constant( - [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32 - ) - y_pred = tf.ragged.constant([[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32) - - rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.5357, delta=1e-3) - - def test_ragged_input_with_mask_token_ids(self): - rouge_l = RougeL(mask_token_ids=[0, 1]) - y_true = tf.ragged.constant( - [[1, 2, 3, 4], [1, 5, 6, 0, 0]], dtype=tf.int32 - ) - y_pred = tf.ragged.constant( - [[1, 3, 2, 4, 4, 4], [5, 6, 0, 2, 2]], dtype=tf.int32 - ) - - rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.583, delta=1e-3) - - def test_precision(self): - rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="precision") - y_true = tf.constant( - [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 - ) - y_pred = tf.constant( - [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 - ) - - rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.45, delta=1e-3) - - def test_recall(self): - rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="recall") - y_true = tf.constant( - [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 - ) - y_pred = tf.constant( - [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 - ) - - rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.8333, delta=1e-3) - - def test_output_with_alpha(self): - rouge_l = RougeL(alpha=0.7) - y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32) - y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32) - - rouge_l_val = rouge_l(y_true, y_pred) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.5253, delta=1e-3) - - def test_two_inputs_from_logits(self): - rouge_l = RougeL(mask_token_ids=[0, 1]) - y_true_1 = tf.constant( - [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 - ) - y_pred_1 = tf.constant( - [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 - ) - - rouge_l_val = rouge_l(y_true_1, y_pred_1) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3) - - y_true_2 = tf.ragged.constant( - [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32 - ) - y_pred_2 = tf.ragged.constant( - [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32 - ) - - rouge_l_val = rouge_l(y_true_2, y_pred_2) - self.assertAlmostEqual(rouge_l_val.numpy(), 0.7014, delta=1e-3) - - def test_reset_state(self): - rouge_l = RougeL(mask_token_ids=[0, 1]) - y_true = tf.constant( - [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 - ) - y_pred = tf.constant( - [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 - ) - - rouge_l.update_state(y_true, y_pred) - self.assertNotEqual(rouge_l.result(), 0.0) - - rouge_l.reset_state() - self.assertEqual(rouge_l.result(), 0.0) - - def test_update_state(self): - rouge_l = RougeL(mask_token_ids=[0, 1]) - y_true_1 = tf.constant( - [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 - ) - y_pred_1 = tf.constant( - [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 - ) - - rouge_l.update_state(y_true_1, y_pred_1) - rouge_l_val = rouge_l.result() - self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3) - - y_true_2 = tf.ragged.constant( - [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32 - ) - y_pred_2 = tf.ragged.constant( - [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32 - ) - - rouge_l.update_state(y_true_2, y_pred_2) - rouge_l_val = rouge_l.result() - self.assertAlmostEqual(rouge_l_val.numpy(), 0.7014, delta=1e-3) - - def test_merge_state(self): - rouge_l_1 = RougeL(mask_token_ids=[0, 1]) - rouge_l_2 = RougeL(mask_token_ids=[0, 1]) - - y_true_1 = tf.constant( - [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32 - ) - y_pred_1 = tf.constant( - [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32 - ) - - y_true_2 = tf.ragged.constant( - [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32 - ) - y_pred_2 = tf.ragged.constant( - [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32 - ) - - y_true_3 = tf.ragged.constant( - [[9, 8, 7, 1], [10, 5, 1, 2, 3]], dtype=tf.int32 - ) - y_pred_3 = tf.ragged.constant( - [[1, 2, 7, 9, 8, 0], [10, 1, 2]], dtype=tf.int32 - ) - - rouge_l_1.update_state(y_true_1, y_pred_1) - rouge_l_1.update_state(y_true_2, y_pred_2) - self.assertAlmostEqual(rouge_l_1.result().numpy(), 0.7014, delta=1e-3) - - rouge_l_2.update_state(y_true_3, y_pred_3) - self.assertAlmostEqual(rouge_l_2.result().numpy(), 0.6190, delta=1e-3) - - merged_rouge_l = RougeL(mask_token_ids=[0, 1]) - merged_rouge_l.merge_state([rouge_l_1, rouge_l_2]) - self.assertAlmostEqual( - merged_rouge_l.result().numpy(), 0.6739, delta=1e-3 - ) - - def test_get_config(self): - rouge_l = RougeL( - alpha=0.7, - metric_type="precision", - mask_token_ids=[0], - dtype=tf.float32, - name="rouge_l_test", - ) - config = rouge_l.get_config() - expected_config = { - "alpha": 0.7, - "metric_type": "precision", - "mask_token_ids": [0], - "dtype": tf.float32, - "name": "rouge_l_test", - } - self.assertEqual(config, expected_config) diff --git a/keras_nlp/metrics/rouge_test.py b/keras_nlp/metrics/rouge_test.py new file mode 100644 index 0000000000..05ca3b65cc --- /dev/null +++ b/keras_nlp/metrics/rouge_test.py @@ -0,0 +1,216 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Rouge.""" + +import tensorflow as tf + +from keras_nlp.metrics import Rouge + + +class RougeTest(tf.test.TestCase): + def test_initialization(self): + rouge = Rouge() + self.assertEqual(rouge.result().numpy(), 0.0) + + def test_string_input(self): + rouge = Rouge( + variant="rouge2", metric_type="f1_score", use_stemmer=False + ) + y_true = "hey, this is great fun" + y_pred = "great fun indeed" + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3) + + def test_string_list_input(self): + rouge = Rouge( + variant="rouge2", metric_type="f1_score", use_stemmer=False + ) + y_true = ["hey, this is great fun", "i love contributing to KerasNLP"] + y_pred = ["great fun indeed", "contributing to KerasNLP is delightful"] + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.417, delta=1e-3) + + def test_tensor_input(self): + rouge = Rouge( + variant="rouge2", metric_type="f1_score", use_stemmer=False + ) + y_true = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred = tf.constant( + ["great fun indeed", "contributing to KerasNLP is delightful"] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.417, delta=1e-3) + + def test_rouge_l(self): + rouge = Rouge( + variant="rougeL", metric_type="f1_score", use_stemmer=False + ) + y_true = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred = tf.constant( + ["great fun indeed", "contributing to KerasNLP is delightful"] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.55, delta=1e-3) + + def test_rouge_l_sum(self): + rouge = Rouge( + variant="rougeLsum", metric_type="f1_score", use_stemmer=False + ) + y_true = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred = tf.constant( + ["great fun indeed", "contributing to KerasNLP is delightful"] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.55, delta=1e-3) + + def test_incorrect_variant(self): + with self.assertRaises(ValueError): + _ = Rouge( + variant="rouge10", metric_type="f1_score", use_stemmer=False + ) + + def test_precision(self): + rouge = Rouge( + variant="rouge3", metric_type="precision", use_stemmer=False + ) + y_true = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred = tf.constant( + [ + "great fun indeed", + "KerasNLP is awesome, i love contributing to it", + ] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.167, delta=1e-3) + + def test_recall(self): + rouge = Rouge(variant="rouge3", metric_type="recall", use_stemmer=False) + y_true = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred = tf.constant( + [ + "great fun indeed", + "KerasNLP is awesome, i love contributing to it", + ] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3) + + def test_reset_state(self): + rouge = Rouge() + y_true = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred = tf.constant( + [ + "great fun indeed", + "KerasNLP is awesome, i love contributing to it", + ] + ) + + rouge.update_state(y_true, y_pred) + self.assertNotEqual(rouge.result(), 0.0) + + rouge.reset_state() + self.assertEqual(rouge.result(), 0.0) + + def test_update_state(self): + rouge = Rouge() + y_true_1 = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred_1 = tf.constant( + [ + "great fun indeed", + "KerasNLP is awesome, i love contributing to it", + ] + ) + + rouge.update_state(y_true_1, y_pred_1) + rouge_val = rouge.result() + self.assertAlmostEqual(rouge_val.numpy(), 0.439, delta=1e-3) + + y_true_2 = tf.constant(["what is your favourite show"]) + y_pred_2 = tf.constant(["my favourite show is silicon valley"]) + + rouge.update_state(y_true_2, y_pred_2) + rouge_val = rouge.result() + self.assertAlmostEqual(rouge_val.numpy(), 0.367, delta=1e-3) + + def test_merge_state(self): + rouge_1 = Rouge() + rouge_2 = Rouge() + + y_true_1 = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred_1 = tf.constant( + [ + "great fun indeed", + "KerasNLP is awesome, i love contributing to it", + ] + ) + + y_true_2 = tf.constant(["what is your favourite show"]) + y_pred_2 = tf.constant(["my favourite show is silicon valley"]) + + y_true_3 = tf.constant(["lorem ipsum dolor sit amet"]) + y_pred_3 = tf.constant(["lorem ipsum is simply dummy text"]) + + rouge_1.update_state(y_true_1, y_pred_1) + rouge_1.update_state(y_true_2, y_pred_2) + self.assertAlmostEqual(rouge_1.result().numpy(), 0.367, delta=1e-3) + + rouge_2.update_state(y_true_3, y_pred_3) + self.assertAlmostEqual(rouge_2.result().numpy(), 0.222, delta=1e-3) + + merged_rouge = Rouge() + merged_rouge.merge_state([rouge_1, rouge_2]) + self.assertAlmostEqual(merged_rouge.result().numpy(), 0.331, delta=1e-3) + + def test_get_config(self): + rouge = Rouge( + variant="rouge5", + metric_type="precision", + use_stemmer=True, + dtype=tf.float32, + name="rouge_test", + ) + + config = rouge.get_config() + expected_config = { + "variant": "rouge5", + "metric_type": "precision", + "use_stemmer": True, + "dtype": tf.float32, + "name": "rouge_test", + } + self.assertEqual(config, expected_config) From 7586e0002acbd5866c29aba71850abf13412bfc2 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Mon, 23 May 2022 21:21:40 +0530 Subject: [PATCH 15/30] Fix rouge_score import --- keras_nlp/metrics/rouge.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge.py index 089157c139..459de736ca 100644 --- a/keras_nlp/metrics/rouge.py +++ b/keras_nlp/metrics/rouge.py @@ -14,12 +14,14 @@ """ROUGE metric implementation based on `keras.metrics.Metric`.""" +import sys import tensorflow as tf from tensorflow import keras from keras_nlp.utils.tensor_utils import tensor_to_string_list try: + import rouge_score from rouge_score import rouge_scorer except: pass @@ -55,7 +57,7 @@ def __init__( ): super().__init__(name=name, dtype=dtype, **kwargs) - if rouge_scorer is None: + if "rouge_score" not in sys.modules: raise ImportError( "ROUGE metric requires the `rouge_score` package." "Please install it with `pip install rouge_score`." From 893aab9681b10f59dcc03795b28369e706c0ee54 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Tue, 24 May 2022 18:05:14 +0530 Subject: [PATCH 16/30] Add rouge-score to test deps list --- keras_nlp/metrics/rouge.py | 6 +++--- setup.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge.py index 459de736ca..e23b6a3587 100644 --- a/keras_nlp/metrics/rouge.py +++ b/keras_nlp/metrics/rouge.py @@ -23,8 +23,8 @@ try: import rouge_score from rouge_score import rouge_scorer -except: - pass +except ImportError: + rouge_score = None class Rouge(keras.metrics.Metric): @@ -57,7 +57,7 @@ def __init__( ): super().__init__(name=name, dtype=dtype, **kwargs) - if "rouge_score" not in sys.modules: + if rouge_score is None: raise ImportError( "ROUGE metric requires the `rouge_score` package." "Please install it with `pip install rouge_score`." diff --git a/setup.py b/setup.py index 371a3e08f5..fac287ae7c 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ "isort", "pytest", "pytest-cov", + "rouge-score", ], "examples": [ "datasets", # For GLUE in BERT example. From ccf33d4ac891dab50381cde4396c020f8f06b1e7 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sat, 28 May 2022 16:19:19 +0530 Subject: [PATCH 17/30] Address review comments - II --- keras_nlp/metrics/__init__.py | 3 +- keras_nlp/metrics/{rouge.py => rouge_l.py} | 77 +++---- keras_nlp/metrics/rouge_l_test.py | 196 ++++++++++++++++++ keras_nlp/metrics/rouge_n.py | 175 ++++++++++++++++ .../{rouge_test.py => rouge_n_test.py} | 165 +++++++-------- 5 files changed, 490 insertions(+), 126 deletions(-) rename keras_nlp/metrics/{rouge.py => rouge_l.py} (70%) create mode 100644 keras_nlp/metrics/rouge_l_test.py create mode 100644 keras_nlp/metrics/rouge_n.py rename keras_nlp/metrics/{rouge_test.py => rouge_n_test.py} (52%) diff --git a/keras_nlp/metrics/__init__.py b/keras_nlp/metrics/__init__.py index 2a0682138e..55ade6dc8a 100644 --- a/keras_nlp/metrics/__init__.py +++ b/keras_nlp/metrics/__init__.py @@ -13,4 +13,5 @@ # limitations under the License. from keras_nlp.metrics.perplexity import Perplexity -from keras_nlp.metrics.rouge import Rouge +from keras_nlp.metrics.rouge_l import RougeL +from keras_nlp.metrics.rouge_n import RougeN diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge_l.py similarity index 70% rename from keras_nlp/metrics/rouge.py rename to keras_nlp/metrics/rouge_l.py index e23b6a3587..79f897a2da 100644 --- a/keras_nlp/metrics/rouge.py +++ b/keras_nlp/metrics/rouge_l.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""ROUGE metric implementation based on `keras.metrics.Metric`.""" +"""ROUGE-L metric implementation based on `keras.metrics.Metric`.""" -import sys import tensorflow as tf from tensorflow import keras @@ -27,19 +26,19 @@ rouge_score = None -class Rouge(keras.metrics.Metric): - """ROUGE metric. +class RougeL(keras.metrics.Metric): + """ROUGE-L metric. - This class implements all the variants of the ROUGE metric - ROUGE-N, - ROUGE-L and ROUGE-LSum. + This class implements the ROUGE-L variant of the ROUGE metric. The ROUGE-L + metric is traditionally used for evaluating summarisation systems. + Succinctly put, ROUGE-L is a score based on the length of the longest + common subsequence present in the reference text and the hypothesis text. Args: - variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to - "rouge2". For "rougeN", N lies in the range [1, 9]. - metric_type: string. One of "precision", "recall", "f1_score". Defaults - to "f1_score". use_stemmer: bool. Whether Porter Stemmer should be used to strip word suffixes to improve matching. Defaults to False. + metric_type: string. One of "precision", "recall", "f1_score". Defaults + to "f1_score". dtype: string or tf.dtypes.Dtype. Precision of metric computation. If not specified, it defaults to tf.float32. name: string. Name of the metric instance. @@ -48,19 +47,18 @@ class Rouge(keras.metrics.Metric): def __init__( self, - variant="rouge2", metric_type="f1_score", use_stemmer=False, dtype=None, - name="rouge", + name="rouge-l", **kwargs, ): super().__init__(name=name, dtype=dtype, **kwargs) if rouge_score is None: raise ImportError( - "ROUGE metric requires the `rouge_score` package." - "Please install it with `pip install rouge_score`." + "ROUGE metric requires the `rouge_score` package. " + "Please install it with `pip install rouge-score`." ) if not tf.as_dtype(self.dtype).is_floating: @@ -69,36 +67,24 @@ def __init__( f"Received: dtype={dtype}" ) - if variant not in tuple( - ("rouge" + str(order) for order in range(1, 10)) - ) + ( - "rougeL", - "rougeLsum", - ): - raise ValueError( - "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, " - "rougeLsum, with N ranging from 1 to 9. Received: " - f"variant={variant}" - ) if metric_type not in ("precision", "recall", "f1_score"): raise ValueError( '`metric_type` must be one of "precision", "recall", ' f'"f1_score". Received: metric_type={metric_type}' ) - self.variant = variant self.metric_type = metric_type self.use_stemmer = use_stemmer - # To-do: Add split_summaries and tokenizer options after the maintainers - # of rouge_scorer have released a new version. - self._rouge_scorer = rouge_scorer.RougeScorer( - rouge_types=[self.variant], + # To-do: Add an option for adding custom tokenizer after the maintainers + # of rouge-score have released a new version. + self._rouge_l_scorer = rouge_scorer.RougeScorer( + rouge_types=["rougeL"], use_stemmer=use_stemmer, ) - self._rouge_score = self.add_weight( - name="rouge_score", + self._rouge_l_score = self.add_weight( + name="rouge_l_score", initializer="zeros", dtype=self.dtype, ) @@ -107,8 +93,9 @@ def __init__( ) def update_state(self, y_true, y_pred, sample_weight=None): - # Both y_true and y_pred have shape: [batch_size]. Each element is a - # string. + # Three possible shapes for y_true and y_pred: Python string, + # [batch_size] and [batch_size, 1]. In the latter two cases, we have + # strings in the tensor/list. # Check if input is a raw string/list. if isinstance(y_true, str): @@ -120,14 +107,19 @@ def update_state(self, y_true, y_pred, sample_weight=None): elif isinstance(y_pred, list): y_pred = tf.constant(y_pred) + # If the shape of y_true and y_pred is [batch_size, 1], squeeze it to + # [batch_size]. + if y_true.shape.rank == 2: + y_true = tf.squeeze(y_true, axis=1) + if y_pred.shape.rank == 2: + y_pred = tf.squeeze(y_pred, axis=1) + batch_size = tf.shape(y_true)[0] - def _calculate_rouge_score(reference, hypothesis): + def _calculate_rouge_l_score(reference, hypothesis): reference = tensor_to_string_list(reference) hypothesis = tensor_to_string_list(hypothesis) - score = self._rouge_scorer.score(reference, hypothesis)[ - self.variant - ] + score = self._rouge_l_scorer.score(reference, hypothesis)["rougeL"] if self.metric_type == "precision": score = score.precision @@ -139,11 +131,11 @@ def _calculate_rouge_score(reference, hypothesis): for batch_idx in range(batch_size): score = tf.py_function( - func=_calculate_rouge_score, + func=_calculate_rouge_l_score, inp=[y_true[batch_idx], y_pred[batch_idx]], Tout=self.dtype, ) - self._rouge_score.assign_add(score) + self._rouge_l_score.assign_add(score) self._number_of_samples.assign_add( tf.cast(batch_size, dtype=self.dtype) @@ -152,18 +144,17 @@ def _calculate_rouge_score(reference, hypothesis): def result(self): if self._number_of_samples == 0: return 0.0 - rouge_l_score = self._rouge_score / self._number_of_samples + rouge_l_score = self._rouge_l_score / self._number_of_samples return rouge_l_score def reset_state(self): - self._rouge_score.assign(0.0) + self._rouge_l_score.assign(0.0) self._number_of_samples.assign(0.0) def get_config(self): config = super().get_config() config.update( { - "variant": self.variant, "metric_type": self.metric_type, "use_stemmer": self.use_stemmer, } diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py new file mode 100644 index 0000000000..a3e4250dae --- /dev/null +++ b/keras_nlp/metrics/rouge_l_test.py @@ -0,0 +1,196 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for RougeL.""" + +import tensorflow as tf + +from keras_nlp.metrics import RougeL + + +class RougeLTest(tf.test.TestCase): + def test_initialization(self): + rouge = RougeL() + self.assertEqual(rouge.result().numpy(), 0.0) + + def test_string_input(self): + rouge = RougeL(use_stemmer=False) + y_true = "the tiny little cat was found under the big funny bed" + y_pred = "the cat was under the bed" + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.706, delta=1e-3) + + def test_string_list_input(self): + rouge = RougeL(use_stemmer=False) + y_true = [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] + y_pred = [ + "the cat was under the bed", + "i love contributing to KerasNLP", + ] + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) + + def test_tensor_input(self): + rouge = RougeL(use_stemmer=False) + y_true = tf.constant( + [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] + ) + y_pred = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) + + def test_rank_2_input(self): + rouge = RougeL(use_stemmer=False) + y_true = tf.constant( + [ + ["the tiny little cat was found under the big funny bed"], + ["i really love contributing to KerasNLP"], + ] + ) + y_pred = tf.constant( + [["the cat was under the bed"], ["i love contributing to KerasNLP"]] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) + + def test_precision(self): + rouge = RougeL(metric_type="precision", use_stemmer=False) + y_true = tf.constant( + [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] + ) + y_pred = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 1, delta=1e-3) + + def test_recall(self): + rouge = RougeL(metric_type="recall", use_stemmer=False) + y_true = tf.constant( + [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] + ) + y_pred = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) + + rouge_val = rouge(y_true, y_pred) + self.assertAlmostEqual(rouge_val.numpy(), 0.689, delta=1e-3) + + def test_reset_state(self): + rouge = RougeL() + y_true = tf.constant( + ["hey, this is great fun", "i love contributing to KerasNLP"] + ) + y_pred = tf.constant( + [ + "great fun indeed", + "KerasNLP is awesome, i love contributing to it", + ] + ) + + rouge.update_state(y_true, y_pred) + self.assertNotEqual(rouge.result(), 0.0) + + rouge.reset_state() + self.assertEqual(rouge.result(), 0.0) + + def test_update_state(self): + rouge = RougeL() + y_true_1 = tf.constant( + [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] + ) + y_pred_1 = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) + + rouge.update_state(y_true_1, y_pred_1) + rouge_val = rouge.result() + self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) + + y_true_2 = tf.constant(["what is your favourite show"]) + y_pred_2 = tf.constant(["my favourite show is silicon valley"]) + + rouge.update_state(y_true_2, y_pred_2) + rouge_val = rouge.result() + self.assertAlmostEqual(rouge_val.numpy(), 0.659, delta=1e-3) + + def test_merge_state(self): + rouge_1 = RougeL() + rouge_2 = RougeL() + + y_true_1 = tf.constant( + [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] + ) + y_pred_1 = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) + + y_true_2 = tf.constant(["what is your favourite show"]) + y_pred_2 = tf.constant(["my favourite show is silicon valley"]) + + y_true_3 = tf.constant(["lorem ipsum dolor sit amet"]) + y_pred_3 = tf.constant(["lorem ipsum is simply dummy text"]) + + rouge_1.update_state(y_true_1, y_pred_1) + rouge_1.update_state(y_true_2, y_pred_2) + self.assertAlmostEqual(rouge_1.result().numpy(), 0.659, delta=1e-3) + + rouge_2.update_state(y_true_3, y_pred_3) + self.assertAlmostEqual(rouge_2.result().numpy(), 0.364, delta=1e-3) + + merged_rouge = RougeL() + merged_rouge.merge_state([rouge_1, rouge_2]) + self.assertAlmostEqual(merged_rouge.result().numpy(), 0.586, delta=1e-3) + + def test_get_config(self): + rouge = RougeL( + metric_type="precision", + use_stemmer=True, + dtype=tf.float32, + name="rouge_l_test", + ) + + config = rouge.get_config() + expected_config = { + "metric_type": "precision", + "use_stemmer": True, + "dtype": tf.float32, + "name": "rouge_l_test", + } + self.assertEqual(config, expected_config) diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py new file mode 100644 index 0000000000..ad8e2288d9 --- /dev/null +++ b/keras_nlp/metrics/rouge_n.py @@ -0,0 +1,175 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ROUGE-N metric implementation based on `keras.metrics.Metric`.""" + +import tensorflow as tf +from tensorflow import keras + +from keras_nlp.utils.tensor_utils import tensor_to_string_list + +try: + import rouge_score + from rouge_score import rouge_scorer +except ImportError: + rouge_score = None + + +class RougeN(keras.metrics.Metric): + """ROUGE-N metric. + + This class implements the ROUGE-N variant of the ROUGE metric. The ROUGE-N + metric is traditionally used for evaluating summarisation systems. + Succinctly put, ROUGE-N is a score based on the number of matching n-grams + between the reference text and the hypothesis text. + + Args: + order: The order of n-grams which are to be matched. It should lie in + range [1, 9]. Defaults to 2. + metric_type: string. One of "precision", "recall", "f1_score". Defaults + to "f1_score". + use_stemmer: bool. Whether Porter Stemmer should be used to strip word + suffixes to improve matching. Defaults to False. + dtype: string or tf.dtypes.Dtype. Precision of metric computation. If + not specified, it defaults to tf.float32. + name: string. Name of the metric instance. + **kwargs: Other keyword arguments. + """ + + def __init__( + self, + order=2, + metric_type="f1_score", + use_stemmer=False, + dtype=None, + name="rouge-n", + **kwargs, + ): + super().__init__(name=name, dtype=dtype, **kwargs) + + if rouge_score is None: + raise ImportError( + "ROUGE metric requires the `rouge_score` package. " + "Please install it with `pip install rouge-score`." + ) + + if not tf.as_dtype(self.dtype).is_floating: + raise ValueError( + "`dtype` must be a floating point type. " + f"Received: dtype={dtype}" + ) + + if order not in range(1, 10): + raise ValueError( + "Invalid `order` value. Should lie in the range [1, 9]." + f"Received order={order}" + ) + + if metric_type not in ("precision", "recall", "f1_score"): + raise ValueError( + '`metric_type` must be one of "precision", "recall", ' + f'"f1_score". Received: metric_type={metric_type}' + ) + + self.order = order + self.metric_type = metric_type + self.use_stemmer = use_stemmer + + # To-do: Add an option for adding custom tokenizer after the maintainers + # of rouge-score have released a new version. + self._rouge_n_scorer = rouge_scorer.RougeScorer( + rouge_types=["rouge" + str(order)], + use_stemmer=use_stemmer, + ) + + self._rouge_n_score = self.add_weight( + name="rouge_n_score", + initializer="zeros", + dtype=self.dtype, + ) + self._number_of_samples = self.add_weight( + name="number_of_samples", initializer="zeros", dtype=self.dtype + ) + + def update_state(self, y_true, y_pred, sample_weight=None): + # Three possible shapes for y_true and y_pred: Python string, + # [batch_size] and [batch_size, 1]. In the latter two cases, we have + # strings in the tensor/list. + + # Check if input is a raw string/list. + if isinstance(y_true, str): + y_true = tf.constant([y_true]) + elif isinstance(y_true, list): + y_true = tf.constant(y_true) + if isinstance(y_pred, str): + y_pred = tf.constant([y_pred]) + elif isinstance(y_pred, list): + y_pred = tf.constant(y_pred) + + # If the shape of y_true and y_pred is [batch_size, 1], squeeze it to + # [batch_size]. + if y_true.shape.rank == 2: + y_true = tf.squeeze(y_true, axis=1) + if y_pred.shape.rank == 2: + y_pred = tf.squeeze(y_pred, axis=1) + + batch_size = tf.shape(y_true)[0] + + def _calculate_rouge_n_score(reference, hypothesis): + reference = tensor_to_string_list(reference) + hypothesis = tensor_to_string_list(hypothesis) + score = self._rouge_n_scorer.score(reference, hypothesis)[ + "rouge" + str(self.order) + ] + + if self.metric_type == "precision": + score = score.precision + elif self.metric_type == "recall": + score = score.recall + else: + score = score.fmeasure + return score + + for batch_idx in range(batch_size): + score = tf.py_function( + func=_calculate_rouge_n_score, + inp=[y_true[batch_idx], y_pred[batch_idx]], + Tout=self.dtype, + ) + self._rouge_n_score.assign_add(score) + + self._number_of_samples.assign_add( + tf.cast(batch_size, dtype=self.dtype) + ) + + def result(self): + if self._number_of_samples == 0: + return 0.0 + rouge_n_score = self._rouge_n_score / self._number_of_samples + return rouge_n_score + + def reset_state(self): + self._rouge_n_score.assign(0.0) + self._number_of_samples.assign(0.0) + + def get_config(self): + config = super().get_config() + config.update( + { + "order": self.order, + "metric_type": self.metric_type, + "use_stemmer": self.use_stemmer, + } + ) + return config diff --git a/keras_nlp/metrics/rouge_test.py b/keras_nlp/metrics/rouge_n_test.py similarity index 52% rename from keras_nlp/metrics/rouge_test.py rename to keras_nlp/metrics/rouge_n_test.py index 05ca3b65cc..983902f1e5 100644 --- a/keras_nlp/metrics/rouge_test.py +++ b/keras_nlp/metrics/rouge_n_test.py @@ -12,120 +12,121 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for Rouge.""" +"""Tests for RougeN.""" import tensorflow as tf -from keras_nlp.metrics import Rouge +from keras_nlp.metrics import RougeN -class RougeTest(tf.test.TestCase): +class RougeNTest(tf.test.TestCase): def test_initialization(self): - rouge = Rouge() + rouge = RougeN() self.assertEqual(rouge.result().numpy(), 0.0) def test_string_input(self): - rouge = Rouge( - variant="rouge2", metric_type="f1_score", use_stemmer=False - ) - y_true = "hey, this is great fun" - y_pred = "great fun indeed" + rouge = RougeN(order=2, use_stemmer=False) + y_true = "the tiny little cat was found under the big funny bed" + y_pred = "the cat was under the bed" rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3) + self.assertAlmostEqual(rouge_val.numpy(), 0.267, delta=1e-3) def test_string_list_input(self): - rouge = Rouge( - variant="rouge2", metric_type="f1_score", use_stemmer=False - ) - y_true = ["hey, this is great fun", "i love contributing to KerasNLP"] - y_pred = ["great fun indeed", "contributing to KerasNLP is delightful"] + rouge = RougeN(order=2, use_stemmer=False) + y_true = [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] + y_pred = [ + "the cat was under the bed", + "i love contributing to KerasNLP", + ] rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.417, delta=1e-3) + self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) def test_tensor_input(self): - rouge = Rouge( - variant="rouge2", metric_type="f1_score", use_stemmer=False - ) + rouge = RougeN(order=2, use_stemmer=False) y_true = tf.constant( - ["hey, this is great fun", "i love contributing to KerasNLP"] + [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] ) y_pred = tf.constant( - ["great fun indeed", "contributing to KerasNLP is delightful"] + ["the cat was under the bed", "i love contributing to KerasNLP"] ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.417, delta=1e-3) + self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) - def test_rouge_l(self): - rouge = Rouge( - variant="rougeL", metric_type="f1_score", use_stemmer=False - ) + def test_rank_2_input(self): + rouge = RougeN(order=2, use_stemmer=False) y_true = tf.constant( - ["hey, this is great fun", "i love contributing to KerasNLP"] + [ + ["the tiny little cat was found under the big funny bed"], + ["i really love contributing to KerasNLP"], + ] ) y_pred = tf.constant( - ["great fun indeed", "contributing to KerasNLP is delightful"] + [["the cat was under the bed"], ["i love contributing to KerasNLP"]] ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.55, delta=1e-3) + self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) - def test_rouge_l_sum(self): - rouge = Rouge( - variant="rougeLsum", metric_type="f1_score", use_stemmer=False - ) + def test_incorrect_order(self): + with self.assertRaises(ValueError): + _ = RougeN(order=10) + + def test_different_order(self): + rouge = RougeN(order=3, use_stemmer=False) y_true = tf.constant( - ["hey, this is great fun", "i love contributing to KerasNLP"] + [ + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", + ] ) y_pred = tf.constant( - ["great fun indeed", "contributing to KerasNLP is delightful"] + ["the cat was under the bed", "i love contributing to KerasNLP"] ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.55, delta=1e-3) - - def test_incorrect_variant(self): - with self.assertRaises(ValueError): - _ = Rouge( - variant="rouge10", metric_type="f1_score", use_stemmer=False - ) + self.assertAlmostEqual(rouge_val.numpy(), 0.286, delta=1e-3) def test_precision(self): - rouge = Rouge( - variant="rouge3", metric_type="precision", use_stemmer=False - ) + rouge = RougeN(order=3, metric_type="precision", use_stemmer=False) y_true = tf.constant( - ["hey, this is great fun", "i love contributing to KerasNLP"] - ) - y_pred = tf.constant( [ - "great fun indeed", - "KerasNLP is awesome, i love contributing to it", + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", ] ) + y_pred = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.167, delta=1e-3) + self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3) def test_recall(self): - rouge = Rouge(variant="rouge3", metric_type="recall", use_stemmer=False) + rouge = RougeN(order=3, metric_type="recall", use_stemmer=False) y_true = tf.constant( - ["hey, this is great fun", "i love contributing to KerasNLP"] - ) - y_pred = tf.constant( [ - "great fun indeed", - "KerasNLP is awesome, i love contributing to it", + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", ] ) + y_pred = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3) + self.assertAlmostEqual(rouge_val.numpy(), 0.25, delta=1e-3) def test_reset_state(self): - rouge = Rouge() + rouge = RougeN() y_true = tf.constant( ["hey, this is great fun", "i love contributing to KerasNLP"] ) @@ -143,41 +144,41 @@ def test_reset_state(self): self.assertEqual(rouge.result(), 0.0) def test_update_state(self): - rouge = Rouge() + rouge = RougeN() y_true_1 = tf.constant( - ["hey, this is great fun", "i love contributing to KerasNLP"] - ) - y_pred_1 = tf.constant( [ - "great fun indeed", - "KerasNLP is awesome, i love contributing to it", + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", ] ) + y_pred_1 = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) rouge.update_state(y_true_1, y_pred_1) rouge_val = rouge.result() - self.assertAlmostEqual(rouge_val.numpy(), 0.439, delta=1e-3) + self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) y_true_2 = tf.constant(["what is your favourite show"]) y_pred_2 = tf.constant(["my favourite show is silicon valley"]) rouge.update_state(y_true_2, y_pred_2) rouge_val = rouge.result() - self.assertAlmostEqual(rouge_val.numpy(), 0.367, delta=1e-3) + self.assertAlmostEqual(rouge_val.numpy(), 0.385, delta=1e-3) def test_merge_state(self): - rouge_1 = Rouge() - rouge_2 = Rouge() + rouge_1 = RougeN() + rouge_2 = RougeN() y_true_1 = tf.constant( - ["hey, this is great fun", "i love contributing to KerasNLP"] - ) - y_pred_1 = tf.constant( [ - "great fun indeed", - "KerasNLP is awesome, i love contributing to it", + "the tiny little cat was found under the big funny bed", + "i really love contributing to KerasNLP", ] ) + y_pred_1 = tf.constant( + ["the cat was under the bed", "i love contributing to KerasNLP"] + ) y_true_2 = tf.constant(["what is your favourite show"]) y_pred_2 = tf.constant(["my favourite show is silicon valley"]) @@ -187,30 +188,30 @@ def test_merge_state(self): rouge_1.update_state(y_true_1, y_pred_1) rouge_1.update_state(y_true_2, y_pred_2) - self.assertAlmostEqual(rouge_1.result().numpy(), 0.367, delta=1e-3) + self.assertAlmostEqual(rouge_1.result().numpy(), 0.385, delta=1e-3) rouge_2.update_state(y_true_3, y_pred_3) self.assertAlmostEqual(rouge_2.result().numpy(), 0.222, delta=1e-3) - merged_rouge = Rouge() + merged_rouge = RougeN() merged_rouge.merge_state([rouge_1, rouge_2]) - self.assertAlmostEqual(merged_rouge.result().numpy(), 0.331, delta=1e-3) + self.assertAlmostEqual(merged_rouge.result().numpy(), 0.344, delta=1e-3) def test_get_config(self): - rouge = Rouge( - variant="rouge5", + rouge = RougeN( + order=5, metric_type="precision", use_stemmer=True, dtype=tf.float32, - name="rouge_test", + name="rouge_n_test", ) config = rouge.get_config() expected_config = { - "variant": "rouge5", + "order": 5, "metric_type": "precision", "use_stemmer": True, "dtype": tf.float32, - "name": "rouge_test", + "name": "rouge_n_test", } self.assertEqual(config, expected_config) From 748df818b6a3e5530b2110b1eb7ec483a9f547a3 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 3 Jun 2022 18:42:30 +0530 Subject: [PATCH 18/30] Address review comments - III --- keras_nlp/metrics/rouge.py | 186 ++++++++++++++++++++++++ keras_nlp/metrics/rouge_l.py | 214 +++++++++++++-------------- keras_nlp/metrics/rouge_l_test.py | 20 ++- keras_nlp/metrics/rouge_n.py | 230 ++++++++++++++++-------------- keras_nlp/metrics/rouge_n_test.py | 21 ++- 5 files changed, 443 insertions(+), 228 deletions(-) create mode 100644 keras_nlp/metrics/rouge.py diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge.py new file mode 100644 index 0000000000..13467d1cac --- /dev/null +++ b/keras_nlp/metrics/rouge.py @@ -0,0 +1,186 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ROUGE metric implementation based on `keras.metrics.Metric`.""" + + +import tensorflow as tf +from tensorflow import keras + +from keras_nlp.utils.tensor_utils import tensor_to_string_list + +try: + import rouge_score + from rouge_score import rouge_scorer +except ImportError: + rouge_score = None + + +class RougeBase(keras.metrics.Metric): + """ROUGE metric. + This class implements all the variants of the ROUGE metric - ROUGE-N, + ROUGE-L and ROUGE-LSum. + Args: + variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to + "rouge2". For "rougeN", N lies in the range [1, 9]. + metric_type: string. One of "precision", "recall", "f1_score". Defaults + to "f1_score". + use_stemmer: bool. Whether Porter Stemmer should be used to strip word + suffixes to improve matching. Defaults to False. + dtype: string or tf.dtypes.Dtype. Precision of metric computation. If + not specified, it defaults to tf.float32. + name: string. Name of the metric instance. + **kwargs: Other keyword arguments. + """ + + def __init__( + self, + variant="rouge2", + metric_type="f1_score", + use_stemmer=False, + dtype=None, + name="rouge", + **kwargs, + ): + super().__init__(name=name, dtype=dtype, **kwargs) + + if rouge_score is None: + raise ImportError( + "ROUGE metric requires the `rouge_score` package. " + "Please install it with `pip install rouge-score`." + ) + + if not tf.as_dtype(self.dtype).is_floating: + raise ValueError( + "`dtype` must be a floating point type. " + f"Received: dtype={dtype}" + ) + + if metric_type not in ("precision", "recall", "f1_score"): + raise ValueError( + '`metric_type` must be one of "precision", "recall", ' + f'"f1_score". Received: metric_type={metric_type}' + ) + + if variant not in tuple( + ("rouge" + str(order) for order in range(1, 10)) + ) + ( + "rougeL", + "rougeLsum", + ): + raise ValueError( + "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, " + "rougeLsum, with N ranging from 1 to 9. Received: " + f"variant={variant}" + ) + + self.variant = variant + self.metric_type = metric_type + self.use_stemmer = use_stemmer + + # To-do: Add split_summaries and tokenizer options after the maintainers + # of rouge_scorer have released a new version. + self._rouge_scorer = rouge_scorer.RougeScorer( + rouge_types=[self.variant], + use_stemmer=use_stemmer, + ) + + self._rouge_score = self.add_weight( + name="rouge_score", + initializer="zeros", + dtype=self.dtype, + ) + self._number_of_samples = self.add_weight( + name="number_of_samples", initializer="zeros", dtype=self.dtype + ) + + def update_state(self, y_true, y_pred, sample_weight=None): + # Three possible shapes for y_true and y_pred: Python string, + # [batch_size] and [batch_size, 1]. In the latter two cases, we have + # strings in the tensor/list. + + def validate_and_fix_rank(input_, tensor_name): + if not isinstance(input_, tf.Tensor): + input_ = tf.convert_to_tensor(input_) + + if input_.shape.rank == 0: + return input_[tf.newaxis] + elif input_.shape.rank == 1: + return input_ + elif input_.shape.rank == 2: + if input_.shape[1] != 1: + raise ValueError( + f"{tensor_name} must be of shape `[batch_size, 1]`. " + f"Found shape: {input_.shape}" + ) + else: + return tf.squeeze(input_, axis=1) + else: + raise ValueError( + f"{tensor_name} must be of rank 0 (scalar input), 1 or 2. " + f"Found rank: {input_.shape.rank}" + ) + + y_true = validate_and_fix_rank(y_true, "y_true") + y_pred = validate_and_fix_rank(y_pred, "y_pred") + + batch_size = tf.shape(y_true)[0] + + def calculate_rouge_score(reference, hypothesis): + reference = tensor_to_string_list(reference) + hypothesis = tensor_to_string_list(hypothesis) + score = self._rouge_scorer.score(reference, hypothesis)[ + self.variant + ] + + if self.metric_type == "precision": + score = score.precision + elif self.metric_type == "recall": + score = score.recall + else: + score = score.fmeasure + return score + + for batch_idx in range(batch_size): + score = tf.py_function( + func=calculate_rouge_score, + inp=[y_true[batch_idx], y_pred[batch_idx]], + Tout=self.dtype, + ) + self._rouge_score.assign_add(score) + + self._number_of_samples.assign_add( + tf.cast(batch_size, dtype=self.dtype) + ) + + def result(self): + if self._number_of_samples == 0: + return 0.0 + rouge_score = self._rouge_score / self._number_of_samples + return rouge_score + + def reset_state(self): + self._rouge_score.assign(0.0) + self._number_of_samples.assign(0.0) + + def get_config(self): + config = super().get_config() + config.update( + { + "variant": self.variant, + "metric_type": self.metric_type, + "use_stemmer": self.use_stemmer, + } + ) + return config diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 79f897a2da..af888f7795 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -14,19 +14,11 @@ """ROUGE-L metric implementation based on `keras.metrics.Metric`.""" -import tensorflow as tf -from tensorflow import keras -from keras_nlp.utils.tensor_utils import tensor_to_string_list +from keras_nlp.metrics.rouge import RougeBase -try: - import rouge_score - from rouge_score import rouge_scorer -except ImportError: - rouge_score = None - -class RougeL(keras.metrics.Metric): +class RougeL(RougeBase): """ROUGE-L metric. This class implements the ROUGE-L variant of the ROUGE metric. The ROUGE-L @@ -34,6 +26,14 @@ class RougeL(keras.metrics.Metric): Succinctly put, ROUGE-L is a score based on the length of the longest common subsequence present in the reference text and the hypothesis text. + Note on input shapes: + `y_true` and `y_pred` can be of the following types/shapes: + 1. Python string/scalar input + 2. Tensor/Python list + a. rank 0 + b. rank 1 (every element in the tensor is a string) + c. rank 2 (shape: `(batch_size, 1)`) + Args: use_stemmer: bool. Whether Porter Stemmer should be used to strip word suffixes to improve matching. Defaults to False. @@ -43,6 +43,92 @@ class RougeL(keras.metrics.Metric): not specified, it defaults to tf.float32. name: string. Name of the metric instance. **kwargs: Other keyword arguments. + + Examples: + + 1. Various Input Types. + 1.1. Python string. + >>> rouge_l = keras_nlp.metrics.RougeL() + >>> y_true = "the tiny little cat was found under the big funny bed" + >>> y_pred = "the cat was under the bed" + >>> rouge_l(y_true, y_pred) + + + 1.2. rank 1 inputs. + a. Python list. + >>> rouge_l = keras_nlp.metrics.RougeL() + >>> y_true = [ + ... "the tiny little cat was found under the big funny bed", + ... "i really love contributing to KerasNLP", + ... ] + >>> y_pred = [ + ... "the cat was under the bed", + ... "i love contributing to KerasNLP", + ... ] + >>> + >>> rouge_l(y_true, y_pred) + + + b. Tensor + >>> rouge_l = keras_nlp.metrics.RougeL() + >>> y_true = tf.constant( + ... [ + ... "the tiny little cat was found under the big funny bed", + ... "i really love contributing to KerasNLP", + ... ] + ... ) + >>> y_pred = tf.constant( + ... [ + ... "the cat was under the bed", + ... "i love contributing to KerasNLP", + ... ] + ... ) + >>> rouge_l(y_true, y_pred) + + + 1.3. rank 2 inputs. + >>> rouge_l = keras_nlp.metrics.RougeL() + >>> y_true = tf.constant( + ... [ + ... ["the tiny little cat was found under the big funny bed"], + ... ["i really love contributing to KerasNLP"], + ... ] + ... ) + >>> y_pred = tf.constant( + ... [ + ... ["the cat was under the bed"], + ... ["i love contributing to KerasNLP"], + ... ] + ... ) + >>> rouge_l(y_true, y_pred) + + + 3. Output the precision instead of the F1 Score. + >>> rouge_l = keras_nlp.metrics.RougeL(metric_type="precision") + >>> y_true = tf.constant( + ... [ + ... "the tiny little cat was found under the big funny bed", + ... "i really love contributing to KerasNLP", + ... ] + ... ) + >>> y_pred = tf.constant( + ... [ + ... "the cat was under the bed", + ... "i love contributing to KerasNLP", + ... ] + ... ) + >>> rouge_l(y_true, y_pred) + + + 4. Pass the metric to `model.compile()`. + >>> inputs = keras.Input(shape=(), dtype='string') + >>> outputs = tf.strings.lower(inputs) + >>> model = keras.Model(inputs, outputs) + >>> model.compile(metrics=[keras_nlp.metrics.RougeL()]) + >>> x = tf.constant(["HELLO THIS IS FUN"]) + >>> y = tf.constant(["hello this is awesome"]) + >>> model.evaluate(x, y, return_dict=True) + {'loss': 0.0, 'rouge-l': 0.75} """ def __init__( @@ -53,110 +139,16 @@ def __init__( name="rouge-l", **kwargs, ): - super().__init__(name=name, dtype=dtype, **kwargs) - - if rouge_score is None: - raise ImportError( - "ROUGE metric requires the `rouge_score` package. " - "Please install it with `pip install rouge-score`." - ) - - if not tf.as_dtype(self.dtype).is_floating: - raise ValueError( - "`dtype` must be a floating point type. " - f"Received: dtype={dtype}" - ) - - if metric_type not in ("precision", "recall", "f1_score"): - raise ValueError( - '`metric_type` must be one of "precision", "recall", ' - f'"f1_score". Received: metric_type={metric_type}' - ) - - self.metric_type = metric_type - self.use_stemmer = use_stemmer - - # To-do: Add an option for adding custom tokenizer after the maintainers - # of rouge-score have released a new version. - self._rouge_l_scorer = rouge_scorer.RougeScorer( - rouge_types=["rougeL"], + super().__init__( + variant="rougeL", + metric_type=metric_type, use_stemmer=use_stemmer, + dtype=dtype, + name=name, + **kwargs, ) - self._rouge_l_score = self.add_weight( - name="rouge_l_score", - initializer="zeros", - dtype=self.dtype, - ) - self._number_of_samples = self.add_weight( - name="number_of_samples", initializer="zeros", dtype=self.dtype - ) - - def update_state(self, y_true, y_pred, sample_weight=None): - # Three possible shapes for y_true and y_pred: Python string, - # [batch_size] and [batch_size, 1]. In the latter two cases, we have - # strings in the tensor/list. - - # Check if input is a raw string/list. - if isinstance(y_true, str): - y_true = tf.constant([y_true]) - elif isinstance(y_true, list): - y_true = tf.constant(y_true) - if isinstance(y_pred, str): - y_pred = tf.constant([y_pred]) - elif isinstance(y_pred, list): - y_pred = tf.constant(y_pred) - - # If the shape of y_true and y_pred is [batch_size, 1], squeeze it to - # [batch_size]. - if y_true.shape.rank == 2: - y_true = tf.squeeze(y_true, axis=1) - if y_pred.shape.rank == 2: - y_pred = tf.squeeze(y_pred, axis=1) - - batch_size = tf.shape(y_true)[0] - - def _calculate_rouge_l_score(reference, hypothesis): - reference = tensor_to_string_list(reference) - hypothesis = tensor_to_string_list(hypothesis) - score = self._rouge_l_scorer.score(reference, hypothesis)["rougeL"] - - if self.metric_type == "precision": - score = score.precision - elif self.metric_type == "recall": - score = score.recall - else: - score = score.fmeasure - return score - - for batch_idx in range(batch_size): - score = tf.py_function( - func=_calculate_rouge_l_score, - inp=[y_true[batch_idx], y_pred[batch_idx]], - Tout=self.dtype, - ) - self._rouge_l_score.assign_add(score) - - self._number_of_samples.assign_add( - tf.cast(batch_size, dtype=self.dtype) - ) - - def result(self): - if self._number_of_samples == 0: - return 0.0 - rouge_l_score = self._rouge_l_score / self._number_of_samples - return rouge_l_score - - def reset_state(self): - self._rouge_l_score.assign(0.0) - self._number_of_samples.assign(0.0) - def get_config(self): config = super().get_config() - config.update( - { - "metric_type": self.metric_type, - "use_stemmer": self.use_stemmer, - } - ) + del config["variant"] return config diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py index a3e4250dae..216686273d 100644 --- a/keras_nlp/metrics/rouge_l_test.py +++ b/keras_nlp/metrics/rouge_l_test.py @@ -15,6 +15,7 @@ """Tests for RougeL.""" import tensorflow as tf +from tensorflow import keras from keras_nlp.metrics import RougeL @@ -76,6 +77,19 @@ def test_rank_2_input(self): rouge_val = rouge(y_true, y_pred) self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) + def model_compile(self): + inputs = keras.Input(shape=(), dtype="string") + outputs = tf.strings.lower(inputs) + model = keras.Model(inputs, outputs) + + model.compile(metrics=[RougeL()]) + + x = tf.constant(["HELLO THIS IS FUN"]) + y = tf.constant(["hello this is awesome"]) + + output = model.evaluate(x, y, return_dict=True) + self.assertAlmostEqual(output["rouge-l"], 0.75, delta=1e-3) + def test_precision(self): rouge = RougeL(metric_type="precision", use_stemmer=False) y_true = tf.constant( @@ -187,10 +201,8 @@ def test_get_config(self): ) config = rouge.get_config() - expected_config = { + expected_config_subset = { "metric_type": "precision", "use_stemmer": True, - "dtype": tf.float32, - "name": "rouge_l_test", } - self.assertEqual(config, expected_config) + self.assertEqual(config, {**config, **expected_config_subset}) diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index ad8e2288d9..d243a39a44 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -14,19 +14,11 @@ """ROUGE-N metric implementation based on `keras.metrics.Metric`.""" -import tensorflow as tf -from tensorflow import keras -from keras_nlp.utils.tensor_utils import tensor_to_string_list +from keras_nlp.metrics.rouge import RougeBase -try: - import rouge_score - from rouge_score import rouge_scorer -except ImportError: - rouge_score = None - -class RougeN(keras.metrics.Metric): +class RougeN(RougeBase): """ROUGE-N metric. This class implements the ROUGE-N variant of the ROUGE metric. The ROUGE-N @@ -34,6 +26,14 @@ class RougeN(keras.metrics.Metric): Succinctly put, ROUGE-N is a score based on the number of matching n-grams between the reference text and the hypothesis text. + Note on input shapes: + `y_true` and `y_pred` can be of the following types/shapes: + 1. Python string/scalar input + 2. Tensor/Python list + a. rank 0 + b. rank 1 (every element in the tensor is a string) + c. rank 2 (shape: `(batch_size, 1)`) + Args: order: The order of n-grams which are to be matched. It should lie in range [1, 9]. Defaults to 2. @@ -45,6 +45,108 @@ class RougeN(keras.metrics.Metric): not specified, it defaults to tf.float32. name: string. Name of the metric instance. **kwargs: Other keyword arguments. + + Examples: + + 1. Various Input Types. + 1.1. Python string. + >>> rouge_n = keras_nlp.metrics.RougeN(order=2) + >>> y_true = "the tiny little cat was found under the big funny bed" + >>> y_pred = "the cat was under the bed" + >>> rouge_n(y_true, y_pred) + + + 1.2. rank 1 inputs. + a. Python list. + >>> rouge_n = keras_nlp.metrics.RougeN(order=2) + >>> y_true = [ + ... "the tiny little cat was found under the big funny bed", + ... "i really love contributing to KerasNLP", + ... ] + >>> y_pred = [ + ... "the cat was under the bed", + ... "i love contributing to KerasNLP", + ... ] + >>> rouge_n(y_true, y_pred) + + + b. Tensor. + >>> rouge_n = keras_nlp.metrics.RougeN(order=2) + >>> y_true = tf.constant( + ... [ + ... "the tiny little cat was found under the big funny bed", + ... "i really love contributing to KerasNLP", + ... ] + ... ) + >>> y_pred = tf.constant( + ... [ + ... "the cat was under the bed", + ... "i love contributing to KerasNLP", + ... ] + ... ) + >>> rouge_n(y_true, y_pred) + + + 1.3. rank 2 inputs. + >>> rouge_n = keras_nlp.metrics.RougeN(order=2) + >>> y_true = tf.constant( + ... [ + ... ["the tiny little cat was found under the big funny bed"], + ... ["i really love contributing to KerasNLP"], + ... ] + ... ) + >>> y_pred = tf.constant( + ... [ + ... ["the cat was under the bed"], + ... ["i love contributing to KerasNLP"], + ... ] + ... ) + >>> rouge_n(y_true, y_pred) + + + 2. Consider trigrams for calculating ROUGE-N. + >>> rouge_n = keras_nlp.metrics.RougeN(order=3) + >>> y_true = tf.constant( + ... [ + ... "the tiny little cat was found under the big funny bed", + ... "i really love contributing to KerasNLP", + ... ] + ... ) + >>> y_pred = tf.constant( + ... [ + ... "the cat was under the bed", + ... "i love contributing to KerasNLP", + ... ] + ... ) + >>> rouge_n(y_true, y_pred) + + + 3. Output the precision instead of the F1 Score. + >>> rouge_n = keras_nlp.metrics.RougeN(order=3, metric_type="precision") + >>> y_true = tf.constant( + ... [ + ... "the tiny little cat was found under the big funny bed", + ... "i really love contributing to KerasNLP", + ... ] + ... ) + >>> y_pred = tf.constant( + ... [ + ... "the cat was under the bed", + ... "i love contributing to KerasNLP", + ... ] + ... ) + >>> rouge_n(y_true, y_pred) + + + 4. Pass the metric to `model.compile()`. + >>> inputs = keras.Input(shape=(), dtype='string') + >>> outputs = tf.strings.lower(inputs) + >>> model = keras.Model(inputs, outputs) + >>> model.compile(metrics=[keras_nlp.metrics.RougeN()]) + >>> x = tf.constant(["HELLO THIS IS FUN"]) + >>> y = tf.constant(["hello this is awesome"]) + >>> model.evaluate(x, y, return_dict=True) + {'loss': 0.0, 'rouge-n': 0.6666666865348816} """ def __init__( @@ -56,120 +158,30 @@ def __init__( name="rouge-n", **kwargs, ): - super().__init__(name=name, dtype=dtype, **kwargs) - - if rouge_score is None: - raise ImportError( - "ROUGE metric requires the `rouge_score` package. " - "Please install it with `pip install rouge-score`." - ) - - if not tf.as_dtype(self.dtype).is_floating: - raise ValueError( - "`dtype` must be a floating point type. " - f"Received: dtype={dtype}" - ) - if order not in range(1, 10): raise ValueError( "Invalid `order` value. Should lie in the range [1, 9]." f"Received order={order}" ) - if metric_type not in ("precision", "recall", "f1_score"): - raise ValueError( - '`metric_type` must be one of "precision", "recall", ' - f'"f1_score". Received: metric_type={metric_type}' - ) - - self.order = order - self.metric_type = metric_type - self.use_stemmer = use_stemmer - - # To-do: Add an option for adding custom tokenizer after the maintainers - # of rouge-score have released a new version. - self._rouge_n_scorer = rouge_scorer.RougeScorer( - rouge_types=["rouge" + str(order)], + super().__init__( + variant=f"rouge{order}", + metric_type=metric_type, use_stemmer=use_stemmer, + dtype=dtype, + name=name, + **kwargs, ) - self._rouge_n_score = self.add_weight( - name="rouge_n_score", - initializer="zeros", - dtype=self.dtype, - ) - self._number_of_samples = self.add_weight( - name="number_of_samples", initializer="zeros", dtype=self.dtype - ) - - def update_state(self, y_true, y_pred, sample_weight=None): - # Three possible shapes for y_true and y_pred: Python string, - # [batch_size] and [batch_size, 1]. In the latter two cases, we have - # strings in the tensor/list. - - # Check if input is a raw string/list. - if isinstance(y_true, str): - y_true = tf.constant([y_true]) - elif isinstance(y_true, list): - y_true = tf.constant(y_true) - if isinstance(y_pred, str): - y_pred = tf.constant([y_pred]) - elif isinstance(y_pred, list): - y_pred = tf.constant(y_pred) - - # If the shape of y_true and y_pred is [batch_size, 1], squeeze it to - # [batch_size]. - if y_true.shape.rank == 2: - y_true = tf.squeeze(y_true, axis=1) - if y_pred.shape.rank == 2: - y_pred = tf.squeeze(y_pred, axis=1) - - batch_size = tf.shape(y_true)[0] - - def _calculate_rouge_n_score(reference, hypothesis): - reference = tensor_to_string_list(reference) - hypothesis = tensor_to_string_list(hypothesis) - score = self._rouge_n_scorer.score(reference, hypothesis)[ - "rouge" + str(self.order) - ] - - if self.metric_type == "precision": - score = score.precision - elif self.metric_type == "recall": - score = score.recall - else: - score = score.fmeasure - return score - - for batch_idx in range(batch_size): - score = tf.py_function( - func=_calculate_rouge_n_score, - inp=[y_true[batch_idx], y_pred[batch_idx]], - Tout=self.dtype, - ) - self._rouge_n_score.assign_add(score) - - self._number_of_samples.assign_add( - tf.cast(batch_size, dtype=self.dtype) - ) - - def result(self): - if self._number_of_samples == 0: - return 0.0 - rouge_n_score = self._rouge_n_score / self._number_of_samples - return rouge_n_score - - def reset_state(self): - self._rouge_n_score.assign(0.0) - self._number_of_samples.assign(0.0) + self.order = order def get_config(self): config = super().get_config() + del config["variant"] + config.update( { "order": self.order, - "metric_type": self.metric_type, - "use_stemmer": self.use_stemmer, } ) return config diff --git a/keras_nlp/metrics/rouge_n_test.py b/keras_nlp/metrics/rouge_n_test.py index 983902f1e5..be008e3800 100644 --- a/keras_nlp/metrics/rouge_n_test.py +++ b/keras_nlp/metrics/rouge_n_test.py @@ -15,6 +15,7 @@ """Tests for RougeN.""" import tensorflow as tf +from tensorflow import keras from keras_nlp.metrics import RougeN @@ -76,6 +77,19 @@ def test_rank_2_input(self): rouge_val = rouge(y_true, y_pred) self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) + def model_compile(self): + inputs = keras.Input(shape=(), dtype="string") + outputs = tf.strings.lower(inputs) + model = keras.Model(inputs, outputs) + + model.compile(metrics=[RougeN()]) + + x = tf.constant(["HELLO THIS IS FUN"]) + y = tf.constant(["hello this is awesome"]) + + output = model.evaluate(x, y, return_dict=True) + self.assertAlmostEqual(output["rouge-n"], 0.667, delta=1e-3) + def test_incorrect_order(self): with self.assertRaises(ValueError): _ = RougeN(order=10) @@ -207,11 +221,10 @@ def test_get_config(self): ) config = rouge.get_config() - expected_config = { + expected_config_subset = { "order": 5, "metric_type": "precision", "use_stemmer": True, - "dtype": tf.float32, - "name": "rouge_n_test", } - self.assertEqual(config, expected_config) + + self.assertEqual(config, {**config, **expected_config_subset}) From a793d3d97d24f08735fdbc93439a353059ddb72b Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 3 Jun 2022 18:50:42 +0530 Subject: [PATCH 19/30] Fix model.compile error in doc-string --- keras_nlp/metrics/rouge_l.py | 5 +++-- keras_nlp/metrics/rouge_n.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index af888f7795..1fa0ad09fb 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -127,8 +127,9 @@ class RougeL(RougeBase): >>> model.compile(metrics=[keras_nlp.metrics.RougeL()]) >>> x = tf.constant(["HELLO THIS IS FUN"]) >>> y = tf.constant(["hello this is awesome"]) - >>> model.evaluate(x, y, return_dict=True) - {'loss': 0.0, 'rouge-l': 0.75} + >>> metric_dict = model.evaluate(x, y, return_dict=True) + >>> metric_dict["rouge-l"] + 0.75 """ def __init__( diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index d243a39a44..37a86f5207 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -145,8 +145,9 @@ class RougeN(RougeBase): >>> model.compile(metrics=[keras_nlp.metrics.RougeN()]) >>> x = tf.constant(["HELLO THIS IS FUN"]) >>> y = tf.constant(["hello this is awesome"]) - >>> model.evaluate(x, y, return_dict=True) - {'loss': 0.0, 'rouge-n': 0.6666666865348816} + >>> metric_dict = model.evaluate(x, y, return_dict=True) + >>> metric_dict["rouge-n"] + 0.6666666865348816 """ def __init__( From b8dae75b58cee6cc280319d2c9f4eb4f1d962843 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Sun, 5 Jun 2022 13:26:01 +0530 Subject: [PATCH 20/30] Rename rouge.py to rouge_base.py --- keras_nlp/metrics/{rouge.py => rouge_base.py} | 0 keras_nlp/metrics/rouge_l.py | 2 +- keras_nlp/metrics/rouge_n.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename keras_nlp/metrics/{rouge.py => rouge_base.py} (100%) diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge_base.py similarity index 100% rename from keras_nlp/metrics/rouge.py rename to keras_nlp/metrics/rouge_base.py diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 1fa0ad09fb..3d18b5e1e0 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -15,7 +15,7 @@ """ROUGE-L metric implementation based on `keras.metrics.Metric`.""" -from keras_nlp.metrics.rouge import RougeBase +from keras_nlp.metrics.rouge_base import RougeBase class RougeL(RougeBase): diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index 37a86f5207..8ab24b76e8 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -15,7 +15,7 @@ """ROUGE-N metric implementation based on `keras.metrics.Metric`.""" -from keras_nlp.metrics.rouge import RougeBase +from keras_nlp.metrics.rouge_base import RougeBase class RougeN(RougeBase): From 80500863b58e0a03b01cebe3007882bf20567f3a Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Tue, 7 Jun 2022 14:18:19 +0530 Subject: [PATCH 21/30] Address review comments - IV --- keras_nlp/metrics/rouge_base.py | 38 ++++++++++++++++++--------------- keras_nlp/metrics/rouge_l.py | 8 ++----- keras_nlp/metrics/rouge_n.py | 8 ++----- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py index 13467d1cac..1340474aef 100644 --- a/keras_nlp/metrics/rouge_base.py +++ b/keras_nlp/metrics/rouge_base.py @@ -29,8 +29,14 @@ class RougeBase(keras.metrics.Metric): """ROUGE metric. + This class implements all the variants of the ROUGE metric - ROUGE-N, ROUGE-L and ROUGE-LSum. + + Note on input shapes: + For `y_true` and `y_pred`, this class supports scalar values and batch + inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`. + Args: variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to "rouge2". For "rougeN", N lies in the range [1, 9]. @@ -39,7 +45,7 @@ class RougeBase(keras.metrics.Metric): use_stemmer: bool. Whether Porter Stemmer should be used to strip word suffixes to improve matching. Defaults to False. dtype: string or tf.dtypes.Dtype. Precision of metric computation. If - not specified, it defaults to tf.float32. + not specified, it defaults to tf.float32. name: string. Name of the metric instance. **kwargs: Other keyword arguments. """ @@ -76,12 +82,10 @@ def __init__( if variant not in tuple( ("rouge" + str(order) for order in range(1, 10)) ) + ( - "rougeL", - "rougeLsum", - ): + "rougeL",): raise ValueError( "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, " - "rougeLsum, with N ranging from 1 to 9. Received: " + "with N ranging from 1 to 9. Received: " f"variant={variant}" ) @@ -110,26 +114,26 @@ def update_state(self, y_true, y_pred, sample_weight=None): # [batch_size] and [batch_size, 1]. In the latter two cases, we have # strings in the tensor/list. - def validate_and_fix_rank(input_, tensor_name): - if not isinstance(input_, tf.Tensor): - input_ = tf.convert_to_tensor(input_) + def validate_and_fix_rank(inputs, tensor_name): + if not isinstance(inputs, tf.Tensor): + inputs = tf.convert_to_tensor(inputs) - if input_.shape.rank == 0: - return input_[tf.newaxis] - elif input_.shape.rank == 1: - return input_ - elif input_.shape.rank == 2: - if input_.shape[1] != 1: + if inputs.shape.rank == 0: + return inputs[tf.newaxis] + elif inputs.shape.rank == 1: + return inputs + elif inputs.shape.rank == 2: + if inputs.shape[1] != 1: raise ValueError( f"{tensor_name} must be of shape `[batch_size, 1]`. " - f"Found shape: {input_.shape}" + f"Found shape: {inputs.shape}" ) else: - return tf.squeeze(input_, axis=1) + return tf.squeeze(inputs, axis=1) else: raise ValueError( f"{tensor_name} must be of rank 0 (scalar input), 1 or 2. " - f"Found rank: {input_.shape.rank}" + f"Found rank: {inputs.shape.rank}" ) y_true = validate_and_fix_rank(y_true, "y_true") diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 3d18b5e1e0..3bfb77caa7 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -27,12 +27,8 @@ class RougeL(RougeBase): common subsequence present in the reference text and the hypothesis text. Note on input shapes: - `y_true` and `y_pred` can be of the following types/shapes: - 1. Python string/scalar input - 2. Tensor/Python list - a. rank 0 - b. rank 1 (every element in the tensor is a string) - c. rank 2 (shape: `(batch_size, 1)`) + For `y_true` and `y_pred`, this class supports scalar values and batch + inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`. Args: use_stemmer: bool. Whether Porter Stemmer should be used to strip word diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index 8ab24b76e8..57d611d928 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -27,12 +27,8 @@ class RougeN(RougeBase): between the reference text and the hypothesis text. Note on input shapes: - `y_true` and `y_pred` can be of the following types/shapes: - 1. Python string/scalar input - 2. Tensor/Python list - a. rank 0 - b. rank 1 (every element in the tensor is a string) - c. rank 2 (shape: `(batch_size, 1)`) + For `y_true` and `y_pred`, this class supports scalar values and batch + inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`. Args: order: The order of n-grams which are to be matched. It should lie in From da44d22335d927d68922ad9df6f43d30dfe56f6c Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Tue, 7 Jun 2022 14:19:04 +0530 Subject: [PATCH 22/30] Address review comments - IV --- keras_nlp/metrics/rouge_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py index 1340474aef..41bfa722cf 100644 --- a/keras_nlp/metrics/rouge_base.py +++ b/keras_nlp/metrics/rouge_base.py @@ -81,8 +81,7 @@ def __init__( if variant not in tuple( ("rouge" + str(order) for order in range(1, 10)) - ) + ( - "rougeL",): + ) + ("rougeL",): raise ValueError( "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, " "with N ranging from 1 to 9. Received: " From f8c05aacc34cfa111321173bf5298bd184347a9f Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 10 Jun 2022 11:24:46 +0530 Subject: [PATCH 23/30] Return dict from ROUGE --- keras_nlp/metrics/rouge_base.py | 85 ++++++++++++------ keras_nlp/metrics/rouge_l.py | 67 +++++++------- keras_nlp/metrics/rouge_l_test.py | 134 ++++++++++++++++++---------- keras_nlp/metrics/rouge_n.py | 70 ++++++++------- keras_nlp/metrics/rouge_n_test.py | 142 ++++++++++++++++++++---------- 5 files changed, 317 insertions(+), 181 deletions(-) diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py index 41bfa722cf..069ef14226 100644 --- a/keras_nlp/metrics/rouge_base.py +++ b/keras_nlp/metrics/rouge_base.py @@ -15,6 +15,8 @@ """ROUGE metric implementation based on `keras.metrics.Metric`.""" +import types + import tensorflow as tf from tensorflow import keras @@ -40,8 +42,6 @@ class RougeBase(keras.metrics.Metric): Args: variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to "rouge2". For "rougeN", N lies in the range [1, 9]. - metric_type: string. One of "precision", "recall", "f1_score". Defaults - to "f1_score". use_stemmer: bool. Whether Porter Stemmer should be used to strip word suffixes to improve matching. Defaults to False. dtype: string or tf.dtypes.Dtype. Precision of metric computation. If @@ -53,7 +53,6 @@ class RougeBase(keras.metrics.Metric): def __init__( self, variant="rouge2", - metric_type="f1_score", use_stemmer=False, dtype=None, name="rouge", @@ -73,12 +72,6 @@ def __init__( f"Received: dtype={dtype}" ) - if metric_type not in ("precision", "recall", "f1_score"): - raise ValueError( - '`metric_type` must be one of "precision", "recall", ' - f'"f1_score". Received: metric_type={metric_type}' - ) - if variant not in tuple( ("rouge" + str(order) for order in range(1, 10)) ) + ("rougeL",): @@ -89,7 +82,6 @@ def __init__( ) self.variant = variant - self.metric_type = metric_type self.use_stemmer = use_stemmer # To-do: Add split_summaries and tokenizer options after the maintainers @@ -99,15 +91,46 @@ def __init__( use_stemmer=use_stemmer, ) - self._rouge_score = self.add_weight( - name="rouge_score", + self._rouge_precision = self.add_weight( + name="rouge_precision", initializer="zeros", dtype=self.dtype, ) + self._rouge_recall = self.add_weight( + name="rouge_recall", + initializer="zeros", + dtype=self.dtype, + ) + self._rouge_f1_score = self.add_weight( + name="rouge_f1_score", + initializer="zeros", + dtype=self.dtype, + ) + self._number_of_samples = self.add_weight( name="number_of_samples", initializer="zeros", dtype=self.dtype ) + def __new__(cls, *args, **kwargs): + # Temporary workaround for Keras bug with dictionary return types. + # Wraps `result()` with a python dictionary that also supports variable + # assignment. We have to do this with __new__ because the base metric + # class wraps the `results()` method. + obj = super().__new__(cls) + + class MetricDict(dict): + """A dictionary that supports variable assignment.""" + + pass + + def wrap_result(result_fn): + return tf.__internal__.decorator.make_decorator( + result_fn, lambda obj, *args: MetricDict(result_fn(*args)) + ) + + obj.result = types.MethodType(wrap_result(obj.result), obj) + return obj + def update_state(self, y_true, y_pred, sample_weight=None): # Three possible shapes for y_true and y_pred: Python string, # [batch_size] and [batch_size, 1]. In the latter two cases, we have @@ -146,14 +169,10 @@ def calculate_rouge_score(reference, hypothesis): score = self._rouge_scorer.score(reference, hypothesis)[ self.variant ] - - if self.metric_type == "precision": - score = score.precision - elif self.metric_type == "recall": - score = score.recall - else: - score = score.fmeasure - return score + return tf.cast( + tf.constant([score.precision, score.recall, score.fmeasure]), + dtype=self.dtype, + ) for batch_idx in range(batch_size): score = tf.py_function( @@ -161,7 +180,9 @@ def calculate_rouge_score(reference, hypothesis): inp=[y_true[batch_idx], y_pred[batch_idx]], Tout=self.dtype, ) - self._rouge_score.assign_add(score) + self._rouge_precision.assign_add(score[0]) + self._rouge_recall.assign_add(score[1]) + self._rouge_f1_score.assign_add(score[2]) self._number_of_samples.assign_add( tf.cast(batch_size, dtype=self.dtype) @@ -169,12 +190,25 @@ def calculate_rouge_score(reference, hypothesis): def result(self): if self._number_of_samples == 0: - return 0.0 - rouge_score = self._rouge_score / self._number_of_samples - return rouge_score + return { + f"{self.name}_precision": 0.0, + f"{self.name}_recall": 0.0, + f"{self.name}_f1_score": 0.0, + } + + rouge_precision = self._rouge_precision / self._number_of_samples + rouge_recall = self._rouge_recall / self._number_of_samples + rouge_f1_score = self._rouge_f1_score / self._number_of_samples + return { + f"{self.name}_precision": rouge_precision, + f"{self.name}_recall": rouge_recall, + f"{self.name}_f1_score": rouge_f1_score, + } def reset_state(self): - self._rouge_score.assign(0.0) + self._rouge_precision.assign(0.0) + self._rouge_recall.assign(0.0) + self._rouge_f1_score.assign(0.0) self._number_of_samples.assign(0.0) def get_config(self): @@ -182,7 +216,6 @@ def get_config(self): config.update( { "variant": self.variant, - "metric_type": self.metric_type, "use_stemmer": self.use_stemmer, } ) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 3bfb77caa7..0c1a907d7c 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -33,8 +33,6 @@ class RougeL(RougeBase): Args: use_stemmer: bool. Whether Porter Stemmer should be used to strip word suffixes to improve matching. Defaults to False. - metric_type: string. One of "precision", "recall", "f1_score". Defaults - to "f1_score". dtype: string or tf.dtypes.Dtype. Precision of metric computation. If not specified, it defaults to tf.float32. name: string. Name of the metric instance. @@ -48,7 +46,13 @@ class RougeL(RougeBase): >>> y_true = "the tiny little cat was found under the big funny bed" >>> y_pred = "the cat was under the bed" >>> rouge_l(y_true, y_pred) - + { + 'rouge-l_precision': , + 'rouge-l_recall': + , + 'rouge-l_f1_score': + + } 1.2. rank 1 inputs. a. Python list. @@ -61,9 +65,14 @@ class RougeL(RougeBase): ... "the cat was under the bed", ... "i love contributing to KerasNLP", ... ] - >>> >>> rouge_l(y_true, y_pred) - + { + 'rouge-l_precision': , + 'rouge-l_recall': + , + 'rouge-l_f1_score': + + } b. Tensor >>> rouge_l = keras_nlp.metrics.RougeL() @@ -80,7 +89,13 @@ class RougeL(RougeBase): ... ] ... ) >>> rouge_l(y_true, y_pred) - + { + 'rouge-l_precision': , + 'rouge-l_recall': + , + 'rouge-l_f1_score': + + } 1.3. rank 2 inputs. >>> rouge_l = keras_nlp.metrics.RougeL() @@ -97,26 +112,15 @@ class RougeL(RougeBase): ... ] ... ) >>> rouge_l(y_true, y_pred) - - - 3. Output the precision instead of the F1 Score. - >>> rouge_l = keras_nlp.metrics.RougeL(metric_type="precision") - >>> y_true = tf.constant( - ... [ - ... "the tiny little cat was found under the big funny bed", - ... "i really love contributing to KerasNLP", - ... ] - ... ) - >>> y_pred = tf.constant( - ... [ - ... "the cat was under the bed", - ... "i love contributing to KerasNLP", - ... ] - ... ) - >>> rouge_l(y_true, y_pred) - - - 4. Pass the metric to `model.compile()`. + { + 'rouge-l_precision': , + 'rouge-l_recall': + , + 'rouge-l_f1_score': + + } + + 3. Pass the metric to `model.compile()`. >>> inputs = keras.Input(shape=(), dtype='string') >>> outputs = tf.strings.lower(inputs) >>> model = keras.Model(inputs, outputs) @@ -124,13 +128,17 @@ class RougeL(RougeBase): >>> x = tf.constant(["HELLO THIS IS FUN"]) >>> y = tf.constant(["hello this is awesome"]) >>> metric_dict = model.evaluate(x, y, return_dict=True) - >>> metric_dict["rouge-l"] - 0.75 + >>> metric_dict + { + 'loss': 0.0, + 'rouge-l_precision': 0.75, + 'rouge-l_recall': 0.75, + 'rouge-l_f1_score': 0.75 + } """ def __init__( self, - metric_type="f1_score", use_stemmer=False, dtype=None, name="rouge-l", @@ -138,7 +146,6 @@ def __init__( ): super().__init__( variant="rougeL", - metric_type=metric_type, use_stemmer=use_stemmer, dtype=dtype, name=name, diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py index 216686273d..af3f306990 100644 --- a/keras_nlp/metrics/rouge_l_test.py +++ b/keras_nlp/metrics/rouge_l_test.py @@ -21,9 +21,20 @@ class RougeLTest(tf.test.TestCase): + def setUp(self): + super().setUp() + self.metric_types = ( + "rouge-l_precision", + "rouge-l_recall", + "rouge-l_f1_score", + ) + def test_initialization(self): rouge = RougeL() - self.assertEqual(rouge.result().numpy(), 0.0) + result = rouge.result() + + for metric_type in self.metric_types: + self.assertEqual(result[metric_type].numpy(), 0.0) def test_string_input(self): rouge = RougeL(use_stemmer=False) @@ -31,7 +42,12 @@ def test_string_input(self): y_pred = "the cat was under the bed" rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.706, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [1, 0.545, 0.706] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_string_list_input(self): rouge = RougeL(use_stemmer=False) @@ -45,7 +61,12 @@ def test_string_list_input(self): ] rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [1, 0.689, 0.807] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_tensor_input(self): rouge = RougeL(use_stemmer=False) @@ -60,7 +81,12 @@ def test_tensor_input(self): ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [1, 0.689, 0.807] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_rank_2_input(self): rouge = RougeL(use_stemmer=False) @@ -75,9 +101,14 @@ def test_rank_2_input(self): ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) - - def model_compile(self): + for metric_type, expected_val in zip( + self.metric_types, [1, 0.689, 0.807] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) + + def test_model_compile(self): inputs = keras.Input(shape=(), dtype="string") outputs = tf.strings.lower(inputs) model = keras.Model(inputs, outputs) @@ -88,37 +119,12 @@ def model_compile(self): y = tf.constant(["hello this is awesome"]) output = model.evaluate(x, y, return_dict=True) - self.assertAlmostEqual(output["rouge-l"], 0.75, delta=1e-3) - - def test_precision(self): - rouge = RougeL(metric_type="precision", use_stemmer=False) - y_true = tf.constant( - [ - "the tiny little cat was found under the big funny bed", - "i really love contributing to KerasNLP", - ] - ) - y_pred = tf.constant( - ["the cat was under the bed", "i love contributing to KerasNLP"] - ) - - rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 1, delta=1e-3) - - def test_recall(self): - rouge = RougeL(metric_type="recall", use_stemmer=False) - y_true = tf.constant( - [ - "the tiny little cat was found under the big funny bed", - "i really love contributing to KerasNLP", - ] - ) - y_pred = tf.constant( - ["the cat was under the bed", "i love contributing to KerasNLP"] - ) - - rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.689, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [0.75, 0.75, 0.75] + ): + self.assertAlmostEqual( + output[metric_type], expected_val, delta=1e-3 + ) def test_reset_state(self): rouge = RougeL() @@ -133,10 +139,18 @@ def test_reset_state(self): ) rouge.update_state(y_true, y_pred) - self.assertNotEqual(rouge.result(), 0.0) + rouge_val = rouge.result() + for metric_type, unexpected_val in zip( + self.metric_types, [0.0, 0.0, 0.0] + ): + self.assertNotEqual(rouge_val[metric_type].numpy(), unexpected_val) rouge.reset_state() - self.assertEqual(rouge.result(), 0.0) + rouge_val = rouge.result() + for metric_type, unexpected_val in zip( + self.metric_types, [0.0, 0.0, 0.0] + ): + self.assertEqual(rouge_val[metric_type].numpy(), unexpected_val) def test_update_state(self): rouge = RougeL() @@ -152,14 +166,24 @@ def test_update_state(self): rouge.update_state(y_true_1, y_pred_1) rouge_val = rouge.result() - self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [1, 0.689, 0.807] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) y_true_2 = tf.constant(["what is your favourite show"]) y_pred_2 = tf.constant(["my favourite show is silicon valley"]) rouge.update_state(y_true_2, y_pred_2) rouge_val = rouge.result() - self.assertAlmostEqual(rouge_val.numpy(), 0.659, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [0.778, 0.593, 0.66] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_merge_state(self): rouge_1 = RougeL() @@ -183,18 +207,35 @@ def test_merge_state(self): rouge_1.update_state(y_true_1, y_pred_1) rouge_1.update_state(y_true_2, y_pred_2) - self.assertAlmostEqual(rouge_1.result().numpy(), 0.659, delta=1e-3) + rouge_val = rouge_1.result() + for metric_type, expected_val in zip( + self.metric_types, [0.778, 0.593, 0.66] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) rouge_2.update_state(y_true_3, y_pred_3) - self.assertAlmostEqual(rouge_2.result().numpy(), 0.364, delta=1e-3) + rouge_val = rouge_2.result() + for metric_type, expected_val in zip( + self.metric_types, [0.333, 0.4, 0.364] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) merged_rouge = RougeL() merged_rouge.merge_state([rouge_1, rouge_2]) - self.assertAlmostEqual(merged_rouge.result().numpy(), 0.586, delta=1e-3) + rouge_val = merged_rouge.result() + for metric_type, expected_val in zip( + self.metric_types, [0.667, 0.545, 0.586] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_get_config(self): rouge = RougeL( - metric_type="precision", use_stemmer=True, dtype=tf.float32, name="rouge_l_test", @@ -202,7 +243,6 @@ def test_get_config(self): config = rouge.get_config() expected_config_subset = { - "metric_type": "precision", "use_stemmer": True, } self.assertEqual(config, {**config, **expected_config_subset}) diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index 57d611d928..4030066ecd 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -33,8 +33,6 @@ class RougeN(RougeBase): Args: order: The order of n-grams which are to be matched. It should lie in range [1, 9]. Defaults to 2. - metric_type: string. One of "precision", "recall", "f1_score". Defaults - to "f1_score". use_stemmer: bool. Whether Porter Stemmer should be used to strip word suffixes to improve matching. Defaults to False. dtype: string or tf.dtypes.Dtype. Precision of metric computation. If @@ -50,7 +48,12 @@ class RougeN(RougeBase): >>> y_true = "the tiny little cat was found under the big funny bed" >>> y_pred = "the cat was under the bed" >>> rouge_n(y_true, y_pred) - + { + 'rouge-n_precision': , + 'rouge-n_recall': , + 'rouge-n_f1_score': + + } 1.2. rank 1 inputs. a. Python list. @@ -64,7 +67,12 @@ class RougeN(RougeBase): ... "i love contributing to KerasNLP", ... ] >>> rouge_n(y_true, y_pred) - + { + 'rouge-n_precision': , + 'rouge-n_recall': , + 'rouge-n_f1_score': + + } b. Tensor. >>> rouge_n = keras_nlp.metrics.RougeN(order=2) @@ -81,7 +89,12 @@ class RougeN(RougeBase): ... ] ... ) >>> rouge_n(y_true, y_pred) - + { + 'rouge-n_precision': , + 'rouge-n_recall': , + 'rouge-n_f1_score': + + } 1.3. rank 2 inputs. >>> rouge_n = keras_nlp.metrics.RougeN(order=2) @@ -98,7 +111,12 @@ class RougeN(RougeBase): ... ] ... ) >>> rouge_n(y_true, y_pred) - + { + 'rouge-n_precision': , + 'rouge-n_recall': , + 'rouge-n_f1_score': + + } 2. Consider trigrams for calculating ROUGE-N. >>> rouge_n = keras_nlp.metrics.RougeN(order=3) @@ -115,26 +133,15 @@ class RougeN(RougeBase): ... ] ... ) >>> rouge_n(y_true, y_pred) - - - 3. Output the precision instead of the F1 Score. - >>> rouge_n = keras_nlp.metrics.RougeN(order=3, metric_type="precision") - >>> y_true = tf.constant( - ... [ - ... "the tiny little cat was found under the big funny bed", - ... "i really love contributing to KerasNLP", - ... ] - ... ) - >>> y_pred = tf.constant( - ... [ - ... "the cat was under the bed", - ... "i love contributing to KerasNLP", - ... ] - ... ) - >>> rouge_n(y_true, y_pred) - - - 4. Pass the metric to `model.compile()`. + { + 'rouge-n_precision': + , + 'rouge-n_recall': , + 'rouge-n_f1_score': + + } + + 3. Pass the metric to `model.compile()`. >>> inputs = keras.Input(shape=(), dtype='string') >>> outputs = tf.strings.lower(inputs) >>> model = keras.Model(inputs, outputs) @@ -142,14 +149,18 @@ class RougeN(RougeBase): >>> x = tf.constant(["HELLO THIS IS FUN"]) >>> y = tf.constant(["hello this is awesome"]) >>> metric_dict = model.evaluate(x, y, return_dict=True) - >>> metric_dict["rouge-n"] - 0.6666666865348816 + >>> metric_dict + { + 'loss': 0.0, + 'rouge-n_precision': 0.6666666865348816, + 'rouge-n_recall': 0.6666666865348816, + 'rouge-n_f1_score': 0.6666666865348816 + } """ def __init__( self, order=2, - metric_type="f1_score", use_stemmer=False, dtype=None, name="rouge-n", @@ -163,7 +174,6 @@ def __init__( super().__init__( variant=f"rouge{order}", - metric_type=metric_type, use_stemmer=use_stemmer, dtype=dtype, name=name, diff --git a/keras_nlp/metrics/rouge_n_test.py b/keras_nlp/metrics/rouge_n_test.py index be008e3800..8537876a66 100644 --- a/keras_nlp/metrics/rouge_n_test.py +++ b/keras_nlp/metrics/rouge_n_test.py @@ -21,9 +21,20 @@ class RougeNTest(tf.test.TestCase): + def setUp(self): + super().setUp() + self.metric_types = ( + "rouge-n_precision", + "rouge-n_recall", + "rouge-n_f1_score", + ) + def test_initialization(self): rouge = RougeN() - self.assertEqual(rouge.result().numpy(), 0.0) + result = rouge.result() + + for metric_type in self.metric_types: + self.assertEqual(result[metric_type].numpy(), 0.0) def test_string_input(self): rouge = RougeN(order=2, use_stemmer=False) @@ -31,7 +42,12 @@ def test_string_input(self): y_pred = "the cat was under the bed" rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.267, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [0.4, 0.2, 0.267] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_string_list_input(self): rouge = RougeN(order=2, use_stemmer=False) @@ -45,7 +61,12 @@ def test_string_list_input(self): ] rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [0.575, 0.4, 0.467] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_tensor_input(self): rouge = RougeN(order=2, use_stemmer=False) @@ -60,7 +81,12 @@ def test_tensor_input(self): ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [0.575, 0.4, 0.467] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_rank_2_input(self): rouge = RougeN(order=2, use_stemmer=False) @@ -75,9 +101,14 @@ def test_rank_2_input(self): ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) - - def model_compile(self): + for metric_type, expected_val in zip( + self.metric_types, [0.575, 0.4, 0.467] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) + + def test_model_compile(self): inputs = keras.Input(shape=(), dtype="string") outputs = tf.strings.lower(inputs) model = keras.Model(inputs, outputs) @@ -88,7 +119,13 @@ def model_compile(self): y = tf.constant(["hello this is awesome"]) output = model.evaluate(x, y, return_dict=True) - self.assertAlmostEqual(output["rouge-n"], 0.667, delta=1e-3) + + for metric_type, expected_val in zip( + self.metric_types, [0.667, 0.667, 0.667] + ): + self.assertAlmostEqual( + output[metric_type], expected_val, delta=1e-3 + ) def test_incorrect_order(self): with self.assertRaises(ValueError): @@ -107,37 +144,12 @@ def test_different_order(self): ) rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.286, delta=1e-3) - - def test_precision(self): - rouge = RougeN(order=3, metric_type="precision", use_stemmer=False) - y_true = tf.constant( - [ - "the tiny little cat was found under the big funny bed", - "i really love contributing to KerasNLP", - ] - ) - y_pred = tf.constant( - ["the cat was under the bed", "i love contributing to KerasNLP"] - ) - - rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3) - - def test_recall(self): - rouge = RougeN(order=3, metric_type="recall", use_stemmer=False) - y_true = tf.constant( - [ - "the tiny little cat was found under the big funny bed", - "i really love contributing to KerasNLP", - ] - ) - y_pred = tf.constant( - ["the cat was under the bed", "i love contributing to KerasNLP"] - ) - - rouge_val = rouge(y_true, y_pred) - self.assertAlmostEqual(rouge_val.numpy(), 0.25, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [0.333, 0.25, 0.286] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_reset_state(self): rouge = RougeN() @@ -152,10 +164,18 @@ def test_reset_state(self): ) rouge.update_state(y_true, y_pred) - self.assertNotEqual(rouge.result(), 0.0) + rouge_val = rouge.result() + for metric_type, unexpected_val in zip( + self.metric_types, [0.0, 0.0, 0.0] + ): + self.assertNotEqual(rouge_val[metric_type].numpy(), unexpected_val) rouge.reset_state() - self.assertEqual(rouge.result(), 0.0) + rouge_val = rouge.result() + for metric_type, unexpected_val in zip( + self.metric_types, [0.0, 0.0, 0.0] + ): + self.assertEqual(rouge_val[metric_type].numpy(), unexpected_val) def test_update_state(self): rouge = RougeN() @@ -171,14 +191,24 @@ def test_update_state(self): rouge.update_state(y_true_1, y_pred_1) rouge_val = rouge.result() - self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [0.575, 0.4, 0.467] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) y_true_2 = tf.constant(["what is your favourite show"]) y_pred_2 = tf.constant(["my favourite show is silicon valley"]) rouge.update_state(y_true_2, y_pred_2) rouge_val = rouge.result() - self.assertAlmostEqual(rouge_val.numpy(), 0.385, delta=1e-3) + for metric_type, expected_val in zip( + self.metric_types, [0.45, 0.35, 0.385] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_merge_state(self): rouge_1 = RougeN() @@ -202,19 +232,36 @@ def test_merge_state(self): rouge_1.update_state(y_true_1, y_pred_1) rouge_1.update_state(y_true_2, y_pred_2) - self.assertAlmostEqual(rouge_1.result().numpy(), 0.385, delta=1e-3) + rouge_val = rouge_1.result() + for metric_type, expected_val in zip( + self.metric_types, [0.45, 0.35, 0.385] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) rouge_2.update_state(y_true_3, y_pred_3) - self.assertAlmostEqual(rouge_2.result().numpy(), 0.222, delta=1e-3) + rouge_val = rouge_2.result() + for metric_type, expected_val in zip( + self.metric_types, [0.2, 0.25, 0.222] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) merged_rouge = RougeN() merged_rouge.merge_state([rouge_1, rouge_2]) - self.assertAlmostEqual(merged_rouge.result().numpy(), 0.344, delta=1e-3) + rouge_val = merged_rouge.result() + for metric_type, expected_val in zip( + self.metric_types, [0.388, 0.325, 0.344] + ): + self.assertAlmostEqual( + rouge_val[metric_type].numpy(), expected_val, delta=1e-3 + ) def test_get_config(self): rouge = RougeN( order=5, - metric_type="precision", use_stemmer=True, dtype=tf.float32, name="rouge_n_test", @@ -223,7 +270,6 @@ def test_get_config(self): config = rouge.get_config() expected_config_subset = { "order": 5, - "metric_type": "precision", "use_stemmer": True, } From f4df42b7b6111cbde95f77722b2020b9d3512a67 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 10 Jun 2022 11:57:41 +0530 Subject: [PATCH 24/30] Fix doc-strings --- keras_nlp/metrics/rouge_l.py | 39 +++++--------------------------- keras_nlp/metrics/rouge_n.py | 43 +++++------------------------------- 2 files changed, 11 insertions(+), 71 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 0c1a907d7c..bcbbaec0b3 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -46,13 +46,7 @@ class RougeL(RougeBase): >>> y_true = "the tiny little cat was found under the big funny bed" >>> y_pred = "the cat was under the bed" >>> rouge_l(y_true, y_pred) - { - 'rouge-l_precision': , - 'rouge-l_recall': - , - 'rouge-l_f1_score': - - } + {'rouge-l_precision': , 'rouge-l_recall': , 'rouge-l_f1_score': } 1.2. rank 1 inputs. a. Python list. @@ -66,13 +60,7 @@ class RougeL(RougeBase): ... "i love contributing to KerasNLP", ... ] >>> rouge_l(y_true, y_pred) - { - 'rouge-l_precision': , - 'rouge-l_recall': - , - 'rouge-l_f1_score': - - } + {'rouge-l_precision': , 'rouge-l_recall': , 'rouge-l_f1_score': } b. Tensor >>> rouge_l = keras_nlp.metrics.RougeL() @@ -89,13 +77,7 @@ class RougeL(RougeBase): ... ] ... ) >>> rouge_l(y_true, y_pred) - { - 'rouge-l_precision': , - 'rouge-l_recall': - , - 'rouge-l_f1_score': - - } + {'rouge-l_precision': , 'rouge-l_recall': , 'rouge-l_f1_score': } 1.3. rank 2 inputs. >>> rouge_l = keras_nlp.metrics.RougeL() @@ -112,13 +94,7 @@ class RougeL(RougeBase): ... ] ... ) >>> rouge_l(y_true, y_pred) - { - 'rouge-l_precision': , - 'rouge-l_recall': - , - 'rouge-l_f1_score': - - } + {'rouge-l_precision': , 'rouge-l_recall': , 'rouge-l_f1_score': } 3. Pass the metric to `model.compile()`. >>> inputs = keras.Input(shape=(), dtype='string') @@ -129,12 +105,7 @@ class RougeL(RougeBase): >>> y = tf.constant(["hello this is awesome"]) >>> metric_dict = model.evaluate(x, y, return_dict=True) >>> metric_dict - { - 'loss': 0.0, - 'rouge-l_precision': 0.75, - 'rouge-l_recall': 0.75, - 'rouge-l_f1_score': 0.75 - } + {'loss': 0.0, 'rouge-l_precision': 0.75, 'rouge-l_recall': 0.75, 'rouge-l_f1_score': 0.75} """ def __init__( diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index 4030066ecd..e7ab222467 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -48,12 +48,7 @@ class RougeN(RougeBase): >>> y_true = "the tiny little cat was found under the big funny bed" >>> y_pred = "the cat was under the bed" >>> rouge_n(y_true, y_pred) - { - 'rouge-n_precision': , - 'rouge-n_recall': , - 'rouge-n_f1_score': - - } + {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } 1.2. rank 1 inputs. a. Python list. @@ -67,12 +62,7 @@ class RougeN(RougeBase): ... "i love contributing to KerasNLP", ... ] >>> rouge_n(y_true, y_pred) - { - 'rouge-n_precision': , - 'rouge-n_recall': , - 'rouge-n_f1_score': - - } + {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } b. Tensor. >>> rouge_n = keras_nlp.metrics.RougeN(order=2) @@ -89,12 +79,7 @@ class RougeN(RougeBase): ... ] ... ) >>> rouge_n(y_true, y_pred) - { - 'rouge-n_precision': , - 'rouge-n_recall': , - 'rouge-n_f1_score': - - } + {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } 1.3. rank 2 inputs. >>> rouge_n = keras_nlp.metrics.RougeN(order=2) @@ -111,12 +96,7 @@ class RougeN(RougeBase): ... ] ... ) >>> rouge_n(y_true, y_pred) - { - 'rouge-n_precision': , - 'rouge-n_recall': , - 'rouge-n_f1_score': - - } + {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } 2. Consider trigrams for calculating ROUGE-N. >>> rouge_n = keras_nlp.metrics.RougeN(order=3) @@ -133,13 +113,7 @@ class RougeN(RougeBase): ... ] ... ) >>> rouge_n(y_true, y_pred) - { - 'rouge-n_precision': - , - 'rouge-n_recall': , - 'rouge-n_f1_score': - - } + {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } 3. Pass the metric to `model.compile()`. >>> inputs = keras.Input(shape=(), dtype='string') @@ -150,12 +124,7 @@ class RougeN(RougeBase): >>> y = tf.constant(["hello this is awesome"]) >>> metric_dict = model.evaluate(x, y, return_dict=True) >>> metric_dict - { - 'loss': 0.0, - 'rouge-n_precision': 0.6666666865348816, - 'rouge-n_recall': 0.6666666865348816, - 'rouge-n_f1_score': 0.6666666865348816 - } + {'loss': 0.0, 'rouge-n_precision': 0.6666666865348816, 'rouge-n_recall': 0.6666666865348816, 'rouge-n_f1_score': 0.6666666865348816} """ def __init__( From 723d8e7e7067dd803a005f15891e5cdd1ca0d78b Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 10 Jun 2022 23:47:06 +0530 Subject: [PATCH 25/30] Truncate doc-string example output --- keras_nlp/metrics/rouge_l.py | 20 ++++++++++---------- keras_nlp/metrics/rouge_n.py | 24 ++++++++++++------------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index bcbbaec0b3..8e54b9a8d5 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -45,8 +45,8 @@ class RougeL(RougeBase): >>> rouge_l = keras_nlp.metrics.RougeL() >>> y_true = "the tiny little cat was found under the big funny bed" >>> y_pred = "the cat was under the bed" - >>> rouge_l(y_true, y_pred) - {'rouge-l_precision': , 'rouge-l_recall': , 'rouge-l_f1_score': } + >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"] + 1.2. rank 1 inputs. a. Python list. @@ -59,8 +59,8 @@ class RougeL(RougeBase): ... "the cat was under the bed", ... "i love contributing to KerasNLP", ... ] - >>> rouge_l(y_true, y_pred) - {'rouge-l_precision': , 'rouge-l_recall': , 'rouge-l_f1_score': } + >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"] + b. Tensor >>> rouge_l = keras_nlp.metrics.RougeL() @@ -76,8 +76,8 @@ class RougeL(RougeBase): ... "i love contributing to KerasNLP", ... ] ... ) - >>> rouge_l(y_true, y_pred) - {'rouge-l_precision': , 'rouge-l_recall': , 'rouge-l_f1_score': } + >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"] + 1.3. rank 2 inputs. >>> rouge_l = keras_nlp.metrics.RougeL() @@ -93,8 +93,8 @@ class RougeL(RougeBase): ... ["i love contributing to KerasNLP"], ... ] ... ) - >>> rouge_l(y_true, y_pred) - {'rouge-l_precision': , 'rouge-l_recall': , 'rouge-l_f1_score': } + >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"] + 3. Pass the metric to `model.compile()`. >>> inputs = keras.Input(shape=(), dtype='string') @@ -104,8 +104,8 @@ class RougeL(RougeBase): >>> x = tf.constant(["HELLO THIS IS FUN"]) >>> y = tf.constant(["hello this is awesome"]) >>> metric_dict = model.evaluate(x, y, return_dict=True) - >>> metric_dict - {'loss': 0.0, 'rouge-l_precision': 0.75, 'rouge-l_recall': 0.75, 'rouge-l_f1_score': 0.75} + >>> metric_dict["rouge-l_f1_score"] + 0.75 """ def __init__( diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index e7ab222467..b1eb059ec2 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -47,8 +47,8 @@ class RougeN(RougeBase): >>> rouge_n = keras_nlp.metrics.RougeN(order=2) >>> y_true = "the tiny little cat was found under the big funny bed" >>> y_pred = "the cat was under the bed" - >>> rouge_n(y_true, y_pred) - {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } + >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + 1.2. rank 1 inputs. a. Python list. @@ -61,8 +61,8 @@ class RougeN(RougeBase): ... "the cat was under the bed", ... "i love contributing to KerasNLP", ... ] - >>> rouge_n(y_true, y_pred) - {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } + >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + b. Tensor. >>> rouge_n = keras_nlp.metrics.RougeN(order=2) @@ -78,8 +78,8 @@ class RougeN(RougeBase): ... "i love contributing to KerasNLP", ... ] ... ) - >>> rouge_n(y_true, y_pred) - {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } + >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + 1.3. rank 2 inputs. >>> rouge_n = keras_nlp.metrics.RougeN(order=2) @@ -95,8 +95,8 @@ class RougeN(RougeBase): ... ["i love contributing to KerasNLP"], ... ] ... ) - >>> rouge_n(y_true, y_pred) - {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } + >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + 2. Consider trigrams for calculating ROUGE-N. >>> rouge_n = keras_nlp.metrics.RougeN(order=3) @@ -112,8 +112,8 @@ class RougeN(RougeBase): ... "i love contributing to KerasNLP", ... ] ... ) - >>> rouge_n(y_true, y_pred) - {'rouge-n_precision': , 'rouge-n_recall': , 'rouge-n_f1_score': } + >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + 3. Pass the metric to `model.compile()`. >>> inputs = keras.Input(shape=(), dtype='string') @@ -123,8 +123,8 @@ class RougeN(RougeBase): >>> x = tf.constant(["HELLO THIS IS FUN"]) >>> y = tf.constant(["hello this is awesome"]) >>> metric_dict = model.evaluate(x, y, return_dict=True) - >>> metric_dict - {'loss': 0.0, 'rouge-n_precision': 0.6666666865348816, 'rouge-n_recall': 0.6666666865348816, 'rouge-n_f1_score': 0.6666666865348816} + >>> metric_dict["rouge-n_f1_score"] + 0.6666666865348816 """ def __init__( From b0fe8bc3c972bb9bbb350a4de67052b3ee711ff1 Mon Sep 17 00:00:00 2001 From: Abheesht Date: Thu, 16 Jun 2022 17:03:32 +0530 Subject: [PATCH 26/30] Remove ROUGE-LSum from doc-string --- keras_nlp/metrics/rouge_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py index 069ef14226..59f4d4aba2 100644 --- a/keras_nlp/metrics/rouge_base.py +++ b/keras_nlp/metrics/rouge_base.py @@ -32,8 +32,8 @@ class RougeBase(keras.metrics.Metric): """ROUGE metric. - This class implements all the variants of the ROUGE metric - ROUGE-N, - ROUGE-L and ROUGE-LSum. + This class implements two variants of the ROUGE metric - ROUGE-N, + and ROUGE-L. Note on input shapes: For `y_true` and `y_pred`, this class supports scalar values and batch From 7250617b2c8934f1b454cd92245c11ad0f1680a0 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 17 Jun 2022 00:36:15 +0530 Subject: [PATCH 27/30] Small doc-string changes --- keras_nlp/metrics/rouge_base.py | 4 ++-- keras_nlp/metrics/rouge_l.py | 2 +- keras_nlp/metrics/rouge_n.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py index 59f4d4aba2..8e5158b79c 100644 --- a/keras_nlp/metrics/rouge_base.py +++ b/keras_nlp/metrics/rouge_base.py @@ -37,10 +37,10 @@ class RougeBase(keras.metrics.Metric): Note on input shapes: For `y_true` and `y_pred`, this class supports scalar values and batch - inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`. + inputs of shapes `()`, `(batch_size,)` and `(batch_size, 1)`. Args: - variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to + variant: string. One of "rougeN", "rougeL". Defaults to "rouge2". For "rougeN", N lies in the range [1, 9]. use_stemmer: bool. Whether Porter Stemmer should be used to strip word suffixes to improve matching. Defaults to False. diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index 8e54b9a8d5..aff177f93d 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -28,7 +28,7 @@ class RougeL(RougeBase): Note on input shapes: For `y_true` and `y_pred`, this class supports scalar values and batch - inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`. + inputs of shapes `()`, `(batch_size,)` and `(batch_size, 1)`. Args: use_stemmer: bool. Whether Porter Stemmer should be used to strip word diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index b1eb059ec2..dc31630ff2 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -28,7 +28,7 @@ class RougeN(RougeBase): Note on input shapes: For `y_true` and `y_pred`, this class supports scalar values and batch - inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`. + inputs of shapes `()`, `(batch_size,)` and `(batch_size, 1)`. Args: order: The order of n-grams which are to be matched. It should lie in From 3c5b3dc8ff7b150b6aa9dbd6caeace40ae7f9c9d Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 17 Jun 2022 09:04:30 +0530 Subject: [PATCH 28/30] Add TODO comment for dict return bug --- keras_nlp/metrics/rouge_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py index 8e5158b79c..92294b0d10 100644 --- a/keras_nlp/metrics/rouge_base.py +++ b/keras_nlp/metrics/rouge_base.py @@ -116,6 +116,7 @@ def __new__(cls, *args, **kwargs): # Wraps `result()` with a python dictionary that also supports variable # assignment. We have to do this with __new__ because the base metric # class wraps the `results()` method. + # TODO: Remove this snippet of code once the Keras bug is fixed. obj = super().__new__(cls) class MetricDict(dict): From 4fa518ab13802a12dab7110ee751ebb1909de5d1 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 17 Jun 2022 13:11:06 +0530 Subject: [PATCH 29/30] Address review comments - V --- keras_nlp/metrics/rouge_base.py | 12 +-- keras_nlp/metrics/rouge_l_test.py | 130 ++++++++++++--------------- keras_nlp/metrics/rouge_n_test.py | 142 +++++++++++++----------------- 3 files changed, 121 insertions(+), 163 deletions(-) diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py index 92294b0d10..22d4adf3b8 100644 --- a/keras_nlp/metrics/rouge_base.py +++ b/keras_nlp/metrics/rouge_base.py @@ -192,18 +192,18 @@ def calculate_rouge_score(reference, hypothesis): def result(self): if self._number_of_samples == 0: return { - f"{self.name}_precision": 0.0, - f"{self.name}_recall": 0.0, - f"{self.name}_f1_score": 0.0, + "precision": 0.0, + "recall": 0.0, + "f1_score": 0.0, } rouge_precision = self._rouge_precision / self._number_of_samples rouge_recall = self._rouge_recall / self._number_of_samples rouge_f1_score = self._rouge_f1_score / self._number_of_samples return { - f"{self.name}_precision": rouge_precision, - f"{self.name}_recall": rouge_recall, - f"{self.name}_f1_score": rouge_f1_score, + "precision": rouge_precision, + "recall": rouge_recall, + "f1_score": rouge_f1_score, } def reset_state(self): diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py index af3f306990..d130e12190 100644 --- a/keras_nlp/metrics/rouge_l_test.py +++ b/keras_nlp/metrics/rouge_l_test.py @@ -23,18 +23,27 @@ class RougeLTest(tf.test.TestCase): def setUp(self): super().setUp() - self.metric_types = ( - "rouge-l_precision", - "rouge-l_recall", - "rouge-l_f1_score", - ) + + def assertDictAlmostEqual(d1, d2, delta=1e-3, typecast_to_numpy=True): + for key, val in d1.items(): + if typecast_to_numpy: + val = val.numpy() + self.assertAlmostEqual(val, d2[key], delta=delta) + + def assertDictAllValuesNotEqual(d1, d2): + for key, val in d1.items(): + self.assertNotEqual(val, d2[key]) + + self.assertDictAlmostEqual = assertDictAlmostEqual + self.assertDictAllValuesNotEqual = assertDictAllValuesNotEqual def test_initialization(self): rouge = RougeL() result = rouge.result() - for metric_type in self.metric_types: - self.assertEqual(result[metric_type].numpy(), 0.0) + self.assertDictEqual( + result, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0} + ) def test_string_input(self): rouge = RougeL(use_stemmer=False) @@ -42,12 +51,9 @@ def test_string_input(self): y_pred = "the cat was under the bed" rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [1, 0.545, 0.706] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 1.0, "recall": 0.545, "f1_score": 0.706} + ) def test_string_list_input(self): rouge = RougeL(use_stemmer=False) @@ -61,12 +67,9 @@ def test_string_list_input(self): ] rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [1, 0.689, 0.807] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 1.0, "recall": 0.689, "f1_score": 0.807} + ) def test_tensor_input(self): rouge = RougeL(use_stemmer=False) @@ -81,12 +84,9 @@ def test_tensor_input(self): ) rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [1, 0.689, 0.807] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 1.0, "recall": 0.689, "f1_score": 0.807} + ) def test_rank_2_input(self): rouge = RougeL(use_stemmer=False) @@ -101,12 +101,9 @@ def test_rank_2_input(self): ) rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [1, 0.689, 0.807] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 1.0, "recall": 0.689, "f1_score": 0.807} + ) def test_model_compile(self): inputs = keras.Input(shape=(), dtype="string") @@ -119,12 +116,12 @@ def test_model_compile(self): y = tf.constant(["hello this is awesome"]) output = model.evaluate(x, y, return_dict=True) - for metric_type, expected_val in zip( - self.metric_types, [0.75, 0.75, 0.75] - ): - self.assertAlmostEqual( - output[metric_type], expected_val, delta=1e-3 - ) + del output["loss"] + self.assertDictAlmostEqual( + output, + {"precision": 0.75, "recall": 0.75, "f1_score": 0.75}, + typecast_to_numpy=False, + ) def test_reset_state(self): rouge = RougeL() @@ -140,17 +137,15 @@ def test_reset_state(self): rouge.update_state(y_true, y_pred) rouge_val = rouge.result() - for metric_type, unexpected_val in zip( - self.metric_types, [0.0, 0.0, 0.0] - ): - self.assertNotEqual(rouge_val[metric_type].numpy(), unexpected_val) + self.assertDictAllValuesNotEqual( + rouge_val, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0} + ) rouge.reset_state() rouge_val = rouge.result() - for metric_type, unexpected_val in zip( - self.metric_types, [0.0, 0.0, 0.0] - ): - self.assertEqual(rouge_val[metric_type].numpy(), unexpected_val) + self.assertDictEqual( + rouge_val, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0} + ) def test_update_state(self): rouge = RougeL() @@ -166,24 +161,18 @@ def test_update_state(self): rouge.update_state(y_true_1, y_pred_1) rouge_val = rouge.result() - for metric_type, expected_val in zip( - self.metric_types, [1, 0.689, 0.807] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 1.0, "recall": 0.689, "f1_score": 0.807} + ) y_true_2 = tf.constant(["what is your favourite show"]) y_pred_2 = tf.constant(["my favourite show is silicon valley"]) rouge.update_state(y_true_2, y_pred_2) rouge_val = rouge.result() - for metric_type, expected_val in zip( - self.metric_types, [0.778, 0.593, 0.66] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.778, "recall": 0.593, "f1_score": 0.66} + ) def test_merge_state(self): rouge_1 = RougeL() @@ -208,31 +197,22 @@ def test_merge_state(self): rouge_1.update_state(y_true_1, y_pred_1) rouge_1.update_state(y_true_2, y_pred_2) rouge_val = rouge_1.result() - for metric_type, expected_val in zip( - self.metric_types, [0.778, 0.593, 0.66] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.778, "recall": 0.593, "f1_score": 0.66} + ) rouge_2.update_state(y_true_3, y_pred_3) rouge_val = rouge_2.result() - for metric_type, expected_val in zip( - self.metric_types, [0.333, 0.4, 0.364] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.333, "recall": 0.4, "f1_score": 0.364} + ) merged_rouge = RougeL() merged_rouge.merge_state([rouge_1, rouge_2]) rouge_val = merged_rouge.result() - for metric_type, expected_val in zip( - self.metric_types, [0.667, 0.545, 0.586] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.667, "recall": 0.545, "f1_score": 0.586} + ) def test_get_config(self): rouge = RougeL( diff --git a/keras_nlp/metrics/rouge_n_test.py b/keras_nlp/metrics/rouge_n_test.py index 8537876a66..2183afe3fe 100644 --- a/keras_nlp/metrics/rouge_n_test.py +++ b/keras_nlp/metrics/rouge_n_test.py @@ -23,18 +23,27 @@ class RougeNTest(tf.test.TestCase): def setUp(self): super().setUp() - self.metric_types = ( - "rouge-n_precision", - "rouge-n_recall", - "rouge-n_f1_score", - ) + + def assertDictAlmostEqual(d1, d2, delta=1e-3, typecast_to_numpy=True): + for key, val in d1.items(): + if typecast_to_numpy: + val = val.numpy() + self.assertAlmostEqual(val, d2[key], delta=delta) + + def assertDictAllValuesNotEqual(d1, d2): + for key, val in d1.items(): + self.assertNotEqual(val, d2[key]) + + self.assertDictAlmostEqual = assertDictAlmostEqual + self.assertDictAllValuesNotEqual = assertDictAllValuesNotEqual def test_initialization(self): rouge = RougeN() result = rouge.result() - for metric_type in self.metric_types: - self.assertEqual(result[metric_type].numpy(), 0.0) + self.assertDictEqual( + result, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0} + ) def test_string_input(self): rouge = RougeN(order=2, use_stemmer=False) @@ -42,12 +51,9 @@ def test_string_input(self): y_pred = "the cat was under the bed" rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [0.4, 0.2, 0.267] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.4, "recall": 0.2, "f1_score": 0.267} + ) def test_string_list_input(self): rouge = RougeN(order=2, use_stemmer=False) @@ -61,12 +67,9 @@ def test_string_list_input(self): ] rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [0.575, 0.4, 0.467] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.575, "recall": 0.4, "f1_score": 0.467} + ) def test_tensor_input(self): rouge = RougeN(order=2, use_stemmer=False) @@ -81,12 +84,9 @@ def test_tensor_input(self): ) rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [0.575, 0.4, 0.467] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.575, "recall": 0.4, "f1_score": 0.467} + ) def test_rank_2_input(self): rouge = RougeN(order=2, use_stemmer=False) @@ -101,12 +101,9 @@ def test_rank_2_input(self): ) rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [0.575, 0.4, 0.467] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.575, "recall": 0.4, "f1_score": 0.467} + ) def test_model_compile(self): inputs = keras.Input(shape=(), dtype="string") @@ -119,13 +116,12 @@ def test_model_compile(self): y = tf.constant(["hello this is awesome"]) output = model.evaluate(x, y, return_dict=True) - - for metric_type, expected_val in zip( - self.metric_types, [0.667, 0.667, 0.667] - ): - self.assertAlmostEqual( - output[metric_type], expected_val, delta=1e-3 - ) + del output["loss"] + self.assertDictAlmostEqual( + output, + {"precision": 0.667, "recall": 0.667, "f1_score": 0.667}, + typecast_to_numpy=False, + ) def test_incorrect_order(self): with self.assertRaises(ValueError): @@ -144,12 +140,11 @@ def test_different_order(self): ) rouge_val = rouge(y_true, y_pred) - for metric_type, expected_val in zip( - self.metric_types, [0.333, 0.25, 0.286] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, + {"precision": 0.333, "recall": 0.25, "f1_score": 0.286}, + typecast_to_numpy=False, + ) def test_reset_state(self): rouge = RougeN() @@ -165,17 +160,15 @@ def test_reset_state(self): rouge.update_state(y_true, y_pred) rouge_val = rouge.result() - for metric_type, unexpected_val in zip( - self.metric_types, [0.0, 0.0, 0.0] - ): - self.assertNotEqual(rouge_val[metric_type].numpy(), unexpected_val) + self.assertDictAllValuesNotEqual( + rouge_val, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0} + ) rouge.reset_state() rouge_val = rouge.result() - for metric_type, unexpected_val in zip( - self.metric_types, [0.0, 0.0, 0.0] - ): - self.assertEqual(rouge_val[metric_type].numpy(), unexpected_val) + self.assertDictEqual( + rouge_val, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0} + ) def test_update_state(self): rouge = RougeN() @@ -191,24 +184,18 @@ def test_update_state(self): rouge.update_state(y_true_1, y_pred_1) rouge_val = rouge.result() - for metric_type, expected_val in zip( - self.metric_types, [0.575, 0.4, 0.467] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.575, "recall": 0.4, "f1_score": 0.467} + ) y_true_2 = tf.constant(["what is your favourite show"]) y_pred_2 = tf.constant(["my favourite show is silicon valley"]) rouge.update_state(y_true_2, y_pred_2) rouge_val = rouge.result() - for metric_type, expected_val in zip( - self.metric_types, [0.45, 0.35, 0.385] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.45, "recall": 0.35, "f1_score": 0.385} + ) def test_merge_state(self): rouge_1 = RougeN() @@ -233,31 +220,22 @@ def test_merge_state(self): rouge_1.update_state(y_true_1, y_pred_1) rouge_1.update_state(y_true_2, y_pred_2) rouge_val = rouge_1.result() - for metric_type, expected_val in zip( - self.metric_types, [0.45, 0.35, 0.385] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.45, "recall": 0.35, "f1_score": 0.385} + ) rouge_2.update_state(y_true_3, y_pred_3) rouge_val = rouge_2.result() - for metric_type, expected_val in zip( - self.metric_types, [0.2, 0.25, 0.222] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.2, "recall": 0.25, "f1_score": 0.222} + ) merged_rouge = RougeN() merged_rouge.merge_state([rouge_1, rouge_2]) rouge_val = merged_rouge.result() - for metric_type, expected_val in zip( - self.metric_types, [0.388, 0.325, 0.344] - ): - self.assertAlmostEqual( - rouge_val[metric_type].numpy(), expected_val, delta=1e-3 - ) + self.assertDictAlmostEqual( + rouge_val, {"precision": 0.388, "recall": 0.325, "f1_score": 0.344} + ) def test_get_config(self): rouge = RougeN( From 14e851fa62fe8eb9519148fb73e1b93c219f9175 Mon Sep 17 00:00:00 2001 From: abheesht17 Date: Fri, 17 Jun 2022 13:17:33 +0530 Subject: [PATCH 30/30] Fix doc-string --- keras_nlp/metrics/rouge_l.py | 10 +++++----- keras_nlp/metrics/rouge_n.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py index aff177f93d..f6969a85f6 100644 --- a/keras_nlp/metrics/rouge_l.py +++ b/keras_nlp/metrics/rouge_l.py @@ -45,7 +45,7 @@ class RougeL(RougeBase): >>> rouge_l = keras_nlp.metrics.RougeL() >>> y_true = "the tiny little cat was found under the big funny bed" >>> y_pred = "the cat was under the bed" - >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"] + >>> rouge_l(y_true, y_pred)["f1_score"] 1.2. rank 1 inputs. @@ -59,7 +59,7 @@ class RougeL(RougeBase): ... "the cat was under the bed", ... "i love contributing to KerasNLP", ... ] - >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"] + >>> rouge_l(y_true, y_pred)["f1_score"] b. Tensor @@ -76,7 +76,7 @@ class RougeL(RougeBase): ... "i love contributing to KerasNLP", ... ] ... ) - >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"] + >>> rouge_l(y_true, y_pred)["f1_score"] 1.3. rank 2 inputs. @@ -93,7 +93,7 @@ class RougeL(RougeBase): ... ["i love contributing to KerasNLP"], ... ] ... ) - >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"] + >>> rouge_l(y_true, y_pred)["f1_score"] 3. Pass the metric to `model.compile()`. @@ -104,7 +104,7 @@ class RougeL(RougeBase): >>> x = tf.constant(["HELLO THIS IS FUN"]) >>> y = tf.constant(["hello this is awesome"]) >>> metric_dict = model.evaluate(x, y, return_dict=True) - >>> metric_dict["rouge-l_f1_score"] + >>> metric_dict["f1_score"] 0.75 """ diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py index dc31630ff2..4bfe532ee2 100644 --- a/keras_nlp/metrics/rouge_n.py +++ b/keras_nlp/metrics/rouge_n.py @@ -47,7 +47,7 @@ class RougeN(RougeBase): >>> rouge_n = keras_nlp.metrics.RougeN(order=2) >>> y_true = "the tiny little cat was found under the big funny bed" >>> y_pred = "the cat was under the bed" - >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + >>> rouge_n(y_true, y_pred)["f1_score"] 1.2. rank 1 inputs. @@ -61,7 +61,7 @@ class RougeN(RougeBase): ... "the cat was under the bed", ... "i love contributing to KerasNLP", ... ] - >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + >>> rouge_n(y_true, y_pred)["f1_score"] b. Tensor. @@ -78,7 +78,7 @@ class RougeN(RougeBase): ... "i love contributing to KerasNLP", ... ] ... ) - >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + >>> rouge_n(y_true, y_pred)["f1_score"] 1.3. rank 2 inputs. @@ -95,7 +95,7 @@ class RougeN(RougeBase): ... ["i love contributing to KerasNLP"], ... ] ... ) - >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + >>> rouge_n(y_true, y_pred)["f1_score"] 2. Consider trigrams for calculating ROUGE-N. @@ -112,7 +112,7 @@ class RougeN(RougeBase): ... "i love contributing to KerasNLP", ... ] ... ) - >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"] + >>> rouge_n(y_true, y_pred)["f1_score"] 3. Pass the metric to `model.compile()`. @@ -123,7 +123,7 @@ class RougeN(RougeBase): >>> x = tf.constant(["HELLO THIS IS FUN"]) >>> y = tf.constant(["hello this is awesome"]) >>> metric_dict = model.evaluate(x, y, return_dict=True) - >>> metric_dict["rouge-n_f1_score"] + >>> metric_dict["f1_score"] 0.6666666865348816 """