From 62324bee018058e827dd03d90ecd22ad271dd468 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 16 Apr 2022 11:16:44 +0530
Subject: [PATCH 01/30] Add rough class for RougeL

---
 keras_nlp/metrics/rouge_l.py | 173 +++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 keras_nlp/metrics/rouge_l.py

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
new file mode 100644
index 0000000000..5a56a782a8
--- /dev/null
+++ b/keras_nlp/metrics/rouge_l.py
@@ -0,0 +1,173 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ROUGE-L metric implementation based on `keras.metrics.Metric`."""
+
+import tensorflow as tf
+import tensorflow_text as tf_text
+from tensorflow import keras
+
+
+class RougeL(keras.metrics.Metric):
+    """ROUGE-L metric.
+
+    This class implements the ROUGE-L metric.
+
+    Args:
+        alpha: float. `alpha` is used as the weight for the
+            harmonic mean of precision and recall. A value of 0 means recall is
+            more important and a value of 1 means precision is more important
+            (same behaviour as
+            https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l).
+        metric_type: string. One of "precision", "recall", "f1_score". Defaults
+            to "f1_score".
+        mask_token_id: int. ID of the token to be masked. If provided, the mask
+            is computed for this class. Note that if this field is provided, and
+            if the `sample_weight` field in `update_state()` is also provided,
+            we will compute the final `sample_weight` as the element-wise
+            product of the mask and the `sample_weight`. In the product, any
+            value >= 1 will be treated as True, and False, otherwise, for
+            masking.
+        dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
+               not specified, it defaults to tf.float32.
+        name: string. Name of the metric instance.
+        **kwargs: Other keyword arguments.
+
+    Examples:
+
+    """
+
+    def __init__(
+        self,
+        alpha=0.5,
+        metric_type="f1_score",
+        mask_token_id=None,
+        dtype=None,
+        name="rouge_l",
+        **kwargs,
+    ):
+        super().__init__(name=name, dtype=dtype, **kwargs)
+
+        if not tf.as_dtype(self.dtype).is_floating:
+            raise ValueError(
+                "`dtype` must be a floating point type. "
+                f"Received: dtype={dtype}"
+            )
+
+        if metric_type not in ["precision", "recall", "f1_score"]:
+            raise ValueError(
+                "`metric_type` must be one of 'precision', 'recall', "
+                "'f1_score'. Received: metric_type={metric_type}"
+            )
+
+        self.alpha = alpha
+        self.metric_type = metric_type
+        self.mask_token_id = mask_token_id
+
+        self._rouge_l_score = self.add_weight(
+            name="rouge_l_score",
+            initializer="zeros",
+            dtype=self.dtype,
+        )
+        self._number_of_samples = self.add_weight(
+            name="number_of_samples", initializer="zeros", dtype=self.dtype
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        # Both y_true and y_pred have shape: [batch_size, seq_len]. Note that
+        # they can also be ragged tensors with shape [num_samples, (seq_len)].
+
+        # If the input tensors are not ragged tensors, convert them to ragged
+        # tensors. `tf_text.metrics.rouge_l` expects ragged tensors.
+        if not isinstance(y_true, tf.RaggedTensor):
+            y_true = tf.RaggedTensor.from_tensor(y_true)
+        if not isinstance(y_pred, tf.RaggedTensor):
+            y_pred = tf.RaggedTensor.from_tensor(y_pred)
+
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, self.dtype)
+
+        batch_size = tf.cast(y_true.nrows(), self.dtype)
+
+        if self.mask_token_id is not None:
+            mask = tf.cast(
+                tf.math.logical_not(tf.equal(y_true, self.mask_token_id)),
+                self.dtype,
+            )
+            if sample_weight is None:
+                sample_weight = mask
+            else:
+                sample_weight = tf.multiply(mask, sample_weight)
+
+        if sample_weight is not None:
+            sample_weight = tf.cast(sample_weight, tf.bool)
+
+            # Apply mask to both tensors.
+            y_true = tf.ragged.boolean_mask(y_true, sample_weight)
+            y_pred = tf.ragged.boolean_mask(y_pred, sample_weight)
+
+        f1_scores, precisions, recalls = rouge_l(
+            y_true, y_pred, alpha=self.alpha
+        )
+        if self.metric_type == "precision":
+            scores = precisions
+        elif self.metric_type == "recall":
+            scores = recalls
+        else:
+            scores = f1_scores
+        self._rouge_l_score.assign_add(tf.reduce_sum(scores))
+        self._number_of_samples.assign_add(batch_size)
+
+    def result(self):
+        if self._number_of_samples == 0:
+            return 0.0
+        rouge_l_score = self._rouge_l_score / self._number_of_samples
+        return rouge_l_score
+
+    def reset_state(self):
+        self._rouge_l_score.assign(0.0)
+        self._number_of_samples.assign(0.0)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "alpha": 0.5,
+                "metric_type": "f1_score",
+                "mask_token_id": self.mask_token_id,
+            }
+        )
+        return config
+
+
+def rouge_l(y_true, y_pred, alpha=0.5):
+    """
+    Computes the ROUGE-L score.
+    Args:
+        y_true (_type_): tf.RaggedTensor. The reference summaries.
+        y_pred (_type_): tf.RaggedTensor. The generated summaries.
+        alpha (float, optional): float. Defaults to 0.5. `alpha` is used as the
+            weight for the harmonic mean of precision and recall. A value of 0
+            means recall is more important and a value of 1 means precision is
+            more important (same behaviour as
+            https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l).
+
+    Returns:
+        (f1_scores, precisions, recalls): Tuple of tf.Tensor. The f1_scores,
+            precisions and recalls are returned for every sample.
+    """
+    f1_scores, precisions, recalls = tf_text.metrics.rouge_l(
+        y_true, y_pred, alpha=alpha
+    )
+    return f1_scores, precisions, recalls

From 3bc476a7f5520368a241d48b84be5452ccfbbf24 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 16 Apr 2022 11:20:17 +0530
Subject: [PATCH 02/30] Fix typos

---
 keras_nlp/metrics/rouge_l.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 5a56a782a8..3f3e09c381 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -143,8 +143,8 @@ def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "alpha": 0.5,
-                "metric_type": "f1_score",
+                "alpha": self.alpha,
+                "metric_type": self.metric_type,
                 "mask_token_id": self.mask_token_id,
             }
         )
@@ -155,12 +155,12 @@ def rouge_l(y_true, y_pred, alpha=0.5):
     """
     Computes the ROUGE-L score.
     Args:
-        y_true (_type_): tf.RaggedTensor. The reference summaries.
-        y_pred (_type_): tf.RaggedTensor. The generated summaries.
-        alpha (float, optional): float. Defaults to 0.5. `alpha` is used as the
-            weight for the harmonic mean of precision and recall. A value of 0
-            means recall is more important and a value of 1 means precision is
-            more important (same behaviour as
+        y_true: tf.RaggedTensor. The reference summaries.
+        y_pred: tf.RaggedTensor. The generated summaries.
+        alpha: float. Defaults to 0.5. `alpha` is used as the weight for the
+            harmonic mean of precision and recall. A value of 0 means recall is
+            more important and a value of 1 means precision is more important
+            (same behaviour as
             https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l).
 
     Returns:

From b09302d0f35fc3aea45a27f735b8717e930ce956 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 16 Apr 2022 14:55:32 +0530
Subject: [PATCH 03/30] Correct logic

---
 keras_nlp/metrics/__init__.py     |   1 +
 keras_nlp/metrics/rouge_l.py      |  46 +++----
 keras_nlp/metrics/rouge_l_test.py | 208 ++++++++++++++++++++++++++++++
 3 files changed, 228 insertions(+), 27 deletions(-)
 create mode 100644 keras_nlp/metrics/rouge_l_test.py

diff --git a/keras_nlp/metrics/__init__.py b/keras_nlp/metrics/__init__.py
index 7152a97032..71509009a3 100644
--- a/keras_nlp/metrics/__init__.py
+++ b/keras_nlp/metrics/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.
 
 from keras_nlp.metrics.perplexity import Perplexity
+from keras_nlp.metrics.rouge_l import RougeL
diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 3f3e09c381..09219e9f00 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -32,13 +32,7 @@ class RougeL(keras.metrics.Metric):
             https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l).
         metric_type: string. One of "precision", "recall", "f1_score". Defaults
             to "f1_score".
-        mask_token_id: int. ID of the token to be masked. If provided, the mask
-            is computed for this class. Note that if this field is provided, and
-            if the `sample_weight` field in `update_state()` is also provided,
-            we will compute the final `sample_weight` as the element-wise
-            product of the mask and the `sample_weight`. In the product, any
-            value >= 1 will be treated as True, and False, otherwise, for
-            masking.
+        mask_token_ids: list of integers. IDs of the tokens to be masked.
         dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
                not specified, it defaults to tf.float32.
         name: string. Name of the metric instance.
@@ -52,7 +46,7 @@ def __init__(
         self,
         alpha=0.5,
         metric_type="f1_score",
-        mask_token_id=None,
+        mask_token_ids=None,
         dtype=None,
         name="rouge_l",
         **kwargs,
@@ -65,7 +59,7 @@ def __init__(
                 f"Received: dtype={dtype}"
             )
 
-        if metric_type not in ["precision", "recall", "f1_score"]:
+        if metric_type not in ("precision", "recall", "f1_score"):
             raise ValueError(
                 "`metric_type` must be one of 'precision', 'recall', "
                 "'f1_score'. Received: metric_type={metric_type}"
@@ -73,7 +67,7 @@ def __init__(
 
         self.alpha = alpha
         self.metric_type = metric_type
-        self.mask_token_id = mask_token_id
+        self.mask_token_ids = mask_token_ids
 
         self._rouge_l_score = self.add_weight(
             name="rouge_l_score",
@@ -95,27 +89,25 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         if not isinstance(y_pred, tf.RaggedTensor):
             y_pred = tf.RaggedTensor.from_tensor(y_pred)
 
-        if sample_weight is not None:
-            sample_weight = tf.cast(sample_weight, self.dtype)
-
         batch_size = tf.cast(y_true.nrows(), self.dtype)
 
-        if self.mask_token_id is not None:
-            mask = tf.cast(
-                tf.math.logical_not(tf.equal(y_true, self.mask_token_id)),
-                self.dtype,
-            )
-            if sample_weight is None:
-                sample_weight = mask
-            else:
-                sample_weight = tf.multiply(mask, sample_weight)
+        y_true_mask = tf.cast(tf.ones_like(y_true), tf.bool)
+        y_pred_mask = tf.cast(tf.ones_like(y_pred), tf.bool)
 
-        if sample_weight is not None:
-            sample_weight = tf.cast(sample_weight, tf.bool)
+        if self.mask_token_ids is not None:
+            for mask_token_id in self.mask_token_ids:
+                y_true_mask = tf.logical_and(
+                    y_true_mask,
+                    tf.math.logical_not(tf.equal(y_true, mask_token_id)),
+                )
+                y_pred_mask = tf.logical_and(
+                    y_pred_mask,
+                    tf.math.logical_not(tf.equal(y_pred, mask_token_id)),
+                )
 
             # Apply mask to both tensors.
-            y_true = tf.ragged.boolean_mask(y_true, sample_weight)
-            y_pred = tf.ragged.boolean_mask(y_pred, sample_weight)
+            y_true = tf.ragged.boolean_mask(y_true, y_true_mask)
+            y_pred = tf.ragged.boolean_mask(y_pred, y_pred_mask)
 
         f1_scores, precisions, recalls = rouge_l(
             y_true, y_pred, alpha=self.alpha
@@ -145,7 +137,7 @@ def get_config(self):
             {
                 "alpha": self.alpha,
                 "metric_type": self.metric_type,
-                "mask_token_id": self.mask_token_id,
+                "mask_token_ids": self.mask_token_ids,
             }
         )
         return config
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
new file mode 100644
index 0000000000..7b2926941d
--- /dev/null
+++ b/keras_nlp/metrics/rouge_l_test.py
@@ -0,0 +1,208 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for RougeL."""
+
+import tensorflow as tf
+
+from keras_nlp.metrics import RougeL
+
+
+class RougeLTest(tf.test.TestCase):
+    def test_vars_after_initializing_class(self):
+        rouge_l = RougeL()
+        self.assertEqual(rouge_l.result().numpy(), 0.0)
+
+    def test_without_mask_token_ids(self):
+        rouge_l = RougeL()
+        y_true = tf.constant([[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
+        y_pred = tf.constant([[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32)
+
+        rouge_l_val = rouge_l(y_true, y_pred)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.70, delta=1e-3)
+
+    def test_with_mask_token_ids(self):
+        rouge_l = RougeL(mask_token_ids=[0, 1])
+        y_true = tf.constant(
+            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
+        )
+        y_pred = tf.constant(
+            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
+        )
+
+        rouge_l_val = rouge_l(y_true, y_pred)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3)
+
+    def test_ragged_input_without_mask_token_ids(self):
+        rouge_l = RougeL()
+        y_true = tf.ragged.constant(
+            [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32
+        )
+        y_pred = tf.ragged.constant([[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32)
+
+        rouge_l_val = rouge_l(y_true, y_pred)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5357, delta=1e-3)
+
+    def test_ragged_input_with_mask_token_ids(self):
+        rouge_l = RougeL(mask_token_ids=[0, 1])
+        y_true = tf.ragged.constant(
+            [[1, 2, 3, 4], [1, 5, 6, 0, 0]], dtype=tf.int32
+        )
+        y_pred = tf.ragged.constant(
+            [[1, 3, 2, 4, 4, 4], [5, 6, 0, 2, 2]], dtype=tf.int32
+        )
+
+        rouge_l_val = rouge_l(y_true, y_pred)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.583, delta=1e-3)
+
+    def test_precision(self):
+        rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="precision")
+        y_true = tf.constant(
+            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
+        )
+        y_pred = tf.constant(
+            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
+        )
+
+        rouge_l_val = rouge_l(y_true, y_pred)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.8333, delta=1e-3)
+
+    def test_recall(self):
+        rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="recall")
+        y_true = tf.constant(
+            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
+        )
+        y_pred = tf.constant(
+            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
+        )
+
+        rouge_l_val = rouge_l(y_true, y_pred)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.45, delta=1e-3)
+
+    def test_two_inputs_from_logits(self):
+        rouge_l = RougeL(mask_token_ids=[0, 1])
+        y_true_1 = tf.constant(
+            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
+        )
+        y_pred_1 = tf.constant(
+            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
+        )
+
+        rouge_l_val = rouge_l(y_true_1, y_pred_1)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3)
+
+        y_true_2 = tf.ragged.constant(
+            [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32
+        )
+        y_pred_2 = tf.ragged.constant(
+            [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32
+        )
+
+        rouge_l_val = rouge_l(y_true_2, y_pred_2)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.7014, delta=1e-3)
+
+    def test_reset_state(self):
+        rouge_l = RougeL(mask_token_ids=[0, 1])
+        y_true = tf.constant(
+            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
+        )
+        y_pred = tf.constant(
+            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
+        )
+
+        rouge_l.update_state(y_true, y_pred)
+        self.assertNotEqual(rouge_l.result(), 0.0)
+
+        rouge_l.reset_state()
+        self.assertEqual(rouge_l.result(), 0.0)
+
+    def test_update_state(self):
+        rouge_l = RougeL(mask_token_ids=[0, 1])
+        y_true_1 = tf.constant(
+            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
+        )
+        y_pred_1 = tf.constant(
+            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
+        )
+
+        rouge_l.update_state(y_true_1, y_pred_1)
+        rouge_l_val = rouge_l.result()
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3)
+
+        y_true_2 = tf.ragged.constant(
+            [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32
+        )
+        y_pred_2 = tf.ragged.constant(
+            [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32
+        )
+
+        rouge_l.update_state(y_true_2, y_pred_2)
+        rouge_l_val = rouge_l.result()
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.7014, delta=1e-3)
+
+    def test_merge_state(self):
+        rouge_l_1 = RougeL(mask_token_ids=[0, 1])
+        rouge_l_2 = RougeL(mask_token_ids=[0, 1])
+
+        y_true_1 = tf.constant(
+            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
+        )
+        y_pred_1 = tf.constant(
+            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
+        )
+
+        y_true_2 = tf.ragged.constant(
+            [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32
+        )
+        y_pred_2 = tf.ragged.constant(
+            [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32
+        )
+
+        y_true_3 = tf.ragged.constant(
+            [[9, 8, 7, 1], [10, 5, 1, 2, 3]], dtype=tf.int32
+        )
+        y_pred_3 = tf.ragged.constant(
+            [[1, 2, 7, 9, 8, 0], [10, 1, 2]], dtype=tf.int32
+        )
+
+        rouge_l_1.update_state(y_true_1, y_pred_1)
+        rouge_l_1.update_state(y_true_2, y_pred_2)
+        self.assertAlmostEqual(rouge_l_1.result().numpy(), 0.7014, delta=1e-3)
+
+        rouge_l_2.update_state(y_true_3, y_pred_3)
+        self.assertAlmostEqual(rouge_l_2.result().numpy(), 0.6190, delta=1e-3)
+
+        merged_rouge_l = RougeL(mask_token_ids=[0, 1])
+        merged_rouge_l.merge_state([rouge_l_1, rouge_l_2])
+        self.assertAlmostEqual(
+            merged_rouge_l.result().numpy(), 0.6739, delta=1e-3
+        )
+
+    def test_get_config(self):
+        rouge_l = RougeL(
+            alpha=0.7,
+            metric_type="precision",
+            mask_token_ids=[0],
+            dtype=tf.float32,
+            name="rouge_l_test",
+        )
+        config = rouge_l.get_config()
+        expected_config = {
+            "alpha": 0.7,
+            "metric_type": "precision",
+            "mask_token_ids": [0],
+            "dtype": tf.float32,
+            "name": "rouge_l_test",
+        }
+        self.assertEqual(config, expected_config)

From cadbd01056661a9ed5308b91d69c94e52f1f1213 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 16 Apr 2022 15:17:19 +0530
Subject: [PATCH 04/30] Add examples

---
 keras_nlp/metrics/rouge_l.py | 67 ++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 09219e9f00..987fa6a15a 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -40,6 +40,73 @@ class RougeL(keras.metrics.Metric):
 
     Examples:
 
+    1. Calculate RougeL (F1 Score) by calling `update_state()` and `result()`.
+    1.1. `mask_token_ids` not provided.
+    >>> tf.random.set_seed(42)
+    >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l")
+    >>> references = tf.random.uniform(
+    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> hypotheses = tf.random.uniform(
+    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> rouge_l.update_state(references, hypotheses)
+    >>> rouge_l.result()
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.40000004
+
+    1.2. `mask_token_ids` provided.
+    >>> tf.random.set_seed(42)
+    >>> rouge_l = keras_nlp.metrics.RougeL(
+    ...     name="rouge_l", mask_token_ids=[0, 1])
+    >>> references = tf.random.uniform(
+    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> hypotheses = tf.random.uniform(
+    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> rouge_l.update_state(references, hypotheses)
+    >>> rouge_l.result()
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.47619048>
+
+    1.3. tf.RaggedTensor as input, and `mask_token_ids` not provided.
+    >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l")
+    >>> references = tf.ragged.constant(
+    ...     [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
+    >>> hypotheses = tf.ragged.constant(
+    ...     [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32)
+    >>> rouge_l.update_state(references, hypotheses)
+    >>> rouge_l.result()
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.53571427>
+
+    1.4. tf.RaggedTensor as input, and `mask_token_ids` provided.
+    >>> rouge_l = keras_nlp.metrics.RougeL(
+    ...     name="rouge_l", mask_token_ids=[1, 5])
+    >>> references = tf.ragged.constant(
+    ...     [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
+    >>> hypotheses = tf.ragged.constant(
+    ...     [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32)
+    >>> rouge_l.update_state(references, hypotheses)
+    >>> rouge_l.result()
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.57142854>
+
+    2. Calculate ROUGE-L directly. This has the same functionality as above.
+    >>> tf.random.set_seed(42)
+    >>> rouge_l = keras_nlp.metrics.RougeL(
+    ...     name="rouge_l", mask_token_ids=[0, 1])
+    >>> references = tf.random.uniform(
+    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> hypotheses = tf.random.uniform(
+    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> rouge_l(references, hypotheses)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.47619048>
+
+    3. Traditionally, the ROUGE-L metric calculates the F1-score. However, if
+    the user wants the precision, this is how it can be done:
+    >>> tf.random.set_seed(42)
+    >>> rouge_l = keras_nlp.metrics.RougeL(
+    ...     name="rouge_l", metric_type="precision")
+    >>> references = tf.random.uniform(
+    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> hypotheses = tf.random.uniform(
+    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> rouge_l(references, hypotheses)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.4>
     """
 
     def __init__(

From 3e767ff467f0c3b6587c8fc61bffcf72acbac636 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 16 Apr 2022 16:23:06 +0530
Subject: [PATCH 05/30] Small doc-string changes

---
 keras_nlp/metrics/rouge_l.py      |  4 ++--
 keras_nlp/metrics/rouge_l_test.py | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 987fa6a15a..70d76f32de 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -83,7 +83,7 @@ class RougeL(keras.metrics.Metric):
     ...     [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32)
     >>> rouge_l.update_state(references, hypotheses)
     >>> rouge_l.result()
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.57142854>
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.4>
 
     2. Calculate ROUGE-L directly. This has the same functionality as above.
     >>> tf.random.set_seed(42)
@@ -227,6 +227,6 @@ def rouge_l(y_true, y_pred, alpha=0.5):
             precisions and recalls are returned for every sample.
     """
     f1_scores, precisions, recalls = tf_text.metrics.rouge_l(
-        y_true, y_pred, alpha=alpha
+        y_pred, y_true, alpha=alpha
     )
     return f1_scores, precisions, recalls
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
index 7b2926941d..892f5b8df7 100644
--- a/keras_nlp/metrics/rouge_l_test.py
+++ b/keras_nlp/metrics/rouge_l_test.py
@@ -76,7 +76,7 @@ def test_precision(self):
         )
 
         rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.8333, delta=1e-3)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.45, delta=1e-3)
 
     def test_recall(self):
         rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="recall")
@@ -88,7 +88,15 @@ def test_recall(self):
         )
 
         rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.45, delta=1e-3)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.8333, delta=1e-3)
+
+    def test_output_with_alpha(self):
+        rouge_l = RougeL(alpha=0.7)
+        y_true = tf.constant([[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
+        y_pred = tf.constant([[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32)
+
+        rouge_l_val = rouge_l(y_true, y_pred)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.70, delta=1e-3)
 
     def test_two_inputs_from_logits(self):
         rouge_l = RougeL(mask_token_ids=[0, 1])

From b622cfe885f601d21a186efc02bf5b11b35f7e4b Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 16 Apr 2022 16:40:00 +0530
Subject: [PATCH 06/30] Add alpha example

---
 keras_nlp/metrics/rouge_l.py      | 8 ++++++++
 keras_nlp/metrics/rouge_l_test.py | 6 +++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 70d76f32de..7e2eb7e30a 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -107,6 +107,14 @@ class RougeL(keras.metrics.Metric):
     ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
     >>> rouge_l(references, hypotheses)
     <tf.Tensor: shape=(), dtype=float32, numpy=0.4>
+
+    4. Modify the precision vs recall importance by specifying the `alpha`
+    parameter.
+    >>> rouge_l = RougeL(name="rouge_l", alpha=0.7)
+    >>> y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32)
+    >>> y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32)
+    >>> rouge_l(references, hypotheses)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.5253>
     """
 
     def __init__(
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
index 892f5b8df7..593a5d1d2b 100644
--- a/keras_nlp/metrics/rouge_l_test.py
+++ b/keras_nlp/metrics/rouge_l_test.py
@@ -92,11 +92,11 @@ def test_recall(self):
 
     def test_output_with_alpha(self):
         rouge_l = RougeL(alpha=0.7)
-        y_true = tf.constant([[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
-        y_pred = tf.constant([[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32)
+        y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32)
+        y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32)
 
         rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.70, delta=1e-3)
+        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5253, delta=1e-3)
 
     def test_two_inputs_from_logits(self):
         rouge_l = RougeL(mask_token_ids=[0, 1])

From 38a809fbb5173a4986d96ca9ec46ad147c1e2a01 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 16 Apr 2022 16:40:49 +0530
Subject: [PATCH 07/30] Small doc-string change

---
 keras_nlp/metrics/rouge_l.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 7e2eb7e30a..b1990e6ca8 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -108,8 +108,8 @@ class RougeL(keras.metrics.Metric):
     >>> rouge_l(references, hypotheses)
     <tf.Tensor: shape=(), dtype=float32, numpy=0.4>
 
-    4. Modify the precision vs recall importance by specifying the `alpha`
-    parameter.
+    4. Modify the precision vs recall importance (for calculating F1-score) by
+    specifying the `alpha` parameter.
     >>> rouge_l = RougeL(name="rouge_l", alpha=0.7)
     >>> y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32)
     >>> y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32)

From e3bf5030d6aef1a01eba14c820a7352c7cd1b585 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 16 Apr 2022 16:48:02 +0530
Subject: [PATCH 08/30] Fix doc-string

---
 keras_nlp/metrics/rouge_l.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index b1990e6ca8..4a1265f2d8 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -110,11 +110,13 @@ class RougeL(keras.metrics.Metric):
 
     4. Modify the precision vs recall importance (for calculating F1-score) by
     specifying the `alpha` parameter.
-    >>> rouge_l = RougeL(name="rouge_l", alpha=0.7)
-    >>> y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32)
-    >>> y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32)
+    >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l", alpha=0.7)
+    >>> references = tf.ragged.constant(
+    ...     [[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32)
+    >>> hypotheses = tf.ragged.constant(
+    ...     [[1], [5, 6, 10, 10, 10]], dtype=tf.int32)
     >>> rouge_l(references, hypotheses)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.5253>
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.52526593>
     """
 
     def __init__(

From d25403bc07b1ba9019ae12006de5c25efc0c971c Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sun, 17 Apr 2022 22:14:05 +0530
Subject: [PATCH 09/30] Fix f-string

---
 keras_nlp/metrics/rouge_l.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 4a1265f2d8..94d56cf7d9 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -138,8 +138,8 @@ def __init__(
 
         if metric_type not in ("precision", "recall", "f1_score"):
             raise ValueError(
-                "`metric_type` must be one of 'precision', 'recall', "
-                "'f1_score'. Received: metric_type={metric_type}"
+                '`metric_type` must be one of "precision", "recall", '
+                f'"f1_score". Received: metric_type={metric_type}'
             )
 
         self.alpha = alpha

From 2f9a35cbbb124daa312ce64b6655eebb047637df Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sun, 17 Apr 2022 22:25:26 +0530
Subject: [PATCH 10/30] Minor doc-string edit

---
 keras_nlp/metrics/rouge_l.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 94d56cf7d9..8f7958e994 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -26,9 +26,9 @@ class RougeL(keras.metrics.Metric):
 
     Args:
         alpha: float. `alpha` is used as the weight for the
-            harmonic mean of precision and recall. A value of 0 means recall is
-            more important and a value of 1 means precision is more important
-            (same behaviour as
+            harmonic mean of precision and recall  (for calculating F1-score). A
+            value of 0 means recall is more important and a value of 1 means
+            precision is more important (same behaviour as
             https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l).
         metric_type: string. One of "precision", "recall", "f1_score". Defaults
             to "f1_score".
@@ -227,9 +227,9 @@ def rouge_l(y_true, y_pred, alpha=0.5):
         y_true: tf.RaggedTensor. The reference summaries.
         y_pred: tf.RaggedTensor. The generated summaries.
         alpha: float. Defaults to 0.5. `alpha` is used as the weight for the
-            harmonic mean of precision and recall. A value of 0 means recall is
-            more important and a value of 1 means precision is more important
-            (same behaviour as
+            harmonic mean of precision and recall (for calculating F1-score). A
+            value of 0 means recall is more important and a value of 1 means
+            precision is more important (same behaviour as
             https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l).
 
     Returns:

From 9b4c1f165230f8d9fc761ebb4e35a00bb384afc6 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sun, 17 Apr 2022 22:26:01 +0530
Subject: [PATCH 11/30] Minor doc-string edit - 2

---
 keras_nlp/metrics/rouge_l.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 8f7958e994..0cabbb13a0 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -34,7 +34,7 @@ class RougeL(keras.metrics.Metric):
             to "f1_score".
         mask_token_ids: list of integers. IDs of the tokens to be masked.
         dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
-               not specified, it defaults to tf.float32.
+            not specified, it defaults to tf.float32.
         name: string. Name of the metric instance.
         **kwargs: Other keyword arguments.
 

From c59aa74b49d0d12da11bbd683468a75abca8a934 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Wed, 20 Apr 2022 19:48:24 +0530
Subject: [PATCH 12/30] Address review comments - I

---
 keras_nlp/metrics/rouge_l.py      | 73 ++++++++++---------------------
 keras_nlp/metrics/rouge_l_test.py |  2 +-
 2 files changed, 25 insertions(+), 50 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 0cabbb13a0..947040fcde 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -42,27 +42,25 @@ class RougeL(keras.metrics.Metric):
 
     1. Calculate RougeL (F1 Score) by calling `update_state()` and `result()`.
     1.1. `mask_token_ids` not provided.
-    >>> tf.random.set_seed(42)
     >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l")
-    >>> references = tf.random.uniform(
-    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
-    >>> hypotheses = tf.random.uniform(
-    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> references = tf.constant(
+    ...     [[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
+    >>> hypotheses = tf.constant(
+    ...     [[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32)
     >>> rouge_l.update_state(references, hypotheses)
     >>> rouge_l.result()
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.40000004
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.70000005
 
     1.2. `mask_token_ids` provided.
-    >>> tf.random.set_seed(42)
     >>> rouge_l = keras_nlp.metrics.RougeL(
     ...     name="rouge_l", mask_token_ids=[0, 1])
-    >>> references = tf.random.uniform(
-    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
-    >>> hypotheses = tf.random.uniform(
-    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> references = tf.constant(
+    ...     [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32)
+    >>> hypotheses = tf.constant(
+    ...     [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32)
     >>> rouge_l.update_state(references, hypotheses)
     >>> rouge_l.result()
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.47619048>
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.5833334>
 
     1.3. tf.RaggedTensor as input, and `mask_token_ids` not provided.
     >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l")
@@ -89,24 +87,23 @@ class RougeL(keras.metrics.Metric):
     >>> tf.random.set_seed(42)
     >>> rouge_l = keras_nlp.metrics.RougeL(
     ...     name="rouge_l", mask_token_ids=[0, 1])
-    >>> references = tf.random.uniform(
-    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
-    >>> hypotheses = tf.random.uniform(
-    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> references = tf.constant(
+    ...     [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32)
+    >>> hypotheses = tf.constant(
+    ...     [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32)
     >>> rouge_l(references, hypotheses)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.47619048>
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.5833334>
 
     3. Traditionally, the ROUGE-L metric calculates the F1-score. However, if
     the user wants the precision, this is how it can be done:
-    >>> tf.random.set_seed(42)
     >>> rouge_l = keras_nlp.metrics.RougeL(
     ...     name="rouge_l", metric_type="precision")
-    >>> references = tf.random.uniform(
-    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
-    >>> hypotheses = tf.random.uniform(
-    ...     shape=[2,5], maxval=10, dtype=tf.int32, seed=42)
+    >>> references = tf.constant(
+    ...     [[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
+    >>> hypotheses = tf.constant(
+    ...     [[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32)
     >>> rouge_l(references, hypotheses)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.4>
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.70000005>
 
     4. Modify the precision vs recall importance (for calculating F1-score) by
     specifying the `alpha` parameter.
@@ -124,7 +121,7 @@ def __init__(
         alpha=0.5,
         metric_type="f1_score",
         mask_token_ids=None,
-        dtype=None,
+        dtype=tf.float32,
         name="rouge_l",
         **kwargs,
     ):
@@ -157,7 +154,7 @@ def __init__(
 
     def update_state(self, y_true, y_pred, sample_weight=None):
         # Both y_true and y_pred have shape: [batch_size, seq_len]. Note that
-        # they can also be ragged tensors with shape [num_samples, (seq_len)].
+        # they can also be ragged tensors with shape [batch_size, (seq_len)].
 
         # If the input tensors are not ragged tensors, convert them to ragged
         # tensors. `tf_text.metrics.rouge_l` expects ragged tensors.
@@ -186,8 +183,8 @@ def update_state(self, y_true, y_pred, sample_weight=None):
             y_true = tf.ragged.boolean_mask(y_true, y_true_mask)
             y_pred = tf.ragged.boolean_mask(y_pred, y_pred_mask)
 
-        f1_scores, precisions, recalls = rouge_l(
-            y_true, y_pred, alpha=self.alpha
+        f1_scores, precisions, recalls = tf_text.metrics.rouge_l(
+            y_pred, y_true, alpha=self.alpha
         )
         if self.metric_type == "precision":
             scores = precisions
@@ -218,25 +215,3 @@ def get_config(self):
             }
         )
         return config
-
-
-def rouge_l(y_true, y_pred, alpha=0.5):
-    """
-    Computes the ROUGE-L score.
-    Args:
-        y_true: tf.RaggedTensor. The reference summaries.
-        y_pred: tf.RaggedTensor. The generated summaries.
-        alpha: float. Defaults to 0.5. `alpha` is used as the weight for the
-            harmonic mean of precision and recall (for calculating F1-score). A
-            value of 0 means recall is more important and a value of 1 means
-            precision is more important (same behaviour as
-            https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l).
-
-    Returns:
-        (f1_scores, precisions, recalls): Tuple of tf.Tensor. The f1_scores,
-            precisions and recalls are returned for every sample.
-    """
-    f1_scores, precisions, recalls = tf_text.metrics.rouge_l(
-        y_pred, y_true, alpha=alpha
-    )
-    return f1_scores, precisions, recalls
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
index 593a5d1d2b..0fb1409499 100644
--- a/keras_nlp/metrics/rouge_l_test.py
+++ b/keras_nlp/metrics/rouge_l_test.py
@@ -20,7 +20,7 @@
 
 
 class RougeLTest(tf.test.TestCase):
-    def test_vars_after_initializing_class(self):
+    def test_initialization(self):
         rouge_l = RougeL()
         self.assertEqual(rouge_l.result().numpy(), 0.0)
 

From d166ab7a238c270fdb4fa55c891df4b85c14781a Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Wed, 20 Apr 2022 19:53:40 +0530
Subject: [PATCH 13/30] Minor change in examples

---
 keras_nlp/metrics/rouge_l.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 947040fcde..ff31bba773 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -84,7 +84,6 @@ class RougeL(keras.metrics.Metric):
     <tf.Tensor: shape=(), dtype=float32, numpy=0.4>
 
     2. Calculate ROUGE-L directly. This has the same functionality as above.
-    >>> tf.random.set_seed(42)
     >>> rouge_l = keras_nlp.metrics.RougeL(
     ...     name="rouge_l", mask_token_ids=[0, 1])
     >>> references = tf.constant(

From 632df5d9500cc6775a931be79fd400a2adbf44b0 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Mon, 23 May 2022 21:10:46 +0530
Subject: [PATCH 14/30] Use the rouge_score package

---
 keras_nlp/metrics/__init__.py     |   2 +-
 keras_nlp/metrics/rouge.py        | 169 +++++++++++++++++++++++
 keras_nlp/metrics/rouge_l.py      | 216 ------------------------------
 keras_nlp/metrics/rouge_l_test.py | 216 ------------------------------
 keras_nlp/metrics/rouge_test.py   | 216 ++++++++++++++++++++++++++++++
 5 files changed, 386 insertions(+), 433 deletions(-)
 create mode 100644 keras_nlp/metrics/rouge.py
 delete mode 100644 keras_nlp/metrics/rouge_l.py
 delete mode 100644 keras_nlp/metrics/rouge_l_test.py
 create mode 100644 keras_nlp/metrics/rouge_test.py

diff --git a/keras_nlp/metrics/__init__.py b/keras_nlp/metrics/__init__.py
index 71509009a3..2a0682138e 100644
--- a/keras_nlp/metrics/__init__.py
+++ b/keras_nlp/metrics/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 
 from keras_nlp.metrics.perplexity import Perplexity
-from keras_nlp.metrics.rouge_l import RougeL
+from keras_nlp.metrics.rouge import Rouge
diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge.py
new file mode 100644
index 0000000000..089157c139
--- /dev/null
+++ b/keras_nlp/metrics/rouge.py
@@ -0,0 +1,169 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ROUGE metric implementation based on `keras.metrics.Metric`."""
+
+import tensorflow as tf
+from tensorflow import keras
+
+from keras_nlp.utils.tensor_utils import tensor_to_string_list
+
+try:
+    from rouge_score import rouge_scorer
+except:
+    pass
+
+
+class Rouge(keras.metrics.Metric):
+    """ROUGE metric.
+
+    This class implements all the variants of the ROUGE metric - ROUGE-N,
+    ROUGE-L and ROUGE-LSum.
+
+    Args:
+        variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to
+            "rouge2". For "rougeN", N lies in the range [1, 9].
+        metric_type: string. One of "precision", "recall", "f1_score". Defaults
+            to "f1_score".
+        use_stemmer: bool. Whether Porter Stemmer should be used to strip word
+            suffixes to improve matching. Defaults to False.
+        dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
+               not specified, it defaults to tf.float32.
+        name: string. Name of the metric instance.
+        **kwargs: Other keyword arguments.
+    """
+
+    def __init__(
+        self,
+        variant="rouge2",
+        metric_type="f1_score",
+        use_stemmer=False,
+        dtype=None,
+        name="rouge",
+        **kwargs,
+    ):
+        super().__init__(name=name, dtype=dtype, **kwargs)
+
+        if rouge_scorer is None:
+            raise ImportError(
+                "ROUGE metric requires the `rouge_score` package."
+                "Please install it with `pip install rouge_score`."
+            )
+
+        if not tf.as_dtype(self.dtype).is_floating:
+            raise ValueError(
+                "`dtype` must be a floating point type. "
+                f"Received: dtype={dtype}"
+            )
+
+        if variant not in tuple(
+            ("rouge" + str(order) for order in range(1, 10))
+        ) + (
+            "rougeL",
+            "rougeLsum",
+        ):
+            raise ValueError(
+                "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, "
+                "rougeLsum, with N ranging from 1 to 9. Received: "
+                f"variant={variant}"
+            )
+        if metric_type not in ("precision", "recall", "f1_score"):
+            raise ValueError(
+                '`metric_type` must be one of "precision", "recall", '
+                f'"f1_score". Received: metric_type={metric_type}'
+            )
+
+        self.variant = variant
+        self.metric_type = metric_type
+        self.use_stemmer = use_stemmer
+
+        # To-do: Add split_summaries and tokenizer options after the maintainers
+        # of rouge_scorer have released a new version.
+        self._rouge_scorer = rouge_scorer.RougeScorer(
+            rouge_types=[self.variant],
+            use_stemmer=use_stemmer,
+        )
+
+        self._rouge_score = self.add_weight(
+            name="rouge_score",
+            initializer="zeros",
+            dtype=self.dtype,
+        )
+        self._number_of_samples = self.add_weight(
+            name="number_of_samples", initializer="zeros", dtype=self.dtype
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        # Both y_true and y_pred have shape: [batch_size]. Each element is a
+        # string.
+
+        # Check if input is a raw string/list.
+        if isinstance(y_true, str):
+            y_true = tf.constant([y_true])
+        elif isinstance(y_true, list):
+            y_true = tf.constant(y_true)
+        if isinstance(y_pred, str):
+            y_pred = tf.constant([y_pred])
+        elif isinstance(y_pred, list):
+            y_pred = tf.constant(y_pred)
+
+        batch_size = tf.shape(y_true)[0]
+
+        def _calculate_rouge_score(reference, hypothesis):
+            reference = tensor_to_string_list(reference)
+            hypothesis = tensor_to_string_list(hypothesis)
+            score = self._rouge_scorer.score(reference, hypothesis)[
+                self.variant
+            ]
+
+            if self.metric_type == "precision":
+                score = score.precision
+            elif self.metric_type == "recall":
+                score = score.recall
+            else:
+                score = score.fmeasure
+            return score
+
+        for batch_idx in range(batch_size):
+            score = tf.py_function(
+                func=_calculate_rouge_score,
+                inp=[y_true[batch_idx], y_pred[batch_idx]],
+                Tout=self.dtype,
+            )
+            self._rouge_score.assign_add(score)
+
+        self._number_of_samples.assign_add(
+            tf.cast(batch_size, dtype=self.dtype)
+        )
+
+    def result(self):
+        if self._number_of_samples == 0:
+            return 0.0
+        rouge_l_score = self._rouge_score / self._number_of_samples
+        return rouge_l_score
+
+    def reset_state(self):
+        self._rouge_score.assign(0.0)
+        self._number_of_samples.assign(0.0)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "variant": self.variant,
+                "metric_type": self.metric_type,
+                "use_stemmer": self.use_stemmer,
+            }
+        )
+        return config
diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
deleted file mode 100644
index ff31bba773..0000000000
--- a/keras_nlp/metrics/rouge_l.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright 2022 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""ROUGE-L metric implementation based on `keras.metrics.Metric`."""
-
-import tensorflow as tf
-import tensorflow_text as tf_text
-from tensorflow import keras
-
-
-class RougeL(keras.metrics.Metric):
-    """ROUGE-L metric.
-
-    This class implements the ROUGE-L metric.
-
-    Args:
-        alpha: float. `alpha` is used as the weight for the
-            harmonic mean of precision and recall  (for calculating F1-score). A
-            value of 0 means recall is more important and a value of 1 means
-            precision is more important (same behaviour as
-            https://www.tensorflow.org/text/api_docs/python/text/metrics/rouge_l).
-        metric_type: string. One of "precision", "recall", "f1_score". Defaults
-            to "f1_score".
-        mask_token_ids: list of integers. IDs of the tokens to be masked.
-        dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
-            not specified, it defaults to tf.float32.
-        name: string. Name of the metric instance.
-        **kwargs: Other keyword arguments.
-
-    Examples:
-
-    1. Calculate RougeL (F1 Score) by calling `update_state()` and `result()`.
-    1.1. `mask_token_ids` not provided.
-    >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l")
-    >>> references = tf.constant(
-    ...     [[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
-    >>> hypotheses = tf.constant(
-    ...     [[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32)
-    >>> rouge_l.update_state(references, hypotheses)
-    >>> rouge_l.result()
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.70000005
-
-    1.2. `mask_token_ids` provided.
-    >>> rouge_l = keras_nlp.metrics.RougeL(
-    ...     name="rouge_l", mask_token_ids=[0, 1])
-    >>> references = tf.constant(
-    ...     [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32)
-    >>> hypotheses = tf.constant(
-    ...     [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32)
-    >>> rouge_l.update_state(references, hypotheses)
-    >>> rouge_l.result()
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.5833334>
-
-    1.3. tf.RaggedTensor as input, and `mask_token_ids` not provided.
-    >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l")
-    >>> references = tf.ragged.constant(
-    ...     [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
-    >>> hypotheses = tf.ragged.constant(
-    ...     [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32)
-    >>> rouge_l.update_state(references, hypotheses)
-    >>> rouge_l.result()
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.53571427>
-
-    1.4. tf.RaggedTensor as input, and `mask_token_ids` provided.
-    >>> rouge_l = keras_nlp.metrics.RougeL(
-    ...     name="rouge_l", mask_token_ids=[1, 5])
-    >>> references = tf.ragged.constant(
-    ...     [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
-    >>> hypotheses = tf.ragged.constant(
-    ...     [[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32)
-    >>> rouge_l.update_state(references, hypotheses)
-    >>> rouge_l.result()
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.4>
-
-    2. Calculate ROUGE-L directly. This has the same functionality as above.
-    >>> rouge_l = keras_nlp.metrics.RougeL(
-    ...     name="rouge_l", mask_token_ids=[0, 1])
-    >>> references = tf.constant(
-    ...     [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32)
-    >>> hypotheses = tf.constant(
-    ...     [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32)
-    >>> rouge_l(references, hypotheses)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.5833334>
-
-    3. Traditionally, the ROUGE-L metric calculates the F1-score. However, if
-    the user wants the precision, this is how it can be done:
-    >>> rouge_l = keras_nlp.metrics.RougeL(
-    ...     name="rouge_l", metric_type="precision")
-    >>> references = tf.constant(
-    ...     [[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
-    >>> hypotheses = tf.constant(
-    ...     [[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32)
-    >>> rouge_l(references, hypotheses)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.70000005>
-
-    4. Modify the precision vs recall importance (for calculating F1-score) by
-    specifying the `alpha` parameter.
-    >>> rouge_l = keras_nlp.metrics.RougeL(name="rouge_l", alpha=0.7)
-    >>> references = tf.ragged.constant(
-    ...     [[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32)
-    >>> hypotheses = tf.ragged.constant(
-    ...     [[1], [5, 6, 10, 10, 10]], dtype=tf.int32)
-    >>> rouge_l(references, hypotheses)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.52526593>
-    """
-
-    def __init__(
-        self,
-        alpha=0.5,
-        metric_type="f1_score",
-        mask_token_ids=None,
-        dtype=tf.float32,
-        name="rouge_l",
-        **kwargs,
-    ):
-        super().__init__(name=name, dtype=dtype, **kwargs)
-
-        if not tf.as_dtype(self.dtype).is_floating:
-            raise ValueError(
-                "`dtype` must be a floating point type. "
-                f"Received: dtype={dtype}"
-            )
-
-        if metric_type not in ("precision", "recall", "f1_score"):
-            raise ValueError(
-                '`metric_type` must be one of "precision", "recall", '
-                f'"f1_score". Received: metric_type={metric_type}'
-            )
-
-        self.alpha = alpha
-        self.metric_type = metric_type
-        self.mask_token_ids = mask_token_ids
-
-        self._rouge_l_score = self.add_weight(
-            name="rouge_l_score",
-            initializer="zeros",
-            dtype=self.dtype,
-        )
-        self._number_of_samples = self.add_weight(
-            name="number_of_samples", initializer="zeros", dtype=self.dtype
-        )
-
-    def update_state(self, y_true, y_pred, sample_weight=None):
-        # Both y_true and y_pred have shape: [batch_size, seq_len]. Note that
-        # they can also be ragged tensors with shape [batch_size, (seq_len)].
-
-        # If the input tensors are not ragged tensors, convert them to ragged
-        # tensors. `tf_text.metrics.rouge_l` expects ragged tensors.
-        if not isinstance(y_true, tf.RaggedTensor):
-            y_true = tf.RaggedTensor.from_tensor(y_true)
-        if not isinstance(y_pred, tf.RaggedTensor):
-            y_pred = tf.RaggedTensor.from_tensor(y_pred)
-
-        batch_size = tf.cast(y_true.nrows(), self.dtype)
-
-        y_true_mask = tf.cast(tf.ones_like(y_true), tf.bool)
-        y_pred_mask = tf.cast(tf.ones_like(y_pred), tf.bool)
-
-        if self.mask_token_ids is not None:
-            for mask_token_id in self.mask_token_ids:
-                y_true_mask = tf.logical_and(
-                    y_true_mask,
-                    tf.math.logical_not(tf.equal(y_true, mask_token_id)),
-                )
-                y_pred_mask = tf.logical_and(
-                    y_pred_mask,
-                    tf.math.logical_not(tf.equal(y_pred, mask_token_id)),
-                )
-
-            # Apply mask to both tensors.
-            y_true = tf.ragged.boolean_mask(y_true, y_true_mask)
-            y_pred = tf.ragged.boolean_mask(y_pred, y_pred_mask)
-
-        f1_scores, precisions, recalls = tf_text.metrics.rouge_l(
-            y_pred, y_true, alpha=self.alpha
-        )
-        if self.metric_type == "precision":
-            scores = precisions
-        elif self.metric_type == "recall":
-            scores = recalls
-        else:
-            scores = f1_scores
-        self._rouge_l_score.assign_add(tf.reduce_sum(scores))
-        self._number_of_samples.assign_add(batch_size)
-
-    def result(self):
-        if self._number_of_samples == 0:
-            return 0.0
-        rouge_l_score = self._rouge_l_score / self._number_of_samples
-        return rouge_l_score
-
-    def reset_state(self):
-        self._rouge_l_score.assign(0.0)
-        self._number_of_samples.assign(0.0)
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "alpha": self.alpha,
-                "metric_type": self.metric_type,
-                "mask_token_ids": self.mask_token_ids,
-            }
-        )
-        return config
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
deleted file mode 100644
index 0fb1409499..0000000000
--- a/keras_nlp/metrics/rouge_l_test.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright 2022 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for RougeL."""
-
-import tensorflow as tf
-
-from keras_nlp.metrics import RougeL
-
-
-class RougeLTest(tf.test.TestCase):
-    def test_initialization(self):
-        rouge_l = RougeL()
-        self.assertEqual(rouge_l.result().numpy(), 0.0)
-
-    def test_without_mask_token_ids(self):
-        rouge_l = RougeL()
-        y_true = tf.constant([[1, 2, 3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32)
-        y_pred = tf.constant([[1, 2, 3, 2, 5], [5, 6, 8, 8, 8]], dtype=tf.int32)
-
-        rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.70, delta=1e-3)
-
-    def test_with_mask_token_ids(self):
-        rouge_l = RougeL(mask_token_ids=[0, 1])
-        y_true = tf.constant(
-            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
-        )
-        y_pred = tf.constant(
-            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
-        )
-
-        rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3)
-
-    def test_ragged_input_without_mask_token_ids(self):
-        rouge_l = RougeL()
-        y_true = tf.ragged.constant(
-            [[3, 4, 5], [5, 6, 7, 8, 9]], dtype=tf.int32
-        )
-        y_pred = tf.ragged.constant([[1, 4, 3, 2, 5], [5, 6]], dtype=tf.int32)
-
-        rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5357, delta=1e-3)
-
-    def test_ragged_input_with_mask_token_ids(self):
-        rouge_l = RougeL(mask_token_ids=[0, 1])
-        y_true = tf.ragged.constant(
-            [[1, 2, 3, 4], [1, 5, 6, 0, 0]], dtype=tf.int32
-        )
-        y_pred = tf.ragged.constant(
-            [[1, 3, 2, 4, 4, 4], [5, 6, 0, 2, 2]], dtype=tf.int32
-        )
-
-        rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.583, delta=1e-3)
-
-    def test_precision(self):
-        rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="precision")
-        y_true = tf.constant(
-            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
-        )
-        y_pred = tf.constant(
-            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
-        )
-
-        rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.45, delta=1e-3)
-
-    def test_recall(self):
-        rouge_l = RougeL(mask_token_ids=[0, 1], metric_type="recall")
-        y_true = tf.constant(
-            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
-        )
-        y_pred = tf.constant(
-            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
-        )
-
-        rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.8333, delta=1e-3)
-
-    def test_output_with_alpha(self):
-        rouge_l = RougeL(alpha=0.7)
-        y_true = tf.ragged.constant([[1, 2, 3], [5, 6, 7, 8]], dtype=tf.int32)
-        y_pred = tf.ragged.constant([[1], [5, 6, 10, 10, 10]], dtype=tf.int32)
-
-        rouge_l_val = rouge_l(y_true, y_pred)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5253, delta=1e-3)
-
-    def test_two_inputs_from_logits(self):
-        rouge_l = RougeL(mask_token_ids=[0, 1])
-        y_true_1 = tf.constant(
-            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
-        )
-        y_pred_1 = tf.constant(
-            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
-        )
-
-        rouge_l_val = rouge_l(y_true_1, y_pred_1)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3)
-
-        y_true_2 = tf.ragged.constant(
-            [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32
-        )
-        y_pred_2 = tf.ragged.constant(
-            [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32
-        )
-
-        rouge_l_val = rouge_l(y_true_2, y_pred_2)
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.7014, delta=1e-3)
-
-    def test_reset_state(self):
-        rouge_l = RougeL(mask_token_ids=[0, 1])
-        y_true = tf.constant(
-            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
-        )
-        y_pred = tf.constant(
-            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
-        )
-
-        rouge_l.update_state(y_true, y_pred)
-        self.assertNotEqual(rouge_l.result(), 0.0)
-
-        rouge_l.reset_state()
-        self.assertEqual(rouge_l.result(), 0.0)
-
-    def test_update_state(self):
-        rouge_l = RougeL(mask_token_ids=[0, 1])
-        y_true_1 = tf.constant(
-            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
-        )
-        y_pred_1 = tf.constant(
-            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
-        )
-
-        rouge_l.update_state(y_true_1, y_pred_1)
-        rouge_l_val = rouge_l.result()
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.5833, delta=1e-3)
-
-        y_true_2 = tf.ragged.constant(
-            [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32
-        )
-        y_pred_2 = tf.ragged.constant(
-            [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32
-        )
-
-        rouge_l.update_state(y_true_2, y_pred_2)
-        rouge_l_val = rouge_l.result()
-        self.assertAlmostEqual(rouge_l_val.numpy(), 0.7014, delta=1e-3)
-
-    def test_merge_state(self):
-        rouge_l_1 = RougeL(mask_token_ids=[0, 1])
-        rouge_l_2 = RougeL(mask_token_ids=[0, 1])
-
-        y_true_1 = tf.constant(
-            [[1, 2, 3, 4, 0, 0], [1, 5, 6, 0, 0, 0]], dtype=tf.int32
-        )
-        y_pred_1 = tf.constant(
-            [[1, 3, 2, 4, 4, 4], [1, 5, 6, 0, 2, 2]], dtype=tf.int32
-        )
-
-        y_true_2 = tf.ragged.constant(
-            [[1, 2, 3, 4], [1, 5, 6, 7, 8]], dtype=tf.int32
-        )
-        y_pred_2 = tf.ragged.constant(
-            [[1, 3, 2, 2, 3, 4], [5, 6, 7, 8, 2]], dtype=tf.int32
-        )
-
-        y_true_3 = tf.ragged.constant(
-            [[9, 8, 7, 1], [10, 5, 1, 2, 3]], dtype=tf.int32
-        )
-        y_pred_3 = tf.ragged.constant(
-            [[1, 2, 7, 9, 8, 0], [10, 1, 2]], dtype=tf.int32
-        )
-
-        rouge_l_1.update_state(y_true_1, y_pred_1)
-        rouge_l_1.update_state(y_true_2, y_pred_2)
-        self.assertAlmostEqual(rouge_l_1.result().numpy(), 0.7014, delta=1e-3)
-
-        rouge_l_2.update_state(y_true_3, y_pred_3)
-        self.assertAlmostEqual(rouge_l_2.result().numpy(), 0.6190, delta=1e-3)
-
-        merged_rouge_l = RougeL(mask_token_ids=[0, 1])
-        merged_rouge_l.merge_state([rouge_l_1, rouge_l_2])
-        self.assertAlmostEqual(
-            merged_rouge_l.result().numpy(), 0.6739, delta=1e-3
-        )
-
-    def test_get_config(self):
-        rouge_l = RougeL(
-            alpha=0.7,
-            metric_type="precision",
-            mask_token_ids=[0],
-            dtype=tf.float32,
-            name="rouge_l_test",
-        )
-        config = rouge_l.get_config()
-        expected_config = {
-            "alpha": 0.7,
-            "metric_type": "precision",
-            "mask_token_ids": [0],
-            "dtype": tf.float32,
-            "name": "rouge_l_test",
-        }
-        self.assertEqual(config, expected_config)
diff --git a/keras_nlp/metrics/rouge_test.py b/keras_nlp/metrics/rouge_test.py
new file mode 100644
index 0000000000..05ca3b65cc
--- /dev/null
+++ b/keras_nlp/metrics/rouge_test.py
@@ -0,0 +1,216 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Rouge."""
+
+import tensorflow as tf
+
+from keras_nlp.metrics import Rouge
+
+
+class RougeTest(tf.test.TestCase):
+    def test_initialization(self):
+        rouge = Rouge()
+        self.assertEqual(rouge.result().numpy(), 0.0)
+
+    def test_string_input(self):
+        rouge = Rouge(
+            variant="rouge2", metric_type="f1_score", use_stemmer=False
+        )
+        y_true = "hey, this is great fun"
+        y_pred = "great fun indeed"
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3)
+
+    def test_string_list_input(self):
+        rouge = Rouge(
+            variant="rouge2", metric_type="f1_score", use_stemmer=False
+        )
+        y_true = ["hey, this is great fun", "i love contributing to KerasNLP"]
+        y_pred = ["great fun indeed", "contributing to KerasNLP is delightful"]
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.417, delta=1e-3)
+
+    def test_tensor_input(self):
+        rouge = Rouge(
+            variant="rouge2", metric_type="f1_score", use_stemmer=False
+        )
+        y_true = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred = tf.constant(
+            ["great fun indeed", "contributing to KerasNLP is delightful"]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.417, delta=1e-3)
+
+    def test_rouge_l(self):
+        rouge = Rouge(
+            variant="rougeL", metric_type="f1_score", use_stemmer=False
+        )
+        y_true = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred = tf.constant(
+            ["great fun indeed", "contributing to KerasNLP is delightful"]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.55, delta=1e-3)
+
+    def test_rouge_l_sum(self):
+        rouge = Rouge(
+            variant="rougeLsum", metric_type="f1_score", use_stemmer=False
+        )
+        y_true = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred = tf.constant(
+            ["great fun indeed", "contributing to KerasNLP is delightful"]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.55, delta=1e-3)
+
+    def test_incorrect_variant(self):
+        with self.assertRaises(ValueError):
+            _ = Rouge(
+                variant="rouge10", metric_type="f1_score", use_stemmer=False
+            )
+
+    def test_precision(self):
+        rouge = Rouge(
+            variant="rouge3", metric_type="precision", use_stemmer=False
+        )
+        y_true = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred = tf.constant(
+            [
+                "great fun indeed",
+                "KerasNLP is awesome, i love contributing to it",
+            ]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.167, delta=1e-3)
+
+    def test_recall(self):
+        rouge = Rouge(variant="rouge3", metric_type="recall", use_stemmer=False)
+        y_true = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred = tf.constant(
+            [
+                "great fun indeed",
+                "KerasNLP is awesome, i love contributing to it",
+            ]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3)
+
+    def test_reset_state(self):
+        rouge = Rouge()
+        y_true = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred = tf.constant(
+            [
+                "great fun indeed",
+                "KerasNLP is awesome, i love contributing to it",
+            ]
+        )
+
+        rouge.update_state(y_true, y_pred)
+        self.assertNotEqual(rouge.result(), 0.0)
+
+        rouge.reset_state()
+        self.assertEqual(rouge.result(), 0.0)
+
+    def test_update_state(self):
+        rouge = Rouge()
+        y_true_1 = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred_1 = tf.constant(
+            [
+                "great fun indeed",
+                "KerasNLP is awesome, i love contributing to it",
+            ]
+        )
+
+        rouge.update_state(y_true_1, y_pred_1)
+        rouge_val = rouge.result()
+        self.assertAlmostEqual(rouge_val.numpy(), 0.439, delta=1e-3)
+
+        y_true_2 = tf.constant(["what is your favourite show"])
+        y_pred_2 = tf.constant(["my favourite show is silicon valley"])
+
+        rouge.update_state(y_true_2, y_pred_2)
+        rouge_val = rouge.result()
+        self.assertAlmostEqual(rouge_val.numpy(), 0.367, delta=1e-3)
+
+    def test_merge_state(self):
+        rouge_1 = Rouge()
+        rouge_2 = Rouge()
+
+        y_true_1 = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred_1 = tf.constant(
+            [
+                "great fun indeed",
+                "KerasNLP is awesome, i love contributing to it",
+            ]
+        )
+
+        y_true_2 = tf.constant(["what is your favourite show"])
+        y_pred_2 = tf.constant(["my favourite show is silicon valley"])
+
+        y_true_3 = tf.constant(["lorem ipsum dolor sit amet"])
+        y_pred_3 = tf.constant(["lorem ipsum is simply dummy text"])
+
+        rouge_1.update_state(y_true_1, y_pred_1)
+        rouge_1.update_state(y_true_2, y_pred_2)
+        self.assertAlmostEqual(rouge_1.result().numpy(), 0.367, delta=1e-3)
+
+        rouge_2.update_state(y_true_3, y_pred_3)
+        self.assertAlmostEqual(rouge_2.result().numpy(), 0.222, delta=1e-3)
+
+        merged_rouge = Rouge()
+        merged_rouge.merge_state([rouge_1, rouge_2])
+        self.assertAlmostEqual(merged_rouge.result().numpy(), 0.331, delta=1e-3)
+
+    def test_get_config(self):
+        rouge = Rouge(
+            variant="rouge5",
+            metric_type="precision",
+            use_stemmer=True,
+            dtype=tf.float32,
+            name="rouge_test",
+        )
+
+        config = rouge.get_config()
+        expected_config = {
+            "variant": "rouge5",
+            "metric_type": "precision",
+            "use_stemmer": True,
+            "dtype": tf.float32,
+            "name": "rouge_test",
+        }
+        self.assertEqual(config, expected_config)

From 7586e0002acbd5866c29aba71850abf13412bfc2 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Mon, 23 May 2022 21:21:40 +0530
Subject: [PATCH 15/30] Fix rouge_score import

---
 keras_nlp/metrics/rouge.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge.py
index 089157c139..459de736ca 100644
--- a/keras_nlp/metrics/rouge.py
+++ b/keras_nlp/metrics/rouge.py
@@ -14,12 +14,14 @@
 
 """ROUGE metric implementation based on `keras.metrics.Metric`."""
 
+import sys
 import tensorflow as tf
 from tensorflow import keras
 
 from keras_nlp.utils.tensor_utils import tensor_to_string_list
 
 try:
+    import rouge_score
     from rouge_score import rouge_scorer
 except:
     pass
@@ -55,7 +57,7 @@ def __init__(
     ):
         super().__init__(name=name, dtype=dtype, **kwargs)
 
-        if rouge_scorer is None:
+        if "rouge_score" not in sys.modules:
             raise ImportError(
                 "ROUGE metric requires the `rouge_score` package."
                 "Please install it with `pip install rouge_score`."

From 893aab9681b10f59dcc03795b28369e706c0ee54 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Tue, 24 May 2022 18:05:14 +0530
Subject: [PATCH 16/30] Add rouge-score to test deps list

---
 keras_nlp/metrics/rouge.py | 6 +++---
 setup.py                   | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge.py
index 459de736ca..e23b6a3587 100644
--- a/keras_nlp/metrics/rouge.py
+++ b/keras_nlp/metrics/rouge.py
@@ -23,8 +23,8 @@
 try:
     import rouge_score
     from rouge_score import rouge_scorer
-except:
-    pass
+except ImportError:
+    rouge_score = None
 
 
 class Rouge(keras.metrics.Metric):
@@ -57,7 +57,7 @@ def __init__(
     ):
         super().__init__(name=name, dtype=dtype, **kwargs)
 
-        if "rouge_score" not in sys.modules:
+        if rouge_score is None:
             raise ImportError(
                 "ROUGE metric requires the `rouge_score` package."
                 "Please install it with `pip install rouge_score`."
diff --git a/setup.py b/setup.py
index 371a3e08f5..fac287ae7c 100644
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,7 @@
             "isort",
             "pytest",
             "pytest-cov",
+            "rouge-score",
         ],
         "examples": [
             "datasets",  # For GLUE in BERT example.

From ccf33d4ac891dab50381cde4396c020f8f06b1e7 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sat, 28 May 2022 16:19:19 +0530
Subject: [PATCH 17/30] Address review comments - II

---
 keras_nlp/metrics/__init__.py                 |   3 +-
 keras_nlp/metrics/{rouge.py => rouge_l.py}    |  77 +++----
 keras_nlp/metrics/rouge_l_test.py             | 196 ++++++++++++++++++
 keras_nlp/metrics/rouge_n.py                  | 175 ++++++++++++++++
 .../{rouge_test.py => rouge_n_test.py}        | 165 +++++++--------
 5 files changed, 490 insertions(+), 126 deletions(-)
 rename keras_nlp/metrics/{rouge.py => rouge_l.py} (70%)
 create mode 100644 keras_nlp/metrics/rouge_l_test.py
 create mode 100644 keras_nlp/metrics/rouge_n.py
 rename keras_nlp/metrics/{rouge_test.py => rouge_n_test.py} (52%)

diff --git a/keras_nlp/metrics/__init__.py b/keras_nlp/metrics/__init__.py
index 2a0682138e..55ade6dc8a 100644
--- a/keras_nlp/metrics/__init__.py
+++ b/keras_nlp/metrics/__init__.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 
 from keras_nlp.metrics.perplexity import Perplexity
-from keras_nlp.metrics.rouge import Rouge
+from keras_nlp.metrics.rouge_l import RougeL
+from keras_nlp.metrics.rouge_n import RougeN
diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge_l.py
similarity index 70%
rename from keras_nlp/metrics/rouge.py
rename to keras_nlp/metrics/rouge_l.py
index e23b6a3587..79f897a2da 100644
--- a/keras_nlp/metrics/rouge.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""ROUGE metric implementation based on `keras.metrics.Metric`."""
+"""ROUGE-L metric implementation based on `keras.metrics.Metric`."""
 
-import sys
 import tensorflow as tf
 from tensorflow import keras
 
@@ -27,19 +26,19 @@
     rouge_score = None
 
 
-class Rouge(keras.metrics.Metric):
-    """ROUGE metric.
+class RougeL(keras.metrics.Metric):
+    """ROUGE-L metric.
 
-    This class implements all the variants of the ROUGE metric - ROUGE-N,
-    ROUGE-L and ROUGE-LSum.
+    This class implements the ROUGE-L variant of the ROUGE metric. The ROUGE-L
+    metric is traditionally used for evaluating summarisation systems.
+    Succinctly put, ROUGE-L is a score based on the length of the longest
+    common subsequence present in the reference text and the hypothesis text.
 
     Args:
-        variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to
-            "rouge2". For "rougeN", N lies in the range [1, 9].
-        metric_type: string. One of "precision", "recall", "f1_score". Defaults
-            to "f1_score".
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
             suffixes to improve matching. Defaults to False.
+        metric_type: string. One of "precision", "recall", "f1_score". Defaults
+            to "f1_score".
         dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
                not specified, it defaults to tf.float32.
         name: string. Name of the metric instance.
@@ -48,19 +47,18 @@ class Rouge(keras.metrics.Metric):
 
     def __init__(
         self,
-        variant="rouge2",
         metric_type="f1_score",
         use_stemmer=False,
         dtype=None,
-        name="rouge",
+        name="rouge-l",
         **kwargs,
     ):
         super().__init__(name=name, dtype=dtype, **kwargs)
 
         if rouge_score is None:
             raise ImportError(
-                "ROUGE metric requires the `rouge_score` package."
-                "Please install it with `pip install rouge_score`."
+                "ROUGE metric requires the `rouge_score` package. "
+                "Please install it with `pip install rouge-score`."
             )
 
         if not tf.as_dtype(self.dtype).is_floating:
@@ -69,36 +67,24 @@ def __init__(
                 f"Received: dtype={dtype}"
             )
 
-        if variant not in tuple(
-            ("rouge" + str(order) for order in range(1, 10))
-        ) + (
-            "rougeL",
-            "rougeLsum",
-        ):
-            raise ValueError(
-                "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, "
-                "rougeLsum, with N ranging from 1 to 9. Received: "
-                f"variant={variant}"
-            )
         if metric_type not in ("precision", "recall", "f1_score"):
             raise ValueError(
                 '`metric_type` must be one of "precision", "recall", '
                 f'"f1_score". Received: metric_type={metric_type}'
             )
 
-        self.variant = variant
         self.metric_type = metric_type
         self.use_stemmer = use_stemmer
 
-        # To-do: Add split_summaries and tokenizer options after the maintainers
-        # of rouge_scorer have released a new version.
-        self._rouge_scorer = rouge_scorer.RougeScorer(
-            rouge_types=[self.variant],
+        # To-do: Add an option for adding custom tokenizer after the maintainers
+        # of rouge-score have released a new version.
+        self._rouge_l_scorer = rouge_scorer.RougeScorer(
+            rouge_types=["rougeL"],
             use_stemmer=use_stemmer,
         )
 
-        self._rouge_score = self.add_weight(
-            name="rouge_score",
+        self._rouge_l_score = self.add_weight(
+            name="rouge_l_score",
             initializer="zeros",
             dtype=self.dtype,
         )
@@ -107,8 +93,9 @@ def __init__(
         )
 
     def update_state(self, y_true, y_pred, sample_weight=None):
-        # Both y_true and y_pred have shape: [batch_size]. Each element is a
-        # string.
+        # Three possible shapes for y_true and y_pred: Python string,
+        # [batch_size] and [batch_size, 1]. In the latter two cases, we have
+        # strings in the tensor/list.
 
         # Check if input is a raw string/list.
         if isinstance(y_true, str):
@@ -120,14 +107,19 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         elif isinstance(y_pred, list):
             y_pred = tf.constant(y_pred)
 
+        # If the shape of y_true and y_pred is [batch_size, 1], squeeze it to
+        # [batch_size].
+        if y_true.shape.rank == 2:
+            y_true = tf.squeeze(y_true, axis=1)
+        if y_pred.shape.rank == 2:
+            y_pred = tf.squeeze(y_pred, axis=1)
+
         batch_size = tf.shape(y_true)[0]
 
-        def _calculate_rouge_score(reference, hypothesis):
+        def _calculate_rouge_l_score(reference, hypothesis):
             reference = tensor_to_string_list(reference)
             hypothesis = tensor_to_string_list(hypothesis)
-            score = self._rouge_scorer.score(reference, hypothesis)[
-                self.variant
-            ]
+            score = self._rouge_l_scorer.score(reference, hypothesis)["rougeL"]
 
             if self.metric_type == "precision":
                 score = score.precision
@@ -139,11 +131,11 @@ def _calculate_rouge_score(reference, hypothesis):
 
         for batch_idx in range(batch_size):
             score = tf.py_function(
-                func=_calculate_rouge_score,
+                func=_calculate_rouge_l_score,
                 inp=[y_true[batch_idx], y_pred[batch_idx]],
                 Tout=self.dtype,
             )
-            self._rouge_score.assign_add(score)
+            self._rouge_l_score.assign_add(score)
 
         self._number_of_samples.assign_add(
             tf.cast(batch_size, dtype=self.dtype)
@@ -152,18 +144,17 @@ def _calculate_rouge_score(reference, hypothesis):
     def result(self):
         if self._number_of_samples == 0:
             return 0.0
-        rouge_l_score = self._rouge_score / self._number_of_samples
+        rouge_l_score = self._rouge_l_score / self._number_of_samples
         return rouge_l_score
 
     def reset_state(self):
-        self._rouge_score.assign(0.0)
+        self._rouge_l_score.assign(0.0)
         self._number_of_samples.assign(0.0)
 
     def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "variant": self.variant,
                 "metric_type": self.metric_type,
                 "use_stemmer": self.use_stemmer,
             }
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
new file mode 100644
index 0000000000..a3e4250dae
--- /dev/null
+++ b/keras_nlp/metrics/rouge_l_test.py
@@ -0,0 +1,196 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for RougeL."""
+
+import tensorflow as tf
+
+from keras_nlp.metrics import RougeL
+
+
+class RougeLTest(tf.test.TestCase):
+    def test_initialization(self):
+        rouge = RougeL()
+        self.assertEqual(rouge.result().numpy(), 0.0)
+
+    def test_string_input(self):
+        rouge = RougeL(use_stemmer=False)
+        y_true = "the tiny little cat was found under the big funny bed"
+        y_pred = "the cat was under the bed"
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.706, delta=1e-3)
+
+    def test_string_list_input(self):
+        rouge = RougeL(use_stemmer=False)
+        y_true = [
+            "the tiny little cat was found under the big funny bed",
+            "i really love contributing to KerasNLP",
+        ]
+        y_pred = [
+            "the cat was under the bed",
+            "i love contributing to KerasNLP",
+        ]
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
+
+    def test_tensor_input(self):
+        rouge = RougeL(use_stemmer=False)
+        y_true = tf.constant(
+            [
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
+            ]
+        )
+        y_pred = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
+
+    def test_rank_2_input(self):
+        rouge = RougeL(use_stemmer=False)
+        y_true = tf.constant(
+            [
+                ["the tiny little cat was found under the big funny bed"],
+                ["i really love contributing to KerasNLP"],
+            ]
+        )
+        y_pred = tf.constant(
+            [["the cat was under the bed"], ["i love contributing to KerasNLP"]]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
+
+    def test_precision(self):
+        rouge = RougeL(metric_type="precision", use_stemmer=False)
+        y_true = tf.constant(
+            [
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
+            ]
+        )
+        y_pred = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 1, delta=1e-3)
+
+    def test_recall(self):
+        rouge = RougeL(metric_type="recall", use_stemmer=False)
+        y_true = tf.constant(
+            [
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
+            ]
+        )
+        y_pred = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
+
+        rouge_val = rouge(y_true, y_pred)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.689, delta=1e-3)
+
+    def test_reset_state(self):
+        rouge = RougeL()
+        y_true = tf.constant(
+            ["hey, this is great fun", "i love contributing to KerasNLP"]
+        )
+        y_pred = tf.constant(
+            [
+                "great fun indeed",
+                "KerasNLP is awesome, i love contributing to it",
+            ]
+        )
+
+        rouge.update_state(y_true, y_pred)
+        self.assertNotEqual(rouge.result(), 0.0)
+
+        rouge.reset_state()
+        self.assertEqual(rouge.result(), 0.0)
+
+    def test_update_state(self):
+        rouge = RougeL()
+        y_true_1 = tf.constant(
+            [
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
+            ]
+        )
+        y_pred_1 = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
+
+        rouge.update_state(y_true_1, y_pred_1)
+        rouge_val = rouge.result()
+        self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
+
+        y_true_2 = tf.constant(["what is your favourite show"])
+        y_pred_2 = tf.constant(["my favourite show is silicon valley"])
+
+        rouge.update_state(y_true_2, y_pred_2)
+        rouge_val = rouge.result()
+        self.assertAlmostEqual(rouge_val.numpy(), 0.659, delta=1e-3)
+
+    def test_merge_state(self):
+        rouge_1 = RougeL()
+        rouge_2 = RougeL()
+
+        y_true_1 = tf.constant(
+            [
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
+            ]
+        )
+        y_pred_1 = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
+
+        y_true_2 = tf.constant(["what is your favourite show"])
+        y_pred_2 = tf.constant(["my favourite show is silicon valley"])
+
+        y_true_3 = tf.constant(["lorem ipsum dolor sit amet"])
+        y_pred_3 = tf.constant(["lorem ipsum is simply dummy text"])
+
+        rouge_1.update_state(y_true_1, y_pred_1)
+        rouge_1.update_state(y_true_2, y_pred_2)
+        self.assertAlmostEqual(rouge_1.result().numpy(), 0.659, delta=1e-3)
+
+        rouge_2.update_state(y_true_3, y_pred_3)
+        self.assertAlmostEqual(rouge_2.result().numpy(), 0.364, delta=1e-3)
+
+        merged_rouge = RougeL()
+        merged_rouge.merge_state([rouge_1, rouge_2])
+        self.assertAlmostEqual(merged_rouge.result().numpy(), 0.586, delta=1e-3)
+
+    def test_get_config(self):
+        rouge = RougeL(
+            metric_type="precision",
+            use_stemmer=True,
+            dtype=tf.float32,
+            name="rouge_l_test",
+        )
+
+        config = rouge.get_config()
+        expected_config = {
+            "metric_type": "precision",
+            "use_stemmer": True,
+            "dtype": tf.float32,
+            "name": "rouge_l_test",
+        }
+        self.assertEqual(config, expected_config)
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
new file mode 100644
index 0000000000..ad8e2288d9
--- /dev/null
+++ b/keras_nlp/metrics/rouge_n.py
@@ -0,0 +1,175 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ROUGE-N metric implementation based on `keras.metrics.Metric`."""
+
+import tensorflow as tf
+from tensorflow import keras
+
+from keras_nlp.utils.tensor_utils import tensor_to_string_list
+
+try:
+    import rouge_score
+    from rouge_score import rouge_scorer
+except ImportError:
+    rouge_score = None
+
+
+class RougeN(keras.metrics.Metric):
+    """ROUGE-N metric.
+
+    This class implements the ROUGE-N variant of the ROUGE metric. The ROUGE-N
+    metric is traditionally used for evaluating summarisation systems.
+    Succinctly put, ROUGE-N is a score based on the number of matching n-grams
+    between the reference text and the hypothesis text.
+
+    Args:
+        order: The order of n-grams which are to be matched. It should lie in
+            range [1, 9]. Defaults to 2.
+        metric_type: string. One of "precision", "recall", "f1_score". Defaults
+            to "f1_score".
+        use_stemmer: bool. Whether Porter Stemmer should be used to strip word
+            suffixes to improve matching. Defaults to False.
+        dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
+               not specified, it defaults to tf.float32.
+        name: string. Name of the metric instance.
+        **kwargs: Other keyword arguments.
+    """
+
+    def __init__(
+        self,
+        order=2,
+        metric_type="f1_score",
+        use_stemmer=False,
+        dtype=None,
+        name="rouge-n",
+        **kwargs,
+    ):
+        super().__init__(name=name, dtype=dtype, **kwargs)
+
+        if rouge_score is None:
+            raise ImportError(
+                "ROUGE metric requires the `rouge_score` package. "
+                "Please install it with `pip install rouge-score`."
+            )
+
+        if not tf.as_dtype(self.dtype).is_floating:
+            raise ValueError(
+                "`dtype` must be a floating point type. "
+                f"Received: dtype={dtype}"
+            )
+
+        if order not in range(1, 10):
+            raise ValueError(
+                "Invalid `order` value. Should lie in the range [1, 9]."
+                f"Received order={order}"
+            )
+
+        if metric_type not in ("precision", "recall", "f1_score"):
+            raise ValueError(
+                '`metric_type` must be one of "precision", "recall", '
+                f'"f1_score". Received: metric_type={metric_type}'
+            )
+
+        self.order = order
+        self.metric_type = metric_type
+        self.use_stemmer = use_stemmer
+
+        # To-do: Add an option for adding custom tokenizer after the maintainers
+        # of rouge-score have released a new version.
+        self._rouge_n_scorer = rouge_scorer.RougeScorer(
+            rouge_types=["rouge" + str(order)],
+            use_stemmer=use_stemmer,
+        )
+
+        self._rouge_n_score = self.add_weight(
+            name="rouge_n_score",
+            initializer="zeros",
+            dtype=self.dtype,
+        )
+        self._number_of_samples = self.add_weight(
+            name="number_of_samples", initializer="zeros", dtype=self.dtype
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        # Three possible shapes for y_true and y_pred: Python string,
+        # [batch_size] and [batch_size, 1]. In the latter two cases, we have
+        # strings in the tensor/list.
+
+        # Check if input is a raw string/list.
+        if isinstance(y_true, str):
+            y_true = tf.constant([y_true])
+        elif isinstance(y_true, list):
+            y_true = tf.constant(y_true)
+        if isinstance(y_pred, str):
+            y_pred = tf.constant([y_pred])
+        elif isinstance(y_pred, list):
+            y_pred = tf.constant(y_pred)
+
+        # If the shape of y_true and y_pred is [batch_size, 1], squeeze it to
+        # [batch_size].
+        if y_true.shape.rank == 2:
+            y_true = tf.squeeze(y_true, axis=1)
+        if y_pred.shape.rank == 2:
+            y_pred = tf.squeeze(y_pred, axis=1)
+
+        batch_size = tf.shape(y_true)[0]
+
+        def _calculate_rouge_n_score(reference, hypothesis):
+            reference = tensor_to_string_list(reference)
+            hypothesis = tensor_to_string_list(hypothesis)
+            score = self._rouge_n_scorer.score(reference, hypothesis)[
+                "rouge" + str(self.order)
+            ]
+
+            if self.metric_type == "precision":
+                score = score.precision
+            elif self.metric_type == "recall":
+                score = score.recall
+            else:
+                score = score.fmeasure
+            return score
+
+        for batch_idx in range(batch_size):
+            score = tf.py_function(
+                func=_calculate_rouge_n_score,
+                inp=[y_true[batch_idx], y_pred[batch_idx]],
+                Tout=self.dtype,
+            )
+            self._rouge_n_score.assign_add(score)
+
+        self._number_of_samples.assign_add(
+            tf.cast(batch_size, dtype=self.dtype)
+        )
+
+    def result(self):
+        if self._number_of_samples == 0:
+            return 0.0
+        rouge_n_score = self._rouge_n_score / self._number_of_samples
+        return rouge_n_score
+
+    def reset_state(self):
+        self._rouge_n_score.assign(0.0)
+        self._number_of_samples.assign(0.0)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "order": self.order,
+                "metric_type": self.metric_type,
+                "use_stemmer": self.use_stemmer,
+            }
+        )
+        return config
diff --git a/keras_nlp/metrics/rouge_test.py b/keras_nlp/metrics/rouge_n_test.py
similarity index 52%
rename from keras_nlp/metrics/rouge_test.py
rename to keras_nlp/metrics/rouge_n_test.py
index 05ca3b65cc..983902f1e5 100644
--- a/keras_nlp/metrics/rouge_test.py
+++ b/keras_nlp/metrics/rouge_n_test.py
@@ -12,120 +12,121 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for Rouge."""
+"""Tests for RougeN."""
 
 import tensorflow as tf
 
-from keras_nlp.metrics import Rouge
+from keras_nlp.metrics import RougeN
 
 
-class RougeTest(tf.test.TestCase):
+class RougeNTest(tf.test.TestCase):
     def test_initialization(self):
-        rouge = Rouge()
+        rouge = RougeN()
         self.assertEqual(rouge.result().numpy(), 0.0)
 
     def test_string_input(self):
-        rouge = Rouge(
-            variant="rouge2", metric_type="f1_score", use_stemmer=False
-        )
-        y_true = "hey, this is great fun"
-        y_pred = "great fun indeed"
+        rouge = RougeN(order=2, use_stemmer=False)
+        y_true = "the tiny little cat was found under the big funny bed"
+        y_pred = "the cat was under the bed"
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.267, delta=1e-3)
 
     def test_string_list_input(self):
-        rouge = Rouge(
-            variant="rouge2", metric_type="f1_score", use_stemmer=False
-        )
-        y_true = ["hey, this is great fun", "i love contributing to KerasNLP"]
-        y_pred = ["great fun indeed", "contributing to KerasNLP is delightful"]
+        rouge = RougeN(order=2, use_stemmer=False)
+        y_true = [
+            "the tiny little cat was found under the big funny bed",
+            "i really love contributing to KerasNLP",
+        ]
+        y_pred = [
+            "the cat was under the bed",
+            "i love contributing to KerasNLP",
+        ]
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.417, delta=1e-3)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
 
     def test_tensor_input(self):
-        rouge = Rouge(
-            variant="rouge2", metric_type="f1_score", use_stemmer=False
-        )
+        rouge = RougeN(order=2, use_stemmer=False)
         y_true = tf.constant(
-            ["hey, this is great fun", "i love contributing to KerasNLP"]
+            [
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
+            ]
         )
         y_pred = tf.constant(
-            ["great fun indeed", "contributing to KerasNLP is delightful"]
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
         )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.417, delta=1e-3)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
 
-    def test_rouge_l(self):
-        rouge = Rouge(
-            variant="rougeL", metric_type="f1_score", use_stemmer=False
-        )
+    def test_rank_2_input(self):
+        rouge = RougeN(order=2, use_stemmer=False)
         y_true = tf.constant(
-            ["hey, this is great fun", "i love contributing to KerasNLP"]
+            [
+                ["the tiny little cat was found under the big funny bed"],
+                ["i really love contributing to KerasNLP"],
+            ]
         )
         y_pred = tf.constant(
-            ["great fun indeed", "contributing to KerasNLP is delightful"]
+            [["the cat was under the bed"], ["i love contributing to KerasNLP"]]
         )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.55, delta=1e-3)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
 
-    def test_rouge_l_sum(self):
-        rouge = Rouge(
-            variant="rougeLsum", metric_type="f1_score", use_stemmer=False
-        )
+    def test_incorrect_order(self):
+        with self.assertRaises(ValueError):
+            _ = RougeN(order=10)
+
+    def test_different_order(self):
+        rouge = RougeN(order=3, use_stemmer=False)
         y_true = tf.constant(
-            ["hey, this is great fun", "i love contributing to KerasNLP"]
+            [
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
+            ]
         )
         y_pred = tf.constant(
-            ["great fun indeed", "contributing to KerasNLP is delightful"]
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
         )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.55, delta=1e-3)
-
-    def test_incorrect_variant(self):
-        with self.assertRaises(ValueError):
-            _ = Rouge(
-                variant="rouge10", metric_type="f1_score", use_stemmer=False
-            )
+        self.assertAlmostEqual(rouge_val.numpy(), 0.286, delta=1e-3)
 
     def test_precision(self):
-        rouge = Rouge(
-            variant="rouge3", metric_type="precision", use_stemmer=False
-        )
+        rouge = RougeN(order=3, metric_type="precision", use_stemmer=False)
         y_true = tf.constant(
-            ["hey, this is great fun", "i love contributing to KerasNLP"]
-        )
-        y_pred = tf.constant(
             [
-                "great fun indeed",
-                "KerasNLP is awesome, i love contributing to it",
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
             ]
         )
+        y_pred = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.167, delta=1e-3)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3)
 
     def test_recall(self):
-        rouge = Rouge(variant="rouge3", metric_type="recall", use_stemmer=False)
+        rouge = RougeN(order=3, metric_type="recall", use_stemmer=False)
         y_true = tf.constant(
-            ["hey, this is great fun", "i love contributing to KerasNLP"]
-        )
-        y_pred = tf.constant(
             [
-                "great fun indeed",
-                "KerasNLP is awesome, i love contributing to it",
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
             ]
         )
+        y_pred = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.25, delta=1e-3)
 
     def test_reset_state(self):
-        rouge = Rouge()
+        rouge = RougeN()
         y_true = tf.constant(
             ["hey, this is great fun", "i love contributing to KerasNLP"]
         )
@@ -143,41 +144,41 @@ def test_reset_state(self):
         self.assertEqual(rouge.result(), 0.0)
 
     def test_update_state(self):
-        rouge = Rouge()
+        rouge = RougeN()
         y_true_1 = tf.constant(
-            ["hey, this is great fun", "i love contributing to KerasNLP"]
-        )
-        y_pred_1 = tf.constant(
             [
-                "great fun indeed",
-                "KerasNLP is awesome, i love contributing to it",
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
             ]
         )
+        y_pred_1 = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
 
         rouge.update_state(y_true_1, y_pred_1)
         rouge_val = rouge.result()
-        self.assertAlmostEqual(rouge_val.numpy(), 0.439, delta=1e-3)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
 
         y_true_2 = tf.constant(["what is your favourite show"])
         y_pred_2 = tf.constant(["my favourite show is silicon valley"])
 
         rouge.update_state(y_true_2, y_pred_2)
         rouge_val = rouge.result()
-        self.assertAlmostEqual(rouge_val.numpy(), 0.367, delta=1e-3)
+        self.assertAlmostEqual(rouge_val.numpy(), 0.385, delta=1e-3)
 
     def test_merge_state(self):
-        rouge_1 = Rouge()
-        rouge_2 = Rouge()
+        rouge_1 = RougeN()
+        rouge_2 = RougeN()
 
         y_true_1 = tf.constant(
-            ["hey, this is great fun", "i love contributing to KerasNLP"]
-        )
-        y_pred_1 = tf.constant(
             [
-                "great fun indeed",
-                "KerasNLP is awesome, i love contributing to it",
+                "the tiny little cat was found under the big funny bed",
+                "i really love contributing to KerasNLP",
             ]
         )
+        y_pred_1 = tf.constant(
+            ["the cat was under the bed", "i love contributing to KerasNLP"]
+        )
 
         y_true_2 = tf.constant(["what is your favourite show"])
         y_pred_2 = tf.constant(["my favourite show is silicon valley"])
@@ -187,30 +188,30 @@ def test_merge_state(self):
 
         rouge_1.update_state(y_true_1, y_pred_1)
         rouge_1.update_state(y_true_2, y_pred_2)
-        self.assertAlmostEqual(rouge_1.result().numpy(), 0.367, delta=1e-3)
+        self.assertAlmostEqual(rouge_1.result().numpy(), 0.385, delta=1e-3)
 
         rouge_2.update_state(y_true_3, y_pred_3)
         self.assertAlmostEqual(rouge_2.result().numpy(), 0.222, delta=1e-3)
 
-        merged_rouge = Rouge()
+        merged_rouge = RougeN()
         merged_rouge.merge_state([rouge_1, rouge_2])
-        self.assertAlmostEqual(merged_rouge.result().numpy(), 0.331, delta=1e-3)
+        self.assertAlmostEqual(merged_rouge.result().numpy(), 0.344, delta=1e-3)
 
     def test_get_config(self):
-        rouge = Rouge(
-            variant="rouge5",
+        rouge = RougeN(
+            order=5,
             metric_type="precision",
             use_stemmer=True,
             dtype=tf.float32,
-            name="rouge_test",
+            name="rouge_n_test",
         )
 
         config = rouge.get_config()
         expected_config = {
-            "variant": "rouge5",
+            "order": 5,
             "metric_type": "precision",
             "use_stemmer": True,
             "dtype": tf.float32,
-            "name": "rouge_test",
+            "name": "rouge_n_test",
         }
         self.assertEqual(config, expected_config)

From 748df818b6a3e5530b2110b1eb7ec483a9f547a3 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 3 Jun 2022 18:42:30 +0530
Subject: [PATCH 18/30] Address review comments - III

---
 keras_nlp/metrics/rouge.py        | 186 ++++++++++++++++++++++++
 keras_nlp/metrics/rouge_l.py      | 214 +++++++++++++--------------
 keras_nlp/metrics/rouge_l_test.py |  20 ++-
 keras_nlp/metrics/rouge_n.py      | 230 ++++++++++++++++--------------
 keras_nlp/metrics/rouge_n_test.py |  21 ++-
 5 files changed, 443 insertions(+), 228 deletions(-)
 create mode 100644 keras_nlp/metrics/rouge.py

diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge.py
new file mode 100644
index 0000000000..13467d1cac
--- /dev/null
+++ b/keras_nlp/metrics/rouge.py
@@ -0,0 +1,186 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ROUGE metric implementation based on `keras.metrics.Metric`."""
+
+
+import tensorflow as tf
+from tensorflow import keras
+
+from keras_nlp.utils.tensor_utils import tensor_to_string_list
+
+try:
+    import rouge_score
+    from rouge_score import rouge_scorer
+except ImportError:
+    rouge_score = None
+
+
+class RougeBase(keras.metrics.Metric):
+    """ROUGE metric.
+    This class implements all the variants of the ROUGE metric - ROUGE-N,
+    ROUGE-L and ROUGE-LSum.
+    Args:
+        variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to
+            "rouge2". For "rougeN", N lies in the range [1, 9].
+        metric_type: string. One of "precision", "recall", "f1_score". Defaults
+            to "f1_score".
+        use_stemmer: bool. Whether Porter Stemmer should be used to strip word
+            suffixes to improve matching. Defaults to False.
+        dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
+               not specified, it defaults to tf.float32.
+        name: string. Name of the metric instance.
+        **kwargs: Other keyword arguments.
+    """
+
+    def __init__(
+        self,
+        variant="rouge2",
+        metric_type="f1_score",
+        use_stemmer=False,
+        dtype=None,
+        name="rouge",
+        **kwargs,
+    ):
+        super().__init__(name=name, dtype=dtype, **kwargs)
+
+        if rouge_score is None:
+            raise ImportError(
+                "ROUGE metric requires the `rouge_score` package. "
+                "Please install it with `pip install rouge-score`."
+            )
+
+        if not tf.as_dtype(self.dtype).is_floating:
+            raise ValueError(
+                "`dtype` must be a floating point type. "
+                f"Received: dtype={dtype}"
+            )
+
+        if metric_type not in ("precision", "recall", "f1_score"):
+            raise ValueError(
+                '`metric_type` must be one of "precision", "recall", '
+                f'"f1_score". Received: metric_type={metric_type}'
+            )
+
+        if variant not in tuple(
+            ("rouge" + str(order) for order in range(1, 10))
+        ) + (
+            "rougeL",
+            "rougeLsum",
+        ):
+            raise ValueError(
+                "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, "
+                "rougeLsum, with N ranging from 1 to 9. Received: "
+                f"variant={variant}"
+            )
+
+        self.variant = variant
+        self.metric_type = metric_type
+        self.use_stemmer = use_stemmer
+
+        # To-do: Add split_summaries and tokenizer options after the maintainers
+        # of rouge_scorer have released a new version.
+        self._rouge_scorer = rouge_scorer.RougeScorer(
+            rouge_types=[self.variant],
+            use_stemmer=use_stemmer,
+        )
+
+        self._rouge_score = self.add_weight(
+            name="rouge_score",
+            initializer="zeros",
+            dtype=self.dtype,
+        )
+        self._number_of_samples = self.add_weight(
+            name="number_of_samples", initializer="zeros", dtype=self.dtype
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        # Three possible shapes for y_true and y_pred: Python string,
+        # [batch_size] and [batch_size, 1]. In the latter two cases, we have
+        # strings in the tensor/list.
+
+        def validate_and_fix_rank(input_, tensor_name):
+            if not isinstance(input_, tf.Tensor):
+                input_ = tf.convert_to_tensor(input_)
+
+            if input_.shape.rank == 0:
+                return input_[tf.newaxis]
+            elif input_.shape.rank == 1:
+                return input_
+            elif input_.shape.rank == 2:
+                if input_.shape[1] != 1:
+                    raise ValueError(
+                        f"{tensor_name} must be of shape `[batch_size, 1]`. "
+                        f"Found shape: {input_.shape}"
+                    )
+                else:
+                    return tf.squeeze(input_, axis=1)
+            else:
+                raise ValueError(
+                    f"{tensor_name} must be of rank 0 (scalar input), 1 or 2. "
+                    f"Found rank: {input_.shape.rank}"
+                )
+
+        y_true = validate_and_fix_rank(y_true, "y_true")
+        y_pred = validate_and_fix_rank(y_pred, "y_pred")
+
+        batch_size = tf.shape(y_true)[0]
+
+        def calculate_rouge_score(reference, hypothesis):
+            reference = tensor_to_string_list(reference)
+            hypothesis = tensor_to_string_list(hypothesis)
+            score = self._rouge_scorer.score(reference, hypothesis)[
+                self.variant
+            ]
+
+            if self.metric_type == "precision":
+                score = score.precision
+            elif self.metric_type == "recall":
+                score = score.recall
+            else:
+                score = score.fmeasure
+            return score
+
+        for batch_idx in range(batch_size):
+            score = tf.py_function(
+                func=calculate_rouge_score,
+                inp=[y_true[batch_idx], y_pred[batch_idx]],
+                Tout=self.dtype,
+            )
+            self._rouge_score.assign_add(score)
+
+        self._number_of_samples.assign_add(
+            tf.cast(batch_size, dtype=self.dtype)
+        )
+
+    def result(self):
+        if self._number_of_samples == 0:
+            return 0.0
+        rouge_score = self._rouge_score / self._number_of_samples
+        return rouge_score
+
+    def reset_state(self):
+        self._rouge_score.assign(0.0)
+        self._number_of_samples.assign(0.0)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "variant": self.variant,
+                "metric_type": self.metric_type,
+                "use_stemmer": self.use_stemmer,
+            }
+        )
+        return config
diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 79f897a2da..af888f7795 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -14,19 +14,11 @@
 
 """ROUGE-L metric implementation based on `keras.metrics.Metric`."""
 
-import tensorflow as tf
-from tensorflow import keras
 
-from keras_nlp.utils.tensor_utils import tensor_to_string_list
+from keras_nlp.metrics.rouge import RougeBase
 
-try:
-    import rouge_score
-    from rouge_score import rouge_scorer
-except ImportError:
-    rouge_score = None
 
-
-class RougeL(keras.metrics.Metric):
+class RougeL(RougeBase):
     """ROUGE-L metric.
 
     This class implements the ROUGE-L variant of the ROUGE metric. The ROUGE-L
@@ -34,6 +26,14 @@ class RougeL(keras.metrics.Metric):
     Succinctly put, ROUGE-L is a score based on the length of the longest
     common subsequence present in the reference text and the hypothesis text.
 
+    Note on input shapes:
+    `y_true` and `y_pred` can be of the following types/shapes:
+    1. Python string/scalar input
+    2. Tensor/Python list
+        a. rank 0
+        b. rank 1 (every element in the tensor is a string)
+        c. rank 2 (shape: `(batch_size, 1)`)
+
     Args:
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
             suffixes to improve matching. Defaults to False.
@@ -43,6 +43,92 @@ class RougeL(keras.metrics.Metric):
                not specified, it defaults to tf.float32.
         name: string. Name of the metric instance.
         **kwargs: Other keyword arguments.
+
+    Examples:
+
+    1. Various Input Types.
+    1.1. Python string.
+    >>> rouge_l = keras_nlp.metrics.RougeL()
+    >>> y_true = "the tiny little cat was found under the big funny bed"
+    >>> y_pred = "the cat was under the bed"
+    >>> rouge_l(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.7058824>
+
+    1.2. rank 1 inputs.
+    a. Python list.
+    >>> rouge_l = keras_nlp.metrics.RougeL()
+    >>> y_true = [
+    ...     "the tiny little cat was found under the big funny bed",
+    ...     "i really love contributing to KerasNLP",
+    ... ]
+    >>> y_pred = [
+    ...     "the cat was under the bed",
+    ...     "i love contributing to KerasNLP",
+    ... ]
+    >>>
+    >>> rouge_l(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
+
+    b. Tensor
+    >>> rouge_l = keras_nlp.metrics.RougeL()
+    >>> y_true = tf.constant(
+    ...     [
+    ...         "the tiny little cat was found under the big funny bed",
+    ...         "i really love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> y_pred = tf.constant(
+    ...     [
+    ...         "the cat was under the bed",
+    ...         "i love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> rouge_l(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
+
+    1.3. rank 2 inputs.
+    >>> rouge_l = keras_nlp.metrics.RougeL()
+    >>> y_true = tf.constant(
+    ...     [
+    ...         ["the tiny little cat was found under the big funny bed"],
+    ...         ["i really love contributing to KerasNLP"],
+    ...     ]
+    ... )
+    >>> y_pred = tf.constant(
+    ...     [
+    ...         ["the cat was under the bed"],
+    ...         ["i love contributing to KerasNLP"],
+    ...     ]
+    ... )
+    >>> rouge_l(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
+
+    3. Output the precision instead of the F1 Score.
+    >>> rouge_l = keras_nlp.metrics.RougeL(metric_type="precision")
+    >>> y_true = tf.constant(
+    ...     [
+    ...         "the tiny little cat was found under the big funny bed",
+    ...         "i really love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> y_pred = tf.constant(
+    ...     [
+    ...         "the cat was under the bed",
+    ...         "i love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> rouge_l(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=1.0>
+
+    4. Pass the metric to `model.compile()`.
+    >>> inputs = keras.Input(shape=(), dtype='string')
+    >>> outputs = tf.strings.lower(inputs)
+    >>> model = keras.Model(inputs, outputs)
+    >>> model.compile(metrics=[keras_nlp.metrics.RougeL()])
+    >>> x = tf.constant(["HELLO THIS IS FUN"])
+    >>> y = tf.constant(["hello this is awesome"])
+    >>> model.evaluate(x, y, return_dict=True)
+    {'loss': 0.0, 'rouge-l': 0.75}
     """
 
     def __init__(
@@ -53,110 +139,16 @@ def __init__(
         name="rouge-l",
         **kwargs,
     ):
-        super().__init__(name=name, dtype=dtype, **kwargs)
-
-        if rouge_score is None:
-            raise ImportError(
-                "ROUGE metric requires the `rouge_score` package. "
-                "Please install it with `pip install rouge-score`."
-            )
-
-        if not tf.as_dtype(self.dtype).is_floating:
-            raise ValueError(
-                "`dtype` must be a floating point type. "
-                f"Received: dtype={dtype}"
-            )
-
-        if metric_type not in ("precision", "recall", "f1_score"):
-            raise ValueError(
-                '`metric_type` must be one of "precision", "recall", '
-                f'"f1_score". Received: metric_type={metric_type}'
-            )
-
-        self.metric_type = metric_type
-        self.use_stemmer = use_stemmer
-
-        # To-do: Add an option for adding custom tokenizer after the maintainers
-        # of rouge-score have released a new version.
-        self._rouge_l_scorer = rouge_scorer.RougeScorer(
-            rouge_types=["rougeL"],
+        super().__init__(
+            variant="rougeL",
+            metric_type=metric_type,
             use_stemmer=use_stemmer,
+            dtype=dtype,
+            name=name,
+            **kwargs,
         )
 
-        self._rouge_l_score = self.add_weight(
-            name="rouge_l_score",
-            initializer="zeros",
-            dtype=self.dtype,
-        )
-        self._number_of_samples = self.add_weight(
-            name="number_of_samples", initializer="zeros", dtype=self.dtype
-        )
-
-    def update_state(self, y_true, y_pred, sample_weight=None):
-        # Three possible shapes for y_true and y_pred: Python string,
-        # [batch_size] and [batch_size, 1]. In the latter two cases, we have
-        # strings in the tensor/list.
-
-        # Check if input is a raw string/list.
-        if isinstance(y_true, str):
-            y_true = tf.constant([y_true])
-        elif isinstance(y_true, list):
-            y_true = tf.constant(y_true)
-        if isinstance(y_pred, str):
-            y_pred = tf.constant([y_pred])
-        elif isinstance(y_pred, list):
-            y_pred = tf.constant(y_pred)
-
-        # If the shape of y_true and y_pred is [batch_size, 1], squeeze it to
-        # [batch_size].
-        if y_true.shape.rank == 2:
-            y_true = tf.squeeze(y_true, axis=1)
-        if y_pred.shape.rank == 2:
-            y_pred = tf.squeeze(y_pred, axis=1)
-
-        batch_size = tf.shape(y_true)[0]
-
-        def _calculate_rouge_l_score(reference, hypothesis):
-            reference = tensor_to_string_list(reference)
-            hypothesis = tensor_to_string_list(hypothesis)
-            score = self._rouge_l_scorer.score(reference, hypothesis)["rougeL"]
-
-            if self.metric_type == "precision":
-                score = score.precision
-            elif self.metric_type == "recall":
-                score = score.recall
-            else:
-                score = score.fmeasure
-            return score
-
-        for batch_idx in range(batch_size):
-            score = tf.py_function(
-                func=_calculate_rouge_l_score,
-                inp=[y_true[batch_idx], y_pred[batch_idx]],
-                Tout=self.dtype,
-            )
-            self._rouge_l_score.assign_add(score)
-
-        self._number_of_samples.assign_add(
-            tf.cast(batch_size, dtype=self.dtype)
-        )
-
-    def result(self):
-        if self._number_of_samples == 0:
-            return 0.0
-        rouge_l_score = self._rouge_l_score / self._number_of_samples
-        return rouge_l_score
-
-    def reset_state(self):
-        self._rouge_l_score.assign(0.0)
-        self._number_of_samples.assign(0.0)
-
     def get_config(self):
         config = super().get_config()
-        config.update(
-            {
-                "metric_type": self.metric_type,
-                "use_stemmer": self.use_stemmer,
-            }
-        )
+        del config["variant"]
         return config
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
index a3e4250dae..216686273d 100644
--- a/keras_nlp/metrics/rouge_l_test.py
+++ b/keras_nlp/metrics/rouge_l_test.py
@@ -15,6 +15,7 @@
 """Tests for RougeL."""
 
 import tensorflow as tf
+from tensorflow import keras
 
 from keras_nlp.metrics import RougeL
 
@@ -76,6 +77,19 @@ def test_rank_2_input(self):
         rouge_val = rouge(y_true, y_pred)
         self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
 
+    def model_compile(self):
+        inputs = keras.Input(shape=(), dtype="string")
+        outputs = tf.strings.lower(inputs)
+        model = keras.Model(inputs, outputs)
+
+        model.compile(metrics=[RougeL()])
+
+        x = tf.constant(["HELLO THIS IS FUN"])
+        y = tf.constant(["hello this is awesome"])
+
+        output = model.evaluate(x, y, return_dict=True)
+        self.assertAlmostEqual(output["rouge-l"], 0.75, delta=1e-3)
+
     def test_precision(self):
         rouge = RougeL(metric_type="precision", use_stemmer=False)
         y_true = tf.constant(
@@ -187,10 +201,8 @@ def test_get_config(self):
         )
 
         config = rouge.get_config()
-        expected_config = {
+        expected_config_subset = {
             "metric_type": "precision",
             "use_stemmer": True,
-            "dtype": tf.float32,
-            "name": "rouge_l_test",
         }
-        self.assertEqual(config, expected_config)
+        self.assertEqual(config, {**config, **expected_config_subset})
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index ad8e2288d9..d243a39a44 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -14,19 +14,11 @@
 
 """ROUGE-N metric implementation based on `keras.metrics.Metric`."""
 
-import tensorflow as tf
-from tensorflow import keras
 
-from keras_nlp.utils.tensor_utils import tensor_to_string_list
+from keras_nlp.metrics.rouge import RougeBase
 
-try:
-    import rouge_score
-    from rouge_score import rouge_scorer
-except ImportError:
-    rouge_score = None
 
-
-class RougeN(keras.metrics.Metric):
+class RougeN(RougeBase):
     """ROUGE-N metric.
 
     This class implements the ROUGE-N variant of the ROUGE metric. The ROUGE-N
@@ -34,6 +26,14 @@ class RougeN(keras.metrics.Metric):
     Succinctly put, ROUGE-N is a score based on the number of matching n-grams
     between the reference text and the hypothesis text.
 
+    Note on input shapes:
+    `y_true` and `y_pred` can be of the following types/shapes:
+    1. Python string/scalar input
+    2. Tensor/Python list
+        a. rank 0
+        b. rank 1 (every element in the tensor is a string)
+        c. rank 2 (shape: `(batch_size, 1)`)
+
     Args:
         order: The order of n-grams which are to be matched. It should lie in
             range [1, 9]. Defaults to 2.
@@ -45,6 +45,108 @@ class RougeN(keras.metrics.Metric):
                not specified, it defaults to tf.float32.
         name: string. Name of the metric instance.
         **kwargs: Other keyword arguments.
+
+    Examples:
+
+    1. Various Input Types.
+    1.1. Python string.
+    >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
+    >>> y_true = "the tiny little cat was found under the big funny bed"
+    >>> y_pred = "the cat was under the bed"
+    >>> rouge_n(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.26666668>
+
+    1.2. rank 1 inputs.
+    a. Python list.
+    >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
+    >>> y_true = [
+    ...     "the tiny little cat was found under the big funny bed",
+    ...     "i really love contributing to KerasNLP",
+    ... ]
+    >>> y_pred = [
+    ...     "the cat was under the bed",
+    ...     "i love contributing to KerasNLP",
+    ... ]
+    >>> rouge_n(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+
+    b. Tensor.
+    >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
+    >>> y_true = tf.constant(
+    ...     [
+    ...         "the tiny little cat was found under the big funny bed",
+    ...         "i really love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> y_pred = tf.constant(
+    ...     [
+    ...         "the cat was under the bed",
+    ...         "i love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> rouge_n(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+
+    1.3. rank 2 inputs.
+    >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
+    >>> y_true = tf.constant(
+    ...     [
+    ...         ["the tiny little cat was found under the big funny bed"],
+    ...         ["i really love contributing to KerasNLP"],
+    ...     ]
+    ... )
+    >>> y_pred = tf.constant(
+    ...     [
+    ...         ["the cat was under the bed"],
+    ...         ["i love contributing to KerasNLP"],
+    ...     ]
+    ... )
+    >>> rouge_n(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+
+    2. Consider trigrams for calculating ROUGE-N.
+    >>> rouge_n = keras_nlp.metrics.RougeN(order=3)
+    >>> y_true = tf.constant(
+    ...     [
+    ...         "the tiny little cat was found under the big funny bed",
+    ...         "i really love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> y_pred = tf.constant(
+    ...     [
+    ...         "the cat was under the bed",
+    ...         "i love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> rouge_n(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.2857143>
+
+    3. Output the precision instead of the F1 Score.
+    >>> rouge_n = keras_nlp.metrics.RougeN(order=3, metric_type="precision")
+    >>> y_true = tf.constant(
+    ...     [
+    ...         "the tiny little cat was found under the big funny bed",
+    ...         "i really love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> y_pred = tf.constant(
+    ...     [
+    ...         "the cat was under the bed",
+    ...         "i love contributing to KerasNLP",
+    ...     ]
+    ... )
+    >>> rouge_n(y_true, y_pred)
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.33333334>
+
+    4. Pass the metric to `model.compile()`.
+    >>> inputs = keras.Input(shape=(), dtype='string')
+    >>> outputs = tf.strings.lower(inputs)
+    >>> model = keras.Model(inputs, outputs)
+    >>> model.compile(metrics=[keras_nlp.metrics.RougeN()])
+    >>> x = tf.constant(["HELLO THIS IS FUN"])
+    >>> y = tf.constant(["hello this is awesome"])
+    >>> model.evaluate(x, y, return_dict=True)
+    {'loss': 0.0, 'rouge-n': 0.6666666865348816}
     """
 
     def __init__(
@@ -56,120 +158,30 @@ def __init__(
         name="rouge-n",
         **kwargs,
     ):
-        super().__init__(name=name, dtype=dtype, **kwargs)
-
-        if rouge_score is None:
-            raise ImportError(
-                "ROUGE metric requires the `rouge_score` package. "
-                "Please install it with `pip install rouge-score`."
-            )
-
-        if not tf.as_dtype(self.dtype).is_floating:
-            raise ValueError(
-                "`dtype` must be a floating point type. "
-                f"Received: dtype={dtype}"
-            )
-
         if order not in range(1, 10):
             raise ValueError(
                 "Invalid `order` value. Should lie in the range [1, 9]."
                 f"Received order={order}"
             )
 
-        if metric_type not in ("precision", "recall", "f1_score"):
-            raise ValueError(
-                '`metric_type` must be one of "precision", "recall", '
-                f'"f1_score". Received: metric_type={metric_type}'
-            )
-
-        self.order = order
-        self.metric_type = metric_type
-        self.use_stemmer = use_stemmer
-
-        # To-do: Add an option for adding custom tokenizer after the maintainers
-        # of rouge-score have released a new version.
-        self._rouge_n_scorer = rouge_scorer.RougeScorer(
-            rouge_types=["rouge" + str(order)],
+        super().__init__(
+            variant=f"rouge{order}",
+            metric_type=metric_type,
             use_stemmer=use_stemmer,
+            dtype=dtype,
+            name=name,
+            **kwargs,
         )
 
-        self._rouge_n_score = self.add_weight(
-            name="rouge_n_score",
-            initializer="zeros",
-            dtype=self.dtype,
-        )
-        self._number_of_samples = self.add_weight(
-            name="number_of_samples", initializer="zeros", dtype=self.dtype
-        )
-
-    def update_state(self, y_true, y_pred, sample_weight=None):
-        # Three possible shapes for y_true and y_pred: Python string,
-        # [batch_size] and [batch_size, 1]. In the latter two cases, we have
-        # strings in the tensor/list.
-
-        # Check if input is a raw string/list.
-        if isinstance(y_true, str):
-            y_true = tf.constant([y_true])
-        elif isinstance(y_true, list):
-            y_true = tf.constant(y_true)
-        if isinstance(y_pred, str):
-            y_pred = tf.constant([y_pred])
-        elif isinstance(y_pred, list):
-            y_pred = tf.constant(y_pred)
-
-        # If the shape of y_true and y_pred is [batch_size, 1], squeeze it to
-        # [batch_size].
-        if y_true.shape.rank == 2:
-            y_true = tf.squeeze(y_true, axis=1)
-        if y_pred.shape.rank == 2:
-            y_pred = tf.squeeze(y_pred, axis=1)
-
-        batch_size = tf.shape(y_true)[0]
-
-        def _calculate_rouge_n_score(reference, hypothesis):
-            reference = tensor_to_string_list(reference)
-            hypothesis = tensor_to_string_list(hypothesis)
-            score = self._rouge_n_scorer.score(reference, hypothesis)[
-                "rouge" + str(self.order)
-            ]
-
-            if self.metric_type == "precision":
-                score = score.precision
-            elif self.metric_type == "recall":
-                score = score.recall
-            else:
-                score = score.fmeasure
-            return score
-
-        for batch_idx in range(batch_size):
-            score = tf.py_function(
-                func=_calculate_rouge_n_score,
-                inp=[y_true[batch_idx], y_pred[batch_idx]],
-                Tout=self.dtype,
-            )
-            self._rouge_n_score.assign_add(score)
-
-        self._number_of_samples.assign_add(
-            tf.cast(batch_size, dtype=self.dtype)
-        )
-
-    def result(self):
-        if self._number_of_samples == 0:
-            return 0.0
-        rouge_n_score = self._rouge_n_score / self._number_of_samples
-        return rouge_n_score
-
-    def reset_state(self):
-        self._rouge_n_score.assign(0.0)
-        self._number_of_samples.assign(0.0)
+        self.order = order
 
     def get_config(self):
         config = super().get_config()
+        del config["variant"]
+
         config.update(
             {
                 "order": self.order,
-                "metric_type": self.metric_type,
-                "use_stemmer": self.use_stemmer,
             }
         )
         return config
diff --git a/keras_nlp/metrics/rouge_n_test.py b/keras_nlp/metrics/rouge_n_test.py
index 983902f1e5..be008e3800 100644
--- a/keras_nlp/metrics/rouge_n_test.py
+++ b/keras_nlp/metrics/rouge_n_test.py
@@ -15,6 +15,7 @@
 """Tests for RougeN."""
 
 import tensorflow as tf
+from tensorflow import keras
 
 from keras_nlp.metrics import RougeN
 
@@ -76,6 +77,19 @@ def test_rank_2_input(self):
         rouge_val = rouge(y_true, y_pred)
         self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
 
+    def model_compile(self):
+        inputs = keras.Input(shape=(), dtype="string")
+        outputs = tf.strings.lower(inputs)
+        model = keras.Model(inputs, outputs)
+
+        model.compile(metrics=[RougeN()])
+
+        x = tf.constant(["HELLO THIS IS FUN"])
+        y = tf.constant(["hello this is awesome"])
+
+        output = model.evaluate(x, y, return_dict=True)
+        self.assertAlmostEqual(output["rouge-n"], 0.667, delta=1e-3)
+
     def test_incorrect_order(self):
         with self.assertRaises(ValueError):
             _ = RougeN(order=10)
@@ -207,11 +221,10 @@ def test_get_config(self):
         )
 
         config = rouge.get_config()
-        expected_config = {
+        expected_config_subset = {
             "order": 5,
             "metric_type": "precision",
             "use_stemmer": True,
-            "dtype": tf.float32,
-            "name": "rouge_n_test",
         }
-        self.assertEqual(config, expected_config)
+
+        self.assertEqual(config, {**config, **expected_config_subset})

From a793d3d97d24f08735fdbc93439a353059ddb72b Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 3 Jun 2022 18:50:42 +0530
Subject: [PATCH 19/30] Fix model.compile error in doc-string

---
 keras_nlp/metrics/rouge_l.py | 5 +++--
 keras_nlp/metrics/rouge_n.py | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index af888f7795..1fa0ad09fb 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -127,8 +127,9 @@ class RougeL(RougeBase):
     >>> model.compile(metrics=[keras_nlp.metrics.RougeL()])
     >>> x = tf.constant(["HELLO THIS IS FUN"])
     >>> y = tf.constant(["hello this is awesome"])
-    >>> model.evaluate(x, y, return_dict=True)
-    {'loss': 0.0, 'rouge-l': 0.75}
+    >>> metric_dict = model.evaluate(x, y, return_dict=True)
+    >>> metric_dict["rouge-l"]
+    0.75
     """
 
     def __init__(
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index d243a39a44..37a86f5207 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -145,8 +145,9 @@ class RougeN(RougeBase):
     >>> model.compile(metrics=[keras_nlp.metrics.RougeN()])
     >>> x = tf.constant(["HELLO THIS IS FUN"])
     >>> y = tf.constant(["hello this is awesome"])
-    >>> model.evaluate(x, y, return_dict=True)
-    {'loss': 0.0, 'rouge-n': 0.6666666865348816}
+    >>> metric_dict = model.evaluate(x, y, return_dict=True)
+    >>> metric_dict["rouge-n"]
+    0.6666666865348816
     """
 
     def __init__(

From b8dae75b58cee6cc280319d2c9f4eb4f1d962843 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Sun, 5 Jun 2022 13:26:01 +0530
Subject: [PATCH 20/30] Rename rouge.py to rouge_base.py

---
 keras_nlp/metrics/{rouge.py => rouge_base.py} | 0
 keras_nlp/metrics/rouge_l.py                  | 2 +-
 keras_nlp/metrics/rouge_n.py                  | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename keras_nlp/metrics/{rouge.py => rouge_base.py} (100%)

diff --git a/keras_nlp/metrics/rouge.py b/keras_nlp/metrics/rouge_base.py
similarity index 100%
rename from keras_nlp/metrics/rouge.py
rename to keras_nlp/metrics/rouge_base.py
diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 1fa0ad09fb..3d18b5e1e0 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -15,7 +15,7 @@
 """ROUGE-L metric implementation based on `keras.metrics.Metric`."""
 
 
-from keras_nlp.metrics.rouge import RougeBase
+from keras_nlp.metrics.rouge_base import RougeBase
 
 
 class RougeL(RougeBase):
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index 37a86f5207..8ab24b76e8 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -15,7 +15,7 @@
 """ROUGE-N metric implementation based on `keras.metrics.Metric`."""
 
 
-from keras_nlp.metrics.rouge import RougeBase
+from keras_nlp.metrics.rouge_base import RougeBase
 
 
 class RougeN(RougeBase):

From 80500863b58e0a03b01cebe3007882bf20567f3a Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Tue, 7 Jun 2022 14:18:19 +0530
Subject: [PATCH 21/30] Address review comments - IV

---
 keras_nlp/metrics/rouge_base.py | 38 ++++++++++++++++++---------------
 keras_nlp/metrics/rouge_l.py    |  8 ++-----
 keras_nlp/metrics/rouge_n.py    |  8 ++-----
 3 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py
index 13467d1cac..1340474aef 100644
--- a/keras_nlp/metrics/rouge_base.py
+++ b/keras_nlp/metrics/rouge_base.py
@@ -29,8 +29,14 @@
 
 class RougeBase(keras.metrics.Metric):
     """ROUGE metric.
+
     This class implements all the variants of the ROUGE metric - ROUGE-N,
     ROUGE-L and ROUGE-LSum.
+
+    Note on input shapes:
+    For `y_true` and `y_pred`, this class supports scalar values and batch
+    inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`.
+
     Args:
         variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to
             "rouge2". For "rougeN", N lies in the range [1, 9].
@@ -39,7 +45,7 @@ class RougeBase(keras.metrics.Metric):
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
             suffixes to improve matching. Defaults to False.
         dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
-               not specified, it defaults to tf.float32.
+            not specified, it defaults to tf.float32.
         name: string. Name of the metric instance.
         **kwargs: Other keyword arguments.
     """
@@ -76,12 +82,10 @@ def __init__(
         if variant not in tuple(
             ("rouge" + str(order) for order in range(1, 10))
         ) + (
-            "rougeL",
-            "rougeLsum",
-        ):
+            "rougeL",):
             raise ValueError(
                 "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, "
-                "rougeLsum, with N ranging from 1 to 9. Received: "
+                "with N ranging from 1 to 9. Received: "
                 f"variant={variant}"
             )
 
@@ -110,26 +114,26 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         # [batch_size] and [batch_size, 1]. In the latter two cases, we have
         # strings in the tensor/list.
 
-        def validate_and_fix_rank(input_, tensor_name):
-            if not isinstance(input_, tf.Tensor):
-                input_ = tf.convert_to_tensor(input_)
+        def validate_and_fix_rank(inputs, tensor_name):
+            if not isinstance(inputs, tf.Tensor):
+                inputs = tf.convert_to_tensor(inputs)
 
-            if input_.shape.rank == 0:
-                return input_[tf.newaxis]
-            elif input_.shape.rank == 1:
-                return input_
-            elif input_.shape.rank == 2:
-                if input_.shape[1] != 1:
+            if inputs.shape.rank == 0:
+                return inputs[tf.newaxis]
+            elif inputs.shape.rank == 1:
+                return inputs
+            elif inputs.shape.rank == 2:
+                if inputs.shape[1] != 1:
                     raise ValueError(
                         f"{tensor_name} must be of shape `[batch_size, 1]`. "
-                        f"Found shape: {input_.shape}"
+                        f"Found shape: {inputs.shape}"
                     )
                 else:
-                    return tf.squeeze(input_, axis=1)
+                    return tf.squeeze(inputs, axis=1)
             else:
                 raise ValueError(
                     f"{tensor_name} must be of rank 0 (scalar input), 1 or 2. "
-                    f"Found rank: {input_.shape.rank}"
+                    f"Found rank: {inputs.shape.rank}"
                 )
 
         y_true = validate_and_fix_rank(y_true, "y_true")
diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 3d18b5e1e0..3bfb77caa7 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -27,12 +27,8 @@ class RougeL(RougeBase):
     common subsequence present in the reference text and the hypothesis text.
 
     Note on input shapes:
-    `y_true` and `y_pred` can be of the following types/shapes:
-    1. Python string/scalar input
-    2. Tensor/Python list
-        a. rank 0
-        b. rank 1 (every element in the tensor is a string)
-        c. rank 2 (shape: `(batch_size, 1)`)
+    For `y_true` and `y_pred`, this class supports scalar values and batch
+    inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`.
 
     Args:
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index 8ab24b76e8..57d611d928 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -27,12 +27,8 @@ class RougeN(RougeBase):
     between the reference text and the hypothesis text.
 
     Note on input shapes:
-    `y_true` and `y_pred` can be of the following types/shapes:
-    1. Python string/scalar input
-    2. Tensor/Python list
-        a. rank 0
-        b. rank 1 (every element in the tensor is a string)
-        c. rank 2 (shape: `(batch_size, 1)`)
+    For `y_true` and `y_pred`, this class supports scalar values and batch
+    inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`.
 
     Args:
         order: The order of n-grams which are to be matched. It should lie in

From da44d22335d927d68922ad9df6f43d30dfe56f6c Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Tue, 7 Jun 2022 14:19:04 +0530
Subject: [PATCH 22/30] Address review comments - IV

---
 keras_nlp/metrics/rouge_base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py
index 1340474aef..41bfa722cf 100644
--- a/keras_nlp/metrics/rouge_base.py
+++ b/keras_nlp/metrics/rouge_base.py
@@ -81,8 +81,7 @@ def __init__(
 
         if variant not in tuple(
             ("rouge" + str(order) for order in range(1, 10))
-        ) + (
-            "rougeL",):
+        ) + ("rougeL",):
             raise ValueError(
                 "Invalid variant of ROUGE. Should be one of: rougeN, rougeL, "
                 "with N ranging from 1 to 9. Received: "

From f8c05aacc34cfa111321173bf5298bd184347a9f Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 10 Jun 2022 11:24:46 +0530
Subject: [PATCH 23/30] Return dict from ROUGE

---
 keras_nlp/metrics/rouge_base.py   |  85 ++++++++++++------
 keras_nlp/metrics/rouge_l.py      |  67 +++++++-------
 keras_nlp/metrics/rouge_l_test.py | 134 ++++++++++++++++++----------
 keras_nlp/metrics/rouge_n.py      |  70 ++++++++-------
 keras_nlp/metrics/rouge_n_test.py | 142 ++++++++++++++++++++----------
 5 files changed, 317 insertions(+), 181 deletions(-)

diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py
index 41bfa722cf..069ef14226 100644
--- a/keras_nlp/metrics/rouge_base.py
+++ b/keras_nlp/metrics/rouge_base.py
@@ -15,6 +15,8 @@
 """ROUGE metric implementation based on `keras.metrics.Metric`."""
 
 
+import types
+
 import tensorflow as tf
 from tensorflow import keras
 
@@ -40,8 +42,6 @@ class RougeBase(keras.metrics.Metric):
     Args:
         variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to
             "rouge2". For "rougeN", N lies in the range [1, 9].
-        metric_type: string. One of "precision", "recall", "f1_score". Defaults
-            to "f1_score".
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
             suffixes to improve matching. Defaults to False.
         dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
@@ -53,7 +53,6 @@ class RougeBase(keras.metrics.Metric):
     def __init__(
         self,
         variant="rouge2",
-        metric_type="f1_score",
         use_stemmer=False,
         dtype=None,
         name="rouge",
@@ -73,12 +72,6 @@ def __init__(
                 f"Received: dtype={dtype}"
             )
 
-        if metric_type not in ("precision", "recall", "f1_score"):
-            raise ValueError(
-                '`metric_type` must be one of "precision", "recall", '
-                f'"f1_score". Received: metric_type={metric_type}'
-            )
-
         if variant not in tuple(
             ("rouge" + str(order) for order in range(1, 10))
         ) + ("rougeL",):
@@ -89,7 +82,6 @@ def __init__(
             )
 
         self.variant = variant
-        self.metric_type = metric_type
         self.use_stemmer = use_stemmer
 
         # To-do: Add split_summaries and tokenizer options after the maintainers
@@ -99,15 +91,46 @@ def __init__(
             use_stemmer=use_stemmer,
         )
 
-        self._rouge_score = self.add_weight(
-            name="rouge_score",
+        self._rouge_precision = self.add_weight(
+            name="rouge_precision",
             initializer="zeros",
             dtype=self.dtype,
         )
+        self._rouge_recall = self.add_weight(
+            name="rouge_recall",
+            initializer="zeros",
+            dtype=self.dtype,
+        )
+        self._rouge_f1_score = self.add_weight(
+            name="rouge_f1_score",
+            initializer="zeros",
+            dtype=self.dtype,
+        )
+
         self._number_of_samples = self.add_weight(
             name="number_of_samples", initializer="zeros", dtype=self.dtype
         )
 
+    def __new__(cls, *args, **kwargs):
+        # Temporary workaround for Keras bug with dictionary return types.
+        # Wraps `result()` with a python dictionary that also supports variable
+        # assignment. We have to do this with __new__ because the base metric
+        # class wraps the `results()` method.
+        obj = super().__new__(cls)
+
+        class MetricDict(dict):
+            """A dictionary that supports variable assignment."""
+
+            pass
+
+        def wrap_result(result_fn):
+            return tf.__internal__.decorator.make_decorator(
+                result_fn, lambda obj, *args: MetricDict(result_fn(*args))
+            )
+
+        obj.result = types.MethodType(wrap_result(obj.result), obj)
+        return obj
+
     def update_state(self, y_true, y_pred, sample_weight=None):
         # Three possible shapes for y_true and y_pred: Python string,
         # [batch_size] and [batch_size, 1]. In the latter two cases, we have
@@ -146,14 +169,10 @@ def calculate_rouge_score(reference, hypothesis):
             score = self._rouge_scorer.score(reference, hypothesis)[
                 self.variant
             ]
-
-            if self.metric_type == "precision":
-                score = score.precision
-            elif self.metric_type == "recall":
-                score = score.recall
-            else:
-                score = score.fmeasure
-            return score
+            return tf.cast(
+                tf.constant([score.precision, score.recall, score.fmeasure]),
+                dtype=self.dtype,
+            )
 
         for batch_idx in range(batch_size):
             score = tf.py_function(
@@ -161,7 +180,9 @@ def calculate_rouge_score(reference, hypothesis):
                 inp=[y_true[batch_idx], y_pred[batch_idx]],
                 Tout=self.dtype,
             )
-            self._rouge_score.assign_add(score)
+            self._rouge_precision.assign_add(score[0])
+            self._rouge_recall.assign_add(score[1])
+            self._rouge_f1_score.assign_add(score[2])
 
         self._number_of_samples.assign_add(
             tf.cast(batch_size, dtype=self.dtype)
@@ -169,12 +190,25 @@ def calculate_rouge_score(reference, hypothesis):
 
     def result(self):
         if self._number_of_samples == 0:
-            return 0.0
-        rouge_score = self._rouge_score / self._number_of_samples
-        return rouge_score
+            return {
+                f"{self.name}_precision": 0.0,
+                f"{self.name}_recall": 0.0,
+                f"{self.name}_f1_score": 0.0,
+            }
+
+        rouge_precision = self._rouge_precision / self._number_of_samples
+        rouge_recall = self._rouge_recall / self._number_of_samples
+        rouge_f1_score = self._rouge_f1_score / self._number_of_samples
+        return {
+            f"{self.name}_precision": rouge_precision,
+            f"{self.name}_recall": rouge_recall,
+            f"{self.name}_f1_score": rouge_f1_score,
+        }
 
     def reset_state(self):
-        self._rouge_score.assign(0.0)
+        self._rouge_precision.assign(0.0)
+        self._rouge_recall.assign(0.0)
+        self._rouge_f1_score.assign(0.0)
         self._number_of_samples.assign(0.0)
 
     def get_config(self):
@@ -182,7 +216,6 @@ def get_config(self):
         config.update(
             {
                 "variant": self.variant,
-                "metric_type": self.metric_type,
                 "use_stemmer": self.use_stemmer,
             }
         )
diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 3bfb77caa7..0c1a907d7c 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -33,8 +33,6 @@ class RougeL(RougeBase):
     Args:
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
             suffixes to improve matching. Defaults to False.
-        metric_type: string. One of "precision", "recall", "f1_score". Defaults
-            to "f1_score".
         dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
                not specified, it defaults to tf.float32.
         name: string. Name of the metric instance.
@@ -48,7 +46,13 @@ class RougeL(RougeBase):
     >>> y_true = "the tiny little cat was found under the big funny bed"
     >>> y_pred = "the cat was under the bed"
     >>> rouge_l(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.7058824>
+    {
+        'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
+        'rouge-l_recall':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.54545456>,
+        'rouge-l_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.7058824>
+    }
 
     1.2. rank 1 inputs.
     a. Python list.
@@ -61,9 +65,14 @@ class RougeL(RougeBase):
     ...     "the cat was under the bed",
     ...     "i love contributing to KerasNLP",
     ... ]
-    >>>
     >>> rouge_l(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
+    {
+        'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
+        'rouge-l_recall':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>,
+        'rouge-l_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
+    }
 
     b. Tensor
     >>> rouge_l = keras_nlp.metrics.RougeL()
@@ -80,7 +89,13 @@ class RougeL(RougeBase):
     ...     ]
     ... )
     >>> rouge_l(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
+    {
+        'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
+        'rouge-l_recall':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>,
+        'rouge-l_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
+    }
 
     1.3. rank 2 inputs.
     >>> rouge_l = keras_nlp.metrics.RougeL()
@@ -97,26 +112,15 @@ class RougeL(RougeBase):
     ...     ]
     ... )
     >>> rouge_l(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
-
-    3. Output the precision instead of the F1 Score.
-    >>> rouge_l = keras_nlp.metrics.RougeL(metric_type="precision")
-    >>> y_true = tf.constant(
-    ...     [
-    ...         "the tiny little cat was found under the big funny bed",
-    ...         "i really love contributing to KerasNLP",
-    ...     ]
-    ... )
-    >>> y_pred = tf.constant(
-    ...     [
-    ...         "the cat was under the bed",
-    ...         "i love contributing to KerasNLP",
-    ...     ]
-    ... )
-    >>> rouge_l(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=1.0>
-
-    4. Pass the metric to `model.compile()`.
+    {
+        'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
+        'rouge-l_recall':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>,
+        'rouge-l_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
+    }
+
+    3. Pass the metric to `model.compile()`.
     >>> inputs = keras.Input(shape=(), dtype='string')
     >>> outputs = tf.strings.lower(inputs)
     >>> model = keras.Model(inputs, outputs)
@@ -124,13 +128,17 @@ class RougeL(RougeBase):
     >>> x = tf.constant(["HELLO THIS IS FUN"])
     >>> y = tf.constant(["hello this is awesome"])
     >>> metric_dict = model.evaluate(x, y, return_dict=True)
-    >>> metric_dict["rouge-l"]
-    0.75
+    >>> metric_dict
+    {
+        'loss': 0.0,
+        'rouge-l_precision': 0.75,
+        'rouge-l_recall': 0.75,
+        'rouge-l_f1_score': 0.75
+    }
     """
 
     def __init__(
         self,
-        metric_type="f1_score",
         use_stemmer=False,
         dtype=None,
         name="rouge-l",
@@ -138,7 +146,6 @@ def __init__(
     ):
         super().__init__(
             variant="rougeL",
-            metric_type=metric_type,
             use_stemmer=use_stemmer,
             dtype=dtype,
             name=name,
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
index 216686273d..af3f306990 100644
--- a/keras_nlp/metrics/rouge_l_test.py
+++ b/keras_nlp/metrics/rouge_l_test.py
@@ -21,9 +21,20 @@
 
 
 class RougeLTest(tf.test.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.metric_types = (
+            "rouge-l_precision",
+            "rouge-l_recall",
+            "rouge-l_f1_score",
+        )
+
     def test_initialization(self):
         rouge = RougeL()
-        self.assertEqual(rouge.result().numpy(), 0.0)
+        result = rouge.result()
+
+        for metric_type in self.metric_types:
+            self.assertEqual(result[metric_type].numpy(), 0.0)
 
     def test_string_input(self):
         rouge = RougeL(use_stemmer=False)
@@ -31,7 +42,12 @@ def test_string_input(self):
         y_pred = "the cat was under the bed"
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.706, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [1, 0.545, 0.706]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_string_list_input(self):
         rouge = RougeL(use_stemmer=False)
@@ -45,7 +61,12 @@ def test_string_list_input(self):
         ]
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [1, 0.689, 0.807]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_tensor_input(self):
         rouge = RougeL(use_stemmer=False)
@@ -60,7 +81,12 @@ def test_tensor_input(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [1, 0.689, 0.807]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_rank_2_input(self):
         rouge = RougeL(use_stemmer=False)
@@ -75,9 +101,14 @@ def test_rank_2_input(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
-
-    def model_compile(self):
+        for metric_type, expected_val in zip(
+            self.metric_types, [1, 0.689, 0.807]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
+
+    def test_model_compile(self):
         inputs = keras.Input(shape=(), dtype="string")
         outputs = tf.strings.lower(inputs)
         model = keras.Model(inputs, outputs)
@@ -88,37 +119,12 @@ def model_compile(self):
         y = tf.constant(["hello this is awesome"])
 
         output = model.evaluate(x, y, return_dict=True)
-        self.assertAlmostEqual(output["rouge-l"], 0.75, delta=1e-3)
-
-    def test_precision(self):
-        rouge = RougeL(metric_type="precision", use_stemmer=False)
-        y_true = tf.constant(
-            [
-                "the tiny little cat was found under the big funny bed",
-                "i really love contributing to KerasNLP",
-            ]
-        )
-        y_pred = tf.constant(
-            ["the cat was under the bed", "i love contributing to KerasNLP"]
-        )
-
-        rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 1, delta=1e-3)
-
-    def test_recall(self):
-        rouge = RougeL(metric_type="recall", use_stemmer=False)
-        y_true = tf.constant(
-            [
-                "the tiny little cat was found under the big funny bed",
-                "i really love contributing to KerasNLP",
-            ]
-        )
-        y_pred = tf.constant(
-            ["the cat was under the bed", "i love contributing to KerasNLP"]
-        )
-
-        rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.689, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.75, 0.75, 0.75]
+        ):
+            self.assertAlmostEqual(
+                output[metric_type], expected_val, delta=1e-3
+            )
 
     def test_reset_state(self):
         rouge = RougeL()
@@ -133,10 +139,18 @@ def test_reset_state(self):
         )
 
         rouge.update_state(y_true, y_pred)
-        self.assertNotEqual(rouge.result(), 0.0)
+        rouge_val = rouge.result()
+        for metric_type, unexpected_val in zip(
+            self.metric_types, [0.0, 0.0, 0.0]
+        ):
+            self.assertNotEqual(rouge_val[metric_type].numpy(), unexpected_val)
 
         rouge.reset_state()
-        self.assertEqual(rouge.result(), 0.0)
+        rouge_val = rouge.result()
+        for metric_type, unexpected_val in zip(
+            self.metric_types, [0.0, 0.0, 0.0]
+        ):
+            self.assertEqual(rouge_val[metric_type].numpy(), unexpected_val)
 
     def test_update_state(self):
         rouge = RougeL()
@@ -152,14 +166,24 @@ def test_update_state(self):
 
         rouge.update_state(y_true_1, y_pred_1)
         rouge_val = rouge.result()
-        self.assertAlmostEqual(rouge_val.numpy(), 0.807, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [1, 0.689, 0.807]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
         y_true_2 = tf.constant(["what is your favourite show"])
         y_pred_2 = tf.constant(["my favourite show is silicon valley"])
 
         rouge.update_state(y_true_2, y_pred_2)
         rouge_val = rouge.result()
-        self.assertAlmostEqual(rouge_val.numpy(), 0.659, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.778, 0.593, 0.66]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_merge_state(self):
         rouge_1 = RougeL()
@@ -183,18 +207,35 @@ def test_merge_state(self):
 
         rouge_1.update_state(y_true_1, y_pred_1)
         rouge_1.update_state(y_true_2, y_pred_2)
-        self.assertAlmostEqual(rouge_1.result().numpy(), 0.659, delta=1e-3)
+        rouge_val = rouge_1.result()
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.778, 0.593, 0.66]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
         rouge_2.update_state(y_true_3, y_pred_3)
-        self.assertAlmostEqual(rouge_2.result().numpy(), 0.364, delta=1e-3)
+        rouge_val = rouge_2.result()
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.333, 0.4, 0.364]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
         merged_rouge = RougeL()
         merged_rouge.merge_state([rouge_1, rouge_2])
-        self.assertAlmostEqual(merged_rouge.result().numpy(), 0.586, delta=1e-3)
+        rouge_val = merged_rouge.result()
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.667, 0.545, 0.586]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_get_config(self):
         rouge = RougeL(
-            metric_type="precision",
             use_stemmer=True,
             dtype=tf.float32,
             name="rouge_l_test",
@@ -202,7 +243,6 @@ def test_get_config(self):
 
         config = rouge.get_config()
         expected_config_subset = {
-            "metric_type": "precision",
             "use_stemmer": True,
         }
         self.assertEqual(config, {**config, **expected_config_subset})
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index 57d611d928..4030066ecd 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -33,8 +33,6 @@ class RougeN(RougeBase):
     Args:
         order: The order of n-grams which are to be matched. It should lie in
             range [1, 9]. Defaults to 2.
-        metric_type: string. One of "precision", "recall", "f1_score". Defaults
-            to "f1_score".
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
             suffixes to improve matching. Defaults to False.
         dtype: string or tf.dtypes.Dtype. Precision of metric computation. If
@@ -50,7 +48,12 @@ class RougeN(RougeBase):
     >>> y_true = "the tiny little cat was found under the big funny bed"
     >>> y_pred = "the cat was under the bed"
     >>> rouge_n(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.26666668>
+    {
+        'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>,
+        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.2>,
+        'rouge-n_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.26666668>
+    }
 
     1.2. rank 1 inputs.
     a. Python list.
@@ -64,7 +67,12 @@ class RougeN(RougeBase):
     ...     "i love contributing to KerasNLP",
     ... ]
     >>> rouge_n(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+    {
+        'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>,
+        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>,
+        'rouge-n_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+    }
 
     b. Tensor.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
@@ -81,7 +89,12 @@ class RougeN(RougeBase):
     ...     ]
     ... )
     >>> rouge_n(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+    {
+        'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>,
+        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>,
+        'rouge-n_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+    }
 
     1.3. rank 2 inputs.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
@@ -98,7 +111,12 @@ class RougeN(RougeBase):
     ...     ]
     ... )
     >>> rouge_n(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+    {
+        'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>,
+        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>,
+        'rouge-n_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
+    }
 
     2. Consider trigrams for calculating ROUGE-N.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=3)
@@ -115,26 +133,15 @@ class RougeN(RougeBase):
     ...     ]
     ... )
     >>> rouge_n(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.2857143>
-
-    3. Output the precision instead of the F1 Score.
-    >>> rouge_n = keras_nlp.metrics.RougeN(order=3, metric_type="precision")
-    >>> y_true = tf.constant(
-    ...     [
-    ...         "the tiny little cat was found under the big funny bed",
-    ...         "i really love contributing to KerasNLP",
-    ...     ]
-    ... )
-    >>> y_pred = tf.constant(
-    ...     [
-    ...         "the cat was under the bed",
-    ...         "i love contributing to KerasNLP",
-    ...     ]
-    ... )
-    >>> rouge_n(y_true, y_pred)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.33333334>
-
-    4. Pass the metric to `model.compile()`.
+    {
+        'rouge-n_precision':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.33333334>,
+        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.25>,
+        'rouge-n_f1_score':
+            <tf.Tensor: shape=(), dtype=float32, numpy=0.2857143>
+    }
+
+    3. Pass the metric to `model.compile()`.
     >>> inputs = keras.Input(shape=(), dtype='string')
     >>> outputs = tf.strings.lower(inputs)
     >>> model = keras.Model(inputs, outputs)
@@ -142,14 +149,18 @@ class RougeN(RougeBase):
     >>> x = tf.constant(["HELLO THIS IS FUN"])
     >>> y = tf.constant(["hello this is awesome"])
     >>> metric_dict = model.evaluate(x, y, return_dict=True)
-    >>> metric_dict["rouge-n"]
-    0.6666666865348816
+    >>> metric_dict
+    {
+        'loss': 0.0,
+        'rouge-n_precision': 0.6666666865348816,
+        'rouge-n_recall': 0.6666666865348816,
+        'rouge-n_f1_score': 0.6666666865348816
+    }
     """
 
     def __init__(
         self,
         order=2,
-        metric_type="f1_score",
         use_stemmer=False,
         dtype=None,
         name="rouge-n",
@@ -163,7 +174,6 @@ def __init__(
 
         super().__init__(
             variant=f"rouge{order}",
-            metric_type=metric_type,
             use_stemmer=use_stemmer,
             dtype=dtype,
             name=name,
diff --git a/keras_nlp/metrics/rouge_n_test.py b/keras_nlp/metrics/rouge_n_test.py
index be008e3800..8537876a66 100644
--- a/keras_nlp/metrics/rouge_n_test.py
+++ b/keras_nlp/metrics/rouge_n_test.py
@@ -21,9 +21,20 @@
 
 
 class RougeNTest(tf.test.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.metric_types = (
+            "rouge-n_precision",
+            "rouge-n_recall",
+            "rouge-n_f1_score",
+        )
+
     def test_initialization(self):
         rouge = RougeN()
-        self.assertEqual(rouge.result().numpy(), 0.0)
+        result = rouge.result()
+
+        for metric_type in self.metric_types:
+            self.assertEqual(result[metric_type].numpy(), 0.0)
 
     def test_string_input(self):
         rouge = RougeN(order=2, use_stemmer=False)
@@ -31,7 +42,12 @@ def test_string_input(self):
         y_pred = "the cat was under the bed"
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.267, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.4, 0.2, 0.267]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_string_list_input(self):
         rouge = RougeN(order=2, use_stemmer=False)
@@ -45,7 +61,12 @@ def test_string_list_input(self):
         ]
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.575, 0.4, 0.467]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_tensor_input(self):
         rouge = RougeN(order=2, use_stemmer=False)
@@ -60,7 +81,12 @@ def test_tensor_input(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.575, 0.4, 0.467]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_rank_2_input(self):
         rouge = RougeN(order=2, use_stemmer=False)
@@ -75,9 +101,14 @@ def test_rank_2_input(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
-
-    def model_compile(self):
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.575, 0.4, 0.467]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
+
+    def test_model_compile(self):
         inputs = keras.Input(shape=(), dtype="string")
         outputs = tf.strings.lower(inputs)
         model = keras.Model(inputs, outputs)
@@ -88,7 +119,13 @@ def model_compile(self):
         y = tf.constant(["hello this is awesome"])
 
         output = model.evaluate(x, y, return_dict=True)
-        self.assertAlmostEqual(output["rouge-n"], 0.667, delta=1e-3)
+
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.667, 0.667, 0.667]
+        ):
+            self.assertAlmostEqual(
+                output[metric_type], expected_val, delta=1e-3
+            )
 
     def test_incorrect_order(self):
         with self.assertRaises(ValueError):
@@ -107,37 +144,12 @@ def test_different_order(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.286, delta=1e-3)
-
-    def test_precision(self):
-        rouge = RougeN(order=3, metric_type="precision", use_stemmer=False)
-        y_true = tf.constant(
-            [
-                "the tiny little cat was found under the big funny bed",
-                "i really love contributing to KerasNLP",
-            ]
-        )
-        y_pred = tf.constant(
-            ["the cat was under the bed", "i love contributing to KerasNLP"]
-        )
-
-        rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.333, delta=1e-3)
-
-    def test_recall(self):
-        rouge = RougeN(order=3, metric_type="recall", use_stemmer=False)
-        y_true = tf.constant(
-            [
-                "the tiny little cat was found under the big funny bed",
-                "i really love contributing to KerasNLP",
-            ]
-        )
-        y_pred = tf.constant(
-            ["the cat was under the bed", "i love contributing to KerasNLP"]
-        )
-
-        rouge_val = rouge(y_true, y_pred)
-        self.assertAlmostEqual(rouge_val.numpy(), 0.25, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.333, 0.25, 0.286]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_reset_state(self):
         rouge = RougeN()
@@ -152,10 +164,18 @@ def test_reset_state(self):
         )
 
         rouge.update_state(y_true, y_pred)
-        self.assertNotEqual(rouge.result(), 0.0)
+        rouge_val = rouge.result()
+        for metric_type, unexpected_val in zip(
+            self.metric_types, [0.0, 0.0, 0.0]
+        ):
+            self.assertNotEqual(rouge_val[metric_type].numpy(), unexpected_val)
 
         rouge.reset_state()
-        self.assertEqual(rouge.result(), 0.0)
+        rouge_val = rouge.result()
+        for metric_type, unexpected_val in zip(
+            self.metric_types, [0.0, 0.0, 0.0]
+        ):
+            self.assertEqual(rouge_val[metric_type].numpy(), unexpected_val)
 
     def test_update_state(self):
         rouge = RougeN()
@@ -171,14 +191,24 @@ def test_update_state(self):
 
         rouge.update_state(y_true_1, y_pred_1)
         rouge_val = rouge.result()
-        self.assertAlmostEqual(rouge_val.numpy(), 0.467, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.575, 0.4, 0.467]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
         y_true_2 = tf.constant(["what is your favourite show"])
         y_pred_2 = tf.constant(["my favourite show is silicon valley"])
 
         rouge.update_state(y_true_2, y_pred_2)
         rouge_val = rouge.result()
-        self.assertAlmostEqual(rouge_val.numpy(), 0.385, delta=1e-3)
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.45, 0.35, 0.385]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_merge_state(self):
         rouge_1 = RougeN()
@@ -202,19 +232,36 @@ def test_merge_state(self):
 
         rouge_1.update_state(y_true_1, y_pred_1)
         rouge_1.update_state(y_true_2, y_pred_2)
-        self.assertAlmostEqual(rouge_1.result().numpy(), 0.385, delta=1e-3)
+        rouge_val = rouge_1.result()
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.45, 0.35, 0.385]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
         rouge_2.update_state(y_true_3, y_pred_3)
-        self.assertAlmostEqual(rouge_2.result().numpy(), 0.222, delta=1e-3)
+        rouge_val = rouge_2.result()
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.2, 0.25, 0.222]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
         merged_rouge = RougeN()
         merged_rouge.merge_state([rouge_1, rouge_2])
-        self.assertAlmostEqual(merged_rouge.result().numpy(), 0.344, delta=1e-3)
+        rouge_val = merged_rouge.result()
+        for metric_type, expected_val in zip(
+            self.metric_types, [0.388, 0.325, 0.344]
+        ):
+            self.assertAlmostEqual(
+                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
+            )
 
     def test_get_config(self):
         rouge = RougeN(
             order=5,
-            metric_type="precision",
             use_stemmer=True,
             dtype=tf.float32,
             name="rouge_n_test",
@@ -223,7 +270,6 @@ def test_get_config(self):
         config = rouge.get_config()
         expected_config_subset = {
             "order": 5,
-            "metric_type": "precision",
             "use_stemmer": True,
         }
 

From f4df42b7b6111cbde95f77722b2020b9d3512a67 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 10 Jun 2022 11:57:41 +0530
Subject: [PATCH 24/30] Fix doc-strings

---
 keras_nlp/metrics/rouge_l.py | 39 +++++---------------------------
 keras_nlp/metrics/rouge_n.py | 43 +++++-------------------------------
 2 files changed, 11 insertions(+), 71 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 0c1a907d7c..bcbbaec0b3 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -46,13 +46,7 @@ class RougeL(RougeBase):
     >>> y_true = "the tiny little cat was found under the big funny bed"
     >>> y_pred = "the cat was under the bed"
     >>> rouge_l(y_true, y_pred)
-    {
-        'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
-        'rouge-l_recall':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.54545456>,
-        'rouge-l_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.7058824>
-    }
+    {'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'rouge-l_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.54545456>, 'rouge-l_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.7058824>}
 
     1.2. rank 1 inputs.
     a. Python list.
@@ -66,13 +60,7 @@ class RougeL(RougeBase):
     ...     "i love contributing to KerasNLP",
     ... ]
     >>> rouge_l(y_true, y_pred)
-    {
-        'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
-        'rouge-l_recall':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>,
-        'rouge-l_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
-    }
+    {'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'rouge-l_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>, 'rouge-l_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>}
 
     b. Tensor
     >>> rouge_l = keras_nlp.metrics.RougeL()
@@ -89,13 +77,7 @@ class RougeL(RougeBase):
     ...     ]
     ... )
     >>> rouge_l(y_true, y_pred)
-    {
-        'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
-        'rouge-l_recall':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>,
-        'rouge-l_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
-    }
+    {'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'rouge-l_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>, 'rouge-l_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>}
 
     1.3. rank 2 inputs.
     >>> rouge_l = keras_nlp.metrics.RougeL()
@@ -112,13 +94,7 @@ class RougeL(RougeBase):
     ...     ]
     ... )
     >>> rouge_l(y_true, y_pred)
-    {
-        'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
-        'rouge-l_recall':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>,
-        'rouge-l_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
-    }
+    {'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'rouge-l_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>, 'rouge-l_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>}
 
     3. Pass the metric to `model.compile()`.
     >>> inputs = keras.Input(shape=(), dtype='string')
@@ -129,12 +105,7 @@ class RougeL(RougeBase):
     >>> y = tf.constant(["hello this is awesome"])
     >>> metric_dict = model.evaluate(x, y, return_dict=True)
     >>> metric_dict
-    {
-        'loss': 0.0,
-        'rouge-l_precision': 0.75,
-        'rouge-l_recall': 0.75,
-        'rouge-l_f1_score': 0.75
-    }
+    {'loss': 0.0, 'rouge-l_precision': 0.75, 'rouge-l_recall': 0.75, 'rouge-l_f1_score': 0.75}
     """
 
     def __init__(
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index 4030066ecd..e7ab222467 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -48,12 +48,7 @@ class RougeN(RougeBase):
     >>> y_true = "the tiny little cat was found under the big funny bed"
     >>> y_pred = "the cat was under the bed"
     >>> rouge_n(y_true, y_pred)
-    {
-        'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>,
-        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.2>,
-        'rouge-n_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.26666668>
-    }
+    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.2>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.26666668>}
 
     1.2. rank 1 inputs.
     a. Python list.
@@ -67,12 +62,7 @@ class RougeN(RougeBase):
     ...     "i love contributing to KerasNLP",
     ... ]
     >>> rouge_n(y_true, y_pred)
-    {
-        'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>,
-        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>,
-        'rouge-n_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
-    }
+    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>}
 
     b. Tensor.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
@@ -89,12 +79,7 @@ class RougeN(RougeBase):
     ...     ]
     ... )
     >>> rouge_n(y_true, y_pred)
-    {
-        'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>,
-        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>,
-        'rouge-n_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
-    }
+    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>}
 
     1.3. rank 2 inputs.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
@@ -111,12 +96,7 @@ class RougeN(RougeBase):
     ...     ]
     ... )
     >>> rouge_n(y_true, y_pred)
-    {
-        'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>,
-        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>,
-        'rouge-n_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
-    }
+    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>}
 
     2. Consider trigrams for calculating ROUGE-N.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=3)
@@ -133,13 +113,7 @@ class RougeN(RougeBase):
     ...     ]
     ... )
     >>> rouge_n(y_true, y_pred)
-    {
-        'rouge-n_precision':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.33333334>,
-        'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.25>,
-        'rouge-n_f1_score':
-            <tf.Tensor: shape=(), dtype=float32, numpy=0.2857143>
-    }
+    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.33333334>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.25>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.2857143>}
 
     3. Pass the metric to `model.compile()`.
     >>> inputs = keras.Input(shape=(), dtype='string')
@@ -150,12 +124,7 @@ class RougeN(RougeBase):
     >>> y = tf.constant(["hello this is awesome"])
     >>> metric_dict = model.evaluate(x, y, return_dict=True)
     >>> metric_dict
-    {
-        'loss': 0.0,
-        'rouge-n_precision': 0.6666666865348816,
-        'rouge-n_recall': 0.6666666865348816,
-        'rouge-n_f1_score': 0.6666666865348816
-    }
+    {'loss': 0.0, 'rouge-n_precision': 0.6666666865348816, 'rouge-n_recall': 0.6666666865348816, 'rouge-n_f1_score': 0.6666666865348816}
     """
 
     def __init__(

From 723d8e7e7067dd803a005f15891e5cdd1ca0d78b Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 10 Jun 2022 23:47:06 +0530
Subject: [PATCH 25/30] Truncate doc-string example output

---
 keras_nlp/metrics/rouge_l.py | 20 ++++++++++----------
 keras_nlp/metrics/rouge_n.py | 24 ++++++++++++------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index bcbbaec0b3..8e54b9a8d5 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -45,8 +45,8 @@ class RougeL(RougeBase):
     >>> rouge_l = keras_nlp.metrics.RougeL()
     >>> y_true = "the tiny little cat was found under the big funny bed"
     >>> y_pred = "the cat was under the bed"
-    >>> rouge_l(y_true, y_pred)
-    {'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'rouge-l_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.54545456>, 'rouge-l_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.7058824>}
+    >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.7058824>
 
     1.2. rank 1 inputs.
     a. Python list.
@@ -59,8 +59,8 @@ class RougeL(RougeBase):
     ...     "the cat was under the bed",
     ...     "i love contributing to KerasNLP",
     ... ]
-    >>> rouge_l(y_true, y_pred)
-    {'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'rouge-l_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>, 'rouge-l_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>}
+    >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
 
     b. Tensor
     >>> rouge_l = keras_nlp.metrics.RougeL()
@@ -76,8 +76,8 @@ class RougeL(RougeBase):
     ...         "i love contributing to KerasNLP",
     ...     ]
     ... )
-    >>> rouge_l(y_true, y_pred)
-    {'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'rouge-l_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>, 'rouge-l_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>}
+    >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
 
     1.3. rank 2 inputs.
     >>> rouge_l = keras_nlp.metrics.RougeL()
@@ -93,8 +93,8 @@ class RougeL(RougeBase):
     ...         ["i love contributing to KerasNLP"],
     ...     ]
     ... )
-    >>> rouge_l(y_true, y_pred)
-    {'rouge-l_precision': <tf.Tensor: shape=(), dtype=float32, numpy=1.0>, 'rouge-l_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.68939394>, 'rouge-l_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>}
+    >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
 
     3. Pass the metric to `model.compile()`.
     >>> inputs = keras.Input(shape=(), dtype='string')
@@ -104,8 +104,8 @@ class RougeL(RougeBase):
     >>> x = tf.constant(["HELLO THIS IS FUN"])
     >>> y = tf.constant(["hello this is awesome"])
     >>> metric_dict = model.evaluate(x, y, return_dict=True)
-    >>> metric_dict
-    {'loss': 0.0, 'rouge-l_precision': 0.75, 'rouge-l_recall': 0.75, 'rouge-l_f1_score': 0.75}
+    >>> metric_dict["rouge-l_f1_score"]
+     0.75
     """
 
     def __init__(
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index e7ab222467..b1eb059ec2 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -47,8 +47,8 @@ class RougeN(RougeBase):
     >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
     >>> y_true = "the tiny little cat was found under the big funny bed"
     >>> y_pred = "the cat was under the bed"
-    >>> rouge_n(y_true, y_pred)
-    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.2>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.26666668>}
+    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.26666668>
 
     1.2. rank 1 inputs.
     a. Python list.
@@ -61,8 +61,8 @@ class RougeN(RougeBase):
     ...     "the cat was under the bed",
     ...     "i love contributing to KerasNLP",
     ... ]
-    >>> rouge_n(y_true, y_pred)
-    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>}
+    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
 
     b. Tensor.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
@@ -78,8 +78,8 @@ class RougeN(RougeBase):
     ...         "i love contributing to KerasNLP",
     ...     ]
     ... )
-    >>> rouge_n(y_true, y_pred)
-    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>}
+    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
 
     1.3. rank 2 inputs.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
@@ -95,8 +95,8 @@ class RougeN(RougeBase):
     ...         ["i love contributing to KerasNLP"],
     ...     ]
     ... )
-    >>> rouge_n(y_true, y_pred)
-    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.575>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.4>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>}
+    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
 
     2. Consider trigrams for calculating ROUGE-N.
     >>> rouge_n = keras_nlp.metrics.RougeN(order=3)
@@ -112,8 +112,8 @@ class RougeN(RougeBase):
     ...         "i love contributing to KerasNLP",
     ...     ]
     ... )
-    >>> rouge_n(y_true, y_pred)
-    {'rouge-n_precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.33333334>, 'rouge-n_recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.25>, 'rouge-n_f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.2857143>}
+    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    <tf.Tensor: shape=(), dtype=float32, numpy=0.2857143>
 
     3. Pass the metric to `model.compile()`.
     >>> inputs = keras.Input(shape=(), dtype='string')
@@ -123,8 +123,8 @@ class RougeN(RougeBase):
     >>> x = tf.constant(["HELLO THIS IS FUN"])
     >>> y = tf.constant(["hello this is awesome"])
     >>> metric_dict = model.evaluate(x, y, return_dict=True)
-    >>> metric_dict
-    {'loss': 0.0, 'rouge-n_precision': 0.6666666865348816, 'rouge-n_recall': 0.6666666865348816, 'rouge-n_f1_score': 0.6666666865348816}
+    >>> metric_dict["rouge-n_f1_score"]
+    0.6666666865348816
     """
 
     def __init__(

From b0fe8bc3c972bb9bbb350a4de67052b3ee711ff1 Mon Sep 17 00:00:00 2001
From: Abheesht <sharmabhee@gmail.com>
Date: Thu, 16 Jun 2022 17:03:32 +0530
Subject: [PATCH 26/30] Remove ROUGE-LSum from doc-string

---
 keras_nlp/metrics/rouge_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py
index 069ef14226..59f4d4aba2 100644
--- a/keras_nlp/metrics/rouge_base.py
+++ b/keras_nlp/metrics/rouge_base.py
@@ -32,8 +32,8 @@
 class RougeBase(keras.metrics.Metric):
     """ROUGE metric.
 
-    This class implements all the variants of the ROUGE metric - ROUGE-N,
-    ROUGE-L and ROUGE-LSum.
+    This class implements two variants of the ROUGE metric - ROUGE-N,
+    and ROUGE-L.
 
     Note on input shapes:
     For `y_true` and `y_pred`, this class supports scalar values and batch

From 7250617b2c8934f1b454cd92245c11ad0f1680a0 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 17 Jun 2022 00:36:15 +0530
Subject: [PATCH 27/30] Small doc-string changes

---
 keras_nlp/metrics/rouge_base.py | 4 ++--
 keras_nlp/metrics/rouge_l.py    | 2 +-
 keras_nlp/metrics/rouge_n.py    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py
index 59f4d4aba2..8e5158b79c 100644
--- a/keras_nlp/metrics/rouge_base.py
+++ b/keras_nlp/metrics/rouge_base.py
@@ -37,10 +37,10 @@ class RougeBase(keras.metrics.Metric):
 
     Note on input shapes:
     For `y_true` and `y_pred`, this class supports scalar values and batch
-    inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`.
+    inputs of shapes `()`, `(batch_size,)` and `(batch_size, 1)`.
 
     Args:
-        variant: string. One of "rougeN", "rougeL", "rougeLsum". Defaults to
+        variant: string. One of "rougeN", "rougeL". Defaults to
             "rouge2". For "rougeN", N lies in the range [1, 9].
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
             suffixes to improve matching. Defaults to False.
diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index 8e54b9a8d5..aff177f93d 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -28,7 +28,7 @@ class RougeL(RougeBase):
 
     Note on input shapes:
     For `y_true` and `y_pred`, this class supports scalar values and batch
-    inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`.
+    inputs of shapes `()`, `(batch_size,)` and `(batch_size, 1)`.
 
     Args:
         use_stemmer: bool. Whether Porter Stemmer should be used to strip word
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index b1eb059ec2..dc31630ff2 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -28,7 +28,7 @@ class RougeN(RougeBase):
 
     Note on input shapes:
     For `y_true` and `y_pred`, this class supports scalar values and batch
-    inputs of shapes `()`, `(batch_size, )` and `(batch_size, 1)`.
+    inputs of shapes `()`, `(batch_size,)` and `(batch_size, 1)`.
 
     Args:
         order: The order of n-grams which are to be matched. It should lie in

From 3c5b3dc8ff7b150b6aa9dbd6caeace40ae7f9c9d Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 17 Jun 2022 09:04:30 +0530
Subject: [PATCH 28/30] Add TODO comment for dict return bug

---
 keras_nlp/metrics/rouge_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py
index 8e5158b79c..92294b0d10 100644
--- a/keras_nlp/metrics/rouge_base.py
+++ b/keras_nlp/metrics/rouge_base.py
@@ -116,6 +116,7 @@ def __new__(cls, *args, **kwargs):
         # Wraps `result()` with a python dictionary that also supports variable
         # assignment. We have to do this with __new__ because the base metric
         # class wraps the `results()` method.
+        # TODO: Remove this snippet of code once the Keras bug is fixed.
         obj = super().__new__(cls)
 
         class MetricDict(dict):

From 4fa518ab13802a12dab7110ee751ebb1909de5d1 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 17 Jun 2022 13:11:06 +0530
Subject: [PATCH 29/30] Address review comments - V

---
 keras_nlp/metrics/rouge_base.py   |  12 +--
 keras_nlp/metrics/rouge_l_test.py | 130 ++++++++++++---------------
 keras_nlp/metrics/rouge_n_test.py | 142 +++++++++++++-----------------
 3 files changed, 121 insertions(+), 163 deletions(-)

diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py
index 92294b0d10..22d4adf3b8 100644
--- a/keras_nlp/metrics/rouge_base.py
+++ b/keras_nlp/metrics/rouge_base.py
@@ -192,18 +192,18 @@ def calculate_rouge_score(reference, hypothesis):
     def result(self):
         if self._number_of_samples == 0:
             return {
-                f"{self.name}_precision": 0.0,
-                f"{self.name}_recall": 0.0,
-                f"{self.name}_f1_score": 0.0,
+                "precision": 0.0,
+                "recall": 0.0,
+                "f1_score": 0.0,
             }
 
         rouge_precision = self._rouge_precision / self._number_of_samples
         rouge_recall = self._rouge_recall / self._number_of_samples
         rouge_f1_score = self._rouge_f1_score / self._number_of_samples
         return {
-            f"{self.name}_precision": rouge_precision,
-            f"{self.name}_recall": rouge_recall,
-            f"{self.name}_f1_score": rouge_f1_score,
+            "precision": rouge_precision,
+            "recall": rouge_recall,
+            "f1_score": rouge_f1_score,
         }
 
     def reset_state(self):
diff --git a/keras_nlp/metrics/rouge_l_test.py b/keras_nlp/metrics/rouge_l_test.py
index af3f306990..d130e12190 100644
--- a/keras_nlp/metrics/rouge_l_test.py
+++ b/keras_nlp/metrics/rouge_l_test.py
@@ -23,18 +23,27 @@
 class RougeLTest(tf.test.TestCase):
     def setUp(self):
         super().setUp()
-        self.metric_types = (
-            "rouge-l_precision",
-            "rouge-l_recall",
-            "rouge-l_f1_score",
-        )
+
+        def assertDictAlmostEqual(d1, d2, delta=1e-3, typecast_to_numpy=True):
+            for key, val in d1.items():
+                if typecast_to_numpy:
+                    val = val.numpy()
+                self.assertAlmostEqual(val, d2[key], delta=delta)
+
+        def assertDictAllValuesNotEqual(d1, d2):
+            for key, val in d1.items():
+                self.assertNotEqual(val, d2[key])
+
+        self.assertDictAlmostEqual = assertDictAlmostEqual
+        self.assertDictAllValuesNotEqual = assertDictAllValuesNotEqual
 
     def test_initialization(self):
         rouge = RougeL()
         result = rouge.result()
 
-        for metric_type in self.metric_types:
-            self.assertEqual(result[metric_type].numpy(), 0.0)
+        self.assertDictEqual(
+            result, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}
+        )
 
     def test_string_input(self):
         rouge = RougeL(use_stemmer=False)
@@ -42,12 +51,9 @@ def test_string_input(self):
         y_pred = "the cat was under the bed"
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [1, 0.545, 0.706]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 1.0, "recall": 0.545, "f1_score": 0.706}
+        )
 
     def test_string_list_input(self):
         rouge = RougeL(use_stemmer=False)
@@ -61,12 +67,9 @@ def test_string_list_input(self):
         ]
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [1, 0.689, 0.807]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 1.0, "recall": 0.689, "f1_score": 0.807}
+        )
 
     def test_tensor_input(self):
         rouge = RougeL(use_stemmer=False)
@@ -81,12 +84,9 @@ def test_tensor_input(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [1, 0.689, 0.807]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 1.0, "recall": 0.689, "f1_score": 0.807}
+        )
 
     def test_rank_2_input(self):
         rouge = RougeL(use_stemmer=False)
@@ -101,12 +101,9 @@ def test_rank_2_input(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [1, 0.689, 0.807]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 1.0, "recall": 0.689, "f1_score": 0.807}
+        )
 
     def test_model_compile(self):
         inputs = keras.Input(shape=(), dtype="string")
@@ -119,12 +116,12 @@ def test_model_compile(self):
         y = tf.constant(["hello this is awesome"])
 
         output = model.evaluate(x, y, return_dict=True)
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.75, 0.75, 0.75]
-        ):
-            self.assertAlmostEqual(
-                output[metric_type], expected_val, delta=1e-3
-            )
+        del output["loss"]
+        self.assertDictAlmostEqual(
+            output,
+            {"precision": 0.75, "recall": 0.75, "f1_score": 0.75},
+            typecast_to_numpy=False,
+        )
 
     def test_reset_state(self):
         rouge = RougeL()
@@ -140,17 +137,15 @@ def test_reset_state(self):
 
         rouge.update_state(y_true, y_pred)
         rouge_val = rouge.result()
-        for metric_type, unexpected_val in zip(
-            self.metric_types, [0.0, 0.0, 0.0]
-        ):
-            self.assertNotEqual(rouge_val[metric_type].numpy(), unexpected_val)
+        self.assertDictAllValuesNotEqual(
+            rouge_val, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}
+        )
 
         rouge.reset_state()
         rouge_val = rouge.result()
-        for metric_type, unexpected_val in zip(
-            self.metric_types, [0.0, 0.0, 0.0]
-        ):
-            self.assertEqual(rouge_val[metric_type].numpy(), unexpected_val)
+        self.assertDictEqual(
+            rouge_val, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}
+        )
 
     def test_update_state(self):
         rouge = RougeL()
@@ -166,24 +161,18 @@ def test_update_state(self):
 
         rouge.update_state(y_true_1, y_pred_1)
         rouge_val = rouge.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [1, 0.689, 0.807]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 1.0, "recall": 0.689, "f1_score": 0.807}
+        )
 
         y_true_2 = tf.constant(["what is your favourite show"])
         y_pred_2 = tf.constant(["my favourite show is silicon valley"])
 
         rouge.update_state(y_true_2, y_pred_2)
         rouge_val = rouge.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.778, 0.593, 0.66]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.778, "recall": 0.593, "f1_score": 0.66}
+        )
 
     def test_merge_state(self):
         rouge_1 = RougeL()
@@ -208,31 +197,22 @@ def test_merge_state(self):
         rouge_1.update_state(y_true_1, y_pred_1)
         rouge_1.update_state(y_true_2, y_pred_2)
         rouge_val = rouge_1.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.778, 0.593, 0.66]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.778, "recall": 0.593, "f1_score": 0.66}
+        )
 
         rouge_2.update_state(y_true_3, y_pred_3)
         rouge_val = rouge_2.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.333, 0.4, 0.364]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.333, "recall": 0.4, "f1_score": 0.364}
+        )
 
         merged_rouge = RougeL()
         merged_rouge.merge_state([rouge_1, rouge_2])
         rouge_val = merged_rouge.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.667, 0.545, 0.586]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.667, "recall": 0.545, "f1_score": 0.586}
+        )
 
     def test_get_config(self):
         rouge = RougeL(
diff --git a/keras_nlp/metrics/rouge_n_test.py b/keras_nlp/metrics/rouge_n_test.py
index 8537876a66..2183afe3fe 100644
--- a/keras_nlp/metrics/rouge_n_test.py
+++ b/keras_nlp/metrics/rouge_n_test.py
@@ -23,18 +23,27 @@
 class RougeNTest(tf.test.TestCase):
     def setUp(self):
         super().setUp()
-        self.metric_types = (
-            "rouge-n_precision",
-            "rouge-n_recall",
-            "rouge-n_f1_score",
-        )
+
+        def assertDictAlmostEqual(d1, d2, delta=1e-3, typecast_to_numpy=True):
+            for key, val in d1.items():
+                if typecast_to_numpy:
+                    val = val.numpy()
+                self.assertAlmostEqual(val, d2[key], delta=delta)
+
+        def assertDictAllValuesNotEqual(d1, d2):
+            for key, val in d1.items():
+                self.assertNotEqual(val, d2[key])
+
+        self.assertDictAlmostEqual = assertDictAlmostEqual
+        self.assertDictAllValuesNotEqual = assertDictAllValuesNotEqual
 
     def test_initialization(self):
         rouge = RougeN()
         result = rouge.result()
 
-        for metric_type in self.metric_types:
-            self.assertEqual(result[metric_type].numpy(), 0.0)
+        self.assertDictEqual(
+            result, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}
+        )
 
     def test_string_input(self):
         rouge = RougeN(order=2, use_stemmer=False)
@@ -42,12 +51,9 @@ def test_string_input(self):
         y_pred = "the cat was under the bed"
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.4, 0.2, 0.267]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.4, "recall": 0.2, "f1_score": 0.267}
+        )
 
     def test_string_list_input(self):
         rouge = RougeN(order=2, use_stemmer=False)
@@ -61,12 +67,9 @@ def test_string_list_input(self):
         ]
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.575, 0.4, 0.467]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.575, "recall": 0.4, "f1_score": 0.467}
+        )
 
     def test_tensor_input(self):
         rouge = RougeN(order=2, use_stemmer=False)
@@ -81,12 +84,9 @@ def test_tensor_input(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.575, 0.4, 0.467]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.575, "recall": 0.4, "f1_score": 0.467}
+        )
 
     def test_rank_2_input(self):
         rouge = RougeN(order=2, use_stemmer=False)
@@ -101,12 +101,9 @@ def test_rank_2_input(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.575, 0.4, 0.467]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.575, "recall": 0.4, "f1_score": 0.467}
+        )
 
     def test_model_compile(self):
         inputs = keras.Input(shape=(), dtype="string")
@@ -119,13 +116,12 @@ def test_model_compile(self):
         y = tf.constant(["hello this is awesome"])
 
         output = model.evaluate(x, y, return_dict=True)
-
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.667, 0.667, 0.667]
-        ):
-            self.assertAlmostEqual(
-                output[metric_type], expected_val, delta=1e-3
-            )
+        del output["loss"]
+        self.assertDictAlmostEqual(
+            output,
+            {"precision": 0.667, "recall": 0.667, "f1_score": 0.667},
+            typecast_to_numpy=False,
+        )
 
     def test_incorrect_order(self):
         with self.assertRaises(ValueError):
@@ -144,12 +140,11 @@ def test_different_order(self):
         )
 
         rouge_val = rouge(y_true, y_pred)
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.333, 0.25, 0.286]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val,
+            {"precision": 0.333, "recall": 0.25, "f1_score": 0.286},
+            typecast_to_numpy=False,
+        )
 
     def test_reset_state(self):
         rouge = RougeN()
@@ -165,17 +160,15 @@ def test_reset_state(self):
 
         rouge.update_state(y_true, y_pred)
         rouge_val = rouge.result()
-        for metric_type, unexpected_val in zip(
-            self.metric_types, [0.0, 0.0, 0.0]
-        ):
-            self.assertNotEqual(rouge_val[metric_type].numpy(), unexpected_val)
+        self.assertDictAllValuesNotEqual(
+            rouge_val, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}
+        )
 
         rouge.reset_state()
         rouge_val = rouge.result()
-        for metric_type, unexpected_val in zip(
-            self.metric_types, [0.0, 0.0, 0.0]
-        ):
-            self.assertEqual(rouge_val[metric_type].numpy(), unexpected_val)
+        self.assertDictEqual(
+            rouge_val, {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}
+        )
 
     def test_update_state(self):
         rouge = RougeN()
@@ -191,24 +184,18 @@ def test_update_state(self):
 
         rouge.update_state(y_true_1, y_pred_1)
         rouge_val = rouge.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.575, 0.4, 0.467]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.575, "recall": 0.4, "f1_score": 0.467}
+        )
 
         y_true_2 = tf.constant(["what is your favourite show"])
         y_pred_2 = tf.constant(["my favourite show is silicon valley"])
 
         rouge.update_state(y_true_2, y_pred_2)
         rouge_val = rouge.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.45, 0.35, 0.385]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.45, "recall": 0.35, "f1_score": 0.385}
+        )
 
     def test_merge_state(self):
         rouge_1 = RougeN()
@@ -233,31 +220,22 @@ def test_merge_state(self):
         rouge_1.update_state(y_true_1, y_pred_1)
         rouge_1.update_state(y_true_2, y_pred_2)
         rouge_val = rouge_1.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.45, 0.35, 0.385]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.45, "recall": 0.35, "f1_score": 0.385}
+        )
 
         rouge_2.update_state(y_true_3, y_pred_3)
         rouge_val = rouge_2.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.2, 0.25, 0.222]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.2, "recall": 0.25, "f1_score": 0.222}
+        )
 
         merged_rouge = RougeN()
         merged_rouge.merge_state([rouge_1, rouge_2])
         rouge_val = merged_rouge.result()
-        for metric_type, expected_val in zip(
-            self.metric_types, [0.388, 0.325, 0.344]
-        ):
-            self.assertAlmostEqual(
-                rouge_val[metric_type].numpy(), expected_val, delta=1e-3
-            )
+        self.assertDictAlmostEqual(
+            rouge_val, {"precision": 0.388, "recall": 0.325, "f1_score": 0.344}
+        )
 
     def test_get_config(self):
         rouge = RougeN(

From 14e851fa62fe8eb9519148fb73e1b93c219f9175 Mon Sep 17 00:00:00 2001
From: abheesht17 <sharmabhee@gmail.com>
Date: Fri, 17 Jun 2022 13:17:33 +0530
Subject: [PATCH 30/30] Fix doc-string

---
 keras_nlp/metrics/rouge_l.py | 10 +++++-----
 keras_nlp/metrics/rouge_n.py | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/keras_nlp/metrics/rouge_l.py b/keras_nlp/metrics/rouge_l.py
index aff177f93d..f6969a85f6 100644
--- a/keras_nlp/metrics/rouge_l.py
+++ b/keras_nlp/metrics/rouge_l.py
@@ -45,7 +45,7 @@ class RougeL(RougeBase):
     >>> rouge_l = keras_nlp.metrics.RougeL()
     >>> y_true = "the tiny little cat was found under the big funny bed"
     >>> y_pred = "the cat was under the bed"
-    >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"]
+    >>> rouge_l(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.7058824>
 
     1.2. rank 1 inputs.
@@ -59,7 +59,7 @@ class RougeL(RougeBase):
     ...     "the cat was under the bed",
     ...     "i love contributing to KerasNLP",
     ... ]
-    >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"]
+    >>> rouge_l(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
 
     b. Tensor
@@ -76,7 +76,7 @@ class RougeL(RougeBase):
     ...         "i love contributing to KerasNLP",
     ...     ]
     ... )
-    >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"]
+    >>> rouge_l(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
 
     1.3. rank 2 inputs.
@@ -93,7 +93,7 @@ class RougeL(RougeBase):
     ...         ["i love contributing to KerasNLP"],
     ...     ]
     ... )
-    >>> rouge_l(y_true, y_pred)["rouge-l_f1_score"]
+    >>> rouge_l(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.80748665>
 
     3. Pass the metric to `model.compile()`.
@@ -104,7 +104,7 @@ class RougeL(RougeBase):
     >>> x = tf.constant(["HELLO THIS IS FUN"])
     >>> y = tf.constant(["hello this is awesome"])
     >>> metric_dict = model.evaluate(x, y, return_dict=True)
-    >>> metric_dict["rouge-l_f1_score"]
+    >>> metric_dict["f1_score"]
      0.75
     """
 
diff --git a/keras_nlp/metrics/rouge_n.py b/keras_nlp/metrics/rouge_n.py
index dc31630ff2..4bfe532ee2 100644
--- a/keras_nlp/metrics/rouge_n.py
+++ b/keras_nlp/metrics/rouge_n.py
@@ -47,7 +47,7 @@ class RougeN(RougeBase):
     >>> rouge_n = keras_nlp.metrics.RougeN(order=2)
     >>> y_true = "the tiny little cat was found under the big funny bed"
     >>> y_pred = "the cat was under the bed"
-    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    >>> rouge_n(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.26666668>
 
     1.2. rank 1 inputs.
@@ -61,7 +61,7 @@ class RougeN(RougeBase):
     ...     "the cat was under the bed",
     ...     "i love contributing to KerasNLP",
     ... ]
-    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    >>> rouge_n(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
 
     b. Tensor.
@@ -78,7 +78,7 @@ class RougeN(RougeBase):
     ...         "i love contributing to KerasNLP",
     ...     ]
     ... )
-    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    >>> rouge_n(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
 
     1.3. rank 2 inputs.
@@ -95,7 +95,7 @@ class RougeN(RougeBase):
     ...         ["i love contributing to KerasNLP"],
     ...     ]
     ... )
-    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    >>> rouge_n(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.4666667>
 
     2. Consider trigrams for calculating ROUGE-N.
@@ -112,7 +112,7 @@ class RougeN(RougeBase):
     ...         "i love contributing to KerasNLP",
     ...     ]
     ... )
-    >>> rouge_n(y_true, y_pred)["rouge-n_f1_score"]
+    >>> rouge_n(y_true, y_pred)["f1_score"]
     <tf.Tensor: shape=(), dtype=float32, numpy=0.2857143>
 
     3. Pass the metric to `model.compile()`.
@@ -123,7 +123,7 @@ class RougeN(RougeBase):
     >>> x = tf.constant(["HELLO THIS IS FUN"])
     >>> y = tf.constant(["hello this is awesome"])
     >>> metric_dict = model.evaluate(x, y, return_dict=True)
-    >>> metric_dict["rouge-n_f1_score"]
+    >>> metric_dict["f1_score"]
     0.6666666865348816
     """