Fix Html2TextTransformer for shallow copy (#14197)

Hi, There is some unintended behavior in Html2TextTransformer. The current code is **directly modifying the original documents that are passed as arguments to the function.** Therefore, not only the return of the function but also the input variables are being modified simultaneously. **To resolve this, I added unit test code as well.** reference link: [Shallow vs Deep Copying of Python Objects](https://realpython.com/copying-python-objects/) Thanks! ☺️
langchain-ai · Dec 3, 2023 · 9938086 · 9938086
1 parent 818252b
commit 9938086
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 2 deletions.
diff --git a/libs/langchain/langchain/document_transformers/html2text.py b/libs/langchain/langchain/document_transformers/html2text.py
@@ -39,9 +39,14 @@ def transform_documents(
         h.ignore_links = self.ignore_links
         h.ignore_images = self.ignore_images
 
+        new_documents = []
+
         for d in documents:
-            d.page_content = h.handle(d.page_content)
-        return documents
+            new_document = Document(
+                page_content=h.handle(d.page_content), metadata={**d.metadata}
+            )
+            new_documents.append(new_document)
+        return new_documents
 
     async def atransform_documents(
         self,

diff --git a/libs/langchain/tests/unit_tests/document_transformers/test_html2text_transformer.py b/libs/langchain/tests/unit_tests/document_transformers/test_html2text_transformer.py
@@ -0,0 +1,111 @@
+"""Unit tests for html2text document transformer."""
+import pytest
+from langchain_core.documents import Document
+
+from langchain.document_transformers import Html2TextTransformer
+
+
+@pytest.mark.requires("html2text")
+def test_transform_empty_html() -> None:
+    html2text_transformer = Html2TextTransformer()
+    empty_html = "<html></html>"
+    documents = [Document(page_content=empty_html)]
+    docs_transformed = html2text_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == "\n\n"
+
+
+@pytest.mark.requires("html2text")
+def test_extract_paragraphs() -> None:
+    html2text_transformer = Html2TextTransformer()
+    paragraphs_html = (
+        "<html><h1>Header</h1><p>First paragraph.</p>"
+        "<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
+    )
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = html2text_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "# Header\n\n"
+        "First paragraph.\n\n"
+        "Second paragraph.\n\n"
+        "# Ignore at end\n\n"
+    )
+
+
+@pytest.mark.requires("html2text")
+def test_extract_html() -> None:
+    html2text_transformer = Html2TextTransformer()
+    paragraphs_html = (
+        "<html>Begin of html tag"
+        "<h1>Header</h1>"
+        "<p>First paragraph.</p>"
+        "Middle of html tag"
+        "<p>Second paragraph.</p>"
+        "End of html tag"
+        "</html>"
+    )
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = html2text_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "Begin of html tag\n\n"
+        "# Header\n\n"
+        "First paragraph.\n\n"
+        "Middle of html tag\n\n"
+        "Second paragraph.\n\n"
+        "End of html tag\n\n"
+    )
+
+
+@pytest.mark.requires("html2text")
+def test_remove_style() -> None:
+    html2text_transformer = Html2TextTransformer()
+    with_style_html = (
+        "<html><style>my_funky_style</style><p>First paragraph.</p></html>"
+    )
+    documents = [Document(page_content=with_style_html)]
+    docs_transformed = html2text_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == "First paragraph.\n\n"
+
+
+@pytest.mark.requires("html2text")
+def test_ignore_links() -> None:
+    html2text_transformer = Html2TextTransformer(ignore_links=False)
+    multiple_tags_html = (
+        "<h1>First heading.</h1>"
+        "<p>First paragraph with an <a href='http://example.com'>example</a></p>"
+    )
+    documents = [Document(page_content=multiple_tags_html)]
+
+    docs_transformed = html2text_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "# First heading.\n\n"
+        "First paragraph with an [example](http://example.com)\n\n"
+    )
+
+    html2text_transformer = Html2TextTransformer(ignore_links=True)
+    docs_transformed = html2text_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "# First heading.\n\n" "First paragraph with an example\n\n"
+    )
+
+
+@pytest.mark.requires("html2text")
+def test_ignore_images() -> None:
+    html2text_transformer = Html2TextTransformer(ignore_images=False)
+    multiple_tags_html = (
+        "<h1>First heading.</h1>"
+        "<p>First paragraph with an "
+        "<img src='example.jpg' alt='Example image' width='500' height='600'></p>"
+    )
+    documents = [Document(page_content=multiple_tags_html)]
+
+    docs_transformed = html2text_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "# First heading.\n\n"
+        "First paragraph with an ![Example image](example.jpg)\n\n"
+    )
+
+    html2text_transformer = Html2TextTransformer(ignore_images=True)
+    docs_transformed = html2text_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "# First heading.\n\n" "First paragraph with an\n\n"
+    )