Skip to content

Commit

Permalink
Fix Html2TextTransformer for shallow copy (#14197)
Browse files Browse the repository at this point in the history
<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes (if applicable),
  - **Dependencies:** any dependencies required for this change,
- **Tag maintainer:** for a quicker response, tag the relevant
maintainer (see below),
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc:

https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->
Hi,
There is some unintended behavior in Html2TextTransformer.
The current code is **directly modifying the original documents that are
passed as arguments to the function.**
Therefore, not only the return of the function but also the input
variables are being modified simultaneously.
**To resolve this, I added unit test code as well.**

reference link: [Shallow vs Deep Copying of Python
Objects](https://realpython.com/copying-python-objects/)

Thanks! ☺️
  • Loading branch information
FacerAin committed Dec 3, 2023
1 parent 818252b commit 9938086
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 2 deletions.
9 changes: 7 additions & 2 deletions libs/langchain/langchain/document_transformers/html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,14 @@ def transform_documents(
h.ignore_links = self.ignore_links
h.ignore_images = self.ignore_images

new_documents = []

for d in documents:
d.page_content = h.handle(d.page_content)
return documents
new_document = Document(
page_content=h.handle(d.page_content), metadata={**d.metadata}
)
new_documents.append(new_document)
return new_documents

async def atransform_documents(
self,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Unit tests for html2text document transformer."""
import pytest
from langchain_core.documents import Document

from langchain.document_transformers import Html2TextTransformer


@pytest.mark.requires("html2text")
def test_transform_empty_html() -> None:
html2text_transformer = Html2TextTransformer()
empty_html = "<html></html>"
documents = [Document(page_content=empty_html)]
docs_transformed = html2text_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == "\n\n"


@pytest.mark.requires("html2text")
def test_extract_paragraphs() -> None:
html2text_transformer = Html2TextTransformer()
paragraphs_html = (
"<html><h1>Header</h1><p>First paragraph.</p>"
"<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = html2text_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# Header\n\n"
"First paragraph.\n\n"
"Second paragraph.\n\n"
"# Ignore at end\n\n"
)


@pytest.mark.requires("html2text")
def test_extract_html() -> None:
html2text_transformer = Html2TextTransformer()
paragraphs_html = (
"<html>Begin of html tag"
"<h1>Header</h1>"
"<p>First paragraph.</p>"
"Middle of html tag"
"<p>Second paragraph.</p>"
"End of html tag"
"</html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = html2text_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == (
"Begin of html tag\n\n"
"# Header\n\n"
"First paragraph.\n\n"
"Middle of html tag\n\n"
"Second paragraph.\n\n"
"End of html tag\n\n"
)


@pytest.mark.requires("html2text")
def test_remove_style() -> None:
html2text_transformer = Html2TextTransformer()
with_style_html = (
"<html><style>my_funky_style</style><p>First paragraph.</p></html>"
)
documents = [Document(page_content=with_style_html)]
docs_transformed = html2text_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == "First paragraph.\n\n"


@pytest.mark.requires("html2text")
def test_ignore_links() -> None:
html2text_transformer = Html2TextTransformer(ignore_links=False)
multiple_tags_html = (
"<h1>First heading.</h1>"
"<p>First paragraph with an <a href='http://example.com'>example</a></p>"
)
documents = [Document(page_content=multiple_tags_html)]

docs_transformed = html2text_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# First heading.\n\n"
"First paragraph with an [example](http://example.com)\n\n"
)

html2text_transformer = Html2TextTransformer(ignore_links=True)
docs_transformed = html2text_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# First heading.\n\n" "First paragraph with an example\n\n"
)


@pytest.mark.requires("html2text")
def test_ignore_images() -> None:
html2text_transformer = Html2TextTransformer(ignore_images=False)
multiple_tags_html = (
"<h1>First heading.</h1>"
"<p>First paragraph with an "
"<img src='example.jpg' alt='Example image' width='500' height='600'></p>"
)
documents = [Document(page_content=multiple_tags_html)]

docs_transformed = html2text_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# First heading.\n\n"
"First paragraph with an ![Example image](example.jpg)\n\n"
)

html2text_transformer = Html2TextTransformer(ignore_images=True)
docs_transformed = html2text_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == (
"# First heading.\n\n" "First paragraph with an\n\n"
)

0 comments on commit 9938086

Please sign in to comment.