Skip to content

Commit

Permalink
Update WhatsAppChatLoader regex to handle multiple date-time formats (#…
Browse files Browse the repository at this point in the history
…4186)

This PR updates the `message_line_regex` used by `WhatsAppChatLoader` to
support different date-time formats used in WhatsApp chat exports;
resolves #4153.

The new regex handles the following input formats:
```terminal
[05.05.23, 15:48:11] James: Hi here
[11/8/21, 9:41:32 AM] User name: Message 123
1/23/23, 3:19 AM - User 2: Bye!
1/23/23, 3:22_AM - User 1: And let me know if anything changes
```

Tests have been added to verify that the loader works correctly with all
formats.
  • Loading branch information
hp0404 committed May 5, 2023
1 parent a57259e commit 2a3c5f8
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 9 deletions.
33 changes: 24 additions & 9 deletions langchain/document_loaders/whatsapp_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,31 @@ def load(self) -> List[Document]:
with open(p, encoding="utf8") as f:
lines = f.readlines()

message_line_regex = (
r"(\d{1,2}/\d{1,2}/\d{2,4}, "
r"\d{1,2}:\d{1,2}[ _]?(?:AM|PM)?) - "
r"(.*?): (.*)"
)
for line in lines:
result = re.match(
message_line_regex,
line.strip(),
message_line_regex = r"""
\[?
(
\d{1,2}
[\/.]
\d{1,2}
[\/.]
\d{2,4}
,\s
\d{1,2}
:\d{2}
(?:
:\d{2}
)?
(?:[ _](?:AM|PM))?
)
\]?
[\s-]*
([\w\s]+)
[:]+
\s
(.+)
"""
for line in lines:
result = re.match(message_line_regex, line.strip(), flags=re.VERBOSE)
if result:
date, sender, text = result.groups()
text_content += concatenate_rows(date, sender, text)
Expand Down
19 changes: 19 additions & 0 deletions tests/integration_tests/document_loaders/test_whatsapp_chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from pathlib import Path

from langchain.document_loaders import WhatsAppChatLoader


def test_whatsapp_chat_loader() -> None:
"""Test WhatsAppChatLoader."""
file_path = Path(__file__).parent.parent / "examples" / "whatsapp_chat.txt"
loader = WhatsAppChatLoader(str(file_path))
docs = loader.load()

assert len(docs) == 1
assert docs[0].metadata["source"] == str(file_path)
assert docs[0].page_content == (
"James on 05.05.23, 15:48:11: Hi here\n\n"
"User name on 11/8/21, 9:41:32 AM: Message 123\n\n"
"User 2 on 1/23/23, 3:19 AM: Bye!\n\n"
"User 1 on 1/23/23, 3:22_AM: And let me know if anything changes\n\n"
)
4 changes: 4 additions & 0 deletions tests/integration_tests/examples/whatsapp_chat.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[05.05.23, 15:48:11] James: Hi here
[11/8/21, 9:41:32 AM] User name: Message 123
1/23/23, 3:19 AM - User 2: Bye!
1/23/23, 3:22_AM - User 1: And let me know if anything changes

0 comments on commit 2a3c5f8

Please sign in to comment.