From fffb99e8bc58da138d4199713d9c93b7621d7668 Mon Sep 17 00:00:00 2001 From: yin1991 <84140478+xiaokuili@users.noreply.github.com> Date: Tue, 30 Jan 2024 09:05:41 +0000 Subject: [PATCH 1/4] add proxy --- .../document_loaders/url_playwright.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/url_playwright.py b/libs/community/langchain_community/document_loaders/url_playwright.py index 8071d3717f7266..12c27391b815b4 100644 --- a/libs/community/langchain_community/document_loaders/url_playwright.py +++ b/libs/community/langchain_community/document_loaders/url_playwright.py @@ -2,7 +2,7 @@ """ import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List, Optional, Dict from langchain_core.documents import Document @@ -111,6 +111,17 @@ class PlaywrightURLLoader(BaseLoader): urls (List[str]): List of URLs to load. continue_on_failure (bool): If True, continue loading other URLs on failure. headless (bool): If True, the browser will run in headless mode. + proxy (Optional[Dict[str, str]]): If set, the browser will access URLs through the specified proxy. + + Examples + -------- + from langchain_community.document_loaders import PlaywrightURLLoader + + loader = UnstructuredHTMLLoader( + "example.html", mode="elements", strategy="fast", + ) + docs = loader.load() + """ def __init__( @@ -120,6 +131,7 @@ def __init__( headless: bool = True, remove_selectors: Optional[List[str]] = None, evaluator: Optional[PlaywrightEvaluator] = None, + proxy: Optional[Dict[str, str]] = None ): """Load a list of URLs using Playwright.""" try: @@ -133,6 +145,7 @@ def __init__( self.urls = urls self.continue_on_failure = continue_on_failure self.headless = headless + self.proxy= proxy if remove_selectors and evaluator: raise ValueError( @@ -153,7 +166,8 @@ def load(self) -> List[Document]: docs: List[Document] = list() with sync_playwright() as p: - browser = p.chromium.launch(headless=self.headless) + + browser = p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: try: page = browser.new_page() @@ -186,7 +200,7 @@ async def aload(self) -> List[Document]: docs: List[Document] = list() async with async_playwright() as p: - browser = await p.chromium.launch(headless=self.headless) + browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: try: page = await browser.new_page() From 6fae177e5f5136db743cb610dabcab4f9f4ccb74 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 31 Jan 2024 09:17:25 +0000 Subject: [PATCH 2/4] add Example demonstrating crawling with a proxy. --- .../document_loaders/url_playwright.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/url_playwright.py b/libs/community/langchain_community/document_loaders/url_playwright.py index 12c27391b815b4..d5e9c72b353519 100644 --- a/libs/community/langchain_community/document_loaders/url_playwright.py +++ b/libs/community/langchain_community/document_loaders/url_playwright.py @@ -115,13 +115,18 @@ class PlaywrightURLLoader(BaseLoader): Examples -------- + # crwal with proxy from langchain_community.document_loaders import PlaywrightURLLoader - - loader = UnstructuredHTMLLoader( - "example.html", mode="elements", strategy="fast", - ) - docs = loader.load() - + urls = [ + "https://api.ipify.org/?format=json", + ] + proxy={ + "server": "https://xx.xx.xx:15818", # https://: + "username": "username", + "password": "password" + } + loader = PlaywrightURLLoader(urls=urls, proxy=proxy) + data = loader.load() """ def __init__( @@ -220,3 +225,5 @@ async def aload(self) -> List[Document]: raise e await browser.close() return docs + + From a8e422c0c40e670bac718f89136e5140102f23c2 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 31 Jan 2024 09:48:39 +0000 Subject: [PATCH 3/4] lint and format file --- .../document_loaders/url_playwright.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/url_playwright.py b/libs/community/langchain_community/document_loaders/url_playwright.py index d5e9c72b353519..ce7bec24a03b16 100644 --- a/libs/community/langchain_community/document_loaders/url_playwright.py +++ b/libs/community/langchain_community/document_loaders/url_playwright.py @@ -2,7 +2,7 @@ """ import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional, Dict +from typing import TYPE_CHECKING, Dict, List, Optional from langchain_core.documents import Document @@ -111,11 +111,12 @@ class PlaywrightURLLoader(BaseLoader): urls (List[str]): List of URLs to load. continue_on_failure (bool): If True, continue loading other URLs on failure. headless (bool): If True, the browser will run in headless mode. - proxy (Optional[Dict[str, str]]): If set, the browser will access URLs through the specified proxy. + proxy (Optional[Dict[str, str]]): If set, the browser will access URLs + through the specified proxy. Examples -------- - # crwal with proxy + # crwal with proxy from langchain_community.document_loaders import PlaywrightURLLoader urls = [ "https://api.ipify.org/?format=json", @@ -136,7 +137,7 @@ def __init__( headless: bool = True, remove_selectors: Optional[List[str]] = None, evaluator: Optional[PlaywrightEvaluator] = None, - proxy: Optional[Dict[str, str]] = None + proxy: Optional[Dict[str, str]] = None, ): """Load a list of URLs using Playwright.""" try: @@ -150,7 +151,7 @@ def __init__( self.urls = urls self.continue_on_failure = continue_on_failure self.headless = headless - self.proxy= proxy + self.proxy = proxy if remove_selectors and evaluator: raise ValueError( @@ -171,7 +172,6 @@ def load(self) -> List[Document]: docs: List[Document] = list() with sync_playwright() as p: - browser = p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: try: @@ -225,5 +225,3 @@ async def aload(self) -> List[Document]: raise e await browser.close() return docs - - From 8a0783ea9f5607c59fc6b843e41de150945df7fa Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 12 Feb 2024 19:35:17 -0800 Subject: [PATCH 4/4] fmt --- .../document_loaders/url_playwright.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/url_playwright.py b/libs/community/langchain_community/document_loaders/url_playwright.py index ce7bec24a03b16..106f15cee6f61e 100644 --- a/libs/community/langchain_community/document_loaders/url_playwright.py +++ b/libs/community/langchain_community/document_loaders/url_playwright.py @@ -114,20 +114,19 @@ class PlaywrightURLLoader(BaseLoader): proxy (Optional[Dict[str, str]]): If set, the browser will access URLs through the specified proxy. - Examples - -------- - # crwal with proxy - from langchain_community.document_loaders import PlaywrightURLLoader - urls = [ - "https://api.ipify.org/?format=json", - ] - proxy={ - "server": "https://xx.xx.xx:15818", # https://: - "username": "username", - "password": "password" - } - loader = PlaywrightURLLoader(urls=urls, proxy=proxy) - data = loader.load() + Example: + .. code-block:: python + + from langchain_community.document_loaders import PlaywrightURLLoader + + urls = ["https://api.ipify.org/?format=json",] + proxy={ + "server": "https://xx.xx.xx:15818", # https://: + "username": "username", + "password": "password" + } + loader = PlaywrightURLLoader(urls, proxy=proxy) + data = loader.load() """ def __init__(