In [73]:
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from newspaper import Article

class WebExtractor():
    
    article = Article("")
    
    def __init__(self):
        pass

    @staticmethod
    def suspected_junk(docs):
        """
        Identifies and returns a list of suspected junk documents based on specific criteria.

        :param docs: A list of documents, where each document is represented as a dictionary.
                     Each dictionary should have a "text" key containing the text content of the document.
        :return: A list of suspected junk documents based on the criteria of having less than 300 characters
                 or having the same text as another document in the input list.
        """
        junk_docs = []

        short_docs = [doc for doc in docs if len(doc["text"]) < 300]
        junk_docs.extend(short_docs)

        seen_texts = set()
        for doc in docs:
            if doc["text"] in seen_texts and doc not in junk_docs:
                junk_docs.append(doc)
            else:
                seen_texts.add(doc["text"])
        return junk_docs
    
    @staticmethod
    def newspaper_extractor(html):
        """
        Extracts and cleans text content from HTML using the 'newspaper' library.

        :param html: HTML content to be processed.
        :return: Cleaned and concatenated text extracted from the HTML.
        """
        WebExtractor.article.set_html(html)
        WebExtractor.article.parse()
        return ' '.join(WebExtractor.article.text.split())
    
    @staticmethod
    def data_structure_converter(docs):
        """
        Converts a list of documents into a specific data structure.

        :param docs: List of documents to be converted.
        :return: List of dictionaries, each representing a document with 'text' and 'url' keys.
        """
        documents = []
        for doc in docs:
            documents.append({'text': doc.page_content,
                             'url': doc.metadata['source']})
        return documents
    
    @staticmethod
    def extract_web(url, depth=1):
        """
        Extracts web content from the specified URL(s) using a recursive approach.

        :param url: URL or list of URLs to extract content from.
        :param depth: Maximum depth for recursive extraction (default is 1).
        :return: Tuple containing two lists of dictionaries - (correct_documents, junk_documents).
                 Each dictionary represents a document with 'text' and 'url' keys.
        :raises TypeError: If the 'url' parameter is not of the string type or a list of string variables.
        """
        if not (isinstance(url, str) or (isinstance(url, list) and all(isinstance(item, str) for item in url))):
            raise TypeError("The 'url' parameter should be of the string type or a list with string variables.")
    
        if isinstance(url, str):
            url_list = [url]
        else:
            url_list = url
        
        all_docs = []
        for address in url_list:
            try:
                loader = RecursiveUrlLoader(url=address, max_depth=depth, extractor=WebExtractor.newspaper_extractor)
                all_docs.extend(loader.load())
            except Exception as e:
                print(f"Exception: {e}")
                break
        
        documents = WebExtractor.data_structure_converter(all_docs)
        junk_documents = WebExtractor.suspected_junk(documents)
        
        correct_documents = [doc for doc in documents if doc not in junk_documents]
        
        print(f"Number of pages downloaded: {len(all_docs)}")
        print(f"Number of suspected junks: {len(junk_documents)}")
        
        return correct_documents, junk_documents