In [8]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search
import openai
from io import BytesIO
from PyPDF2 import PdfReader
from docx import Document
#from langchain.chat_models import ChatOpenAI
#from langchain_community.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from reference import encoder, dims, getIndex, llm


In [9]:
# Function to find downloadable files from a URL
def find_downloadable_links(url):
    try:
        # Send HTTP request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful

        # Parse the page content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all links in the page
        links = soup.find_all('a', href=True)

        downloadable_files = []
        
        # Check if the link points to a downloadable file (e.g., .pdf, .docx, .txt)
        for link in links:
            href = link['href']
            if href.endswith(('.pdf', '.docx', '.txt', '.xls', '.pptx', '.csv')):
                downloadable_files.append(href)

        return downloadable_files
    except Exception as e:
        print(f"Error with URL {url}: {e}")
        return []


In [10]:
# Function to download the document content
def download_document(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.content
    except Exception as e:
        print(f"Error downloading document from {url}: {e}")
        return None

In [11]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_data):
    try:
        pdf_reader = PdfReader(BytesIO(pdf_data))
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

# Function to extract text from a Word document (.docx)
def extract_text_from_docx(docx_data):
    try:
        doc = Document(BytesIO(docx_data))
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text
    except Exception as e:
        print(f"Error extracting text from DOCX: {e}")
        return ""

# Function to extract text from a text file
def extract_text_from_txt(txt_data):
    return txt_data.decode('utf-8')

In [12]:
# Function to summarize text using LangChain and GPT-4
def summarize_text_with_langchain(text):
    try:
        # Create a prompt template for summarization
        prompt_template = "Please summarize the following text:\n\n{text}"
        prompt = PromptTemplate(input_variables=["text"], template=prompt_template)
        
        # Create the LLM chain
        chain = LLMChain(llm=llm, prompt=prompt)
        
        # Get the summary from the model
        summary = chain.run(text)
        return summary
    except Exception as e:
        print(f"Error summarizing with LangChain: {e}")
        return "Error generating summary."


In [13]:
# Function to get a summary of the document
def get_document_summary(url):
    # Download the document based on its extension
    document_data = download_document(url)
    
    if not document_data:
        return "Unable to download the document."

    # Check the file type by extension and extract text accordingly
    if url.endswith(".pdf"):
        text = extract_text_from_pdf(document_data)
    elif url.endswith(".docx"):
        text = extract_text_from_docx(document_data)
    elif url.endswith(".txt"):
        text = extract_text_from_txt(document_data)
    else:
        return "Unsupported file type for summarization."

    # If no text extracted, return a message
    if not text.strip():
        return "No text extracted from the document."

    # Summarize the extracted text using LangChain and GPT-4
    return summarize_text_with_langchain(text)

# Function to perform the search and get the links to documents
def search_documents(query, num_results=5):
    print(f"Searching for '{query}'...")
    
    # Perform Google search and get the URLs of the top results
    search_results = search(query, num_results=num_results)

    all_downloadable_links = {}

    # Check each URL for downloadable files
    for result in search_results:
        print(f"Checking {result}...")
        downloadable_links = find_downloadable_links(result)
        if downloadable_links:
            all_downloadable_links[result] = downloadable_links

    return all_downloadable_links

In [14]:
# Main function to search and display downloadable file links
def main():
    query = input("Enter the search query: ")
    num_results = 10  #int(input("How many search results do you want to check? "))

    downloadable_files = search_documents(query, num_results)

    if downloadable_files:
        print("\nFound downloadable files at these links:")
        for url, files in downloadable_files.items():
            print(f"\nURL: {url}")
            for file in files:
                print(f"  - {file}")
                # Get a summary of each file
                summary = get_document_summary(file)
                print(f"    Summary: {summary}")
    else:
        print("No downloadable files found.")

# Run the script
if __name__ == "__main__":
    main()

Enter the search query:  machine learning


Searching for 'machine learning'...
Checking https://en.wikipedia.org/wiki/Machine_learning...
Checking https://www.ibm.com/think/topics/machine-learning...
Checking https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained...
Checking https://developers.google.com/machine-learning/crash-course...
Checking https://www.coursera.org/learn/machine-learning...
Checking https://www.google.com/search?num=12...
Checking https://www.geeksforgeeks.org/machine-learning/...
Checking https://www.databricks.com/glossary/machine-learning-models...
Checking https://azure.microsoft.com/en-us/products/machine-learning...
Checking https://www.sciencedirect.com/topics/computer-science/machine-learning...
Error with URL https://www.sciencedirect.com/topics/computer-science/machine-learning: 403 Client Error: Forbidden for url: https://www.sciencedirect.com/topics/computer-science/machine-learning

Found downloadable files at these links:

URL: https://en.wikipedia.org/wiki/Machine_learning


  chain = LLMChain(llm=llm, prompt=prompt)
  summary = chain.run(text)
Failed to multipart ingest runs: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')


    Summary: The paper "Measuring the Efficiency of the Intraday Forex Market with a Universal Data Compression Algorithm" by Armin Shmilovici et al. explores the use of a universal Variable Order Markov (VOM) model to test the weak form of the Efficient Market Hypothesis (EMH) in the intraday Forex market. The study analyzes 12 pairs of international currency exchange rates over various time intervals. The VOM model, which detects recurring patterns in data for compression and prediction, found statistically significant compression in all time-series, indicating predictability above random. However, this predictability was insufficient to develop a profitable trading strategy, suggesting that the Forex market is efficient most of the time. The study concludes that while the VOM model can identify patterns, these do not translate into economic profits due to market efficiency and transaction costs. The research highlights the challenges of using predictive models in financial markets a

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')


    Summary: The paper "An Analysis of Single-Layer Networks in Unsupervised Feature Learning" by Adam Coates, Honglak Lee, and Andrew Y. Ng explores the effectiveness of single-layer networks in unsupervised feature learning. The authors argue that simple factors, such as the number of hidden nodes, can be more crucial for high performance than the complexity of the learning algorithm or model depth. They apply various off-the-shelf feature learning algorithms, including sparse auto-encoders, sparse RBMs, K-means clustering, and Gaussian mixtures, to datasets like CIFAR, NORB, and STL using single-layer networks. Their analysis focuses on the impact of model setup changes, such as receptive field size, number of hidden nodes, stride, and whitening. The results indicate that a large number of hidden nodes and dense feature extraction are critical for achieving high performance. Surprisingly, K-means clustering, a simple and fast algorithm, achieved state-of-the-art performance on CIFAR

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')


    Summary: The paper presents a novel method for generic visual categorization, which involves identifying object content in natural images while generalizing across variations inherent to the object class. The proposed "bag of keypoints" method uses vector quantization of affine invariant descriptors of image patches and is implemented using Naïve Bayes and SVM classifiers. The method is simple, computationally efficient, and invariant to affine transformations, occlusion, lighting, and intra-class variations. The authors demonstrate the method's robustness to background clutter and its good categorization accuracy without exploiting geometric information. The approach is inspired by text categorization methods and involves detecting and describing image patches, assigning patch descriptors to clusters, constructing a bag of keypoints, and applying a multi-class classifier. The paper compares the performance of Naïve Bayes and SVM classifiers on a seven-class dataset and a four-clas

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')


    Summary: The paper presents a novel method for generic visual categorization, which involves identifying the object content of natural images while generalizing across variations inherent to the object class. This method, called the "bag of keypoints," is based on vector quantization of affine invariant descriptors of image patches. The authors propose and compare two implementations using different classifiers: Naïve Bayes and Support Vector Machines (SVM). The method is noted for its simplicity, computational efficiency, and invariance to affine transformations, occlusion, lighting, and intra-class variations. The paper demonstrates the method's robustness to background clutter and its ability to achieve good categorization accuracy without exploiting geometric information. The approach is inspired by text categorization methods and involves detecting and describing image patches, assigning patch descriptors to clusters, constructing a bag of keypoints, and applying a multi-class

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')


KeyboardInterrupt: 

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Forbidden"}')
