In [1]:
# Search Tool

from tavily import TavilyClient
from dotenv import load_dotenv
load_dotenv()
import os
from rich.pretty import pprint



In [2]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
tavily_client = TavilyClient(api_key=os.environ['TVLY_API_KEY'])

In [3]:
# results = tavily_client.search("What is anomaly detection?")

In [4]:
# results['results'][0]['content']

In [5]:
# results

In [6]:
query_="What can LLM Agents DO?"
results = tavily_client.search(query_,
                               include_raw_content=True)

In [7]:
# pprint(results['results'][0]['raw_content'])

In [3]:
import requests
import random
import time
from requests import get
from bs4 import BeautifulSoup

# List of user-agents
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.3',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
    'Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko'
]

# Proxy details
PROXIES = {
    'http': 'http://your_proxy_server:port',
    'https': 'https://your_proxy_server:port'
}

def get_text_from_url(url: str) -> str:
    """Fetches the text content from a given URL.

    Args:
        url: The URL to fetch the content from.
    Returns:
        The text content of the page, or None if an error occurs.
    """
    # Simulate human-like behavior with random sleep durations
    time.sleep(random.randint(2, 5))  # Sleep for 2-5 seconds
    try:
        # Randomly select a user-agent
        headers = {'User-Agent': random.choice(USER_AGENTS)}
        response = get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract text from all elements, stripping whitespace
        text = ' '.join([s.get_text(strip=True) for s in soup.find_all()])
        return text
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching content from {url}: {e}")
        return None

In [9]:
# get list of urls
urls = [result['url'] for result in results['results']]

In [10]:
list_of_texts = [get_text_from_url(url) for url in urls]

In [32]:
import os
import asyncio
import numpy as np
from langchain.embeddings import OpenAIEmbeddings
from aiohttp import ClientSession
from rank_bm25 import BM25Okapi
from typing import List
import faiss
from dotenv import load_dotenv

# Load environment variables (make sure you have a .env file with your OpenAI API key)
load_dotenv()

class RAGSystem:
    def __init__(self, texts: List[dict], chunk_size: int = 500, embedding_model: str = "text-embedding-3-small"):
        self.chunks = self.chunk_texts(texts, chunk_size)
        self.bm25 = self.create_bm25()
        self.embedding_model = embedding_model
        self.client = OpenAIEmbeddings(model=self.embedding_model)
        self.embeddings = None
        self.index = None

    def chunk_texts(self, texts: List[dict], chunk_size: int) -> List[dict]:
        chunks = []
        for item in texts:
            text = item['content']
            url = item['url']
            words = text.split()
            for i in range(0, len(words), chunk_size):
                chunk = ' '.join(words[i:i+chunk_size])
                chunks.append({'content': chunk, 'url': url})
        return chunks

    def create_bm25(self):
        tokenized_chunks = [chunk['content'].split() for chunk in self.chunks]
        return BM25Okapi(tokenized_chunks)

    def create_embeddings(self):
        if self.embeddings is None:
            texts = [chunk['content'] for chunk in self.chunks]
            self.embeddings = self.client.embed_documents(texts)
        return self.embeddings

    def create_faiss_index(self):
        if self.index is None:
            embeddings = self.create_embeddings()
            dimension = len(embeddings[0])
            embeddings = np.array(embeddings)
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(embeddings)
        return self.index

    def keyword_search(self, query: str, top_k: int = 5) -> List[dict]:
        scores = self.bm25.get_scores(query.split())
        top_indices = np.argsort(scores)[-top_k:][::-1]
        return [self.chunks[i] for i in top_indices]

    def semantic_search(self, query: str, top_k: int = 5) -> List[dict]:
        query_embedding = self.client.embed_documents([query])[0]
        index = self.create_faiss_index()
        distances, indices = index.search(np.array([query_embedding]), top_k)
        return [self.chunks[i] for i in indices[0]]

    def combined_search(self, query: str, top_k: int = 5, alpha: float = 0.5) -> List[dict]:
        keyword_scores = self.bm25.get_scores(query.split())
        query_embedding = self.client.embed_documents([query])[0]
        index = self.create_faiss_index()
        distances, indices = index.search(np.array([query_embedding]), len(self.chunks))
        semantic_scores = np.zeros(len(self.chunks))
        semantic_scores[indices[0]] = 1 / (1 + distances[0])
        combined_scores = alpha * keyword_scores + (1 - alpha) * semantic_scores
        top_indices = np.argsort(combined_scores)[-top_k:][::-1]
        return [self.chunks[i] for i in top_indices]

In [35]:
dict_of_texts = [{'content': get_text_from_url(url), 'url': url} for url in urls]

In [36]:

# Example usage
# texts = [
#     "The quick brown fox jumps over the lazy dog.",
#     "A journey of a thousand miles begins with a single step.",
#     "To be or not to be, that is the question.",
#     "All that glitters is not gold.",
#     "Where there's a will, there's a way."
# ]
texts = dict_of_texts

rag = RAGSystem(texts)

query = "LLM and its reasearch"
print("Keyword search results:")
print(rag.keyword_search(query))

print("\nSemantic search results:")
print(rag.semantic_search(query))

print("\nCombined search results:")
print(rag.combined_search(query))

Keyword search results:
[{'content': "to review and assess its effectiveness. LLM-based agents use internal feedback mechanisms, drawing on existing models to refine their strategies. They also interact with humans to adjust their plans based on human feedback and preferences. Agents can also gather insights from their environments, both real and virtual, using outcomes and observations to refine their plans further.Two effective methods for incorporating feedback in planning areReActandReflexion.ReAct, for instance, helps an LLM solve complex tasks by cycling through a sequence of thought, action, and observation, repeating these steps as needed. It takes in feedback from the environment, which can include observations as well as input from humans or other models. This method allows the LLM to adjust its approach based on real-time feedback, enhancing its ability to answer questions more effectively.Tools useTools in this term are various resources that help LLM agents connect with ex

In [37]:
final_results = rag.combined_search(query)

In [38]:
final_results[0]

{'content': "to review and assess its effectiveness. LLM-based agents use internal feedback mechanisms, drawing on existing models to refine their strategies. They also interact with humans to adjust their plans based on human feedback and preferences. Agents can also gather insights from their environments, both real and virtual, using outcomes and observations to refine their plans further.Two effective methods for incorporating feedback in planning areReActandReflexion.ReAct, for instance, helps an LLM solve complex tasks by cycling through a sequence of thought, action, and observation, repeating these steps as needed. It takes in feedback from the environment, which can include observations as well as input from humans or other models. This method allows the LLM to adjust its approach based on real-time feedback, enhancing its ability to answer questions more effectively.Tools useTools in this term are various resources that help LLM agents connect with external environments to pe

In [5]:
import os
import asyncio
import numpy as np
from rank_bm25 import BM25Okapi
from typing import List
import faiss
import fastavro
# from fastavro.schema import load_schema
from dotenv import load_dotenv
import tiktoken
import re
import litellm
from rerankers import Reranker
from functools import lru_cache  # Import lru_cache


class LiteLLMEmbeddingClient:
    def __init__(self, model: str, api_key: str):
        self.model = model
        self.api_key = api_key
    async def embed_document(self, text: str) -> List[float]:
        response = litellm.embedding(input=[text], model=self.model, api_key=self.api_key)
        return response['data'][0]['embedding']

    async def embed_documents(self, texts: List[str]) -> List[List[float]]:
        tasks = [self.embed_document(text) for text in texts]
        return await asyncio.gather(*tasks)

    async def embed_query(self, query: str) -> List[float]:
        return await self.embed_document(query)

class RetrievalEngine:
    def __init__(self, texts: List[dict],
                chunk_size: int = 500,
                overlap: int = 100,
                tokens: bool = False,
                embed_client = LiteLLMEmbeddingClient(model= "text-embedding-3-small",
                                                      api_key=os.environ['OPENAI_API_KEY']),
                reranker = Reranker('flashrank')):
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.tokens = tokens
        self.chunks = self.chunk_texts(texts)
        self.bm25 = self.create_bm25()
        self.embed_client = embed_client
        self.embeddings = None
        self.index = None
        self.reranker = reranker
        

    def chunk_texts(self, texts: List[dict]) -> List[dict]:
        chunks = []
        for item in texts:
            text = item['content']
            url = item['url']
            if self.tokens:
                chunked_texts = self.chunk_text_by_tokens(text, self.chunk_size, self.overlap)
            else:
                chunked_texts = self.chunk_text(text, self.chunk_size, self.overlap)
            for chunk in chunked_texts:
                chunks.append({'content': chunk, 'url': url})
        return chunks

    def chunk_text(self, text, max_char_length=1000, overlap=0):
        chunks = []
        current_chunk = ""
        sentences = re.split(r'(\.|\?|!)', text.replace('\n', ' '))

        for sentence in sentences:
            trimmed_sentence = sentence.strip()
            if not trimmed_sentence:
                continue

            chunk_length = len(current_chunk) + len(trimmed_sentence) + 1
            lower_bound = max_char_length - max_char_length * 0.5
            upper_bound = max_char_length + max_char_length * 0.5

            if lower_bound <= chunk_length <= upper_bound and current_chunk:
                current_chunk = re.sub(r'^\.\s+', "", current_chunk).strip()
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = ""
            elif chunk_length > upper_bound:
                current_chunk = re.sub(r'^\.\s+', "", current_chunk).strip()
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = trimmed_sentence
            else:
                current_chunk += f" {trimmed_sentence}"

        if current_chunk:
            chunks.append(current_chunk)

        if overlap > 0:
            overlapped_chunks = []
            for i in range(len(chunks)):
                start = max(0, i - overlap)
                end = min(len(chunks), i + 1)
                overlapped_chunks.append(' '.join(chunks[start:end]))
            return overlapped_chunks

        return chunks

    def chunk_text_by_tokens(self, text, max_token_length=100, overlap=0):
        enc = tiktoken.get_encoding("cl100k_base")
        tokens = enc.encode(text)
        chunks = []

        for i in range(0, len(tokens), max_token_length - overlap):
            chunk_tokens = tokens[i:i + max_token_length]
            chunk_text = enc.decode(chunk_tokens)
            chunks.append(chunk_text)

        return chunks

    def create_bm25(self):
        tokenized_chunks = [chunk['content'].split() for chunk in self.chunks]
        return BM25Okapi(tokenized_chunks)

    @lru_cache(maxsize=128)
    async def create_embeddings(self):
        if self.embeddings is None:
            texts = [chunk['content'] for chunk in self.chunks]
            self.embeddings = await self.embed_client.embed_documents(texts)
        return self.embeddings
    
    @lru_cache(maxsize=128)
    async def semantic_query_run(self, query: str, top_k: int = 5) -> List[dict]:
        query_embedding = await self.embed_client.embed_documents([query])
        index = await self.create_faiss_index()  # Ensure this is awaited
        distances, indices = index.search(np.array(query_embedding).astype('float32'), top_k)
        return distances, indices
    
    @lru_cache(maxsize=128)
    async def semantic_search(self, query: str, top_k: int = 5) -> List[dict]:
        distance, indices = await self.semantic_query_run(query, top_k)
        return [self.chunks[i] for i in indices[0]]

    @lru_cache(maxsize=128)
    async def create_faiss_index(self):
        if self.index is None:
            embeddings = await self.create_embeddings()
            dimension = len(embeddings[0])
            embeddings = np.array(embeddings)
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(embeddings)
        return self.index

    @lru_cache(maxsize=128)
    async def keyword_search(self, query: str, top_k: int = 5) -> List[dict]:
        # Get BM25 scores for the query
        scores = self.bm25.get_scores(query.split())
        # Get indices of top_k scores in descending order
        top_indices = np.argsort(scores)[-top_k:][::-1]
        # Return the chunks corresponding to the top indices
        return [self.chunks[i] for i in top_indices]

    @lru_cache(maxsize=128)
    async def combined_search(self, query: str, top_k: int = 5, alpha: float = 0.5) -> List[dict]:
        keyword_scores = self.bm25.get_scores(query.split())
        keyword_scores = self.normalize_scores(keyword_scores)  # Normalize keyword scores
        distances, indices = await self.semantic_query_run(query, len(self.chunks))
        semantic_scores = np.zeros(len(self.chunks))  # Initialize semantic scores
        semantic_scores[indices[0]] = 1 / (1 + distances[0])  # Calculate semantic scores
        combined_scores = alpha * keyword_scores + (1 - alpha) * semantic_scores  # Combine scores
        top_indices = np.argsort(combined_scores)[-top_k:][::-1]  # Get top indices
        return [self.chunks[i] for i in top_indices]
    
    def normalize_scores(self, scores: np.ndarray) -> np.ndarray:
        """Normalize an array of scores to a range between 0 and 1.
        Args:
            scores (np.ndarray): The array of scores to normalize.
        Returns:
            np.ndarray: The normalized scores."""
        # Find the minimum and maximum scores
        min_score = np.min(scores)
        max_score = np.max(scores)
        # Normalize the scores to a range between 0 and 1
        return (scores - min_score) / (max_score - min_score)
    
    async def save_faiss_index(self, file_path: str):
        """Save the FAISS index to a file.
        
        Args:
            file_path (str): The path to the file where the index will be saved.
        """
        if self.index is None:
            await self.create_faiss_index()
        faiss.write_index(self.index, file_path)

    async def save_bm25_index_avro(self, file_path: str):
        """Save the BM25 index to an Avro file.
        
        Args:
            file_path (str): The path to the file where the index will be saved.
        """
        schema = {
            "type": "record",
            "name": "BM25Index",
            "fields": [
                {"name": "doc_freqs", "type": {"type": "array", "items": {"type": "map", "values": "int"}}},
                {"name": "idf", "type": {"type": "array", "items": "double"}},
                {"name": "doc_len", "type": {"type": "array", "items": "int"}},
                {"name": "avgdl", "type": "double"}
            ]
        }
        bm25_data = {
            'doc_freqs': self.bm25.doc_freqs,
            'idf': self.bm25.idf,
            'doc_len': self.bm25.doc_len,
            'avgdl': self.bm25.avgdl
        }
        with open(file_path, 'wb') as f:
            fastavro.writer(f, schema, [bm25_data])

    async def load_bm25_index_avro(self, file_path: str):
        """Load the BM25 index from an Avro file.
        
        Args:
            file_path (str): The path to the file from which the index will be loaded.
        """
        with open(file_path, 'rb') as f:
            reader = fastavro.reader(f)
            bm25_data = next(reader)
        self.bm25 = BM25Okapi([])
        self.bm25.doc_freqs = bm25_data['doc_freqs']
        self.bm25.idf = bm25_data['idf']
        self.bm25.doc_len = bm25_data['doc_len']
        self.bm25.avgdl = bm25_data['avgdl']


    @lru_cache(maxsize=128)
    async def rerank_chunks(self, query: str, chunks: List[dict], top_k: int = 5) -> List[dict]:
        """Rerank chunks of text using the Reranker library.
        
        Args:
            query (str): The query string.
            chunks (List[dict]): The list of chunks to rerank.
            top_k (int): The number of top results to return.
        
        Returns:
            List[dict]: The top-k ranked chunks.
        """
        # Extract texts from chunks
        texts = [chunk['content'] for chunk in chunks]
        # Perform reranking
        results = await self.reranker.rank_async(query=query, docs=texts)
        # Extract top-k results
        top_results = results.top_k(top_k)
        # Map results back to chunks
        ranked_chunks = [chunks[result.document.doc_id] for result in top_results]
        return ranked_chunks
    


Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2
You don't have the necessary dependencies installed to use FlashRankRanker.
Please install the necessary dependencies for FlashRankRanker by running `pip install "rerankers[flashrank]"` or `pip install "rerankers[all]" to install the dependencies for all reranker types.


In [1]:
import re

In [15]:
def extract_rewritten_prompt(rendered_text):
    pattern = r'<output>\n\s*(.?)\s\n</output>'
    match = re.search(pattern,  rendered_text.replace('\n', ' '), re.DOTALL)
    if match:
        output_content = match.group(1).strip()
        # Split the content by newlines and take the first non-empty line
        lines = output_content.split('\n')
        return next((line.strip() for line in lines if line.strip()), '')
    return None

In [16]:
prompt = """
<output>
foundation time series models examples python
</output>

This rewritten search query maintains the original intent of the prompt, focusing on the key terms "foundation time series models" and adding "examples" and "python" to specify the type of information being sought. This query is concise, clear, and suitable for a search API.
"""

In [17]:
extract_rewritten_prompt(prompt)

In [1]:
import duckdb

# Create a connection to a new DuckDB database (in-memory)
conn = duckdb.connect(database='/Users/jshah/Documents/GitHub/pravah/pravah.db', read_only=True)

IOException: IO Error: Could not set lock on file "/Users/jshah/Documents/GitHub/pravah/pravah.db": Conflicting lock is held in /Users/jshah/micromamba/envs/hack/bin/python3.11 (PID 93777) by user jshah. See also https://duckdb.org/docs/connect/concurrency

In [37]:
import pandas as pd

result_df = conn.execute("SELECT * FROM retrieved_chunks").fetchdf()
# print(result_df)

In [39]:
result_df.head()

Unnamed: 0,conversation_uuid,search_type,chunk
0,4410bd1d-af48-4d80-9d30-825ee0f915f5,keyword_search,{'content': strong data validation and ML fra...
1,4410bd1d-af48-4d80-9d30-825ee0f915f5,keyword_search,{'content': Dagster vs Apache Airflow — side b...
2,4410bd1d-af48-4d80-9d30-825ee0f915f5,keyword_search,{'content': atin Solanki307 Followers·Writer f...
3,4410bd1d-af48-4d80-9d30-825ee0f915f5,keyword_search,{'content': use cases.Error handling and vali...
4,4410bd1d-af48-4d80-9d30-825ee0f915f5,keyword_search,{'content': each platform.Provide sample code...


In [40]:
result_df.shape

(870, 3)

In [44]:
# merge the chat_history table with the retrieved_chunks table on conversation_uuid using SQL query
merged_df = conn.execute("""
    SELECT *
    FROM chat_history
    FULL OUTER JOIN retrieved_chunks
    ON chat_history.conversation_uuid = retrieved_chunks.conversation_uuid
""").fetchdf()


In [43]:
merged_df.to_csv('merged_df.csv', index=False)

In [32]:
from rich.pretty import pprint
import json
pprint(json.loads(result_df.search_result[20]))

In [27]:

matches = re.findall(r"(?:^|\r?\n)(?: {4}|\t)[^\r\n]{0,200}(?:\r?\n(?: {4}|\t)[^\r\n]{0,200}){0,20}\r?\n?" , text, re.MULTILINE)

In [53]:
matches = re.findall(pattern_all, text, re.MULTILINE)
for match in matches:
    print(match)

- Item 1
    - Subitem 1.1
    - Subitem 1.2
- Item 2
    1. Subitem 2.1
        - Sub-subitem 2.1.1
    2. Subitem 2.2


In [3]:
# matches = re.findall(quoted_text_pattern, text, re.MULTILINE)
# for match in matches:
#     print(match)

In [33]:
from duckduckgo_search import DDGS

results = DDGS().text("Best way to webscrape", max_results=5)
print(results)

[{'title': 'Web Scraping Python Tutorial - How to Scrape Data From A Website', 'href': 'https://www.freecodecamp.org/news/web-scraping-python-tutorial-how-to-scrape-data-from-a-website/', 'body': 'Note that this is only one of the solutions. You can attempt this in a different way too. In this solution: First of all you select all the div.thumbnail elements which gives you a list of individual products; Then you iterate over them; Because select allows you to chain over itself, you can use select again to get the title.'}, {'title': 'How To Scrape Data from Any Website: 5 Code and No-Code Methods', 'href': 'https://www.scrapin.io/blog/web-scraping', 'body': 'To help you with this, here are some of the methods that you can use depending on your data extraction needs: \u200d. 1. Manual Scraping with Upwork and Fiverr. If you are interested in manual data scraping, you can hire a freelancer via popular freelancing platforms like Upwork and Fiverr.'}, {'title': 'My ultimate guide to web sc

In [34]:
pprint(results)

In [37]:
from tavily import TavilyClient
from dotenv import load_dotenv
import os
import requests
import random
import time
from requests import get
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import warnings
from bs4 import MarkupResemblesLocatorWarning
import PyPDF2
import io
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

# load_dotenv()
# api_key=os.environ['TVLY_API_KEY']

def search_query(query:str, api_key):
    tavily_client = TavilyClient(api_key=api_key)
    results = tavily_client.search(query,
                               include_raw_content=False)
    return results


# List of user-agents
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.3',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
    'Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko'
]

# Proxy details
PROXIES = {
    'http': 'http://your_proxy_server:port',
    'https': 'https://your_proxy_server:port'
}

async def fetch_content(url: str) -> str:
    """Fetches the content from a given URL asynchronously.

    Args:
        url: The URL to fetch the content from.
    Returns:
        The content of the page, or an empty string if an error occurs.
    """
    # await asyncio.sleep(random.randint(2, 5))  # Simulate human-like behavior with random sleep durations
    try:
        headers = {'User-Agent': random.choice(USER_AGENTS)}
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers=headers) as response:
                response.raise_for_status()
                if url.endswith(".pdf"):
                    # Handle PDF content
                    content = await response.read()
                    pdf_reader = PyPDF2.PdfReader(io.BytesIO(content))
                    text = ""
                    for page in range(len(pdf_reader.pages)):
                        text += pdf_reader.pages[page].extract_text()
                    return text
                else:
                    # Handle other content types as before
                    return await response.text()
    except aiohttp.ClientError as e:
        print(f"An error occurred while fetching content from {url}: {e}")
        return ''
    except UnicodeDecodeError as e:
        print(f"An error occurred while decoding content from {url}: {e}")
        return ''

def parse_content(content: str) -> str:
    """Parses the HTML content to extract text.

    Args:
        content: The raw HTML content.
    Returns:
        The text content of the page.
    """
    soup = BeautifulSoup(content, 'html.parser')
    text = '\n'.join([s.get_text(strip=True) for s in soup.find_all()])
    return text

async def get_text_from_url(url: str) -> str:
    """Fetches and parses the text content from a given URL asynchronously.

    Args:
        url: The URL to fetch the content from.
    Returns:
        The text content of the page, or an empty string if an error occurs.
    """
    content = await fetch_content(url)
    if content:
        return parse_content(content)
    return ''

In [38]:
text_html= await fetch_content(results[0]['href'])
if text_html:
    text_output = parse_content(text_html)

In [39]:
pprint(text_output)

In [23]:
# save text in a text file
with open("output.txt", "w") as file:
    file.write(text_output)

In [24]:
jina_api_key ='jina_2e52c403cf164e98b4b33932e010a5b4Pu-b7mFMs178dqaBxy38TPA3f12o'
import aiohttp
import asyncio

url = 'https://r.jina.ai/https://example.com'
headers = {
    'Authorization': 'Bearer {}'.format(jina_api_key),
    'X-Return-Format': 'markdown'
}

async def fetch_jina_content(url: str, headers: dict) -> str:
    url = f'https://r.jina.ai/{url}'
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers) as response:
            response.raise_for_status()
            return await response.text()

# To call the function, you would use:
# content = await fetch_jina_content(url, headers)
# print(content)


In [25]:
url = 'https://r.jina.ai/' + results[0]['href']
content = await fetch_jina_content(url, headers)

In [29]:
print(url)

https://r.jina.ai/https://www.python.org/


In [28]:
# save text in a text file
with open("output_jina.txt", "w") as file:
    file.write(content)

In [30]:
url = 'https://r.jina.ai/' + results[0]['href']
content2 = await fetch_jina_content(url, headers)

In [32]:
# content2

In [42]:
import re
def parse_content_markdown(content: str) -> str:
    """Parses the HTML content and converts it to markdown format.

    Args:
        content: The raw HTML content.
    Returns:
        The markdown formatted text content of the page.
    """
    soup = BeautifulSoup(content, 'html.parser')

    # Handle headings
    for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        heading.string = f"{'#' * int(heading.name[1:])} {heading.get_text(strip=True)}\n"

    # Handle paragraphs
    for p in soup.find_all('p'):
        p.string = f"{p.get_text(strip=True)}\n\n"

    # Handle links
    for a in soup.find_all('a'):
        if 'href' in a.attrs:  # Check if 'href' exists
            a.string = f"[{a.get_text(strip=True)}]({a['href']})"
        

    # Handle bold and italic text
    for strong in soup.find_all('strong'):
        strong.string = f"**{strong.get_text(strip=True)}**"
    for em in soup.find_all('em'):
        em.string = f"*{em.get_text(strip=True)}*"

    # Handle unordered lists
    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            li.string = f"- {li.get_text(strip=True)}\n"

    # Handle ordered lists
    for ol in soup.find_all('ol'):
        for i, li in enumerate(ol.find_all('li')):
            li.string = f"{i+1}. {li.get_text(strip=True)}\n"

    # Handle code blocks
    for pre in soup.find_all('pre'):
        pre.string = f"```\n{pre.get_text()}\n```"

    # Handle images
    for img in soup.find_all('img'):
        alt_text = img.get('alt', '')
        img.replace_with(f"![{alt_text}]({img['src']})")

    # Remove empty tags
    for tag in soup.find_all():
        if not tag.get_text(strip=True):
            tag.decompose()

    # Get the final markdown text
    markdown_text = soup.get_text()

    # Clean up extra newlines
    markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)

    return markdown_text

In [41]:
text_html

'<!DOCTYPE html>\n<html lang="en">\n    <head>\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n        \n        \n            <title>Web Scraping Python Tutorial – How to Scrape Data From A Website</title>\n        \n        <meta name="HandheldFriendly" content="True">\n        <meta name="viewport" content="width=device-width, initial-scale=1.0">\n\n        <link rel="preconnect" href="https://fonts.googleapis.com">\n        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin="">\n        \n            <link rel="preload" as="style" onload="this.onload=null;this.rel=\'stylesheet\'" href="https://fonts.googleapis.com/css2?family=Lato:ital,wght@0,300;0,400;0,700;1,400&family=Roboto+Mono:wght@400;700&display=swap">\n        \n\n        \n        \n    <link rel="preload" as="style" onload="this.onload=null;this.rel=\'stylesheet\'" href="https://cdn.freecodecamp.org/news-assets/prism/1.29.0/themes/prism.min.css">\n<noscript

In [44]:
content3 = parse_content_markdown(text_html)
# save text in a text file
with open("output_mk.txt", "w") as file:
    file.write(content3)

In [45]:
from brave import AsyncBrave

brave = AsyncBrave(api_key='BSA2ididmmVVYAHiX4CNsF598VVyXNA')

query = "Best way to webscrape in python"
num_results = 10

# Use 'await' to perform the search asynchronously
search_results = await brave.search(q=query, count=num_results)

In [None]:
def search_using_brave(query, num_results=5, api_key=api_key):
    brave = AsyncBrave(api_key=api_key)
    search_results = await brave.search(q=query, count=num_results)
    web_results = search_results.web_results
    urls = [x['url'].unicode_string() for x in web_results]
    return {'results':[{'url':url} for url in urls]}

In [64]:
[0]['url'].unicode_string()

'https://nanonets.com/blog/web-scraping-with-python-tutorial/'

In [66]:
from duckduckgo_search import AsyncDDGS

results = await AsyncDDGS().atext("Best way to webscrape", max_results=5)
print(results)

[{'title': 'Web Scraping Python Tutorial - How to Scrape Data From A Website', 'href': 'https://www.freecodecamp.org/news/web-scraping-python-tutorial-how-to-scrape-data-from-a-website/', 'body': 'Note that this is only one of the solutions. You can attempt this in a different way too. In this solution: First of all you select all the div.thumbnail elements which gives you a list of individual products; Then you iterate over them; Because select allows you to chain over itself, you can use select again to get the title.'}, {'title': 'How To Scrape Data from Any Website: 5 Code and No-Code Methods', 'href': 'https://www.scrapin.io/blog/web-scraping', 'body': 'To help you with this, here are some of the methods that you can use depending on your data extraction needs: \u200d. 1. Manual Scraping with Upwork and Fiverr. If you are interested in manual data scraping, you can hire a freelancer via popular freelancing platforms like Upwork and Fiverr.'}, {'title': 'My ultimate guide to web sc

In [69]:
formatted_results = {'results':[{'url': result['href']} for result in results]}
pprint(formatted_results)
 

In [70]:
url = 'https://r.jina.ai/' + results[0]['href']
content = await fetch_jina_content(url, headers)

In [76]:
import tiktoken
def chunk_text_by_tokens(text, max_token_length=100, overlap=0):
    enc = tiktoken.get_encoding("o200k_base")
    tokens = enc.encode(text,disallowed_special=())
    chunks = []

    for i in range(0, len(tokens), max_token_length - overlap):
        chunk_tokens = tokens[i:i + max_token_length]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)

    return chunks

In [86]:
chunks = chunk_text_by_tokens(content,max_token_length=500, overlap=100)

In [87]:
len(chunks)

15

In [88]:
len(chunks[0])

2098

In [112]:
import bm25s
from bm25s import BM25
import Stemmer
def create_bm25(chunks):
    tokenized_chunks = bm25s.tokenize([chunk for chunk in chunks],stopwords='en',stemmer=Stemmer.Stemmer("english"))

    bm25 = BM25()
    bm25.index(tokenized_chunks)
    return bm25

In [113]:
bm25_index = create_bm25(chunks)

                                                            

In [115]:
chunks[13]

'-code/)\n*   [Learn PHP](https://www.freecodecamp.org/news/the-php-handbook/)\n*   [Learn Java](https://www.freecodecamp.org/news/the-java-handbook/)\n*   [Learn Swift](https://www.freecodecamp.org/news/the-swift-handbook/)\n*   [Learn Golang](https://www.freecodecamp.org/news/learn-golang-handbook/)\n*   [Learn Node.js](https://www.freecodecamp.org/news/get-started-with-nodejs/)\n*   [Learn CSS Grid](https://www.freecodecamp.org/news/complete-guide-to-css-grid/)\n*   [Learn Solidity](https://www.freecodecamp.org/news/learn-solidity-handbook/)\n*   [Learn Express.js](https://www.freecodecamp.org/news/the-express-handbook/)\n*   [Learn JS Modules](https://www.freecodecamp.org/news/javascript-es-modules-and-module-bundlers/)\n*   [Learn Apache Kafka](https://www.freecodecamp.org/news/apache-kafka-handbook/)\n*   [REST API Best Practices](https://www.freecodecamp.org/news/rest-api-design-best-practices-build-a-rest-api/)\n*   [Front-End JS Development](https://www.freecodecamp.org/news/f

In [116]:
query_tokens = bm25s.tokenize('python best practice webscrape',stopwords='en',stemmer=Stemmer.Stemmer("english"))
docs, scores = bm25_index.retrieve(query_tokens, k=10)

                                                     

In [117]:
len(docs[0])

10

In [118]:
docs[0][0]

13

In [120]:
chunks[13]

'-code/)\n*   [Learn PHP](https://www.freecodecamp.org/news/the-php-handbook/)\n*   [Learn Java](https://www.freecodecamp.org/news/the-java-handbook/)\n*   [Learn Swift](https://www.freecodecamp.org/news/the-swift-handbook/)\n*   [Learn Golang](https://www.freecodecamp.org/news/learn-golang-handbook/)\n*   [Learn Node.js](https://www.freecodecamp.org/news/get-started-with-nodejs/)\n*   [Learn CSS Grid](https://www.freecodecamp.org/news/complete-guide-to-css-grid/)\n*   [Learn Solidity](https://www.freecodecamp.org/news/learn-solidity-handbook/)\n*   [Learn Express.js](https://www.freecodecamp.org/news/the-express-handbook/)\n*   [Learn JS Modules](https://www.freecodecamp.org/news/javascript-es-modules-and-module-bundlers/)\n*   [Learn Apache Kafka](https://www.freecodecamp.org/news/apache-kafka-handbook/)\n*   [REST API Best Practices](https://www.freecodecamp.org/news/rest-api-design-best-practices-build-a-rest-api/)\n*   [Front-End JS Development](https://www.freecodecamp.org/news/f

INFO:flashrank.Ranker:Downloading ms-marco-MiniLM-L-12-v2...


Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...


ms-marco-MiniLM-L-12-v2.zip: 100%|██████████| 21.6M/21.6M [00:03<00:00, 6.64MiB/s]


NameError: name 'content' is not defined