In [None]:
from bs4 import BeautifulSoup
import urllib, requests, random, time
from concurrent.futures import ThreadPoolExecutor, as_completed
from fp.fp import FreeProxy
from fake_useragent import UserAgent


# def extract_proxies(html):
#     soup = BeautifulSoup(html, 'html.parser')
#     text = soup.get_text()
    
#     # Regular expression to match IP addresses and ports
#     proxy_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]{1,5}\b')
#     proxies = proxy_pattern.findall(text)
    
#     return proxies

# def fetch_proxies():
#     url = "https://free-proxy-list.net/"
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     ips_ports = extract_proxies(soup.text)
#     proxies = []

#     for ip_port in ips_ports:
#         proxy = f"http://{ip_port}"
#         proxies.append(proxy)
    
#     return proxies


class GoogleSearch:
    def __init__(self, query: str) -> None:
        self.query = query
        escaped_query = urllib.parse.quote_plus(query)
        self.URL = f"https://www.google.com/search?q={escaped_query}"

        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36"
        }
        self.ua = UserAgent()
        self.proxy = None
        self.links = self.get_initial_links()
        self.all_page_data = self.all_pages()

    def get_random_user_agent(self):
        self.headers["User-Agent"] = self.ua.random
        return None
    
    def get_random_proxy(self):
        # return FreeProxy(https=True, google=True, timeout=1.0).get(repeat=False) country_id=['US', 'GB', 'CA', 'AU'], 
        return FreeProxy(google=True, https=True, timeout=20.0).get()

    def clean_urls(self, anchors: list[str]) -> list[str]:

        links: list[str] = []
        for a in anchors:
            links.append(
                list(filter(lambda l: l.startswith("url=http"), a["href"].split("&")))
            )

        links = [
            link.split("url=")[-1]
            for sublist in links
            for link in sublist
            if len(link) > 0
        ]

        return links

    def read_url_page(self, url: str) -> str:
        self.get_random_user_agent()
        # response = requests.get(url, headers=self.headers)
        response = requests.get(url, headers=self.headers, proxies={"https": self.proxy, "http": self.proxy})
        print("read page", response.status_code)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text(strip=True)

    def get_initial_links(self) -> list[str]:
        """
        scrape google for the query with keyword based search
        """
        # print("Searching Google...")
        self.proxy = self.get_random_proxy()
        self.get_random_user_agent()
        response = requests.get(self.URL, headers=self.headers, proxies={"https": self.proxy, "http": self.proxy}) #"https": proxy, 
        print("inital links", response.status_code)
        # response = requests.get(self.URL, headers=self.headers)
        # Print the response status and content for debugging
        code429 = 0
        while True:
            if response.status_code in [429, 400]:
                print(f"Got a {response.status_code}. Retrying.")
                retry_after = int(response.headers.get("Retry-After", 4))  # Default to 60 seconds if not provided
                print(f"Rate limited. Retrying after {retry_after} seconds.")
                time.sleep(retry_after)
                proxy = self.get_random_proxy()
                print(proxy)
                self.get_random_user_agent()
                response = requests.get(self.URL, headers=self.headers, proxies={"https": proxy, "http": proxy}) #"https": proxy, 
                code429 += 1
                if code429 > 5:
                    print("Too many 429s. Exiting.", response.status_code)
                    break
            else:
                print("Finally got through", response.status_code)
                break

        soup = BeautifulSoup(response.text, "html.parser")
        anchors = soup.find_all("a", href=True)
        return self.clean_urls(anchors)

    def all_pages(self) -> list[tuple[str, str]]:
        print(self.links)
        data: list[tuple[str, str]] = []
        with ThreadPoolExecutor(max_workers=4) as executor:

            future_to_url = {
                executor.submit(self.read_url_page, url): url for url in self.links[:3]
            }
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    output = future.result()
                    data.append((url, output))

                except requests.exceptions.HTTPError as e:
                    print(e)

        return data

In [None]:
from crawl4ai import AsyncWebCrawler
import nest_asyncio
import asyncio
from fake_useragent import UserAgent
# https://pypi.org/project/googlesearch-python/
from googlesearch import search
import csv
import random

nest_asyncio.apply()

verbose=False

async def main(urls):
    content = []
    ua = UserAgent()
    # Can add proxy in AsyncWebCrawler(verbose=False, proxy="http://127.0.0.1:7890")
    async with AsyncWebCrawler(verbose=verbose) as crawler:
        for url in urls:
            result = await crawler.arun(url=url, word_count_threshold=2000, user_agent=ua.random, verbose=verbose)
            content.append(result)
    return content

if __name__ == "__main__":
    content = asyncio.run(main())


def get_url_info(queries, num_results=3, num_words=2000, description_only=True):
    urls = []
    if isinstance(queries, str):
        queries = [queries]
    for query in queries:
        for url in search(query, 
                          sleep_interval=random.randint(2, 5), 
                          num_results=num_results,
                          timeout=1000,
                          advanced=True):
            if description_only:
                urls.append(url.description)
            else:
                urls.append(url.url)

    context = ""

    if description_only:
        for con in urls:
            if con is not None:
                context += "\n" + con
            else:
                continue
    else:
        content = asyncio.run(main(urls))
        for con in content:
            if con.markdown is not None:
                truncated_content = " ".join(con.markdown.split()[:num_words])
                context += "\n" + truncated_content
            else:
                continue

    return context

def make_csv(results, filename, header=None, verbose=True):
    # Check if the file exists
    file_exists = os.path.isfile(filename)

    # Open the file in append mode if it exists, otherwise write mode
    with open(filename, 'a' if file_exists else 'w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header only if the file does not exist
        if not file_exists and header is not None:
            writer.writerow(header)
        
        # Write each row to the CSV file
        for row in results:
            writer.writerow(row)

    if verbose: 
        print(f"Data has been {'appended to' if file_exists else 'written to'} {filename}")
    return None

In [8]:
!pip3 install scrapingbee fake_useragent

In [None]:
import time, os
import requests
import urllib.parse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import numpy as np
import random

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

scrapingbee_key = os.getenv("SCRAPINGBEE_API_KEY")

class ProxyWebScraper:
    def __init__(self, dest_url, api_key, verbose=False):
        self.verbose = verbose
        self.api_key = api_key
        self.dest_url = dest_url
        self.ua = UserAgent()
        self.session = requests.Session()
        self.headers = {"User-Agent": self.ua.random}

    # def get_random_user_agent(self):
    #     self.headers["User-Agent"] = self.ua.random
    #     return None
    
    def send_request(self):
        try:
            # self.get_random_user_agent()
            response = self.session.get(
                url="https://app.scrapingbee.com/api/v1",
                params={
                    "url": self.dest_url,
                    "api_key": self.api_key,
                    "render_js": False,
                },
                headers=self.headers
            )
            response.raise_for_status()
            if self.verbose:
                print(f'Response HTTP Status Code: {response.status_code}')
                print(f'Response HTTP Response Body: {response.content}')
            return response
        except requests.exceptions.RequestException as e:
            print(f'HTTP Request failed: {e}')
            return None

    def get_links(self, response):
        if response is None:
            return []
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.find_all('a', href=True)

    def get_sub_links(self, links):
        domain_name = urllib.parse.urlsplit(self.dest_url).netloc
        return [link['href'] for link in links if domain_name in link['href']]
    
    def update_url(self, new_url):
        self.dest_url = new_url
        return None
    

counter = 0
start_time = time.time()
hits = 200
for i in range(hits):
    scraper = ProxyWebScraper(dest_url="https://praxissolutions.com", api_key=scrapingbee_key, verbose=False)
    response = scraper.send_request()
    links = scraper.get_links(response)
    sub_links = scraper.get_sub_links(links)
    counter += 1
    time.sleep(random.randint(2, 20))
    # While loop to simulate user behavior?
    if np.random.choice([0, 1], p=[0.7, 0.3])==1:
        sub_link = random.choice(sub_links)
        full_url = urllib.parse.urljoin(response.url, sub_link)
        scraper.update_url(full_url)
        sub_response = scraper.send_request()
        counter += 1
    if i%20==0:
        end_time = time.time()
        print(f"Round {i} - {counter} credits used.\n   Time elapsed: {np.round((end_time-start_time)/60, 2)}.\n   Time remaining: {np.round((end_time-start_time)/60/counter*(hits-i), 2)}")
print(f"{counter} total credits used.")

In [None]:
import time, os, json
import aiohttp
import asyncio
import urllib.parse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import numpy as np
import random

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

scrapingbee_key = os.getenv("SCRAPINGBEE_API_KEY")

class ProxyWebScraper:
    def __init__(self, dest_url, api_key, verbose=False):
        self.verbose = verbose
        self.api_key = api_key
        self.dest_url = dest_url
        self.ua = UserAgent()
        self.headers = {"User-Agent": self.ua.random}

    async def send_request(self, session):
        try:
            async with session.get(
                url="https://app.scrapingbee.com/api/v1",
                params={
                    "url": self.dest_url,
                    "api_key": self.api_key,
                    "render_js": 'false',
                },
                headers=self.headers
            ) as response:
                response.raise_for_status()
                if self.verbose:
                    print(f'Response HTTP Status Code: {response.status}')
                    print(f'Response HTTP Response Body: {await response.text()}')
                return await response.text()
        except aiohttp.ClientError as e:
            print(f'HTTP Request failed: {e}')
            return None

    def get_links(self, response_text):
        if response_text is None:
            return []
        soup = BeautifulSoup(response_text, 'html.parser')
        return soup.find_all('a', href=True)

    def get_sub_links(self, links):
        domain_name = urllib.parse.urlsplit(self.dest_url).netloc
        return [link['href'] for link in links if domain_name in link['href']]
    
    def update_url(self, new_url):
        self.dest_url = new_url
        return None
    
def send_request():
    response = requests.get(
        url="https://app.scrapingbee.com/api/v1/usage",
        params={
            "api_key": "6D5M0NNTHGA1A7PU7O5T4I5N62BHYV2PH3T7NIFZD6YENV69N55KTPCCYJR5AAKDOPCIBNFIQERIZE75",
        },

    )
    # print('   Response HTTP Status Code: ', response.status_code)
    data_dict = json.loads(response.content)
    print(f'   API credits: {data_dict["used_api_credit"]}/{data_dict["max_api_credit"]}')
    print(f'   Concurrency: {data_dict["current_concurrency"]}/{data_dict["max_concurrency"]}')

async def main(concurrent_requests=4):
    counter = 0
    start_time = time.time()
    hits = 20
    connector = aiohttp.TCPConnector(limit=concurrent_requests)
    async with aiohttp.ClientSession(connector=connector) as session:
        for i in range(hits):
            scraper = ProxyWebScraper(dest_url="https://praxissolutions.com", api_key=scrapingbee_key, verbose=False)
            response_text = await scraper.send_request(session)
            links = scraper.get_links(response_text)
            sub_links = scraper.get_sub_links(links)
            counter += 1
            await asyncio.sleep(random.randint(2, 5))  # Sleep after the main request
            if np.random.choice([0, 1], p=[0.7, 0.3]) == 1:
                sub_link = random.choice(sub_links)
                full_url = urllib.parse.urljoin(scraper.dest_url, sub_link)
                scraper.update_url(full_url)
                await asyncio.sleep(random.randint(1, 3))  # Sleep before the sub-request
                sub_response_text = await scraper.send_request(session)
                counter += 1
            if i % 10 == 0:
                end_time = time.time()
                print(f"Round {i} -- {counter} credits used.\n   Time elapsed: {np.round((end_time-start_time)/60, 2)}.\n   Time remaining: {np.round((end_time-start_time)/60/counter*(hits-i), 2)}")
                send_request()

    print(f"{counter} total credits used.")
    print(f"Total Time elapsed: {np.round((time.time()-start_time)/60, 2)}")

# Run the main function
await main()


In [2]:
import time, os, json
import aiohttp
import asyncio
import urllib.parse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import numpy as np
import random
import requests

from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')

import asyncio
import nest_asyncio

nest_asyncio.apply()

scrapingbee_key = os.getenv("SCRAPINGBEE_API_KEY")

class ProxyWebScraper:
    def __init__(self, dest_url, api_key, verbose=False):
        self.verbose = verbose
        self.api_key = api_key
        self.dest_url = dest_url
        self.ua = UserAgent()
        self.headers = {"User-Agent": self.ua.random}

    async def send_request(self, session):
        try:
            async with session.get(
                url="https://app.scrapingbee.com/api/v1",
                params={
                    "url": self.dest_url,
                    "api_key": self.api_key,
                    "render_js": 'false',
                },
                headers=self.headers
            ) as response:
                response.raise_for_status()
                if self.verbose:
                    print(f'Response HTTP Status Code: {response.status}')
                    print(f'Response HTTP Response Body: {await response.text()}')
                return await response.text()
        except aiohttp.ClientError as e:
            print(f'HTTP Request failed: {e}')
            return None

    def get_links(self, response_text):
        if response_text is None:
            return []
        soup = BeautifulSoup(response_text, 'html.parser')
        return soup.find_all('a', href=True)

    def get_sub_links(self, links):
        domain_name = urllib.parse.urlsplit(self.dest_url).netloc
        return [link['href'] for link in links if domain_name in link['href']]
    
    def update_url(self, new_url):
        self.dest_url = new_url
        return None
    
def send_request(api_key, verbose=True):
    response = requests.get(
        url="https://app.scrapingbee.com/api/v1/usage",
        params={
            "api_key": api_key,
        },

    )
    # print('   Response HTTP Status Code: ', response.status_code)
    data_dict = json.loads(response.content)
    if verbose: 
        print(f'   API credits: {data_dict["used_api_credit"]}/{data_dict["max_api_credit"]}')
        print(f'   Concurrency: {data_dict["current_concurrency"]}/{data_dict["max_concurrency"]}')
        return None
    else:
        return data_dict

async def main(concurrent_requests=4, hits=20, print_every=10):
    counter = 0
    start_time = time.time()
    connector = aiohttp.TCPConnector(limit=concurrent_requests)
    async with aiohttp.ClientSession(connector=connector) as session:
        for i in range(hits):
            scraper = ProxyWebScraper(dest_url="https://praxissolutions.com", api_key=scrapingbee_key, verbose=False)
            response_text = await scraper.send_request(session)
            links = scraper.get_links(response_text)
            sub_links = scraper.get_sub_links(links)
            counter += 1
            del response_text
            await asyncio.sleep(random.randint(2, 5))  # Sleep after the main request
            if np.random.choice([0, 1], p=[0.7, 0.3]) == 1:
                while True:
                    sub_link = random.choice(sub_links)
                    full_url = urllib.parse.urljoin(scraper.dest_url, sub_link)
                    scraper.update_url(full_url)
                    await asyncio.sleep(random.randint(1, 3))  # Sleep before the sub-request
                    _ = await scraper.send_request(session)
                    counter += 1
                    
                    if np.random.choice([0, 1], p=[0.5, 0.5]) == 1:
                        break
            if i % print_every == 0:
                end_time = time.time()
                print(f"Round {i}\n   Time elapsed: {np.round((end_time-start_time)/60, 2)}.\n   Time remaining: {np.round((end_time-start_time)/60/counter*(hits-i), 2)}")
                send_request(scrapingbee_key)
                print()


if __name__ == "__main__":
    start_time = time.time()
    begin = send_request(scrapingbee_key, verbose=False)
    asyncio.run(main())
    fin = send_request(scrapingbee_key, verbose=False)
    print()
    print('--'*30)
    print('Ending stats:')
    print(f'  Concurrency: {fin["current_concurrency"]}/{fin["max_concurrency"]}')
    print(f'  API credits: {fin["used_api_credit"]}/{fin["max_api_credit"]}')
    print(f"  Total credits used: {fin['used_api_credit']-begin['used_api_credit']}")
    print(f"  Total Time elapsed: {np.round((time.time()-start_time)/60, 2)}")

Round 0
   Time elapsed: 0.16.
   Time remaining: 1.61
   API credits: 385/1000
   Concurrency: 0/5

Round 10
   Time elapsed: 1.23.
   Time remaining: 0.77
   API credits: 396/1000
   Concurrency: 0/5


------------------------------------------------------------
Ending stats:
  Concurrency: 0/5
  API credits: 409/1000
  Total credits used: 24


NameError: name 'start_time' is not defined