# Comparison of cost and results for scrappers using AI 

- https://www.youtube.com/watch?v=QxHE4af5BQE

- https://github.com/trancethehuman/ai-workshop-code/blob/main/Web_scraping_for_LLM_in_2024.ipynb

In [1]:
competitor_sites = [
    {
        "name": "Articulate 360 by Adobe",
        "url": "https://www.articulate.com/360/pricing/freelancers"
    },
    {
        "name": "7taps",
        "url": "https://www.7taps.com/pricing"
    },
    {
        "name": "Mindsmith AI",
        "url": "https://www.mindsmith.ai/pricing"
    },
    {
        "name": "Cards-microlearning",
        "url": "https://www.cards-microlearning.com/en/tarifs"
    },
]

!pip install tiktoken --quiet


In [2]:
import tiktoken

def count_tokens(input_string: str) -> int:
    tokenizer = tiktoken.get_encoding("cl100k_base")

    tokens = tokenizer.encode(input_string)

    return len(tokens)

def calculate_cost(input_string: str, cost_per_million_tokens: float = 5) -> float:
    num_tokens = count_tokens(input_string)

    total_cost = (num_tokens / 1_000_000) * cost_per_million_tokens

    return total_cost

# Example usage:
input_string = "What's the difference between beer nuts and deer nuts? Beer nuts are about 5 dollars. Deer nuts are just under a buck."
cost = calculate_cost(input_string)
print(f"The total cost for using gpt-4o is: $US {cost:.6f}")

The total cost for using gpt-4o is: $US 0.000135


pip install prettytable tqdm --quiet


In [3]:
from typing import List, Callable, Dict
from prettytable import PrettyTable, ALL
from tqdm import tqdm

def view_scraped_content(scrape_url_functions: List[Dict[str, Callable[[str], str]]], sites_list: List[Dict[str, str]], characters_to_display: int = 500, table_max_width: int = 50) -> List[Dict[str, str]]:
    content_table_headers = ["Site Name"] + [f"{func['name']} content" for func in scrape_url_functions]
    cost_table_headers = ["Site Name"] + [f"{func['name']} cost" for func in scrape_url_functions]

    content_table = PrettyTable()
    content_table.field_names = content_table_headers

    cost_table = PrettyTable()
    cost_table.field_names = cost_table_headers

    scraped_data = []

    for site in sites_list:
        content_row = [site['name']]
        cost_row = [site['name']]
        site_data = {"provider": site['name'], "sites": []}

        for scrape_function in scrape_url_functions:
            function_name = scrape_function['name']
            for _ in tqdm([site], desc=f"Processing site {site['name']} using {function_name}"):
                try:
                    content = scrape_function['function'](site['url'])
                    content_snippet = content[:characters_to_display]
                    content_row.append(content_snippet)

                    cost = calculate_cost(content)
                    cost_row.append(f"${cost:.6f}")

                    site_data["sites"].append({"name": function_name, "content": content})
                except Exception as e:
                    error_message = f"Error: {str(e)}"
                    content_row.append(error_message)
                    cost_row.append("Error")

                    site_data["sites"].append({"name": function_name, "content": error_message})
                    continue

        content_table.add_row(content_row)
        cost_table.add_row(cost_row)
        scraped_data.append(site_data)

    content_table.max_width = table_max_width
    content_table.hrules = ALL

    cost_table.max_width = table_max_width
    cost_table.hrules = ALL

    print("Content Table:")
    print(content_table)

    print("\nCost Table:\nThis is how much it would cost to use gpt-4o to parse this content for extraction.")
    print(cost_table)

    return scraped_data

In [4]:
# Beautiful Soup utility functions
import requests
from bs4 import BeautifulSoup

def beautiful_soup_scrape_url(url: str):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return str(soup)

In [5]:
import requests
def scrape_jina_ai(url: str) -> str:
  response = requests.get("https://r.jina.ai/" + url)
  return response.text

pip install firecrawl-py --quiet


In [6]:
import firecrawl
import getpass
import sys, os
import requests
sys.path.append(os.path.abspath(os.path.join('..', 'secret')))
from secret_info import fire_crawl 

FIRECRAWL_API_KEY = fire_crawl
def scrape_firecrawl(url: str):
    app = firecrawl.FirecrawlApp(api_key=FIRECRAWL_API_KEY)
    scraped_data = app.scrape_url(url)["markdown"]
    return scraped_data

In [7]:

list_of_scraper_functions = [
      {"name": "Beautiful Soup", "function": beautiful_soup_scrape_url},
      {"name": "Firecrawl", "function": scrape_firecrawl},
      {"name": "Jina AI", "function": scrape_jina_ai}
      ]

all_content = view_scraped_content(list_of_scraper_functions, competitor_sites, 700, 20)

Processing site Articulate 360 by Adobe using Beautiful Soup: 100%|██████████| 1/1 [00:00<00:00,  3.01it/s]
Processing site Articulate 360 by Adobe using Firecrawl: 100%|██████████| 1/1 [00:04<00:00,  4.77s/it]
Processing site Articulate 360 by Adobe using Jina AI: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it]
Processing site 7taps using Beautiful Soup: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]
Processing site 7taps using Firecrawl: 100%|██████████| 1/1 [00:03<00:00,  3.14s/it]
Processing site 7taps using Jina AI: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]
Processing site Mindsmith AI using Beautiful Soup: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Processing site Mindsmith AI using Firecrawl: 100%|██████████| 1/1 [00:03<00:00,  3.57s/it]
Processing site Mindsmith AI using Jina AI: 100%|██████████| 1/1 [00:01<00:00,  1.52s/it]
Processing site Cards-microlearning using Beautiful Soup: 100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
Processing site Cards-microlearning using Firec

Content Table:
+----------------------+------------------------+----------------------+----------------------+
|      Site Name       | Beautiful Soup content |  Firecrawl content   |   Jina AI content    |
+----------------------+------------------------+----------------------+----------------------+
|  Articulate 360 by   |         <html>         |    [Skip to main     |  Title: Freelancer   |
|        Adobe         | <head><title>403 Forbi |  content](#content)  |     Pricing for      |
|                      |  dden</title></head>   |                      |   Articulate 360 -   |
|                      |         <body>         | [![Articulate](https | Everything You Need  |
|                      | <center><h1>403 Forbid | ://www.articulate.co | to Create E‑Learning |
|                      |   den</h1></center>    | m/wp-content/uploads |                      |
|                      | <hr/><center>nginx</ce | /2023/06/articulate- | URL Source: https:// |
|                      | 


