## Import Requirement Packages

In [1]:
import requests, time, json
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from openai import OpenAI
from urllib.parse import urlencode
import re
from typing_extensions import override
from openai import AssistantEventHandler
import concurrent.futures
import os
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
from collections import defaultdict
import json
import csv
from urllib.parse import urljoin, urlparse


## Setting OpenAI

In [5]:
# Headers to mimic a browser visit
headers = {
    'User-Agent': ''
}
# OpenAI API
client = OpenAI(api_key="")

# First, we create a EventHandler class to define
# how we want to handle the events in the response stream.
class EventHandler(AssistantEventHandler):    
  @override
  def on_text_created(self, text) -> None:
    print(f"\nassistant > ", end="", flush=True)
      
  @override
  def on_text_delta(self, delta, snapshot):
    print(delta.value, end="", flush=True)
      
  def on_tool_call_created(self, tool_call):
    print(f"\nassistant > {tool_call.type}\n", flush=True)
  
  def on_tool_call_delta(self, delta, snapshot):
    if delta.type == 'code_interpreter':
      if delta.code_interpreter.input:
        print(delta.code_interpreter.input, end="", flush=True)
      if delta.code_interpreter.outputs:
        print(f"\n\noutput >", flush=True)
        for output in delta.code_interpreter.outputs:
          if output.type == "logs":
            print(f"\n{output.logs}", flush=True)

## Read CSV File and Get Company Name and Website

In [None]:
def read_csv_file(csv_file, start, end):
    csv_data = {}
    try:
        with open(csv_file, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for idx, row in enumerate(reader):
                if idx < start:
                    continue
                if end is not None and idx > end:
                    break
                company_name = row['Company name']
                websites = row['Website']
                csv_data[company_name] = websites
    except FileNotFoundError:
        print(f"Error: File '{csv_file}' not found.")
    except Exception as e:
        print(f"Error reading CSV file: {e}")
    return csv_data

In [None]:
testingdata_buyer51_100 = "testingdata_buyer51_100.csv"
start = 44
end = 51
csv_data_website = read_csv_file(testingdata_buyer51_100, start, end)
csv_data_website

## General Website Scraping

### Sitemap Finding

In [None]:
# get_sitemap_url is the function to get the base sitemap URL
def get_sitemap_url(base_url):
    def fetch_sitemap(scrapeops_api_key, url):
        # Proxy settings
        proxy_params = {
            'api_key': scrapeops_api_key,
            'url': url,
            'render_js': True,
        }
    
        # Send a GET request to the proxy URL
        response = requests.get(url='https://proxy.scrapeops.io/v1/',
                                params=urlencode(proxy_params),
                                headers={'User-Agent': ''})
        if response.status_code == 200:
            return response.text
        return None

    scrapeops_api_key = ""  # API Key
    # Normalize the base URL
    base_url = base_url.rstrip('/')
    # Try common sitemap URLs
    common_paths = []
    for path in common_paths:
        url = urljoin(base_url, path)
        sitemap_content = fetch_sitemap(scrapeops_api_key, url)
        if sitemap_content:
            print('Sitemap URL found:', url)
            return url, sitemap_content
    
    # Try parsing robots.txt for sitemap entries
    robots_url = urljoin(base_url, 'robots.txt')
    robots_content = fetch_sitemap(scrapeops_api_key, robots_url)
    if robots_content:
        # Lines that contain 'Sitemap:' directive
        lines = robots_content.splitlines()
        for line in lines:
            if line.strip().lower().startswith('sitemap:'):
                sitemap_path = line.split(':', 1)[1].strip()
                if sitemap_path:
                    url = urljoin(base_url, sitemap_path)
                    sitemap_content = fetch_sitemap(scrapeops_api_key, url)
                    if sitemap_content:
                        print('Sitemap URL found via robots.txt:', url)
                        return url, sitemap_content

    print('No sitemap found')
    return None, None

# find_additional_sitemaps is the function to find additional sitemaps under base sitemap
def find_additional_sitemaps(sitemap_content):
    additional_sitemaps = []
    try:
        root = ET.fromstring(sitemap_content)
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        for sitemap in root.findall('sitemap:sitemap', namespace):
            loc = sitemap.find('sitemap:loc', namespace)
            if loc is not None:
                sitemap_url = loc.text
                additional_sitemaps.append(sitemap_url)
    except ET.ParseError:
        print('Error parsing the sitemap content')
        
    return additional_sitemaps


In [None]:
# base_url is website base URL
base_url = ''

sitemap_url, sitemap_content = get_sitemap_url(base_url)
print(sitemap_url)
additional_sitemaps = find_additional_sitemaps(sitemap_content)
print(additional_sitemaps)

### Extract Product URLs from Sitemap

In [None]:
# extract_product_urls is the function that extract product URLs from the sitemap
def extract_product_urls(sitemap_url, scrapeops_api_key, pattern, max_urls):
    # Proxy settings
    proxy_params = {
        'api_key': scrapeops_api_key,
        'url': sitemap_url,
        'render_js': True,
    }
    
    # Send a GET request to the proxy URL
    response = requests.get(url='https://proxy.scrapeops.io/v1/',
                            params=urlencode(proxy_params),
                            headers={'User-Agent': ''})

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the XML content
        root = ET.fromstring(response.content)
        
        # Define namespace
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        
        # Extract URLs that contain the specific pattern in their path
        product_urls = [
            url.find('sitemap:loc', namespace).text 
            for url in root.findall('sitemap:url', namespace) 
            if pattern in url.find('sitemap:loc', namespace).text
        ]
        
        # Check if the number of product URLs exceeds the maximum allowed
        print(len(product_urls))
        if len(product_urls) > max_urls:
            print("!!!WARNING!!! There are too many links in the list and might take a long time to process. Should consider separating into smaller lists.")
        else:
            print(f"There are {len(product_urls)} links in the scraping list.")
        return product_urls
    
    # Failed to retrieve the sitemap
    else:
        print(f"Failed to retrieve the sitemap, status code: {response.status_code}")
        return []


In [None]:
# Information input for the function
sitemap_url = ""  # Sitemap Link
company_name = ""  # Provide Company Name
scrapeops_api_key = ""  # API Key
pattern = ''  # URL pattern
max_urls= 500  # Maximum number of URLs

# Call extract_product_urls function to extract product URLs
product_urls = []
product_urls = extract_product_urls(sitemap_url, scrapeops_api_key, pattern, max_urls)
# print(product_urls)

### Scrape Product Information from Product URLs - Single Threading

In [None]:
# collect_product_pages_single is the function that collects product pages from the list of product URLs
def collect_product_pages_single(product_urls, scrapeops_api_key, headers, client, is_seller):
    results = []
    final_tokens = []
    
    for link in product_urls:
        proxy_params = {
            'api_key': scrapeops_api_key,
            'url': link,
            'render_js': True,
        }
        # Send a GET request to the URL with the headers
        response = requests.get(url='https://proxy.scrapeops.io/v1/',
                                params=urlencode(proxy_params),
                                headers=headers)
        # Check if the request was successful
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            prod = str(soup.get_text(separator=' ', strip=True))
            
            if len(prod) > 256000:
                results.append({"product_name": "Pass"})
                continue
            else:
                print(link)
                thread = client.beta.threads.create()
                message = client.beta.threads.messages.create(
                    thread_id=thread.id,
                    role="user",
                    content=f"Please follow the instruction to check if the following content is product information or not and summarize the information into a JSON format with attributes (product_name, product_detail, and product_application) if it is a product information page: {prod}"
                )
                # Then, we use the `stream` SDK helper with the `EventHandler` class to create the Run and stream the response.
                if is_seller:
                    with client.beta.threads.runs.stream(
                            thread_id=thread.id,
                            assistant_id="",
                            event_handler=EventHandler(),
                    ) as stream:
                        stream.until_done()
                else:
                    with client.beta.threads.runs.stream(
                            thread_id=thread.id,
                            assistant_id="",
                            event_handler=EventHandler(),
                    ) as stream:
                        stream.until_done()
                # Retrieve the final message
                messages = client.beta.threads.messages.list(
                    thread_id=thread.id
                )
                final_message = messages.data[0].content[0].text.value 
                results.append(final_message)
                # Retrieve the final tokens
                runs = client.beta.threads.runs.list(
                    thread_id=thread.id
                )
                final_tokens.append({"completion_tokens": runs.data[0].usage.completion_tokens, 
                "prompt_tokens": runs.data[0].usage.prompt_tokens, 
                "total_tokens": runs.data[0].usage.total_tokens})
        # Failed to retrieve the webpage
        else:
            print(f"Failed to retrieve the webpage {link}, status code: {response.status_code}")
            results.append(str({"product_name": "Pass"}))

    return results, final_tokens

In [None]:
# True if the company is a seller, False if the company is a buyer
is_seller = False

# Call collect_product_pages_single function to collect product pages
results, final_tokens = collect_product_pages_single(product_urls, scrapeops_api_key, headers, client, is_seller)
for result in results:
    print(result)

### Scrape Product Information from Product URLs -  Multi-threading

In [None]:
# process_url is function to handle if the product URL request return status code is 429
def process_url(link, queue, scrapeops_api_key, headers, client, is_seller, final_tokens):
    proxy_params = {
        'api_key': scrapeops_api_key,
        'url': link,
        'render_js': True,
    }
    headers = headers

    retries = 2
    backoff_factor = 5

    for i in range(retries):
        try:
            response = requests.get(url='https://proxy.scrapeops.io/v1/',
                                    params=urlencode(proxy_params),
                                    headers=headers,
                                    timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                prod = str(soup.get_text(separator=' ', strip=True))
                if len(prod) > 256000:
                    queue.put(json.dumps({"product_name": "Pass"}))
                    return
                else:
                    print(link)
                    thread = client.beta.threads.create()
                    message = client.beta.threads.messages.create(
                        thread_id=thread.id,
                        role="user",
                        content=f"Please follow the instruction to check if the following content is product information or not and summarize the information into a JSON format with attributes (product_name, product_detail, and product_application) if it is a product information page: {prod}"
                    )
                    if is_seller:
                        with client.beta.threads.runs.stream(
                            thread_id=thread.id,
                            assistant_id="",
                            event_handler=EventHandler(),
                        ) as stream:
                            stream.until_done()
                    else:
                        with client.beta.threads.runs.stream(
                            thread_id=thread.id,
                            assistant_id="",
                            event_handler=EventHandler(),
                        ) as stream:
                            stream.until_done()
                    # Retrieve the final message    
                    messages = client.beta.threads.messages.list(
                        thread_id=thread.id
                    )
                    final_message = messages.data[0].content[0].text.value
                    queue.put(final_message)
                    # Retrieve the final tokens
                    runs = client.beta.threads.runs.list(
                    thread_id=thread.id
                )
                    final_tokens.append({"completion_tokens": runs.data[0].usage.completion_tokens, 
                    "prompt_tokens": runs.data[0].usage.prompt_tokens, 
                    "total_tokens": runs.data[0].usage.total_tokens})
                    return
                
            elif response.status_code == 429:
                wait_time = backoff_factor * (2 ** i) + random.uniform(1, 3)
                print(f"Rate limit exceeded for {link}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Failed to retrieve the webpage {link}, status code: {response.status_code}")
                queue.put(json.dumps({"product_name": "failed"}))
                return
        except requests.RequestException as e:
            print(f"RequestException for URL {link}: {e}")
            time.sleep(backoff_factor * (2 ** i) + random.uniform(1, 3))

    print(f"Failed to retrieve the webpage {link} after {retries} retries due to rate limiting.")
    queue.put(json.dumps({"product_name": "failed"}))

# Number of workers (concurrent threads)
num_workers = 5

# collect_product_pages_multithreaded is function to collect product pages using multi-threading
def collect_product_pages_multithreaded(product_urls, scrapeops_api_key, headers, client, num_workers, is_seller):
    results = []
    results_queue = Queue()
    final_tokens = []

    with ThreadPoolExecutor(max_workers = num_workers) as executor:
        future_to_url = {executor.submit(process_url, link, results_queue, scrapeops_api_key, headers, client, is_seller, final_tokens): link for link in product_urls}
        total_counter = 0
        for future in as_completed(future_to_url):
            total_counter += 1
            link = future_to_url[future]
            try:
                future.result()
            except Exception as e:
                print(f"Exception occurred for URL {link}: {e}")
                results_queue.put(json.dumps({"product_name": "Pass"}))

    print(f"Total number of tasks completed: {total_counter}")

    while not results_queue.empty():
        results.append(results_queue.get())
    
    return results, final_tokens

In [None]:
# True if the company is a seller, False if the company is a buyer
is_seller = False

# Call collect_product_pages_single function to collect product pages
results, final_tokens = collect_product_pages_multithreaded(product_urls, scrapeops_api_key, headers, client, num_workers, is_seller)
for result in results:
    print(result)

### Save to JSON File

In [None]:
# process_and_save_results function to process and save the results to file
def process_and_save_results(results, product_urls, company_name, is_seller, final_tokens):
    clean = []

    def is_valid_json(s):
        try:
            json.loads(s)
        except ValueError:
            return False
        return True

    for i in range(len(results)):
        if is_valid_json(results[i]):
            data = json.loads(results[i])
            data["product_link"] = product_urls[i]
            data["company_name"] = company_name
            
            # Check if the index exists in final_tokens
            if i < len(final_tokens):
                data["completion_tokens"] = final_tokens[i].get("completion_tokens", 0)
                data["prompt_tokens"] = final_tokens[i].get("prompt_tokens", 0)
                data["total_tokens"] = final_tokens[i].get("total_tokens", 0)
            else:
                data["completion_tokens"] = 0
                data["prompt_tokens"] = 0
                data["total_tokens"] = 0

            clean.append(data)
        else:
            print(f"Invalid JSON string at index {i}: {results[i]}")
    print(clean)
    
    if is_seller:
        file_name = f"Sellers/{company_name}.json"
        with open(file_name, "w") as d:
            json.dump(clean, d)
        print(f"Products information saved to {file_name}")
    else:
        file_name = f"Buyers/{company_name}.json"
        with open(file_name, "w") as d:
            json.dump(clean, d)
        print(f"Products information saved to {file_name}")

In [None]:
# Call process_and_save_results function
process_and_save_results(results, product_urls, company_name, is_seller, final_tokens)

### Check JSON File Items

In [None]:
# Load the JSON data
if is_seller:
    with open(f"Sellers/{company_name}.json", 'r') as file:
        data = json.load(file)
else:    
    with open(f"Buyers/{company_name}.json", 'r') as file:
        data = json.load(file)

# Count the number of items
item_count = len(data)
print(f'The JSON file contains {item_count} items.')

### Verifying Step

In [None]:
import os
import json

def count_items_with_ingredients(data):
    """
    Recursively counts items and items with non-empty 'ingredients' list in the given data.
    Returns a tuple (total_items, items_with_ingredients)
    """
    total_items = 0
    items_with_ingredients = 0

    if isinstance(data, list):
        for item in data:
            t_items, t_with_ingredients = count_items_with_ingredients(item)
            total_items += t_items
            items_with_ingredients += t_with_ingredients
    elif isinstance(data, dict):
        if 'ingredients' in data:
            total_items += 1
            if isinstance(data['ingredients'], list) and len(data['ingredients']) > 0:
                items_with_ingredients += 1
        for value in data.values():
            t_items, t_with_ingredients = count_items_with_ingredients(value)
            total_items += t_items
            items_with_ingredients += t_with_ingredients
    return total_items, items_with_ingredients

In [None]:
folder_path = ''

# Iterate over all JSON files in the specified folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                total_items, items_with_ingredients = count_items_with_ingredients(data)
                if total_items > 0:
                    percentage = (items_with_ingredients / total_items) * 100
                    print(f"{filename}: The percentage of items with non-empty ingredients list is {percentage:.2f}%")
                else:
                    print(f"{filename}: No items found.")
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")
            except Exception as e:
                print(f"An error occurred while processing file {filename}: {e}")