### sirentGPT - Client Scalability

In [31]:
import functions_framework
from openai import OpenAI
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib.request import urlopen, Request
from dotenv import load_dotenv
import threading
from datetime import datetime
import json
import os
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urljoin, urlparse

load_dotenv()
client = OpenAI()

In [32]:
def generate_response(prompt):
    response = client.chat.completions.create(
    model="gpt-4-1106-preview",
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": "You are a helpful assistant, expert in Analysing Companies."},
        {"role": "user", "content": str(prompt)},
    ],
    temperature=0
    )
    selection = response.choices[0].message.content
    return selection

def generate_response_feedback(initial_question, system_answer, feedback):
    response = client.chat.completions.create(
    model="gpt-4-1106-preview",
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": "You are a helpful assistant, expert in Analysing Companies."},
        {"role": "user", "content": str(initial_question)},
        {"role": "system", "content": str(system_answer)},
        {"role": "user", "content": str(feedback)}
    ],
    temperature=0
    )
    selection = response.choices[0].message.content
    return selection

def generate_response_gpt3(prompt):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant, expert in Analysing Companies."},
        {"role": "user", "content": str(prompt)},
    ],
    max_tokens=300,
    temperature=0
    )
    selection = response.choices[0].message.content
    return selection

def generate_response_gpt3_json(prompt):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": "You are a helpful assistant, expert in Analysing Companies."},
        {"role": "user", "content": str(prompt)},
    ],
    max_tokens=300,
    temperature=0
    )
    selection = response.choices[0].message.content
    return selection

def retrieve_html(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url, headers=headers)
    html = urlopen(req).read()
    soup = BeautifulSoup(html, features="html.parser")

    for script in soup(["script", "style"]):
        script.extract() 

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

def get_html(html):
    soup = BeautifulSoup(html, features="html.parser")

    for script in soup(["script", "style"]):
        script.extract() 

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return str(text)

In [33]:
def selenium_search(query):

    API_KEY = os.environ['GOOGLE_SEARCH_API_KEY']
    SEARCH_ENGINE_ID = os.environ['SEARCH_ENGINE_ID']

    url = 'https://www.googleapis.com/customsearch/v1'

    params = {
        'key': API_KEY,
        'cx': SEARCH_ENGINE_ID,
        'q': query
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        results = response.json()

        if 'items' in results:
            return results['items'][0]['link']
        else:
            return 'No results found'
    else:
        return f'Error: {response.status_code}'


In [34]:
prompt_legals = ["yes", "no"]
prompt_legal = """ 

You will be given the url handle of a site.

You will need to classify if this site will direct you to the terms & conditions or legal notice of the company where we can potentially find the identification number of the SIRET/SIREN of such company.

The url paths of these site tend to be: 
/terms
/terms-of-service
/terms-and-conditions
/tos
/terms-of-use
/legal
/legal/terms
/privacy
/legal/terms-and-conditions
/user-agreement
/conditions
/terms-of-agreement

You will need to output your response in JSON format like this:

{
    "legal": yes or no
    "relevant_link": www.example.com/path-to-page
}

Here an example:

Input: https://evergrowth.com/privacy 

Output: 

{
    "legal": "true"
    "relevant_link": "https://evergrowth.com/privacy"
}

If no SIREN is found just output "legal": "Not found".

Here the page to classify:
"""

In [35]:
prompt_siret = """

You are given the scrapped text from a legal/terms and conditions page.

You'll need to identify and output the SIREN or SIRET identification number of the business:

Here a description of what these identification numbers are -

SIREN (Système d'Identification du Répertoire des Entreprises): This is a 9-digit identification number assigned to an individual or entity when they first register their business. This number is unique to each business entity and remains unchanged throughout the life of the business, serving as a way to identify the business in administrative procedures and documents.

SIRET (Système d'Identification du Répertoire des Établissements): This number expands on the SIREN number by adding a 5-digit serial number, referred to as NIC (Numéro Interne de Classement), making it a 14-digit number in total. The SIRET number uniquely identifies each physical location (establishment) of a business. If a business operates in multiple locations, each one will have its own SIRET number, but the initial 9-digit SIREN part will be the same for all locations.  

Output your answer in JSON format. Here an example:

{
    "siren/siret": Siren/Siret Number
}

Example Input:

"mail : siege@energys-sas.fr

SIREN : 479 767 926

TVA intracommunautaire : FR55 479737926

Capital : 75000"

Example Output:

{
    "siren/siret": "479737926"
}

If no siren/siret is found in the output put out -> {"siren/siret": "Not found"}
"""

In [36]:
header_siret = "Find here the scrapped text: "

In [37]:
def format_url(url):
    if url.startswith("http://www."):
        url = url.replace("http://www.", "https://www.")
    elif url.startswith("http://"):
        url = url.replace("http://", "https://www.")
    elif url.startswith("www."):
        url = "https://" + url
    elif not url.startswith("https://"):
        url = "https://www." + url
    return url

In [38]:
def retieve_url(url, html=False):
    apikey = os.getenv("ZEN_ROWS_SCRAPER")

    zenrows_api_base = "https://api.zenrows.com/v1/"

    requests_session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    requests_session.mount("https://", HTTPAdapter(max_retries=retries))

    try:
        response = requests_session.get(zenrows_api_base, params={
            "apikey": apikey,
            "url": url,
        })
        if html:
            return response.text
        else: 
            return response
    
    except Exception as e:
        print(e)  
    
    try:
        response = requests.get(zenrows_api_base, params={
            "apikey": apikey,
            "url": url,
            "block_resources": "image,media,font",
            "premium_proxy":"true"
        })
        if html:
            return response.text
        else: 
            return response

    except Exception as e:
        print(e) 

In [39]:
def extract_all_links(base_url):

    response = retieve_url(base_url)
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        a_tags = soup.find_all('a')
        links = set()  
        links.add(base_url.rstrip('/'))
        for tag in a_tags:
            if 'href' in tag.attrs:
                full_url = urljoin(base_url, tag['href'])
                if urlparse(full_url).netloc == urlparse(base_url).netloc:
                    links.add(full_url)
    
        return links
    except Exception:
        print("Failed to retrieve the main webpage.")
        return []

In [40]:
import requests
import xml.etree.ElementTree as ET

def fetch_sitemap(url):
    """
    Fetches the sitemap from the given URL and parses it to extract all page URLs.

    :param url: The base URL of the website.
    :return: A list of URLs contained in the sitemap.
    """
    sitemap_urls = []
    try:
        sitemap_url = url.rstrip('/') + '/sitemap.xml'
        response = requests.get(sitemap_url)
        response.raise_for_status()  

        root = ET.fromstring(response.content)
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        if root.tag == '{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex':
            for sitemap in root.findall('sitemap:sitemap', namespace):
                loc = sitemap.find('sitemap:loc', namespace).text
                sitemap_urls.extend(fetch_sitemap_pages(loc))
        else:
            sitemap_urls.extend(fetch_sitemap_pages(sitemap_url))

    except requests.RequestException as e:
        print(f"Failed to fetch the sitemap: {e}")
    except ET.ParseError as e:
        print(f"Failed to parse the sitemap XML: {e}")

    return sitemap_urls

def fetch_sitemap_pages(sitemap_url):
    """
    Fetches a sitemap and parses it to extract page URLs.

    :param sitemap_url: The URL of the sitemap to parse.
    :return: A list of page URLs found in the sitemap.
    """
    page_urls = []
    base_bool = True

    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()

        root = ET.fromstring(response.content)
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        for url in root.findall('sitemap:url', namespace):
            loc = url.find('sitemap:loc', namespace).text
            page_urls.append(loc)

    except requests.RequestException as e:
        print(f"Failed to fetch the sitemap page: {e}")
        base_bool = False
    except ET.ParseError as e:
        print(f"Failed to parse the sitemap XML: {e}")
        base_bool = False

    return page_urls, base_bool

# url = 'https://www.apolomarketing.net'
# pages = fetch_sitemap(url)
# print(pages)


In [41]:
urls_executor = ThreadPoolExecutor(12)
relevant_executor = ThreadPoolExecutor(12)
def sirenGPT(url):
    try:
        all_pages, base_bool = fetch_sitemap(format_url(url))
        if base_bool == False:
            all_pages = extract_all_links(format_url(url))
    except Exception as e:
        all_pages = extract_all_links(format_url(url))

        
    futures_all_pages = []
    relevant = set()
    print("Enriching...", url)

    for page in all_pages:
        future = urls_executor.submit(generate_response_gpt3_json, (prompt_legal + str(page)))
        futures_all_pages.append(future)
    
    for future in futures_all_pages:
        result = future.result()
        result_json = json.loads(result)
        legal = result_json["legal"]
        if legal == "yes":
            relevant.add(result_json["relevant_link"])
    
    
    print("Found", len(relevant), "relevant pages.")
    print(relevant)
    full_text = ""
    if len(relevant) > 0:
        for page in relevant:
            html = retieve_url(format_url(page), html=True)
            full_text += get_html(html)
    
        response = generate_response(prompt_siret + header_siret + full_text)
        print(response)
        result_json = json.loads(response)
        final_result = result_json["siren/siret"]

        return final_result
    else:
        return "Nothing found"

In [42]:
#sirenGPT("https://evergrowth.com")

In [43]:
import signal
import time

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

def run_function_with_timeout(url):
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(60)  

    try:
        start_time = time.time()
        
        result = sirenGPT(url)
        
        elapsed_time = time.time() - start_time
        print(f"sirenGPT: Completed in {elapsed_time:.2f} seconds.")
        
        signal.alarm(0)
        print("\n")
        return result
    
    except TimeoutException:
        print('Result took too long to output.')
        print("\n")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        print("\n")
        return None

signal.alarm(0)

0

In [44]:
#run_function_with_timeout("https://groupe-coreal.fr/")

#### To enrich fully

In [45]:
import pandas as pd
df = pd.read_csv('test_enrichments/Untitled spreadsheet - Sheet2 (24).csv')
df

Unnamed: 0,SIREN,COMPANY_DOMAIN
0,484299854,3cdbagencement.fr
1,528215270,abak-ingenierie.com
2,329904254,abest.fr
3,509087672,aconstruct.fr
4,315814228,peintisol.net
...,...,...
218,329152763,var-amenagement-developpement.fr
219,63802276,vivian.fr
220,324847003,vldparis.fr
221,342645504,wiame-vrd.com


In [46]:
df['result'] = None

In [47]:
# for index, row in df.iterrows():
#     company_domain = row['COMPANY_DOMAIN']
#     result = run_function_with_timeout(company_domain)
#     df.at[index, 'result'] = result

# df.to_csv('siren_siret_results.csv', index=False)

In [48]:
import concurrent.futures
import pandas as pd

# Assuming df is your DataFrame and run_function_with_timeout is defined

def worker(row):
    index, data = row
    company_domain = data['COMPANY_DOMAIN']
    result = sirenGPT(company_domain)
    return index, result

def update_results(future):
    index, result = future.result()
    df.at[index, 'result'] = result

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Submitting all jobs to the executor
    futures = [executor.submit(worker, row) for row in df.iterrows()]
    
    # Optional: If you want to update the DataFrame as tasks complete
    # you can attach a callback to each future that will update the DataFrame
    # with the result once the future is done.
    for future in futures:
        future.add_done_callback(update_results)

# Save the updated DataFrame to CSV
df.to_csv('siren_siret_results.csv', index=False)


Failed to fetch the sitemap: HTTPSConnectionPool(host='www.peintisol.net', port=443): Max retries exceeded with url: /sitemap.xml (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x12f876a50>: Failed to establish a new connection: [Errno 61] Connection refused'))
Failed to parse the sitemap XML: not well-formed (invalid token): line 94, column 18
Enriching... abest.fr
Found 1 relevant pages.
{'https://www.abest.fr/en/legal'}
Enriching... aconstruct.fr
Found 1 relevant pages.
{'https://www.aconstruct.fr/fr/mentions-legales'}
Enriching... 3cdbagencement.fr
{
    "siren/siret": "Not found"
}
Found 0 relevant pages.
set()
{
    "siren/siret": "Not found"
}
Failed to fetch the sitemap: 404 Client Error: Not Found for url: https://www.agb-gle.com/sitemap.xml
Enriching... agb-gle.com
Found 0 relevant pages.
set()
Enriching... agtherm.com
Enriching... afeo.fr
Found 1 relevant pages.
{'https://www.agtherm.com/legal'}
Found 0 relevant pages.
set()
Failed to fetch the 

  soup = BeautifulSoup(response.text, 'html.parser')


Enriching... agi2d.fr
Found 0 relevant pages.
set()
Failed to fetch the sitemap: 403 Client Error: Forbidden for url: https://alcena.fr/sitemap.xml
{
    "siren/siret": "Not found"
}
Enriching... alcena.fr
Found 0 relevant pages.
set()
Enriching... ageo-construction.com
Enriching... aen-energies.io
Found 0 relevant pages.
set()
Enriching... abak-ingenierie.com
Found 0 relevant pages.
set()
Found 1 relevant pages.
{'https://www.abak-ingenierie.com/mentions-legales/'}
Enriching... altempo.com
Enriching... alphacontrole.com
Enriching... peintisol.net
Found 5 relevant pages.
{'https://www.altempo.com/documentation/', 'https://www.altempo.com/securisation-chantier/', 'https://www.altempo.com/temposynch/', 'https://www.altempo.com/logistique-chantier/', 'https://www.altempo.com/signaletique-chantier/'}
Found 0 relevant pages.
set()
Found 0 relevant pages.
set()
Enriching... alfacoustic.com
Found 0 relevant pages.
set()
Failed to parse the sitemap XML: undefined entity: line 15, column 0
{
  

In [49]:
results = pd.read_csv('/Users/ismadoukkali/Desktop/industryGPT/industryGPT/scalability/siren_siret_results.csv')
results

Unnamed: 0,SIREN,COMPANY_DOMAIN,result
0,484299854,3cdbagencement.fr,Nothing found
1,528215270,abak-ingenierie.com,Not found
2,329904254,abest.fr,Not found
3,509087672,aconstruct.fr,Not found
4,315814228,peintisol.net,Nothing found
...,...,...,...
218,329152763,var-amenagement-developpement.fr,32915276300028
219,63802276,vivian.fr,Nothing found
220,324847003,vldparis.fr,Nothing found
221,342645504,wiame-vrd.com,Not found


In [50]:
# Attempt to convert the 'result' column to numeric, coerce errors to NaN
numeric_results = pd.to_numeric(df['result'], errors='coerce')

# Count the number of non-NaN values, which are the numeric entries
num_numeric_entries = numeric_results.notna().sum()

print(num_numeric_entries)


18
