In [33]:
import pandas as pd 
from bs4 import BeautifulSoup
import requests
import re
import google.generativeai as genai
import numpy as np
import json
from dotenv import load_dotenv
import os

In [34]:
urls = [
    "https://www.boeing.com",
"https://www.goldmansachs.com",
# "https://www.amazon.com",
"https://corporate.exxonmobil.com",
"https://www.hsbc.com",
"https://www.sony.com",
# "https://www.mcdonalds.com",
"https://www.volkswagenag.com",
"https://www.ibm.com",
"https://www.unilever.com"]

In [35]:
load_dotenv()
apikey = os.getenv('API_KEY')
genai.configure(api_key=apikey)
model = genai.GenerativeModel('gemini-1.5-flash')


In [None]:
generation_config = {
            'temperature': 0.3,
            'max_output_tokens': 1024
        }

In [None]:
def genai_model(final_prompt):
    model_response = model.generate_content(
        final_prompt,
        generation_config=generation_config
    )
    return model_response

In [None]:
tasks = ''' 
What is the company's mission statement or core values?
What products or services does the company offer?
When was the company founded, and who were the founders?
Where is the company's headquarters located?
Who are the key executives or leadership team members?
Has the company received any notable awards or recognitions?'''

routes_prompt =  f""" 
Assume you are a experienced data scientist and you have been given a task to filter routes of a companies website

i will give list of urls enclosed in triple backticks and you will have to filter the routes of the website based on the following tasks, don't give all urls, try to find by generalizing
{tasks}
filtered routes should only contain the routes that relevant to the specified tasks
output should be a list of filtered routes, dont give any other information
"""

extraction_prompt = f"""
Assume you are an experienced data scientist tasked with extracting key information from a company's website. 
Below, I am providing the scraped data of the website along with the tasks that need to be completed. 

The tasks are:
{tasks}

Your job is to extract the required information from the scraped data and provide the output in the following format:
- A dictionary with keys 'task_0', 'task_1', ..., where each task's key corresponds to its index.
- If the information for a task is found, include the extracted information as the value.
- If the information for a task is not found, return 'not found' as the value.

Do not include any additional explanations or details.
Focus only on answering the questions specified in the tasks based on the provided scraped data.

The output should be in this format:
{{
    'task_0': 'extracted information',
    'task_1': 'extracted information',
    ...
}}
"""

extraction_prompt_2 = f"""
Assume you are an experienced data scientist tasked with extracting key information from company websites. Your goal is to precisely extract information for these specific tasks:

{tasks}

Rules for Extraction:
- Use ONLY the information provided in the scraped data
- If information is not found, use the exact phrase "not found"
- Be literal and precise in extraction
- Do not add, interpret, or modify the extracted data
- Return results in a dictionary format with keys 'task_0' through 'task_5'
- Ensure each task's information is extracted verbatim from the source

Output Format:
{{
    'task_0': 'Mission statement or core values',
    'task_1': 'Products and services',
    'task_2': 'Founding year; founders',
    'task_3': 'Headquarters location',
    'task_4': 'Executives and leadership',
    'task_5': 'Awards and recognitions'
}}
"""

In [None]:
def scrape(url):
    # Create a session for persistent connections
    session = requests.Session()
    
    # Set headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive'
    }
    
    # Send the GET request with headers
    response = session.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Clean the HTML content: remove scripts, tags, and extra spaces
    page_text = str(soup)
    page_text = re.sub(r'<script.*?>.*?</script>', '', page_text, flags=re.DOTALL)  # Remove <script> tags
    page_text = re.sub(r'<.*?>', '', page_text)  # Remove all HTML tags
    page_text = re.sub(r'\s+', ' ', page_text)  # Replace multiple spaces with a single space
    page_text = page_text.strip()  # Remove leading and trailing spaces
    
    return page_text


In [None]:
def clean_extracted_data(raw_text):
    # Remove code block markers and extra newlines
    cleaned_text = raw_text.replace('```json\n', '').replace('\n```', '').strip()
    
    try:
        # Parse the JSON
        data = json.loads(cleaned_text)
        
        # Clean each key's value
        for key in data:
            # Remove extra newlines and multiple spaces
            if isinstance(data[key], str):
                data[key] = ' '.join(data[key].split())
        
        return data
    except json.JSONDecodeError:
        # If JSON parsing fails, try to extract JSON manually
        try:
            # Use regex to extract JSON content
            json_match = re.search(r'\{.*\}', cleaned_text, re.DOTALL)
            if json_match:
                cleaned_text = json_match.group(0)
                return json.loads(cleaned_text)
        except:
            print("Failed to parse JSON")
            return None

def save_to_csv(results, output_file='company_data.csv'):
    # Prepare the data for CSV
    csv_data = []
    
    for result in results:
        # Process each result
        cleaned_result = clean_extracted_data(result['data'])
        
        if cleaned_result:
            # Create a row with URL and all tasks
            row = {
                'URL': result.get('url', 'Unknown'),
                'Task_0_Mission': cleaned_result.get('task_0', 'Not Found'),
                'Task_1_Products': cleaned_result.get('task_1', 'Not Found'),
                'Task_2_Founding': cleaned_result.get('task_2', 'Not Found'),
                'Task_3_Headquarters': cleaned_result.get('task_3', 'Not Found'),
                'Task_4_Executives': cleaned_result.get('task_4', 'Not Found'),
                'Task_5_Awards': cleaned_result.get('task_5', 'Not Found')
            }
            csv_data.append(row)
    
    # Convert to DataFrame and save to CSV
    df = pd.DataFrame(csv_data)
    df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"Data saved to {output_file}")


In [None]:
final_res = []
# Iterate through the list of URLs
for url in urls:
    try:
        # Fetch the page content
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract all routes (URLs) from <a> tags
        routes = []
        for link in soup.find_all('a'):
            href = link.get('href')
            if href:
                routes.append(href)

        # Remove duplicate routes by converting the list to a set and back to a list
        unique_routes = list(set(routes))

        model_input = f"{routes_prompt} ```{unique_routes}```"
        model_response = genai_model(model_input)

        route_pattern = r"'/([^']+)'"  # Pattern to match content between single quotes
        extracted_routes = re.findall(route_pattern, model_response.text)

        all = ""
        for i in extracted_routes:
            try:
                all += scrape(url + "/" + i)
            except Exception as e:
                log_message = f"Error scraping route {url}/{i}: {str(e)}\n"
                open("output.txt", "a").write(log_message)
                print(log_message)

        b = genai_model(f"{extraction_prompt_2} {all}").text
        open("output.txt", "a").write(b)
        print(b)
        result_dict = {
            'url': url,
            'data': b
        }
        final_res.append(result_dict)
    except Exception as e:
        log_message = f"Error scraping URL {url}: {str(e)}\n"
        open("output.txt", "a").write(log_message)
        print(log_message)

# save_to_csv(final_res)


In [None]:
save_to_csv(final_res)