#Link Parsing

## Link parsing small

In [None]:
# @title
# working with initial 10 links. successfully takes required links and save them in a file
import csv
import requests
from bs4 import BeautifulSoup
import re
import time
from requests.exceptions import Timeout  # Import the Timeout exception for links which takes too much time to load

# Function to categorize links on a webpage
def categorize_links(links):
    maps_link = ""
    instagram_link = ""
    facebook_link = ""
    twitter_link = ""
    other_links = []

    for link in links:
        if "maps.google.com" in link:
            maps_link = link
        elif "www.instagram.com" in link:
            instagram_link = link
        elif "www.facebook.com" in link:
            facebook_link = link
        elif "twitter.com" in link:
            twitter_link = link
        else:
            other_links.append(link)

    return [maps_link, instagram_link, facebook_link, twitter_link] + other_links


def extract_links_with_error_handling(url, index):
    retries = 3  # Number of retries before giving up
    retry_delay = 2  # Delay between retries in seconds
    for _ in range(retries):
        try:
            response = requests.get(url, timeout=10)  # Set a timeout of 10 seconds
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]
            return list(set(valid_links))  # Remove duplicates by converting to a set and back to a list
        except Timeout:
            print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except ConnectionError:
            print(f"Connection error occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except Exception as e:
            print(f"Error extracting links from index {index} for URL {url}: {str(e)}")
            return []

# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile_small.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Iterate over each row in the CSV file
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue  # Skip rows with less than 2 columns

        # Assuming the link is in the second column and the index is in the first column
        link_index = row[0]
        link = row[1]
        if(link == ''):
            continue
        print('going for ', link_index, ' : ', link)
        # Extract links from the current URL and store them as values for the key (current link)
        link_data[link] = extract_links(link, link_index)

        # Introduce a delay of 2 seconds before processing the next link
        time.sleep(1)  # You can adjust the sleep duration as needed

# Categorize links after extracting all links
for key, values in link_data.items():
    link_data[key] = categorize_links(values)

# Print the resulting dictionary and save it to an output file
output_filename = 'link_data.csv'
with open(output_filename, 'w', newline='') as output_csv:
    writer = csv.writer(output_csv)
    for key, values in link_data.items():
        print(f"Link: {key}")
        print("Links found on the page:")
        writer.writerow([key] + values)
        for value in values:
            print(f" - {value}")
        print()

print(f"Data saved to {output_filename}")


##Link Parsing large

In [None]:
# @title
# works with 50 links without issues. also handles most exceptions
import csv
import requests
from bs4 import BeautifulSoup
import re
import time
from requests.exceptions import Timeout, RequestException  # Import the Timeout exception for links which takes too much time to load

# Function to categorize links on a webpage
def categorize_links(links):
    if links is None:
        return []
    maps_link = ""
    instagram_link = ""
    facebook_link = ""
    twitter_link = ""
    other_links = []

    for link in links:
        if "maps.google.com" in link:
            maps_link = link
        elif "www.instagram.com" in link:
            instagram_link = link
        elif "www.facebook.com" in link:
            facebook_link = link
        elif "twitter.com" in link:
            twitter_link = link
        else:
            other_links.append(link)

    return [maps_link, instagram_link, facebook_link, twitter_link] + other_links


def extract_links_with_error_handling(url, index):
    retries = 3  # Number of retries before giving up
    retry_delay = 2  # Delay between retries in seconds
    for _ in range(retries):
        try:
            print('heyaaaa')
            response = requests.get(url, timeout=(5, 5))  # Set a timeout of 10 seconds for both connect and read
            print('aaaayeh')
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]
            return list(set(valid_links))  # Remove duplicates by converting to a set and back to a list
        except Timeout:
            print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except RequestException as e:
            print(f"RequestException occurred for index {index} and URL {url}: {str(e)}")
            return []  # Exit the loop and move to the next link
        except Exception as e:
            print(f"Error extracting links from index {index} for URL {url}: {str(e)}")
            return []

# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Iterate over each row in the CSV file
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue  # Skip rows with less than 2 columns

        # Assuming the link is in the second column and the index is in the first column
        link_index = row[0]
        link = row[1]
        if(link == ''):
            continue
        print('going for ', link_index, ' : ', link)
        # Extract links from the current URL and store them as values for the key (current link)
        link_data[link] = extract_links_with_error_handling(link, link_index)

        # Introduce a delay of 2 seconds before processing the next link
        time.sleep(1)  # You can adjust the sleep duration as needed

# Categorize links after extracting all links
for key, values in link_data.items():
    link_data[key] = categorize_links(values)

# Print the resulting dictionary and save it to an output file
output_filename = 'link_data_full.csv'
with open(output_filename, 'w', newline='') as output_csv:
    writer = csv.writer(output_csv)
    for key, values in link_data.items():
        print(f"Link: {key}")
        print("Links found on the page:")
        writer.writerow([key] + values)
        for value in values:
            print(f" - {value}")
        print()

print(f"Data saved to {output_filename}")


## links -> deep traversals

In [None]:
# @title
# works with 50 links without issues. also handles most exceptions
import csv
import requests
from bs4 import BeautifulSoup
import re
import time
from requests.exceptions import Timeout, RequestException  # Import the Timeout exception for links which takes too much time to load

# Function to categorize links on a webpage
def categorize_links(links):
    if links is None:
        return []
    maps_link = ""
    instagram_link = ""
    facebook_link = ""
    twitter_link = ""
    other_links = []

    for link in links:
        if "maps.google.com" in link:
            maps_link = link
        elif "www.instagram.com" in link:
            instagram_link = link
        elif "www.facebook.com" in link:
            facebook_link = link
        elif "twitter.com" in link:
            twitter_link = link
        else:
            other_links.append(link)

    return [maps_link, instagram_link, facebook_link, twitter_link] + other_links


def extract_links_with_error_handling(url, index):
    retries = 3  # Number of retries before giving up
    retry_delay = 2  # Delay between retries in seconds
    for _ in range(retries):
        try:
            print('heyaaaa')
            response = requests.get(url, timeout=(5, 5))  # Set a timeout of 10 seconds for both connect and read
            print('aaaayeh')
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]
            return list(set(valid_links))  # Remove duplicates by converting to a set and back to a list
        except Timeout:
            print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except RequestException as e:
            print(f"RequestException occurred for index {index} and URL {url}: {str(e)}")
            return []  # Exit the loop and move to the next link
        except Exception as e:
            print(f"Error extracting links from index {index} for URL {url}: {str(e)}")
            return []

# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Iterate over each row in the CSV file
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue  # Skip rows with less than 2 columns

        # Assuming the link is in the second column and the index is in the first column
        link_index = row[0]
        link = row[1]
        if(link == ''):
            continue
        print('going for ', link_index, ' : ', link)
        # Extract links from the current URL and store them as values for the key (current link)
        link_data[link] = extract_links_with_error_handling(link, link_index)

        # Introduce a delay of 2 seconds before processing the next link
        time.sleep(1)  # You can adjust the sleep duration as needed

# Categorize links after extracting all links
for key, values in link_data.items():
    link_data[key] = categorize_links(values)

# Print the resulting dictionary and save it to an output file
output_filename = 'link_data_full.csv'
with open(output_filename, 'w', newline='') as output_csv:
    writer = csv.writer(output_csv)
    for key, values in link_data.items():
        print(f"Link: {key}")
        print("Links found on the page:")
        writer.writerow([key] + values)
        for value in values:
            print(f" - {value}")
        print()

print(f"Data saved to {output_filename}")


# regex codes

## trials

In [None]:
import re
from bs4 import BeautifulSoup

# Sample HTML content (replace this with your actual HTML content)
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')

# Define a regex pattern to match phone numbers
phone_pattern = r'(\+\d{1,2}\s?)?\(\d{3}\)\s?\d{3}[-\s]\d{4}'

# Find elements containing phone numbers and capture their class attributes
phone_number_elements = soup.find_all(text=re.compile(phone_pattern))

for element in phone_number_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    phone_number = element.strip()

    # Check if phone_number is not None before calling strip()
    if phone_number is not None:
        phone_number = phone_number.strip()

    print(f"Phone Number: {phone_number}")
    print(f"Class Attribute: {element_class}")


In [None]:
import re
from bs4 import BeautifulSoup

# Sample HTML content (replace this with your actual HTML content)
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Complex Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <div class="contact-info">
            <p class="phone">Phone: +1 (123) 123-1223</p>
            <p class="email">Email: info@example.com</p>
        </div>
        <div class="address">
            <p>Main Office:</p>
            <p class="street">123 Main St</p>
            <p class="city">Cityville</p>
            <p class="country">Countryland</p>
        </div>
    </div>
    <div class="content">
        <h2>About Us</h2>
        <p class="description">We are a company specializing in...</p>
        <div class="links">
            <a href="https://www.facebook.com" class="social-link">Facebook</a>
            <a href="https://www.twitter.com" class="social-link">Twitter</a>
            <a href="https://www.instagram.com" class="social-link">Instagram</a>
        </div>
    </div>
    <footer class="footer" id="page-footer">
        <div class="opening-hours">
            <p class="day">Monday - Friday</p>
            <p class="hours">9 AM - 6 PM</p>
        </div>
        <div class="copyright">
            <p>&copy; 2023 Sample Company</p>
        </div>
    </footer>
</body>
</html>
"""

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')

# Define regex patterns for phone num\bgroup\sorder\bbers, email addresses, timings, and addresses
phone_pattern = r'(\+\d{1,2}\s?)?\(\d{3}\)\s?\d{3}[-\s]\d{4}'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
timings_pattern = r'(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)(?:day)? [0-9]+(?: AM| PM)? - [0-9]+(?: AM| PM)?'
address_pattern = r'\b\d+\s[A-Za-z\s,]+\b'
catering_pattern = re.compile(r'\bcatering\b', re.IGNORECASE)
group_order_pattern = re.compile(r'\bgroup\sorder\b', re.IGNORECASE)
counter_pattern = re.compile(r'\bcounter\b', re.IGNORECASE)
delivery_pattern = re.compile(r'\bdelivery\b', re.IGNORECASE)
bar_pattern = re.compile(r'\bbar\b', re.IGNORECASE)
pickup_pattern = re.compile(r'\bpick\s*-\s*up\b', re.IGNORECASE)


# Find elements containing phone numbers, email addresses, timings, and addresses and capture their class attributes
phone_number_elements = soup.find_all(text=re.compile(phone_pattern))
email_elements = soup.find_all(text=re.compile(email_pattern))
timings_elements = soup.find_all(text=re.compile(timings_pattern))
address_elements = soup.find_all(text=re.compile(address_pattern))

for element in phone_number_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    phone_number = element.strip()

    # Check if phone_number is not None before calling strip()
    if phone_number is not None:
        phone_number = phone_number.strip()

    print(f"Phone Number: {phone_number}")
    print(f"Class Attribute: {element_class}")

for element in email_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    email = element.strip()

    # Check if email is not None before calling strip()
    if email is not None:
        email = email.strip()

    print(f"Email Address: {email}")
    print(f"Class Attribute: {element_class}")

for element in timings_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    timings = element.strip()

    # Check if timings is not None before calling strip()
    if timings is not None:
        timings = timings.strip()

    print(f"Timings: {timings}")
    print(f"Class Attribute: {element_class}")

for element in address_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    address = element.strip()

    # Check if address is not None before calling strip()
    if address is not None:
        address = address.strip()

    print(f"Address: {address}")
    print(f"Class Attribute: {element_class}")

links traverse and web crawling with threading

## same as below, without functions

In [None]:
import re
from bs4 import BeautifulSoup

# Sample HTML content (replace this with your actual HTML content)
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Complex Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <div class="contact-info">
            <p class="phone">Phone: +1 (123) 123-1223</p>
            <p class="email">Email: info@example.com</p>
        </div>
        <div class="address">
            <p>Main Office:</p>
            <p class="street">123 Main St</p>
            <p class="city">Cityville</p>
            <p class="country">Countryland</p>
        </div>
    </div>
    <div class="content">
        <h2>About Us</h2>
        <p class="description">We are a company specializing in...</p>
        <div class="links">
            <a href="https://www.facebook.com" class="social-link">Facebook</a>
            <a href="https://www.twitter.com" class="social-link">Twitter</a>
            <a href="https://www.instagram.com" class="social-link">Instagram</a>
        </div>
    </div>
    <footer class="footer" id="page-footer">
        <div class="opening-hours">
            <p class="day">Monday - Friday</p>
            <p class="hours">9 AM - 6 PM</p>
        </div>
        <div class="copyright">
            <p>&copy; 2023 Sample Company</p>
        </div>
    </footer>
</body>
</html>
"""

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')

# Define regex patterns for phone num\bgroup\sorder\bbers, email addresses, timings, and addresses
phone_pattern = r'(\+\d{1,2}\s?)?\(\d{3}\)\s?\d{3}[-\s]\d{4}'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
timings_pattern = r'(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)(?:day)? [0-9]+(?: AM| PM)? - [0-9]+(?: AM| PM)?'
address_pattern = r'\b\d+\s[A-Za-z\s,]+\b'
catering_pattern = re.compile(r'\bcatering\b', re.IGNORECASE)
group_order_pattern = re.compile(r'\bgroup\sorder\b', re.IGNORECASE)
counter_pattern = re.compile(r'\bcounter\b', re.IGNORECASE)
delivery_pattern = re.compile(r'\bdelivery\b', re.IGNORECASE)
bar_pattern = re.compile(r'\bbar\b', re.IGNORECASE)
pickup_pattern = re.compile(r'\bpick\s*-\s*up\b', re.IGNORECASE)


# Find elements containing phone numbers, email addresses, timings, and addresses and capture their class attributes
phone_number_elements = soup.find_all(text=re.compile(phone_pattern))
email_elements = soup.find_all(text=re.compile(email_pattern))
timings_elements = soup.find_all(text=re.compile(timings_pattern))
address_elements = soup.find_all(text=re.compile(address_pattern))

for element in phone_number_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    phone_number = element.strip()

    # Check if phone_number is not None before calling strip()
    if phone_number is not None:
        phone_number = phone_number.strip()

    print(f"Phone Number: {phone_number}")
    print(f"Class Attribute: {element_class}")

for element in email_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    email = element.strip()

    # Check if email is not None before calling strip()
    if email is not None:
        email = email.strip()

    print(f"Email Address: {email}")
    print(f"Class Attribute: {element_class}")

for element in timings_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    timings = element.strip()

    # Check if timings is not None before calling strip()
    if timings is not None:
        timings = timings.strip()

    print(f"Timings: {timings}")
    print(f"Class Attribute: {element_class}")

for element in address_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    address = element.strip()

    # Check if address is not None before calling strip()
    if address is not None:
        address = address.strip()

    print(f"Address: {address}")
    print(f"Class Attribute: {element_class}")

In [None]:
# @title
#  links traverse and web crawling with threading
import csv
import requests
from bs4 import BeautifulSoup
import re
import time
from requests.exceptions import Timeout, RequestException
import threading

# Function to categorize links on a webpage
def categorize_links(links):
    if links is None:
        return []
    maps_link = ""
    instagram_link = ""
    facebook_link = ""
    twitter_link = ""
    other_links = []

    for link in links:
        if "maps.google.com" in link:
            maps_link = link
        elif "www.instagram.com" in link:
            instagram_link = link
        elif "www.facebook.com" in link:
            facebook_link = link
        elif "twitter.com" in link:
            twitter_link = link
        else:
            other_links.append(link)

    return [maps_link, instagram_link, facebook_link, twitter_link] + other_links

def extract_links_with_error_handling(url, index):
    retries = 3
    retry_delay = 2
    for _ in range(retries):
        try:
            print('heyaaaa')
            response = requests.get(url, timeout=(5, 5))
            print('aaaayeh')
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]
            return list(set(valid_links))
        except Timeout:
            print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except RequestException as e:
            print(f"RequestException occurred for index {index} and URL {url}: {str(e)}")
            return []
        except Exception as e:
            print(f"Error extracting links from index {index} for URL {url}: {str(e)}")
            return []

# Function for multithreading
def process_url(index, link):
    print('going for ', index, ' : ', link)
    link_data[link] = extract_links_with_error_handling(link, index)
    time.sleep(1)

# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Iterate over each row in the CSV file
    threads = []
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue

        # Assuming the link is in the second column and the index is in the first column
        link_index = row[0]
        link = row[1]
        if(link == ''):
            continue

        # Create a thread to process the URL
        thread = threading.Thread(target=process_url, args=(link_index, link))
        threads.append(thread)

    # Start all threads
    for thread in threads:
        thread.start()

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

# Categorize links after extracting all links
for key, values in link_data.items():
    link_data[key] = categorize_links(values)

# Print the resulting dictionary and save it to an output file
output_filename = 'link_data_full.csv'
with open(output_filename, 'w', newline='') as output_csv:
    writer = csv.writer(output_csv)
    for key, values in link_data.items():
        print(f"Link: {key}")
        print("Links found on the page:")
        writer.writerow([key] + values)
        for value in values:
            print(f" - {value}")
        print()

print(f"Data saved to {output_filename}")


## dummy code for all below codes

In [None]:
from bs4 import BeautifulSoup
import re

# Sample HTML content
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
    </div>
    <div>
    <a href="https://maps.google.com/?q=3583+16th+St,+San+Francisco,+CA+94114,+USA&amp;ftid=0x808f7e1c89dc2e5b:0xa1d14effd3552b22" class="site-location__address" target="_blank" rel="noopener" data-bb-track="button" data-bb-track-on="click" data-bb-track-category="Address" data-bb-track-action="Click" data-bb-track-label="Header">
    <span>3583 16th St,</span>
    <span> San Francisco, CA 94114</span></a>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""

phone_pattern = r'(\+\d{1,2}\s?)?\(\d{3}\)\s?\d{3}[-\s]\d{4}'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
timings_pattern = r'(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)(?:day)? [0-9]+(?: AM| PM)? - [0-9]+(?: AM| PM)?'
# address_pattern = r'\b\d+\s[A-Za-z\s,]+\b'
catering_pattern = re.compile(r'\bcatering\b', re.IGNORECASE)
group_order_pattern = re.compile(r'\bgroup\sorder\b', re.IGNORECASE)
counter_pattern = re.compile(r'\bcounter\b', re.IGNORECASE)
delivery_pattern = re.compile(r'\bdelivery\b', re.IGNORECASE)
bar_pattern = re.compile(r'\bbar\b', re.IGNORECASE)
pickup_pattern = re.compile(r'\bpick\s*-\s*up\b', re.IGNORECASE)

address_pattern = r'\baddress\b'
soup = BeautifulSoup(html_content, 'html.parser')

def get_data_by_regex(pattern):
  elements = soup.find_all(text=re.compile(pattern, re.IGNORECASE))
  for element in elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    elements_with_matching_attributes = soup.find_all(attrs={'data-custom': pattern})
    element_class = parent_element['class'] if parent_element else None
    print("is it even working")
    print(elements_with_matching_attributes)
    print("hello")
    address = element.strip()

    # Check if address is not None before calling strip()
    if address is not None:
        address = address.strip()
    if element_class == None:
      return address
    else:
      address=get_class_data(element_class)
      return address

def get_class_data(address):
  data=[]
  header_element = soup.find_all(class_=address)
  print(header_element)

  if header_element:
    for header in header_element:
    # Extract the text content of the element
      header_text = header.get_text()
      data.append(header_text)
  return data
ad=get_data_by_regex(address_pattern)

# Alternatively, if you want to get the HTML content:
# header_html = str(header_element)
# print("Header HTML:")
# print(header_html)
"https://maps.google.com/?q=3583+16th+St,+San+Francisco,+CA+94114,+USA&ftid=0x808f7e1c89dc2e5b:0xa1d14effd3552b22"

  elements = soup.find_all(text=re.compile(pattern, re.IGNORECASE))


'https://maps.google.com/?q=3583+16th+St,+San+Francisco,+CA+94114,+USA&ftid=0x808f7e1c89dc2e5b:0xa1d14effd3552b22'

## ADDRESS complete

In [None]:
import re
from bs4 import BeautifulSoup

# Sample HTML content (replace this with your actual HTML content)
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
    </div>
    <div data-mesh-id="comp-l48anq6vinlineContent-gridContainer" data-testid="mesh-container-content"><div id="comp-l39r6gmj" class="comp-l39r6gmj wixui-vector-image" data-angle="0" data-angle-style-location="style" style="visibility: inherit;" data-screen-in-hide="done"><div data-testid="svgRoot-comp-l39r6gmj" class="aizuI7 TcoJIb comp-l39r6gmj"><svg preserveAspectRatio="xMidYMid meet" data-bbox="53.5 36.5 93 127" viewBox="53.5 36.5 93 127" height="200" width="200" xmlns="http://www.w3.org/2000/svg" data-type="color" role="presentation" aria-hidden="true" aria-label=""><defs><style>#comp-l39r6gmj svg [data-color="1"] {fill: #E02728;}</style></defs>
    <g>
        <path d="M99.999 163.5l-3.25-3.895C94.986 157.487 53.5 107.468 53.5 82.916 53.5 57.323 74.359 36.5 99.999 36.5c25.644 0 46.501 20.823 46.501 46.416 0 24.551-41.483 74.571-43.252 76.688l-3.249 3.896zm0-118.56c-20.978 0-38.046 17.036-38.046 37.977 0 16.359 25.019 51.015 38.046 67.305 13.029-16.29 38.048-50.946 38.048-67.305 0-20.942-17.068-37.977-38.048-37.977z" fill="#2F54DD" data-color="1"></path>
        <path d="M99.999 101.658c-10.351 0-18.775-8.407-18.775-18.741 0-10.335 8.424-18.743 18.775-18.743 10.353 0 18.777 8.408 18.777 18.743 0 10.333-8.424 18.741-18.777 18.741zm0-29.046c-5.69 0-10.32 4.621-10.32 10.304 0 5.68 4.63 10.303 10.32 10.303 5.692 0 10.324-4.622 10.324-10.303 0-5.682-4.632-10.304-10.324-10.304z" fill="#2F54DD" data-color="1"></path>
    </g>
</svg>
</div></div><div id="comp-l39r6gnw" class="KcpHeO tz5f0K comp-l39r6gnw wixui-rich-text" data-testid="richTextElement" data-angle="0" data-angle-style-location="style" style="visibility: inherit;" data-screen-in-hide="done"><p class="font_5 wixui-rich-text__text" style="font-size:20px;"><span style="font-size:20px;" class="wixui-rich-text__text">Address</span></p></div><div id="comp-l39r6go3" class="KcpHeO tz5f0K comp-l39r6go3 wixui-rich-text" data-testid="richTextElement" data-angle="0" data-angle-style-location="style" style="visibility: inherit;" data-screen-in-hide="done"><p class="font_8 wixui-rich-text__text" style="font-size:16px; line-height:1.8em;"><span style="font-size:16px;" class="wixui-rich-text__text">2135 Franklin St.</span></p>

<p class="font_8 wixui-rich-text__text" style="font-size:16px; line-height:1.8em;"><span style="font-size:16px;" class="wixui-rich-text__text">Oakland CA, 94612</span></p></div></div>
    <div>
    <a href="https://maps.google.com/?q=3583+16th+St,+San+Francisco,+CA+94114,+USA&amp;ftid=0x808f7e1c89dc2e5b:0xa1d14effd3552b22" class="site-location__address" target="_blank" rel="noopener" data-bb-track="button" data-bb-track-on="click" data-bb-track-category="Address" data-bb-track-action="Click" data-bb-track-label="Header">
    <span>3583 16th St,</span>
    <span> San Francisco, CA 94114</span></a>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Define a regex pattern to search for "address" in attributes and text content
# address_pattern = r'\b\d+\s[A-Za-z\s,]+\b'

address_pattern = re.compile(r'\b\d+\s[A-Za-z\s,]+\b', re.IGNORECASE)
#  same tag as above for area, address will have some more regex
# print(address_pattern)
# Search for the 'a' tag with an attribute or text content matching the address pattern
matching_tag = None

# for tag in soup.find_all(address_pattern):
#     print(tag)
#     for attr, value in tag.attrs.items():
#         print(value)
#         if isinstance(attr, str) and isinstance(value, str):
#             if address_pattern.search(attr) or address_pattern.search(value):
#                 matching_tag = tag
#                 break
#     if matching_tag:
#         break
address_elements = soup.find_all(text=re.compile(address_pattern))

for element in address_elements:
    # Find the nearest parent element with a class attribute
    print(element)
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    timings = element.strip()

    # Check if timings is not None before calling strip()
    if timings is not None:
        timings = timings.strip()

    print(f"Class Attribute: {element_class}")

# Find the parent tag of the matching 'a' tag, regardless of its tag type
# parent_tag = matching_tag.find_parent()

# Print the parent tag and its contents
# if parent_tag:
#     print(parent_tag)
# else:
#     print(f"No parent tag found for the matching 'a' tag.")

all_text_content = []
if matching_tag:
    for tag in matching_tag.find_all():
        text = tag.get_text(separator='', strip=True)
        if text:
          print(text)
          all_text_content.append(text)

# Join the text content and print it
result = ', '.join(all_text_content)
print(result)


2135 Franklin St.
Class Attribute: ['wixui-rich-text__text']
© 2023 Sample Company
Class Attribute: ['footer']



  address_elements = soup.find_all(text=re.compile(address_pattern))


## phone number
also include email in same function

In [None]:
from bs4 import BeautifulSoup
import re

# Sample HTML content
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
    </div>
    <div>
    <a href="https://maps.google.com/?q=3583+16th+St,+San+Francisco,+CA+94114,+USA&amp;ftid=0x808f7e1c89dc2e5b:0xa1d14effd3552b22" class="site-location__address" target="_blank" rel="noopener" data-bb-track="button" data-bb-track-on="click" data-bb-track-category="Address" data-bb-track-action="Click" data-bb-track-label="Header">
    <span>3583 16th St,</span>
    <p>Phone: +9 (123) 123-1223</p>
    <span> San Francisco, CA 94114</span></a>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""
soup = BeautifulSoup(html_content, 'html.parser')
phone_pattern = r'\+\d{1,2}\s?\(\d{3}\)\s?\d{3}[-\s]\d{4}'

def findPhone(pattern):
  phones = []
  pattern_elements = soup.find_all(string=re.compile(pattern))
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      parent_element = element.find_parent(attrs={"class": True})
      element_class = parent_element['class'] if parent_element else None

      stripped_ele = element.strip()

      # Check if phone_number is not None before calling strip()
      if stripped_ele is not None:
          stripped_ele = stripped_ele.strip()

      match = re.search(phone_pattern, stripped_ele)
      phones.append(match.group())

  return phones

print(findPhone(phone_pattern))

['+1 (123) 123-1223', '+9 (123) 123-1223']


## email

In [None]:
from bs4 import BeautifulSoup
import re

# Sample HTML content
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
    </div>
    <div>
    <a href="https://maps.google.com/?q=3583+16th+St,+San+Francisco,+CA+94114,+USA&amp;ftid=0x808f7e1c89dc2e5b:0xa1d14effd3552b22" class="site-location__address" target="_blank" rel="noopener" data-bb-track="button" data-bb-track-on="click" data-bb-track-category="Address" data-bb-track-action="Click" data-bb-track-label="Header">
    <span>3583 16th St,</span>
    <span> San Francisco, CA 94114</span></a>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""

email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

def findEmail(pattern):
  pattern_elements = soup.find_all(string=re.compile(pattern))
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      parent_element = element.find_parent(attrs={"class": True})
      element_class = parent_element['class'] if parent_element else None

      stripped_ele = element.strip()

      # Check if phone_number is not None before calling strip()
      if stripped_ele is not None:
          stripped_ele = stripped_ele.strip()

      match = re.search(pattern, stripped_ele)
      return match.group()

print(findEmail(email_pattern))


info@example.com


## offers, boolean return
pickup, group order, catering, counter, bar, delivery, reservation

In [None]:
from bs4 import BeautifulSoup
import re

# Sample HTML content
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
        <button shape="Pill" size="12" kind="BUTTON/PRIMARY" data-testid="MenuOrderMethodTogglePickup" data-anchor-id="MenuOrderMethodTogglePickup" aria-checked="true" role="radio" class="styles__StyledButtonRoot-sc-1ldytso-0 kXBxRx"><span kind="BUTTON/PRIMARY" class="Inset__StyledInset-sc-1phi2ey-0 ipbHIR styles__ContentWrapper-sc-1ldytso-2 hasZAr"><span class="InlineChildren__StyledInlineChildren-sc-6r2tfo-0 gviwpu"><span class="styles__MainContentContainer-sc-1ldytso-3 qvNNS"><span overflow="truncate" display="block" class="styles__TextElement-sc-3qedjx-0 ciJJYj">Pickup</span></span></span></span></button>

    </div>
    <div>
    <a href="https://maps.google.com/?q=3583+16th+St,+San+Francisco,+CA+94114,+USA&amp;ftid=0x808f7e1c89dc2e5b:0xa1d14effd3552b22" class="site-location__address" target="_blank" rel="noopener" data-bb-track="button" data-bb-track-on="click" data-bb-track-category="Address" data-bb-track-action="Click" data-bb-track-label="Header">
    <span>3583 16th St,</span>
<button shape="Pill" size="12" kind="BUTTON/PRIMARY" data-testid="MenuOrderMethodToggleDelivery" data-anchor-id="MenuOrderMethodToggleDelivery" aria-checked="false" role="radio" class="styles__StyledButtonRoot-sc-1ldytso-0 kJHgFL"><span kind="BUTTON/PRIMARY" class="Inset__StyledInset-sc-1phi2ey-0 ipbHIR styles__ContentWrapper-sc-1ldytso-2 hasZAr"><span class="InlineChildren__StyledInlineChildren-sc-6r2tfo-0 gviwpu"><span class="styles__MainContentContainer-sc-1ldytso-3 qvNNS"><span overflow="truncate" display="block" class="styles__TextElement-sc-3qedjx-0 ciJJYj">Delivery</span></span></span></span></button>
    <span> Group orders</span></a>
    <span> San Francisco, CA 94114</span></a>
    <span> bar</span></a><span> San Francisco, CA 94114</span></a><span> San reservations Francisco, CA 94114</span></a>
    <span> San Francisco,delivery CA 94114</span></a>
    <span> Counter</span></a>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""


soup = BeautifulSoup(html_content, 'html.parser')

keywords = ["Pickup", "Group", "Catering", "Counter", "Bar", "Delivery", "Reservations"]


def findPattern(keyword):
  if soup.find_all(string=re.compile(r'\b(?:' + keyword + r')\b', re.IGNORECASE)):
      return True
  return False

for keyword in keywords:
  print(keyword, findPattern(keyword))


Pickup True
Group True
Catering False
Counter True
Bar True
Delivery True
Reservations True


## year established

In [None]:
from bs4 import BeautifulSoup
import re
from datetime import datetime


# Sample HTML content
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p> 1999</p>
        <p>Email: info@example.com</p>
        <p>Phone: +1 (123) 123-1223</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
        <button shape="Pill" size="12" kind="BUTTON/PRIMARY" data-testid="MenuOrderMethodTogglePickup" data-anchor-id="MenuOrderMethodTogglePickup" aria-checked="true" role="radio" class="styles__StyledButtonRoot-sc-1ldytso-0 kXBxRx"><span kind="BUTTON/PRIMARY" class="Inset__StyledInset-sc-1phi2ey-0 ipbHIR styles__ContentWrapper-sc-1ldytso-2 hasZAr"><span class="InlineChildren__StyledInlineChildren-sc-6r2tfo-0 gviwpu"><span class="styles__MainContentContainer-sc-1ldytso-3 qvNNS"><span overflow="truncate" display="block" class="styles__TextElement-sc-3qedjx-0 ciJJYj">Pickup</span></span></span></span></button>

    </div>

    <div>
    <p style="text-align: left; color: green;">We opened as “Club Sinaloa” in 1960 as a meals.</p>
    <a href="https://maps.google.com/?q=3583+16th+St,+San+Francisco,+CA+94114,+USA&amp;ftid=0x808f7e1c89dc2e5b:0xa1d14effd3552b22" class="site-location__address" target="_blank" rel="noopener" data-bb-track="button" data-bb-track-on="click" data-bb-track-category="Address" data-bb-track-action="Click" data-bb-track-label="Header">
    <span>3583 16th St,</span>
<button shape="Pill" size="12" kind="BUTTON/PRIMARY" data-testid="MenuOrderMethodToggleDelivery" data-anchor-id="MenuOrderMethodToggleDelivery" aria-checked="false" role="radio" class="styles__StyledButtonRoot-sc-1ldytso-0 kJHgFL"><span kind="BUTTON/PRIMARY" class="Inset__StyledInset-sc-1phi2ey-0 ipbHIR styles__ContentWrapper-sc-1ldytso-2 hasZAr"><span class="InlineChildren__StyledInlineChildren-sc-6r2tfo-0 gviwpu"><span class="styles__MainContentContainer-sc-1ldytso-3 qvNNS"><span overflow="truncate" display="block" class="styles__TextElement-sc-3qedjx-0 ciJJYj">Delivery</span></span></span></span></button>
    <span> Group orders</span></a>
    <span> San Francisco, CA 94114</span></a>
    <span> bar</span></a><span> San Francisco, CA 94114</span></a><span> San reservations Francisco, CA 94114</span></a>
    <span> San Francisco,delivery CA 94114</span></a>
    <span> Counter</span></a>
    <p>Phone: +9 (123) 123-1223</p>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""

soup = BeautifulSoup(html_content, 'html.parser')
year_pattern = r'\b\d{4}\b'
# phone_pattern = r'\+\d{1,2}\s?\(\d{3}\)\s?\d{3}[-\s]\d{4}'

def findYear(year_pattern):
  pattern_elements = soup.find_all(string=re.compile(year_pattern))
  possible_year = []
  for element in pattern_elements:
      parent_element = element.find_parent(attrs={"class": True})
      element_class = parent_element['class'] if parent_element else None

      stripped_ele = element.strip()
      possible_year.append(stripped_ele)
  all_phones = findPhone(phone_pattern)
  possible_year = [ele for ele in possible_year if not any(phn in ele for phn in all_phones)]
  possible_year = [year for year in possible_year if '©' not in year]

  final_possible_years = []
  for year in possible_year:
    match = re.search(year_pattern, year)
    final_possible_years.append(int(match.group()))
  for yr in final_possible_years:
    if(yr > datetime.now().year):
      final_possible_years.remove(yr)
  return final_possible_years

print(findYear(year_pattern))



[1999, 1960]


## time

In [None]:
from bs4 import BeautifulSoup
import re

# Sample HTML content
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
    </div>
    <div>
    <div id="text-3" class="widget-odd widget-last widget-3 centered widget widget--menu widget_text">
      <h4 class="widget__title widget--menu__title">Business Hours</h4>
      <div class="textwidget"><div class="pixcode  pixcode--separator  separator separator--flower">✻</div>
      <p>Sunday thru Thursday<br>
      11:00am&nbsp; to 9:00pm</p>
      <p>Friday and Saturday<br>
      11:00am to 10:00pm</p>
      <h4><strong style="color: #ed1c24;">Last orders taken 30-minutes before closing</strong></h4>
    </div>
		</div>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""

soup = BeautifulSoup(html_content, 'html.parser')
time_pattern = r'\b\d{1,2}(?::\d{2})?\s*[APap][Mm]\b'

def findTime(pattern):
  pattern_elements = soup.find_all(string=re.compile(pattern, re.IGNORECASE))
  time_slots=[]
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      parent_element = element.find_parent(attrs={"class": True})
      # print(parent_element.get_text())
  return parent_element.get_text()

print(findTime(t_pattern))


✻
Sunday thru Thursday
      11:00am  to 9:00pm
Friday and Saturday
      11:00am to 10:00pm
Last orders taken 30-minutes before closing



## about, menu --> only urls

In [None]:
from bs4 import BeautifulSoup
import re

# Sample HTML content
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
    </div>
    <div>
    <div id="text-3" class="widget-odd widget-last widget-3 centered widget widget--menu widget_text">
      <h4 class="widget__title widget--menu__title">Business Hours</h4>
      <div class="textwidget"><div class="pixcode  pixcode--separator  separator separator--flower">✻</div>
      <p>Sunday thru Thursday<br>
      11:00am&nbsp; to 9:00pm</p>
      <p>Friday and Saturday<br>
      11:00am to 10:00pm</p>
      <h4><strong style="color: #ed1c24;">Last orders taken 30-minutes before closing</strong></h4>
    </div>
    <div id="text-2" class="widget-odd widget-first widget-1 centered widget widget--menu widget_text"><h4 class="widget__title widget--menu__title">About</h4>			<div class="textwidget"><div class="pixcode  pixcode--separator  separator separator--flower">✻</div>

    <h2><strong style="color: red;">SINALOA CAFE</strong></h2>
    <p style="text-align: left; color: green;">We opened as “Club Sinaloa” in 1960 as a dance club and bar. Adolfo &amp; Mary Pena. never dreamed it would turn into a family tradition lasting over 50 years. As it was, patrons stayed late and ate tacos and burritos before leaving. which encouraged to expand the kitchen, dining room and menu.<br>
    We’ve always used the freshest produce and the highest quality of meats to prepare our famous meals.</p>
    </div>
		</div>
		</div>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""

soup = BeautifulSoup(html_content, 'html.parser')
t_pattern = r'\babout\b'

def findEmail(pattern):
  pattern_elements = soup.find_all(string=re.compile(pattern, re.IGNORECASE))
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      print(element)
      parent_element = element.find_parent(attrs={"class": True})
      print(parent_element)

print(findEmail(t_pattern))


About
<h4 class="widget__title widget--menu__title">About</h4>
None


In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import re  
import time  
from requests.exceptions import Timeout, RequestException  # Import the Timeout exception for links which takes too much time to load

# Function to categorize links on a webpage
# time function 
time_pattern = r'\b\d{1,2}(?::\d{2})?\s*[APap][Mm]\b'
phone_pattern = r'\+\d{1,2}\s?\(\d{3}\)\s?\d{3}[-\s]\d{4}'
year_pattern = r'\b\d{4}\b'
keywords = ["Pickup", "Group", "Catering", "Counter", "Bar", "Delivery", "Reservations"]

def findTime(pattern):
  pattern_elements = soup.find_all(string=re.compile(pattern, re.IGNORECASE))
  time_slots=[]
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      parent_element = element.find_parent(attrs={"class": True})
      # print(parent_element.get_text())
  return parent_element.get_text()



#year function

# phone_pattern = r'\+\d{1,2}\s?\(\d{3}\)\s?\d{3}[-\s]\d{4}'

def findYear(year_pattern):
  pattern_elements = soup.find_all(string=re.compile(year_pattern))
  possible_year = []
  for element in pattern_elements:
    

      stripped_ele = element.strip()
      possible_year.append(stripped_ele)
  all_phones = findPhone(phone_pattern)
  if all_phones:
    possible_year = [ele for ele in possible_year if not any(phn in ele for phn in all_phones)]
    possible_year = [year for year in possible_year if '©' not in year]

  final_possible_years = []
  for year in possible_year:
    match = re.search(year_pattern, year)
    final_possible_years.append(int(match.group()))
  for yr in final_possible_years:
    if(yr > datetime.now().year):
      final_possible_years.remove(yr)


  return final_possible_years



# all boolean in one (eg.has_delivery,has_catering.......)
# it will return list so handle carefully




def findPattern(keyword):
  if soup.find_all(string=re.compile(r'\b(?:' + keyword + r')\b', re.IGNORECASE)):
      return True
  return False

for keyword in keywords:
  print(keyword, findPattern(keyword))





# email 

email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

def findEmail(pattern):
  pattern_elements = soup.find_all(string=re.compile(pattern))
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      parent_element = element.find_parent(attrs={"class": True})
      element_class = parent_element['class'] if parent_element else None

      stripped_ele = element.strip()

      # Check if phone_number is not None before calling strip()
      if stripped_ele is not None:
          stripped_ele = stripped_ele.strip()

      match = re.search(pattern, stripped_ele)
      return match.group()



# mobile number , it will return list so handle with





def findPhone(pattern):
  phones = []
  pattern_elements = soup.find_all(string=re.compile(pattern))
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      parent_element = element.find_parent(attrs={"class": True})
      element_class = parent_element['class'] if parent_element else None

      stripped_ele = element.strip()

      # Check if phone_number is not None before calling strip()
      if stripped_ele is not None:
          stripped_ele = stripped_ele.strip()

      match = re.search(phone_pattern, stripped_ele)
      phones.append(match.group())

  return phones









def categorize_links(links):
    if links is None:
        return [] 
    maps_link = ""
    instagram_link = ""
    facebook_link = ""
    twitter_link = ""
    other_links = []

    for link in links:
        if "maps.google.com" in link:
            maps_link = link
        elif "www.instagram.com" in link:
            instagram_link = link
        elif "www.facebook.com" in link:
            facebook_link = link
        elif "twitter.com" in link:
            twitter_link = link
        else:
            other_links.append(link)

    return [maps_link, instagram_link, facebook_link, twitter_link] + other_links


def extract_links_with_error_handling(url, index, visited_links=None, depth=0, max_depth=2):
    if visited_links is None:
        visited_links = set()

    retries = 3  # Number of retries before giving up
    retry_delay = 2  # Delay between retries in seconds
    unique_links = set()

    try:
        if url not in visited_links and depth <= max_depth:
            visited_links.add(url)
            print(f'Crawling {url}')
            response = requests.get(url, timeout=(5, 5))  # Set a timeout of 10 seconds for both connect and read
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            email=findPattern(email_pattern)
            phone_number=findPhone(phone_pattern)
            timing=findTime(time_pattern)
            year_established=findYear(year_pattern)
            print(email,phone_number,timing,year_established)
            for key in keywords:
                print(key,findPattern(key))
    
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]

            for link in valid_links:
                if not any(social in link for social in ["whatsapp", "insta", "facebook", "twitter"]):
                    unique_links.add(link)

            for link in unique_links.copy():
                unique_links |= extract_links_with_error_handling(link, index, visited_links, depth + 1, max_depth)

    except Timeout:
        print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
        time.sleep(retry_delay)
    except RequestException as e:
        print(f"RequestException occurred for index {index} and URL {url}: {str(e)}")
    except Exception as e:
        print(f"Error extracting links from index {index} for URL {url}: {str(e)}")

    return unique_links


# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)
    
    # Iterate over each row in the CSV file
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue  # Skip rows with less than 2 columns
        link_index = row[0]
        link = row[1]
        if(link == ''):
            continue
        print(' ---------------------------going for-----------------------------\n ', link_index, ' : ', link)
        visited_links = set()  
        max_depth = 1
        link_data[link] = extract_links_with_error_handling(link, link_index, visited_links, depth=0, max_depth=max_depth)

        time.sleep(1) 

# Categorize links after extracting all links
for key, values in link_data.items():
    link_data[key] = categorize_links(values)

# Print the resulting dictionary and save it to an output file
output_filename = 'deepcrawl/links_large.csv'
with open(output_filename, 'w', newline='') as output_csv:
    writer = csv.writer(output_csv)
    for key, values in link_data.items():
        print(f"Link: {key}")
        print("Links found on the page:")
        writer.writerow([key] + values)
        for value in values:
            print(f" - {value}")
        print()

print(f"Data saved to {output_filename}")