In [6]:
import csv
import requests
from bs4 import BeautifulSoup
import re  
import time  
import threading
from requests.exceptions import Timeout, RequestException  # Import the Timeout exception for links which takes too much time to load
from datetime import datetime

In [7]:
# phone_pattern = r'\+\d{1,2}\s?\(\d{3}\)\s?\d{3}[-\s]\d{4}'
# email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
# # for boolean values
# keywords_pattern = ["Pickup", "Group", "Catering", "Counter", "Bar", "Delivery", "Reservations"] 
# year_pattern = r'\b\d{4}\b'
# time_pattern = r'\b\d{1,2}(?::\d{2})?\s*[APap][Mm]\b'


In [8]:
# Function to categorize links on a webpage
def categorize_links(data_dict, links):
    if links is None:
        return [] 

    other_links = []

    for link in links:
        if "menu" in link:
            data_dict["restaurant_provided_menu_link"] = link
        elif "order" in link:
            data_dict["order_online_link"] = link
        elif "facebook" in link:
            data_dict["facebook_link"] = link
        elif "instagram" in link:
            data_dict["instagram_link"] = link
        elif "twitter" in link:
            data_dict["twitter_link"] = link
        elif "play_store" in link:
            data_dict["play_store_link"] = link
        elif "app_store" in link:
            data_dict["app_store_link"] = link
        elif "maps" in link:
            data_dict["maps_link"] = link    
        else:
            other_links.append(link)

def findPhone(soup):
  phone_pattern = r'\+\d{1,2}\s?\(\d{3}\)\s?\d{3}[-\s]\d{4}'
  phones = []
  pattern_elements = soup.find_all(string=re.compile(phone_pattern))
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      parent_element = element.find_parent(attrs={"class": True})
      element_class = parent_element['class'] if parent_element else None

      stripped_ele = element.strip()

      # Check if phone_number is not None before calling strip()
      if stripped_ele is not None:
          stripped_ele = stripped_ele.strip()

      match = re.search(phone_pattern, stripped_ele)
      phones.append(match.group())

  return phones

def findEmail(soup):
  email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
  pattern_elements = soup.find_all(string=re.compile(email_pattern))
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
    #   parent_element = element.find_parent(attrs={"class": True})
    #   element_class = parent_element['class'] if parent_element else None

      stripped_ele = element.strip()

      # Check if phone_number is not None before calling strip()
      if stripped_ele is not None:
          stripped_ele = stripped_ele.strip()

      match = re.search(email_pattern, stripped_ele)
      return match.group()
  
def findBooleanValues(soup):
    keywords_pattern = ["pickup", "group", "catering", "counter", "bar", "delivery", "gifting", "takeout"] 
    def findPattern(keyword):
        if soup.find_all(string=re.compile(r'\b(?:' + keyword + r')\b', re.IGNORECASE)):
            return True
        return False
    dict = {}
    for keyword in keywords_pattern:
        dict[keyword] = findPattern(keyword)
    return dict

def findYear(soup):
    year_pattern = r'\b\d{4}\b'
    pattern_elements = soup.find_all(string=re.compile(year_pattern))
    possible_year = []
    for element in pattern_elements:
        parent_element = element.find_parent(attrs={"class": True})
        element_class = parent_element['class'] if parent_element else None 
        stripped_ele = element.strip()
        possible_year.append(stripped_ele)
    all_phones = findPhone(soup)
    possible_year = [ele for ele in possible_year if not any(phn in ele for phn in all_phones)]
    possible_year = [year for year in possible_year if '©' not in year] 
    final_possible_years = []
    for year in possible_year:
      match = re.search(year_pattern, year)
      final_possible_years.append(int(match.group()))
    for yr in final_possible_years:
      if(yr > datetime.now().year):
        final_possible_years.remove(yr)
    return final_possible_years

def findTime(soup):
  time_pattern = r'\b\d{1,2}(?::\d{2})?\s*[APap][Mm]\b'
  pattern_elements = soup.find_all(string=re.compile(time_pattern, re.IGNORECASE))
  for element in pattern_elements:
      # Find the nearest parent element with a class attribute
      parent_element = element.find_parent(attrs={"class": True})
      matches = parent_element.get_text()
      formatted_data = ' '.join(matches.split())
    #   print(formatted_data)
  return formatted_data


In [9]:
import csv
import requests
from bs4 import BeautifulSoup
import re
import time  
import threading
from requests.exceptions import Timeout, RequestException  # Import the Timeout exception for links which takes too much time to load

input_filename = 'input/input4.csv'  # Change this to your CSV file name
output_filename = 'output/output4.csv'

# Define a lock to synchronize access to link_data
link_data_lock = threading.Lock()

def write_output(data_dict, output_filename):
    header = list(data_dict.keys())
    # Open the CSV file in append mode
    with open(output_filename, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)

        # If the file is empty, write the header
        if csvfile.tell() == 0:
            writer.writeheader()

        # Write the data from the dictionary
        writer.writerow(data_dict)

# Function to crawl a single link and store the result in link_data
def crawl_link(link_index, link, max_depth, data_dict):
    visited_links = set()
    result = extract_links_with_error_handling(
        link, 
        link_index, 
        data_dict,
        visited_links, 
        depth=0, 
        max_depth=max_depth)
    
    categorize_links(data_dict, result)
    # print("modified data", data_dict)
    write_output(data_dict, output_filename)

    with link_data_lock:
        link_data[link] = result

def extract_links_with_error_handling(url, index, data_dict, visited_links=None, depth=0, max_depth=2):
    if visited_links is None:
        visited_links = set()

    retries = 3  # Number of retries before giving up
    retry_delay = 2  # Delay between retries in seconds
    unique_links = set()

    try:
        if url not in visited_links and depth <= max_depth:
            visited_links.add(url)
            # print(f'Crawling {url}')
            response = requests.get(url, timeout=(5, 5))  # Set a timeout of 10 seconds for both connect and read
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            for (key, value) in data_dict.items():
                # print(data_dict)
                if(value == ''):
                    # print("keyyyy ", key)
                    if(key ==  'telephone_number'):
                        data_dict[key] = findPhone(soup)
                    if(key == 'year_established'):
                        data_dict[key] = findYear(soup)
                    if(key == 'email_address'):
                        data_dict[key] = findEmail(soup)
                    # if(key == 'operating_hours'):
                    #     data_dict[key] = findTime(soup)
                    
                    if( key == 'offers_delivery' or
                        key == 'offers_pickup' or
                        key == 'offers_group_order' or
                        key == 'offers_catering' or
                        key == 'has_bar' or
                        key == 'has_counter' or
                        key == 'has_gifting' or
                        key == 'has_takeout' ):
                        for(key2, val2) in findBooleanValues(soup).items():
                            # print("------here-------")
                            if(key2 in key and value == ''):
                                # print('---------------in here-----------------')
                                # print(key, val2)
                                data_dict[key] = val2
                    

            # get valid links and recursion
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]

            # all links from current link, add to traverse(in unique link if its not any 
            # social link (basically useless links))
            for link in valid_links:
                if not any(social in link for social in ["whatsapp", "insta", "facebook", "twitter"]):
                    unique_links.add(link)

            # for link in unique_links.copy():
            #     unique_links |= extract_links_with_error_handling(link, index, visited_links, depth + 1, max_depth)

    except Timeout:
        print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
        time.sleep(retry_delay)
    except RequestException as e:
        print(f"RequestException occurred for index {index} and URL {url}: {str(e)}")
    except Exception as e:
        print(f"Error extracting links from index {index} for URL {url}: {str(e)}")

    return unique_links

# Initialize an empty dictionary to store the data
link_data = {}
threads = []
# Read the CSV file containing links
with open(input_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)
    # Read the first row to get the keys
    dummy_dict = {}
    header_row = next(reader)
    for key in header_row:
        dummy_dict[key] = None  # Initialize with None values
    # print(dummy_dict)

    new_rows = ['instagram_link', 'twitter_link', 'play_store_link', 'app_store_link']

    for row in new_rows:
        dummy_dict[row] = None
    # print(dummy_dict)
    
    # Iterate over each row in the CSV file
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue  # Skip rows with less than 2 columns
        link_index = row[0]
        link = row[1]
        # print('r ', row)

        # Create a new dictionary with keys from dummy_dict and values from the current row
        data_dict = {}

        if(link == ''):
            continue

        for key in dummy_dict.keys():
            if key in header_row:
                # If the key is in the header_row (CSV column), use the value from the CSV row
                data_dict[key] = row[header_row.index(key)]
            else:
                # If the key is not in the header_row, use the default value (None)
                data_dict[key] = None
        
        # print("original data", data_dict)
        # print(' ---------------------------going for-----------------------------\n ', link_index, ' : ', link)
        visited_links = set()  
        max_depth = 1
        
        
		# Create and start a new thread for each link
        thread = threading.Thread(target=crawl_link, args=(link_index, link, max_depth, data_dict))
        thread.start()
        threads.append(thread)
        time.sleep(1) 

# Wait for all threads to finish
for thread in threads:
    thread.join()



RequestException occurred for index 22822 and URL http://www.elcafecitoespresso.com/: HTTPConnectionPool(host='www.elcafecitoespresso.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f106b560970>: Failed to establish a new connection: [Errno -2] Name or service not known'))
RequestException occurred for index 22826 and URL http://www.winebar107.com/contact-us.html: 404 Client Error: Not Found for url: http://www.winebar107.com/contact-us.html
RequestException occurred for index 22824 and URL http://www.theplantcafe.com/location/sfo: 404 Client Error: Not Found for url: https://theplantcafe.com/location/sfo
Timeout occurred for index 22821 and URL http://www.7-eleven.com/?yext=14178. Retrying...
Timeout occurred for index 22828 and URL http://www.7-eleven.com/?yext=14133. Retrying...
RequestException occurred for index 22847 and URL http://asientosf.com/asiento/Intro.html: 404 Client Error: Not Found for url:

In [None]:
# html_content = """
# <html>
# <head>
#     <title>Sample Page</title>
# </head>
# <body>
#     <div class="header" id="top-header">
#         <h1>Contact Us</h1>
#         <p>Phone: +1 (123) 123-1223</p>
#         <p>Email: info@example.com</p>
#     </div>
#     <div class="content">
#         <p class="text">Visit our site for more information.</p>
#     </div>
#     <div>
#     <div id="text-3" class="widget-odd widget-last widget-3 centered widget widget--menu widget_text">
#       <h4 class="widget__title widget--menu__title">Business Hours</h4>
#       <div class="textwidget"><div class="pixcode  pixcode--separator  separator separator--flower">✻</div>
#       <p>Sunday thru Thursday<br>
#       11:00am&nbsp; to 9:00pm</p>
#       <p>Friday and Saturday<br>
#       11:00am to 10:00pm</p>
#       <h4><strong style="color: #ed1c24;">Last orders taken 30-minutes before closing</strong></h4>
#     </div>
#     <div id="text-2" class="widget-odd widget-first widget-1 centered widget widget--menu widget_text"><h4 class="widget__title widget--menu__title">About</h4>			<div class="textwidget"><div class="pixcode  pixcode--separator  separator separator--flower">✻</div>

#     <h2><strong style="color: red;">SINALOA CAFE</strong></h2>
#     <p style="text-align: left; color: green;">We opened as “Club Sinaloa” in 1960 as a dance club and bar. Adolfo &amp; Mary Pena. never dreamed it would turn into a family tradition lasting over 50 years. As it was, patrons stayed late and ate tacos and burritos before leaving. which encouraged to expand the kitchen, dining room and menu.<br>
#     We’ve always used the freshest produce and the highest quality of meats to prepare our famous meals.</p>
#     </div>
# 		</div>
# 		</div>
#     </div>
#     <footer class="footer" id="page-footer">
#         <p>&copy; 2023 Sample Company</p>
#     </footer>
# </body>
# </html>
# """

# soup = BeautifulSoup(html_content, 'html.parser')

# my_dict = {'findPhone': None, 'findEmail': None, 'findBooleanValues': None, 'findYear': None, 'findTime': None}

# my_dict['findPhone'] = findPhone(soup)
# my_dict['findEmail'] = findEmail(soup)
# for (key,val) in findBooleanValues(soup).items():
# 	my_dict[key] = val
# my_dict['findTime'] = findTime(soup)
# # my_dict['findYear'] = findYear(soup)

# print(my_dict)
