# Utility Functions

In [72]:
#returns a list that is all the values in list2 not present in list1
def dedup(domains_from_file, domains):
  unique_domains = [domain for domain in domains if domain not in domains_from_file]
  return unique_domains


In [73]:
import tldextract
def normalize_domain(domain):
  extracted = tldextract.extract(domain)
  normalized_domain = f"{extracted.domain}.{extracted.suffix}".lower()
  return normalized_domain


In [74]:
import os
import csv

def toCSV(source, rows):
    csv_filename = f'{source}.csv'

    # Determine if the CSV file already exists
    file_exists = os.path.isfile(csv_filename)

    # Write to CSV file
    with open(csv_filename, mode='a' if file_exists else 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(rows)

    print(f"Data has been written to {csv_filename}")


In [75]:
def save_html(source, domain, html_content):
  folder_path = f'./HTML/{source}'
  if not os.path.exists(folder_path):
    os.makedirs(folder_path)

  file_path = folder_path + f'/{domain}.html'
  print(file_path)
  with open(file_path, 'w') as file:
    file.write(html_content)

In [76]:
from bs4 import BeautifulSoup

def extractData(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    if table is None:
        return

    # Extract table rows
    rows = []
    for row in table.find('tbody').find_all('tr'):
        cells = row.find_all('td')
        row_data = [cell.get_text(strip=True) for cell in cells]
        rows.append(row_data)
    
    return rows


# Data Retrieval

In [77]:
# Function to read domains from a file
def read_domains(file_path):
    with open(file_path, 'r') as file:
        domains = [line.strip() for line in file if line.strip()]
    return domains

In [78]:
import os

def get_finished_HTML_domains(directory='./HTML/aeroleads'):
    # creates the directory if it does not exist already 
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Initialize an empty list to store filenames
    domains_from_file = []

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        # Check if the file ends with .html
        if filename.endswith('.html'):
            # Remove the .html extension and add to the list
            domains_from_file.append(os.path.splitext(filename)[0])
    
    return domains_from_file


In [79]:
import json

#Get JSON Data
def get_JSON_domains(filename):
  # Load the JSON data from a file
  with open(filename, 'r') as file:
      data = json.load(file)

  # Function to extract domain names from email addresses
  def get_domain(email):
      if "@" in email:
          return email.split('@')[1]
      return None

  # Extract and print domain names from email addresses
  domains = set()  # Use a set to avoid duplicates
  for doctor in data:
      email = doctor.get('Email')
      if email:
          domain = get_domain(email)
          if domain:
              domains.add(domain)

  # Print the extracted domain names
  return domains

In [80]:
def get_domains(filepath):
  with open(filepath, 'r') as file:
    domains = set(line.strip() for line in file)
  return domains

# Download HTML

In [81]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

def download_aero(domains):
  
  #need to remove the domain if it already has an html file. 
  HTML_domains = get_finished_HTML_domains('./HTML/aeroleads')
  domain_list = dedup(HTML_domains, domains)

  print(len(domain_list))
  #initial stuff 
  # Set up the WebDriver (this example uses Chrome)
  driver = webdriver.Chrome()

  # Replace with the URL of the page containing the input field
  url = "https://aeroleads.com/email-finder"

  driver.get(url)

  ##clicks button to change to the proper email
  button = driver.find_element(By.XPATH, '//*[@id="kt_app_content_container"]/ul/li[2]/a')

  # Click the button
  button.click()

  time.sleep(5)

  for domain_name in domain_list:
    # Locate the input field by its ID
    input_field = driver.find_element(By.ID, "domain_input")
    #clear the input before writing to it again
    input_field.clear()
    # Fill in the input field with a value
    input_field.send_keys(domain_name)

    # Optionally, submit the form if necessary (this example presses Enter)
    input_field.send_keys(Keys.RETURN)

    # Close the WebDriver after a short wait to see the result (optional)
    time.sleep(2)

    #now need to get the 
    html_content = driver.page_source
    #save html
    save_html('aeroleads', domain=domain_name, html_content=html_content)

  driver.quit()

In [82]:
def extract_aero(html_directory, finished_file):
    """
    Extracts domains from HTML files in a directory and processes only those not in the finished file.

    Args:
    html_directory (str): The directory containing HTML files.
    finished_file (str): Path to the file containing already processed domains.

    Returns:
    list: List of domains that have been processed.
    """
    #create finished file if it does not exist
    if not os.path.exists(finished_file):
        with open(finished_file, 'w') as file:
            pass  # Just create an empty file

    # Read finished domains from the file
    with open(finished_file, 'r') as file:
        finished_domains = set(line.strip() for line in file)

    for filename in os.listdir(html_directory):
        if filename.endswith('.html'):
            # Extract domain from filename (removing the .html extension)
            domain = filename[:-5]
            
            # Process only if domain is not in finished list
            if domain not in finished_domains:
                # Open and read the HTML file
                with open(os.path.join(html_directory, filename), 'r') as file:
                    content = file.read()
                    # Extract data from the content
                    data = extractData(content)
                    
                    if data is not None:
                        toCSV('aeroleads', data)
                    
                    # Add domain to processed domains
                    with open('finished.txt', 'a') as file:
                      file.write(domain + '\n')

In [83]:
def process_domains(filepath):
  domain_list = get_domains(filepath) #normalize domains
  domains = [normalize_domain(domain) for domain in domain_list]

  download_aero(domains)
  extract_aero('./HTML/aeroleads', './finished.txt')