In [None]:
# Import Required Libraries
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service

In [1]:
def extract_data(browser):
    #Scrape the content of the webpage accessed by the browser
    soup = BeautifulSoup(browser.page_source, 'html.parser')

    body_content = soup.find('body')
    if body_content:
        content = body_content.get_text(separator='\n', strip=True)
    else:
        content = "No body content found"

    return {'content': content}

In [2]:
# Configuration
config = {
    "input_csv": "input_emails.csv", #insert name of your id file, here I used emails as IDs
    "output_csv": "output.csv",
    "start_url": "https://my.xyz.in/default.aspx?UserId={}", #example URL - please update with the one you need scrapped
    "target_url": "https://my.xyz.in/details.aspx", #example URL - please update if required, or keep it same as start_url
    "chromedriver_path": "/Users/chromedriver_mac_arm64/chromedriver",#insert your chrome drive path here
    "tor_proxy": "127.0.0.1:9150", # put if required, ensure tor is installed
}

In [None]:
# Set up Chrome options
chrome_options = Options()

# Use the Tor proxy
chrome_options.add_argument(f"--proxy-server=socks5://{config['tor_proxy']}")

# Create a Chrome browser instance with the configured options
s = Service(executable_path=config['chromedriver_path'])
browser = webdriver.Chrome(service=s, options=chrome_options)

In [None]:
with open(config['input_csv'], 'r') as input_file, open(config['output_csv'], 'w', newline='') as output_file:
    csv_reader = csv.reader(input_file)
    csv_writer = csv.writer(output_file)
    csv_writer.writerow(['email', 'fee_structure_output'])  # Write the header row

    for row in csv_reader:
        email = row[0]
        start_url = config['start_url'].format(email)
        target_url = config['target_url']

        try:
            # Navigate to the start_url
            browser.get(start_url)
            
            # Navigate to the target page and scrape the content
            browser.get(target_url)
            page_data = extract_data(browser)
            page_content = page_data['content']
            print(f"Page Content for {email}: {page_content}")

            csv_writer.writerow([email, page_content])
        except Exception as e:
            print(f"Error occurred for {email}: {e}")

browser.quit()