In [None]:
# Import necessary libraries
import re
import math
import sys 
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

In [None]:
# 1. Data Structure Initialization
# Dictionary to store compiled data for all reports
compiled_data = {'Document Date': [], 'Contributor': [], 'Headline': [],
                 'Language': [], 'Pages': [], 'Tickers': [], 'Company Names': [],
                 'Category': [], 'Countries': [], 'Industries': [], 'Regions': [],
                 'Subjects': [], 'Report Styles': [], 'Author': [], 'File Name': []}

In [None]:
# 2. Helper Functions

# Function to extract a URL from a script tag using regex
def get_url_from_script(source_text, start_marker, end_marker):
#    start_marker = 'href="'
#    end_marker = '";</script>'
    pattern = re.compile(f'{re.escape(start_marker)}(.*?){re.escape(end_marker)}')
    match = pattern.search(source_text)
    if match:
        return match.group(1)
    else:
        return None

# Function to download PDF files
def downloading_pdfs(p, doc_no):
    # Extract document ID from the href attribute
    po = re.match(r"^.*\((.*)\).*$",p['href'])
    # Construct the initial URL for document loading
    url_1 = f'https://www.example.com/investextsearchlive.php?opt=loadDocument&docid=%5B%22{po.group(1)}%22%5D&doctype=pdf'
    # Make a request to the initial URL to get the redirection page
    r = requests.get(url_1)
    # Parse the redirection page HTML
    soup_redirect = bs(r.text, 'html')
    # Find the script tag containing the actual PDF URL
    script = soup_redirect.find('script')
    start_marker = 'href="'
    end_marker = '";</script>'
    # Extract the actual PDF URL using the helper function
    result = get_url_from_script(str(script), start_marker, end_marker)

    # Make a request to download the PDF, streaming content
    r2 = requests.get(result, stream=True)
    # Save the PDF to a file
    with open(f'output/{doc_no}.pdf', 'wb') as fd:
        for chunk in r2.iter_content(2000):
            fd.write(chunk)

In [None]:
# 3. Selenium WebDriver Initialization (assuming chromedriver.exe is in the path)
# Initialize Chrome WebDriver (ensure chromedriver.exe is in your system's PATH or provide full path)
driver = webdriver.Chrome(r'path/to/chromedriver.exe') # Placeholder path

# Optional: Navigate to the target URL (commented out in original snippet)
# driver.get(url) 

# Get the initial page source
html = driver.page_source
soup = bs(html, 'html.parser')

In [None]:
# 4. Extracting Number of Reports and Calculating Iterations

# Find the raw string containing the number of reports
found_str_raw = soup.find('td', class_ = 'found')
# Clean the extracted string
found_str = found_str_raw.text.strip()
# Search for the integer number within the string using regex
search_int = re.search(r'\b\d+\b', found_str)

# Process the found number of reports and proceed or exit
if search_int:
    num_of_reports = int(search_int.group())
    # Calculate the number of pages/iterations needed (25 reports per page)
    iterations = int(math.ceil(num_of_reports / 25))

    # Print the extracted and calculated values
    print(num_of_reports)
    print(iterations)

    # 5. Main Web Scraping Loop (Iterating through pages and extracting data)
    ticker_str = 'Tickers'
    soup1 = soup # Initialize soup object for iteration

    for h in range(iterations):
        print(h) # Print current page index
        # Find the main data table on the page
        tsort_body = soup1.find('table', class_ = 'tablesorter bodyline')
    # tsort = tsort_body.select('table.tablesorter:not([class*=\'bodyline\'])')
        # Find all report rows (even and odd classes)
        tr = tsort_body.find_all('tr', class_ = ['even', 'odd'])
        # Separate rows into 'additional data' and 'base data' based on pattern
        add_data = tr[1::2]
        base_data = tr[::2]
        # Get the number of reports displayed on the current page
        reports_per_pg = len(add_data)

        # Loop through each report on the current page
        for i in range(reports_per_pg):
            # Find header elements within the 'additional data' row
            data_header = add_data[i].find_all('b')

            # Check if the first header is 'Tickers' (case-insensitive)
            if ticker_str.lower() == data_header[0].text.strip().lower():

                # Select specific data cells from the 'additional data' row
                data = add_data[i].select('td.leftalignment:not([width])')
                # Count slashes in the first data cell (likely a filtering condition)
                slash_count = data[0].text.strip().count('/')

                # Proceed only if no slashes are found
                if slash_count == 0:

                    # Populate compiled_data with information from additional_data section
                    for j, k in zip(data_header, data):
                        compiled_data[j.text.strip()].append(k.text.strip())

                    # Ensure all lists have consistent lengths by appending 'NA' if data is missing
                    if len(compiled_data['Tickers']) > len(compiled_data['Company Names']):
                        compiled_data['Company Names'].append('NA') # Fixed typo: 'Comapany Names' to 'Company Names'

                    elif len(compiled_data['Tickers']) > len(compiled_data['Category']):
                        compiled_data['Category'].append('NA')

                    elif len(compiled_data['Tickers']) > len(compiled_data['Countries']):
                        compiled_data['Countries'].append('NA')

                    elif len(compiled_data['Tickers']) > len(compiled_data['Industries']):
                        compiled_data['Industries'].append('NA')

                    elif len(compiled_data['Tickers']) > len(compiled_data['Regions']):
                        compiled_data['Regions'].append('NA')

                    elif len(compiled_data['Tickers']) > len(compiled_data['Subjects']):
                        compiled_data['Subjects'].append('NA')

                    elif len(compiled_data['Tickers']) > len(compiled_data['Report Styles']):
                        compiled_data['Report Styles'].append('NA')

                    # Extract data from the 'base data' row
                    b_data = base_data[i].find_all('td')
                    date = b_data[2].text.strip()
                    year = date[-4:] # Extract year from the date
                    # Append base data to compiled_data
                    compiled_data['Document Date'].append(date)
                    compiled_data['Contributor'].append(b_data[3].text.strip())
                    compiled_data['Headline'].append(b_data[4].text.strip())
                    compiled_data['Author'].append(b_data[5].text.strip())
                    compiled_data['Language'].append(b_data[6].text.strip())
                    compiled_data['Pages'].append(b_data[7].text.strip())
                    # Find the PDF download link
                    p = base_data[i].find('a', href=True)
                    # Generate a unique document number/filename
                    doc_no = year + str(h) + str(i)
                    # Call function to download the PDF
                    downloading_pdfs(p, doc_no)
                    # Record the generated file name
                    (compiled_data)['File Name'].append(doc_no)

        # 6. Pagination Logic (Navigating to the next page)
        # Find pagination links
        nxt = soup1.find('span', class_ = 'pagenav').find_all('a', href=True)
        # Determine the 'Next' page link based on the number of links found
        if len(nxt) > 1:
            nxt_pg_ref = nxt[1] #Second link is 'Next'
        else:
            nxt_pg_ref = nxt[0] #First link is 'Next' (e.g., if only one link exists)
    #    nxt_pg_ref = nxt.find('a', href=True)
        # Construct the full URL for the next page
        url1 = 'https://www.example.com/' + nxt_pg_ref['href']
        # Navigate to the next page using Selenium
        driver.get(url1)
        # Get the HTML source of the new page
        html = driver.page_source
        # Update the soup object for the next iteration
        soup1 =bs(html, 'html.parser')

    # 7. Data Export
    # Create a Pandas DataFrame from the compiled data
    df = pd.DataFrame.from_dict(compiled_data, orient='index').transpose()
    # Export the DataFrame to an Excel file
    df.to_excel("Financial_Analyst_report.xlsx", index=False)

else:
    print("No reports found. Exiting program.")
    sys.exit(1) # Exit the script with an error code, as num_of_reports is not defined