In [65]:
import re
import time
from io import BytesIO
from typing import Dict

import requests
import pdfplumber
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait

In [55]:
service = Service('C:/webdriver/chromedriver')
driver = webdriver.Chrome(service=Service())
wait = WebDriverWait(driver, 10)

In [56]:
search_url = 'https://www.cse.lk/pages/cse-daily/cse-daily.component.html'

In [57]:
driver.get(search_url)
time.sleep(10)

In [58]:
report_list_class_name = 'rules-flexible'
report_class_name = 'rules-block flexible'
date_class_name = 'date'
button_class_name = 'dropdown-button'

In [59]:
all_share_price_index_pattern=r'\((ASPI)\)\s+([\d,]+\.\d+)\s+([\d,]+\.\d+)'
sp_sri_lanka_20_index_pattern= r'(S&P Sri Lanka 20 Index)\s+([\d,]+\.\d+)\s+([\d,]+\.\d+)'

In [60]:
def extract_pdf_data(text: str) -> Dict[str, Dict[str, str]]:
    """
    Extracts ASPI and S&P Sri Lanka 20 Index values from the text.

    Args:
        text (str): The input text containing financial data.

    Returns:
        Dict[str, Dict[str, str]]: A dictionary with extracted values for ASPI and S&P Sri Lanka 20 Index.
    """
    # Clean the text by removing extra spaces and newlines
    text = ' '.join(text.split())
    
    # Initialize result dictionary
    extracted_data = {}
    
    # Extract ASPI values
    aspi_match = re.search(all_share_price_index_pattern, text)
    if aspi_match:
        extracted_data['ASPI'] = {
            'Today': aspi_match.group(2),
            'Previous': aspi_match.group(3)
        }
    
    # Extract S&P Sri Lanka 20 Index values
    sp_sri_lanka_match = re.search(sp_sri_lanka_20_index_pattern, text)
    if sp_sri_lanka_match:
        extracted_data['S&P Sri Lanka 20 Index'] = {
            'Today': sp_sri_lanka_match.group(2),
            'Previous': sp_sri_lanka_match.group(3)
        }
    
    return extracted_data


In [61]:
def extract_pdf_text_from_url(pdf_url):
    """
    Extract text from the first page of a PDF given its URL.
    
    Args:
        pdf_url (str): The URL of the PDF file.

    Returns:
        str: Text extracted from the first page of the PDF.
    """
    # Set up headers for the request
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }
    
    # Fetch the PDF content
    response = requests.get(pdf_url, headers=headers)
    response.raise_for_status()  # Ensure the request was successful
    
    # Extract text from the first page of the PDF
    with pdfplumber.open(BytesIO(response.content)) as pdf:
        if len(pdf.pages) > 0:
            first_page_text = pdf.pages[0].extract_text()
        else:
            first_page_text = "No pages found in PDF."
            
    return first_page_text


In [None]:
reports = []

while True:
    # Get the HTML content of the page
    content = driver.page_source
    soup = BeautifulSoup(content, 'html.parser')

    # Find the main element containing the list of reports
    shows_element = soup.find('div', attrs={'class': report_list_class_name})
    div_elements = shows_element.findAll('div', attrs={'class': report_class_name})
    print(f"Number of shows found: {len(div_elements)}")
    
    for div in div_elements:
        date_element = div.find('div', attrs={'class': date_class_name})
        print(date_element.text)    
        download_button = div.find('a', attrs={'class': button_class_name})
        href = download_button['href']
        driver.execute_script(f"window.open('{href}', '_blank');")
        
        
        time.sleep(10)
        
        driver.switch_to.window(driver.window_handles[1])
        pdf_url = driver.current_url
        
        text = extract_pdf_text_from_url(pdf_url)
        
        result=extract_pdf_data(text)
        
        # Close the PDF tab and switch back to main tab
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        data = {
            'date': date_element.text,
            'pdf_url': pdf_url,
            'extracted_data': result
        }
        reports.append(data)
        
    # Close the main tab
    driver.close()
    
    
    break

print(reports)
    

In [64]:
# Prepare data for DataFrame
data_for_df = [
    {
        'Date': entry['date'].strip(),
        'PDF URL': entry['pdf_url'],
        'ASPI Today': entry['extracted_data']['ASPI']['Today'],
        'ASPI Previous': entry['extracted_data']['ASPI']['Previous'],
        'S&P 20 Today': entry['extracted_data']['S&P Sri Lanka 20 Index']['Today'],
        'S&P 20 Previous': entry['extracted_data']['S&P Sri Lanka 20 Index']['Previous'],
    }
    for entry in reports
]

# Create DataFrame
df = pd.DataFrame(data_for_df)

# Define CSV file name
reports_data = "reports_data.csv"

# Save DataFrame to CSV
df.to_csv(reports_data, index=False)
