In [None]:
import time
import os
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

# Initialize the WebDriver options (for better performance)
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode for speed
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

#Filter1
# Function to get all company codes from the dropdown list
def get_company_codes(driver):
    url = "https://www.mse.mk/mk/stats/symbolhistory/ADIN"
    driver.get(url)
    time.sleep(3)
    
    dropdown = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "Code"))
    )
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    options = soup.select('#Code option')
    company_codes = [option['value'] for option in options if option['value'] and not re.search(r'\d', option['value'])]
    
    print(f"Retrieved {len(company_codes)} company codes.")
    return company_codes

#Filter2
# Function to get the latest date available in the CSV file for each company
def get_latest_date_for_company(code):
    csv_file = f"{code}.csv"
    if not os.path.exists(csv_file):
        print(f"No data found for {code}")
        return None
    
    df = pd.read_csv(csv_file)
    if df.empty:
        print(f"CSV EMPTY {code}")
        return None
    
    latest_date = pd.to_datetime(df['Датум'], dayfirst=True).max()
    print(f"{latest_date.strftime('%d.%m.%Y')} for company {code}")
    return latest_date

#Filter3
# Function to update company data
def update_company_data(code, start_date, driver):
    end_date = datetime.today()
    if start_date > end_date:
        print(f"Start date {start_date} is greater than today's date. Adjusting start date to {end_date}.")
        start_date = end_date  # Adjust start_date to today if it's greater than end_date

    current_date = start_date
    print(f"Scraping data for company: {code}")

    # Loop until current_date exceeds end_date
    while current_date <= end_date:
        days_diff = (end_date - current_date).days

        if days_diff <= 7:
            to_date = current_date + timedelta(days=days_diff)
        elif days_diff <= 30:
            to_date = current_date + timedelta(days=7)  # Fetch weekly if under a month gap
        elif days_diff <= 365:
            to_date = current_date + timedelta(days=30)  # Fetch monthly if within a year
        else:
            to_date = current_date + timedelta(days=365)  # Fetch yearly if over a year

        # Ensure 'from_date' is always earlier than or equal to 'to_date'
        if to_date <= current_date:
            to_date = current_date

        # Format dates for request
        from_date_str = current_date.strftime('%d.%m.%Y')
        to_date_str = to_date.strftime('%d.%m.%Y')

        url = f"https://www.mse.mk/mk/stats/symbolhistory/{code}"
        driver.get(url)

        try:
            from_date_input = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "FromDate")))
            to_date_input = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "ToDate")))

            from_date_input.clear()
            from_date_input.send_keys(from_date_str)
            to_date_input.clear()
            to_date_input.send_keys(to_date_str)

            show_button = driver.find_element(By.CSS_SELECTOR, 'input[value="Прикажи"]')
            show_button.click()

            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#resultsTable")))

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            table = soup.select_one('#resultsTable tbody')

            rows = [
                [code] + [col.get_text(strip=True) if col.get_text(strip=True) != '0' else None for col in row.select('td')]
                for row in table.select('tr') if row.select('td')
            ]

            if rows:
                new_data = pd.DataFrame(rows, columns=["Код", "Датум", "Цена на последна трансакција", "Мак.", "Мин.", "Просечна цена", "%пром.", "Количина", "Промет во БЕСТ во денари", "Вкупен промет во денари"])
                new_data['Датум'] = pd.to_datetime(new_data['Датум'], dayfirst=True)

                csv_file = f"{code}.csv"
                if os.path.exists(csv_file):
                    existing_data = pd.read_csv(csv_file, parse_dates=["Датум"], dayfirst=True)
                    combined_data = pd.concat([existing_data, new_data]).drop_duplicates(subset="Датум").sort_values(by="Датум")
                    combined_data.to_csv(csv_file, index=False, date_format="%d.%m.%Y")
                else:
                    new_data.to_csv(csv_file, index=False, date_format="%d.%m.%Y")

                print(f"Data updated for {code} from {from_date_str} to {to_date_str}")

        except TimeoutException:
            print(f"Timeout occurred for {code} while fetching data from {from_date_str} to {to_date_str}.")
            
            if to_date.year == datetime.today().year:
                to_date = datetime.today()

            # Move forward by one year
            else:
                to_date = current_date + timedelta(days=364)
            print(f"Moving forward with the next available range: {from_date_str} to {to_date.strftime('%d.%m.%Y')}")

        current_date = to_date + timedelta(days=1)

        if current_date > end_date:
            print(f"Finished processing {code}. Reached the end date: {end_date.strftime('%d.%m.%Y')}")
            break

# Main function to combine all operations
def main():
    # Start timer
    start_time = time.time()

    driver = webdriver.Chrome(options=options)

    # Get company codes
    all_codes = get_company_codes(driver)

    for code in all_codes:
        latest_date = get_latest_date_for_company(code)
        if latest_date:
            start_date = latest_date + timedelta(days=1)
        else:
            start_date = datetime.today() - timedelta(days=365 * 10)

        update_company_data(code, start_date, driver)

    # End timer
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Total execution time: {execution_time:.2f} seconds")

    driver.quit()

if __name__ == "__main__":
    main()
