In [None]:
!pip install pandas
!pip install selenium
!pip install webdriver_manager
!pip install lxml
!pip install yfinance

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd
from time import sleep

In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [None]:
argo = 'https://www.set.or.th/th/market/index/set/agro'
consump = 'https://www.set.or.th/th/market/index/set/consump'
fincial = 'https://www.set.or.th/th/market/index/set/fincial'
indus = 'https://www.set.or.th/th/market/index/set/indus'
propcon = 'https://www.set.or.th/th/market/index/set/propcon'
resource = 'https://www.set.or.th/th/market/index/set/resourc'
service = 'https://www.set.or.th/th/market/index/set/service'
tech = 'https://www.set.or.th/th/market/index/set/tech'

In [None]:
# Scrap the most exchange value from SET
def scrape_industry_data(driver):
    
    industry_urls = {
        "agro": "https://www.set.or.th/th/market/index/set/agro",
        "consump": "https://www.set.or.th/th/market/index/set/consump",
        "fincial": "https://www.set.or.th/th/market/index/set/fincial",
        "indus": "https://www.set.or.th/th/market/index/set/indus",
        "propcon": "https://www.set.or.th/th/market/index/set/propcon",
        "resource": "https://www.set.or.th/th/market/index/set/resourc",
        "service": "https://www.set.or.th/th/market/index/set/service",
        "tech": "https://www.set.or.th/th/market/index/set/tech"
    }

    industry_data = {}

    for industry, url in industry_urls.items():
        print(f"Scraping {industry} data from {url}...")
        driver.get(url)
        sleep(5)  
        driver.execute_script("window.scrollTo(0, 1500)")  
        sleep(5)  
    
        retry_attempts = 3  
        data_cleaned = None  

        for attempt in range(retry_attempts):
            try:
                data_list = pd.read_html(driver.page_source)
                if len(data_list) > 1:
                    data_cleaned = data_list[1].dropna(how="all")  # Clean the data

            # If data_cleaned is not empty, break the retry loop
                if data_cleaned is not None and not data_cleaned.empty:
                    print(data_cleaned)
                    industry_data[industry] = data_cleaned  # Store in dictionary
                    print(f"Successfully scraped {industry} data.")
                    break
                else:
                    print(f"No data found for {industry}. Retrying... (Attempt {attempt + 1})")
                    sleep(5)  # Wait before retrying
                    driver.refresh()  # Refresh the page

            except Exception as e:
                print(f"Error scraping {industry} on attempt {attempt + 1}: {e}")
        else:
            print(f"Failed to scrape data for {industry} after {retry_attempts} attempts.")

    return industry_data

# Get the top 5
def get_top_5_by_value(dataframe):
    if dataframe.empty or "มูลค่า ('000 บาท)" not in dataframe.columns:
        return pd.DataFrame(columns=["หลักทรัพย์", "มูลค่า ('000 บาท)"])  # Return empty DataFrame if missing data

    dataframe["มูลค่า ('000 บาท)"] = pd.to_numeric(dataframe["มูลค่า ('000 บาท)"], errors="coerce")
    
    dataframe = dataframe.dropna(subset=["มูลค่า ('000 บาท)"])

    top_5 = dataframe.sort_values(by="มูลค่า ('000 บาท)", ascending=False).head(5)

    return top_5[["หลักทรัพย์", "มูลค่า ('000 บาท)"]]

# Extract for furthur input
def extract_top_5_data_with_tickers(source_data):
    top_5_industries = {industry: get_top_5_by_value(df) for industry, df in source_data.items()}
    
    # Extract only the top 5 names and add ".BK"
    top_5_names_industries = {
        industry: [f"{ticker}.BK" for ticker in df["หลักทรัพย์"].tolist()]
        for industry, df in top_5_industries.items()
    }

    return top_5_industries, top_5_names_industries

In [None]:
industry_data = scrape_industry_data(driver)

In [None]:
top_5_industries = {
    industry: get_top_5_by_value(df) for industry, df in industry_data.items()
}

# Print results
print("\n========= TOP 5 with Values =========")
for industry, top_5 in top_5_industries.items():
    print(f"\nTop 5 for {industry}:")
    print(top_5)

top_5_industries, top_5_names_industries = extract_top_5_data_with_tickers(industry_data)

# Print the top 5 names with '.BK' appended
print("\n========= TOP 5 with .BK =========")
for industry, names in top_5_names_industries.items():
    print(f"\nTop 5 tickers for {industry}:")
    print(names)