In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime
import re

vn30_stock_names = [
    "ACB", "BID", "BVH", "CTG", "FPT",
    "GAS", "HPG", "MBB", "MSN", "MWG",
    "NVL", "PDR", "PLX", "PNJ", "POW",
    "SAB", "SSI", "STB", "TCB", "TPB",
    "VHM", "VIB", "VCB", "VIC", "VJC",
    "VNM", "VPB", "VRE", "HDB", "SHB"
]

In [2]:
# Function to scrape data from the current page
def scrape_current_page(is_first_page, driver):
    # Locate the table or data container
    table = driver.find_element(By.ID, "ctl00_webPartManager_wp425243205_wp378545232_cbTradingResult")

    # Extract table rows
    rows = table.find_elements(By.TAG_NAME, "tr")

    # Extract data into a list
    data = []
    for i, row in enumerate(rows):
        # Skip the footer row (last row)
        if i == len(rows) - 1:
            continue

        # Skip the title row (first row) for pages 2 and beyond
        if not is_first_page and i == 0:
            continue

        # Extract columns
        cols = row.find_elements(By.TAG_NAME, "td")
        cols = [col.text.strip() for col in cols[:-1]]
        data.append(cols)

    return data


def get_total_pages(driver):
    # Locate the pagination td element
    pagination_td = driver.find_element(By.XPATH, "//td[contains(., 'Page') and contains(., 'of')]")
    
    # Extract the footer text (e.g., "Page 1 of 229 Next>")
    footer_text = pagination_td.text.strip()
    
    # Extract the total pages from the text (the number after "of")
    total_pages_match = re.search(r"of\s+(\d+)", footer_text)
    if total_pages_match:
        total_pages = int(total_pages_match.group(1))
    else:
        raise ValueError("Couldn't extract total pages from footer text: " + footer_text)
    
    # Locate the "Next" button element and get its onclick attribute
    next_button = driver.find_element(By.XPATH, "//a[contains(@onclick, 'GoPage')]")
    onclick_value = next_button.get_attribute("onclick")
    
    # Extract the fourth parameter (e.g., 124) from the onclick attribute
    total_rows_match = re.search(r"GoPage\(\s*'[^']+'\s*,\s*'[^']+'\s*,\s*\d+\s*,\s*(\d+)\s*\);", onclick_value)
    if total_rows_match:
            total_rows = int(total_rows_match.group(1))
    else:
        raise ValueError("Couldn't extract offset from onclick attribute: " + onclick_value)
    
    # Return both values
    return total_pages, total_rows

In [3]:
def get_all_data(driver, stock):
    # Locate the hidden start date input field
    start_date_hidden_input = driver.find_element(By.ID, "ctl00_webPartManager_wp425243205_wp378545232_dtStartDate_picker_selecteddates")

    # Locate the visible start date input field
    start_date_visible_input = driver.find_element(By.ID, "ctl00_webPartManager_wp425243205_wp378545232_dtStartDate_picker_picker")

    # Set the start date value using JavaScript (both hidden and visible fields)
    start_date = "28/07/2000"  # Format: DD/MM/YYYY (adjust based on the website's expected format)
    driver.execute_script(f"arguments[0].value = '{start_date}';", start_date_hidden_input)
    driver.execute_script(f"arguments[0].value = '{start_date}';", start_date_visible_input)

    # Simulate the ComponentArt Calendar widget's internal logic
    # This JavaScript code sets the selected date in the calendar widget
    calendar_widget_script = """
    var calendar = window.ctl00_webPartManager_wp425243205_wp378545232_dtStartDate_picker;
    if (calendar && calendar.setSelectedDate) {
        var selectedDate = new Date(2000, 6, 28);  // Year, Month (0-based), Day
        calendar.setSelectedDate(selectedDate);
        if (calendar.render) {
            calendar.render();
        } else {
            console.error("render method not found on calendar widget.");
        }
    } else {
        console.error("Calendar widget or setSelectedDate method not found.");
    }
    """

    # Execute the script to update the calendar widget
    driver.execute_script(calendar_widget_script)

    # Trigger events to ensure the website recognizes the change
    driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", start_date_hidden_input)
    driver.execute_script("arguments[0].dispatchEvent(new Event('change'));", start_date_hidden_input)
    driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", start_date_visible_input)
    driver.execute_script("arguments[0].dispatchEvent(new Event('change'));", start_date_visible_input)

    # Locate the hidden end date input field
    end_date_hidden_input = driver.find_element(By.ID, "ctl00_webPartManager_wp425243205_wp378545232_dtEndDate_picker_selecteddates")

    # Locate the visible end date input field
    end_date_visible_input = driver.find_element(By.ID, "ctl00_webPartManager_wp425243205_wp378545232_dtEndDate_picker_picker")

    # Set the end date value using JavaScript (both hidden and visible fields)
    end_date = datetime.today().strftime("%d/%m/%Y")  # Format: DD/MM/YYYY (adjust based on the website's expected format)
    driver.execute_script(f"arguments[0].value = '{end_date}';", end_date_hidden_input)
    driver.execute_script(f"arguments[0].value = '{end_date}';", end_date_visible_input)

    # Simulate the ComponentArt Calendar widget's internal logic for the end date
    calendar_widget_script_end_date = """
    var calendar = window.ctl00_webPartManager_wp425243205_wp378545232_dtEndDate_picker;
    if (calendar && calendar.setSelectedDate) {
        var selectedDate = new Date();  // Today's date
        calendar.setSelectedDate(selectedDate);
        if (calendar.render) {
            calendar.render();
        } else {
            console.error("render method not found on calendar widget.");
        }
    } else {
        console.error("Calendar widget or setSelectedDate method not found.");
    }
    """

    # Execute the script to update the calendar widget for the end date
    driver.execute_script(calendar_widget_script_end_date)

    # Trigger events to ensure the website recognizes the change
    driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", end_date_hidden_input)
    driver.execute_script("arguments[0].dispatchEvent(new Event('change'));", end_date_hidden_input)
    driver.execute_script("arguments[0].dispatchEvent(new Event('input'));", end_date_visible_input)
    driver.execute_script("arguments[0].dispatchEvent(new Event('change'));", end_date_visible_input)

    # Debugging: Print the updated values
    start_date_hidden_value = driver.execute_script("return arguments[0].value;", start_date_hidden_input)
    start_date_visible_value = driver.execute_script("return arguments[0].value;", start_date_visible_input)
    end_date_hidden_value = driver.execute_script("return arguments[0].value;", end_date_hidden_input)
    end_date_visible_value = driver.execute_script("return arguments[0].value;", end_date_visible_input)
    print("Start Date (Hidden):", start_date_hidden_value)
    print("Start Date (Visible):", start_date_visible_value)
    print("End Date (Hidden):", end_date_hidden_value)
    print("End Date (Visible):", end_date_visible_value)

    # Locate and click the "View" button to refresh the data
    view_button = driver.find_element(By.ID, "ctl00_webPartManager_wp425243205_wp378545232_btnView")
    driver.execute_script("arguments[0].click();", view_button)  # Use JavaScript to click the button

    # Wait for the data to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "ctl00_webPartManager_wp425243205_wp378545232_cbTradingResult"))
        )
        print("Data loaded successfully.")
    except Exception as e:
        print("Data did not load:", e)

    # Initialize a list to store all data
    all_data = []

    # Scrape data from the first page (including the title row)
    first_page_data = scrape_current_page(is_first_page=True, driver=driver)
    title_row = first_page_data[0]  # Extract the title row
    all_data.extend(first_page_data[1:])  # Append the rest of the data (excluding the title row)

    # Get the total number of pages from the table footer
    total_pages, total_rows = get_total_pages(driver)
    print(f"Total pages to scrape: {total_pages}")
    print(f"Total rows to scrape: {total_rows}")

    # Initialize the offset for pagination
    offset = 20
    start_date_formatted = start_date_hidden_value.replace('.', '-')
    end_date_formatted = end_date_hidden_value.replace('.', '-')

    # Loop through all pages
    for page in range(2, total_pages + 1):  # Start from page 2 (since we already scraped page 1)
        try:
            # Locate the "Next" button
            next_button = driver.find_element(By.XPATH, "//a[contains(@onclick, 'GoPage')]")

            # Build the new onclick attribute string
            onclick_value = f"GoPage('{start_date_formatted}', '{end_date_formatted}', {offset}, {total_rows});"

            # Update the "Next" button's onclick attribute using execute_script
            driver.execute_script("arguments[0].setAttribute('onclick', arguments[1]);", next_button, onclick_value)

            # Click the "Next" button using JavaScript
            driver.execute_script("arguments[0].click();", next_button)

            # Wait for the data to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "ctl00_webPartManager_wp425243205_wp378545232_cbTradingResult"))
            )

            # Scrape data from the current page (excluding the title row)
            all_data.extend(scrape_current_page(is_first_page=False, driver=driver))
            print(f"Scraped data from page {page}.")

            # Increment the offset by 20 for the next page
            offset += 20

        except Exception as e:
            # If the "Next" button is not found or an error occurs, stop the loop
            print("No more pages or an error occurred:", e)
            break

    # Convert the data into a pandas DataFrame
    df = pd.DataFrame(all_data, columns=title_row)  # Use the title row as column headers

    # # Add an index column
    # df.reset_index(inplace=True, drop=True)  # Add a sequential index column
    # df.rename(columns={"index": "Index"}, inplace=True)  # Rename the index column to "Index"

    # Save the data to a CSV file
    df.to_csv(f"../vn_30_list/{stock}_historical_data.csv", index=False)  # Do not use any column as the index
    print(f"Data saved to {stock}_historical_data.csv")

    # Close the browser
    driver.quit()

In [4]:
def get_latest_data(driver, stock): 
    # Read existing CSV
    df = pd.read_csv(f"../vn_30_list/{stock}_historical_data.csv")

    # Scrape the first page
    first_page_data = scrape_current_page(is_first_page=True, driver=driver)

    # Extract header and data
    title_row = first_page_data[0]
    first_page_df = pd.DataFrame(first_page_data[1:], columns=title_row)

    # Combine the new and existing data
    combined_df = pd.concat([first_page_df, df], ignore_index=True)

    # Drop duplicates based on 'Date' column only — keep the first occurrence
    combined_df.drop_duplicates(subset=['Date'], keep='first', inplace=True)

    # Drop old 'Index' column if it's already there
    if 'Index' in combined_df.columns:
        combined_df.drop(columns=['Index'], inplace=True)

    # Reset index and insert 'Index' column as a normal column
    combined_df.reset_index(drop=True, inplace=True)
    combined_df.insert(0, 'Index', combined_df.index)

    # Save to CSV
    combined_df.to_csv(f"../vn_30_list/{stock}_historical_data.csv", index=False)
    print(f"Data saved to {stock}_historical_data.csv")

    # Quit driver
    driver.quit()

In [6]:
for stock in vn30_stock_names:    
    # Set up Selenium WebDriver
    driver = webdriver.Chrome()  # Make sure chromedriver is in your PATH
    driver.get(f"http://en.stockbiz.vn/Stocks/{stock}/TradingStatistics.aspx")

    # Wait for the page to load
    driver.implicitly_wait(10)

    # Wait for the calendar widget to be fully initialized
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "ctl00_webPartManager_wp425243205_wp378545232_dtStartDate_picker"))
    )

    # get_all_data(driver, stock)
    get_latest_data(driver, stock)

python(77510) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(77571) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to ACB_historical_data.csv


python(77598) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(77659) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to BID_historical_data.csv


python(77685) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(77747) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to BVH_historical_data.csv


python(77773) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(77834) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to CTG_historical_data.csv


python(77855) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(77917) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to FPT_historical_data.csv


python(77958) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78026) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to GAS_historical_data.csv


python(78054) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78127) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to HPG_historical_data.csv


python(78158) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78226) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to MBB_historical_data.csv


python(78285) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78356) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to MSN_historical_data.csv


python(78426) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78494) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to MWG_historical_data.csv


python(78568) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78631) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to NVL_historical_data.csv


python(78697) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78765) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to PDR_historical_data.csv


python(78822) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78886) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to PLX_historical_data.csv


python(78909) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(78974) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to PNJ_historical_data.csv


python(79011) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79077) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to POW_historical_data.csv


python(79141) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79208) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to SAB_historical_data.csv


python(79290) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79369) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to SSI_historical_data.csv


python(79397) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79463) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to STB_historical_data.csv


python(79579) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79649) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to TCB_historical_data.csv


python(79741) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79819) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to TPB_historical_data.csv


python(79895) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79962) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to VHM_historical_data.csv


python(80029) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(80094) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to VIB_historical_data.csv


python(80181) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(80246) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to VCB_historical_data.csv


python(80325) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(80392) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to VIC_historical_data.csv


python(80443) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(80505) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to VJC_historical_data.csv


python(80604) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(80675) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to VNM_historical_data.csv


python(80800) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(80866) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to VPB_historical_data.csv


python(80952) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(81014) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to VRE_historical_data.csv


python(81073) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(81141) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to HDB_historical_data.csv


python(81240) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(81307) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Data saved to SHB_historical_data.csv


In [9]:
df = pd.read_csv('../vn_30_list/SSI_historical_data.csv')
df[:60]

Unnamed: 0,Index,Date,Close,Buy Count,Buy Volume,Sell Count,Sell Volume,Buy-Sell Volume,Deal Volume,Deal Value (Unit: 1000 VND)
0,0,5/9/2025,22.95,7834,24808871,7077,30748056,-5939185,15341500,353759772
1,1,5/8/2025,23.1,9563,29189179,7503,31796948,-2607769,18519000,426184196
2,2,5/7/2025,22.85,12191,26918294,5194,32641882,-5723588,18887200,433144865
3,3,5/6/2025,23.15,6222,30381664,9161,38574990,-8193326,16475000,505719402
4,4,5/5/2025,23.1,5273,14396321,6147,22029422,-7633101,9314500,216015281
5,5,4/29/2025,22.95,6280,20534819,6099,23284381,-2749562,9778300,240553030
6,6,4/28/2025,23.05,7252,27699715,6931,29308723,-1609008,12060400,288378726
7,7,4/25/2025,22.8,9768,28647873,9710,37179226,-8531353,15312300,366336823
8,8,4/24/2025,23.0,8825,33731253,10181,40503503,-6772250,17946000,422097261
9,9,4/23/2025,22.75,6823,23505303,8804,31087931,-7582628,13492900,307374461


In [8]:
df.dtypes

Index                            int64
Date                            object
Close                          float64
Buy Count                       object
Buy Volume                      object
Sell Count                      object
Sell Volume                     object
Buy-Sell Volume                 object
Deal Volume                     object
Deal Value (Unit: 1000 VND)     object
dtype: object