In [None]:
%pip install selenium

In [None]:
from selenium import webdriver  
from selenium.webdriver.edge.options import Options  
from selenium.webdriver.common.by import By  
from selenium.webdriver.support.ui import WebDriverWait  
from selenium.webdriver.support import expected_conditions as EC  
import pandas as pd  
from datetime import datetime  
import time  
import os

# Code for Crawl Data about: 
- Silver
- Crude Oil
- SP500
- Russel 2000 Index
- Platinum
- Copper
- Dollar Index
- CBOE Volatility Index
- MSCI EM EFT
- EuroUSD
- NASDAQ Composite (IXIC)
- Nuclear Energy Index
(VanEck Uranium+Nuclear Energy ETF (NLR) - năng lượng hạt nhân)
- Rice

In [None]:
def get_data(url, start_date_input, end_date_input, column_name='close'):
    start_time = time.time()  # Record the start time
    
    options = Options()     
    options.add_experimental_option('detach', True)    
    driver = webdriver.Edge(options=options) 
    
    try:
        driver.get(url)
        
        def set_date_and_apply(start_date, end_date):
            time.sleep(5)
            date_picker = driver.find_element(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[1]/div[1]/div/div/div[1]')
            date_picker.click()

            # Convert datetime objects to strings
            start_date_str = start_date.strftime('%d-%m-%Y')
            end_date_str = end_date.strftime('%d-%m-%Y')

            start_input = driver.find_element(By.XPATH, '//*[@id="dropdown-menu"]/div/div[1]/input')
            start_input.clear()
            start_input.send_keys(start_date_str)

            end_input = driver.find_element(By.XPATH, '//*[@id="dropdown-menu"]/div/div[2]/input')
            end_input.clear()
            end_input.send_keys(end_date_str)

            apply_button = driver.find_element(By.XPATH, '//*[@id="dropdown-menu"]/div/div[3]/button[1]')
            apply_button.click()

            apply_button = driver.find_element(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[1]/button')
            apply_button.click()

        set_date_and_apply(start_date_input, end_date_input)

        # Step3: Scroll down to download all data
        time.sleep(5) 
        previous_length = 0
        while True:
            bottom = driver.find_element(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tfoot/tr')
            driver.execute_script("arguments[0].scrollIntoView(true);", bottom)
            time.sleep(5) 

            current_length = len(driver.find_elements(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tbody/tr'))
            if current_length == previous_length:
                break
            else:
                previous_length = current_length
        
        rows = driver.find_elements(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tbody/tr')
        data = []   
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, 'td')     
            if len(cells) >= 5:
                date_text = cells[0].text
                close_text = cells[4].text
                numerical_date = datetime.strptime(date_text, '%b %d, %Y').strftime('%d/%m/%Y')
                
                data.append({'date': numerical_date, column_name: close_text})

        df = pd.DataFrame(data)
        
        return df
    finally:
        end_time = time.time()  # Record the end time
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds")
        driver.quit()  # Close the WebDriver

# General 
Set up from start date to end date

In [None]:
start = datetime(2010,1,4)
end = datetime(2023,11,20)

In [None]:
data_urls = {
    'silver': 'https://finance.yahoo.com/quote/SI%3DF/history?p=SI%3DF',
    'crude_oil': 'https://finance.yahoo.com/quote/CL%3DF/history?p=CL%3DF',
    'SP500': 'https://finance.yahoo.com/quote/%5EGSPC/history?p=%5EGSPC',
    'RUT': 'https://finance.yahoo.com/quote/%5ERUT/history?p=%5ERUT',
    'platinum': 'https://finance.yahoo.com/quote/PL%3DF/history?p=PL%3DF',
    'copper': 'https://finance.yahoo.com/quote/HG%3DF/history?p=HG%3DF',
    'DXY': 'https://finance.yahoo.com/quote/DX-Y.NYB/history?p=DX-Y.NYB',
    'VIX': 'https://finance.yahoo.com/quote/%5EVIX/history?p=%5EVIX',
    'MSCI': 'https://finance.yahoo.com/quote/MSCI/history?p=MSCI',
    'EURUSD': 'https://finance.yahoo.com/quote/EURUSD%3DX/history?p=EURUSD%3DX',
    'NASDAQ': 'https://finance.yahoo.com/quote/%5EIXIC/history?p=%5EIXIC',
    'NLR': 'https://finance.yahoo.com/quote/NLR/history?p=NLR',
    'rice': 'https://finance.yahoo.com/quote/ZR%3DF/history?p=ZR%3DF'
}

In [None]:
for column_name, url in data_urls.items():
    file_path = f'{column_name}.csv'
    if not os.path.exists(file_path):
        df = get_data(url, start, end, column_name=column_name)
        df.to_csv(file_path, index=False)