In [49]:
%pip install selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [50]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
import time
from datetime import datetime
import pandas as pd
import os

# Code for Crawl Data about: 
- Silver
- Crude Oil
- SP500
- Russel 2000 Index
- Platinum
- Copper
- Dollar Index
- CBOE Volatility Index
- MSCI EM EFT
- EuroUSD
- NASDAQ Composite (IXIC)
- Nuclear Energy Index
(VanEck Uranium+Nuclear Energy ETF (NLR) - năng lượng hạt nhân)
- Rice

In [51]:
def get_data(url, start_date_input, end_date_input, column_name='close'):
    start_time = time.time()  # Record the start time
    
    options = Options()     
    options.add_experimental_option('detach', True)    
    driver = webdriver.Edge(options=options) 
    
    try:
        driver.get(url)
        
        def set_date_and_apply(start_date, end_date):
            time.sleep(5)
            date_picker = driver.find_element(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[1]/div[1]/div/div/div[1]')
            date_picker.click()

            # Convert datetime objects to strings
            start_date_str = start_date.strftime('%d-%m-%Y')
            end_date_str = end_date.strftime('%d-%m-%Y')

            start_input = driver.find_element(By.XPATH, '//*[@id="dropdown-menu"]/div/div[1]/input')
            start_input.clear()
            start_input.send_keys(start_date_str)

            end_input = driver.find_element(By.XPATH, '//*[@id="dropdown-menu"]/div/div[2]/input')
            end_input.clear()
            end_input.send_keys(end_date_str)

            apply_button = driver.find_element(By.XPATH, '//*[@id="dropdown-menu"]/div/div[3]/button[1]')
            apply_button.click()

            apply_button = driver.find_element(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[1]/button')
            apply_button.click()

        set_date_and_apply(start_date_input, end_date_input)

        # Step3: Scroll down to download all data
        time.sleep(5) 
        previous_length = 0
        while True:
            bottom = driver.find_element(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tfoot/tr')
            driver.execute_script("arguments[0].scrollIntoView(true);", bottom)
            time.sleep(5) 

            current_length = len(driver.find_elements(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tbody/tr'))
            if current_length == previous_length:
                break
            else:
                previous_length = current_length
        
        rows = driver.find_elements(By.XPATH, '//*[@id="Col1-1-HistoricalDataTable-Proxy"]/section/div[2]/table/tbody/tr')
        data = []  
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, 'td')     
            if len(cells) >= 5:
                date_text = cells[0].text
                close_text = cells[4].text
                if close_text == '-':
                    close_text = 'NaN'
                else:
                    close_text = float(close_text.replace(',', ''))
                numerical_date = datetime.strptime(date_text, '%b %d, %Y').strftime('%d/%m/%Y')

                data.append({'date': numerical_date, column_name: close_text}) 
        
        df = pd.DataFrame(data)
        
        return df
    finally:
        end_time = time.time()  # Record the end time
        execution_time = end_time - start_time
        print(f"Execution time for {column_name}: {execution_time} seconds")
        driver.quit()  # Close the WebDriver

# General 
Set up from start date to end date

In [52]:
start = datetime(2010,1,4)
end = datetime(2023,11,20)

In [53]:
data_urls = {
    'silver': 'https://finance.yahoo.com/quote/SI%3DF/history?p=SI%3DF',
    'crude_oil': 'https://finance.yahoo.com/quote/CL%3DF/history?p=CL%3DF',
    'SP500': 'https://finance.yahoo.com/quote/%5EGSPC/history?p=%5EGSPC',
    'RUT': 'https://finance.yahoo.com/quote/%5ERUT/history?p=%5ERUT',
}

In [54]:
for column_name, url in data_urls.items():
    file_path = f'{column_name}.csv'
    if not os.path.exists(file_path):
        df = get_data(url, start, end, column_name=column_name)
        df.to_csv(file_path, index=False)

Execution time for silver: 305.9550213813782 seconds
Execution time for crude_oil: 302.6884808540344 seconds
Execution time for SP500: 298.57992792129517 seconds
Execution time for RUT: 308.46917033195496 seconds


In [55]:
print("RELAXING ....")

RELAXING ....


In [56]:
data_urls1 = {
    'platinum': 'https://finance.yahoo.com/quote/PL%3DF/history?p=PL%3DF',
    'copper': 'https://finance.yahoo.com/quote/HG%3DF/history?p=HG%3DF',
    'DXY': 'https://finance.yahoo.com/quote/DX-Y.NYB/history?p=DX-Y.NYB',
    'VIX': 'https://finance.yahoo.com/quote/%5EVIX/history?p=%5EVIX',
}

In [57]:
for column_name, url in data_urls1.items():
    file_path = f'{column_name}.csv'
    if not os.path.exists(file_path):
        df = get_data(url, start, end, column_name=column_name)
        df.to_csv(file_path, index=False)

Execution time for platinum: 306.0257349014282 seconds
Execution time for copper: 302.3251111507416 seconds
Execution time for DXY: 362.6718144416809 seconds
Execution time for VIX: 314.4731693267822 seconds


In [58]:
print("RELAXING ....")

RELAXING ....


In [59]:
data_urls2 = {
    'MSCI': 'https://finance.yahoo.com/quote/MSCI/history?p=MSCI',
    'EURUSD': 'https://finance.yahoo.com/quote/EURUSD%3DX/history?p=EURUSD%3DX',
    'NASDAQ': 'https://finance.yahoo.com/quote/%5EIXIC/history?p=%5EIXIC',
    'NLR': 'https://finance.yahoo.com/quote/NLR/history?p=NLR',
    'rice': 'https://finance.yahoo.com/quote/ZR%3DF/history?p=ZR%3DF'
}

In [60]:
for column_name, url in data_urls2.items():
    file_path = f'{column_name}.csv'
    if not os.path.exists(file_path):
        df = get_data(url, start, end, column_name=column_name)
        df.to_csv(file_path, index=False)

Execution time for MSCI: 301.133362531662 seconds
Execution time for EURUSD: 312.63931226730347 seconds
Execution time for NASDAQ: 305.1060378551483 seconds
Execution time for NLR: 298.0712478160858 seconds
Execution time for rice: 312.0010118484497 seconds


In [61]:
print("DONE !!!")

DONE !!!
