# 거래소에서 option data 가져와보기

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import time
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo  # Python 3.9 이상

from typing import List, Tuple 
from tqdm import tqdm

import requests

In [2]:
from pathlib import Path

CWD_PATH = Path.cwd()
DATA_PATH = CWD_PATH / 'data'
BACKUP_PATH = CWD_PATH / 'backup'
OUTPUT_PATH = CWD_PATH / 'output'

In [3]:
CWD_PATH

WindowsPath('c:/Users/chlje/VSCodeProjects/KAIST_MFE/BAF645파생거래전략/final_hw')

In [4]:
## custom libs

from krx_config import API_URL, HEADERS, PAYLOAD_TEMPLATE

## API scraping code

### Prompt 1

I need to write code to scrape data from KRX(한국거래소) website. The website uses POST to get daily snapshot of all the options traded on that day. 

Below is the information for its endpint (getJsonData.cmd) and what are in the headers and what the payload is and what the response looks like. 

Write Python code that takes start date and end date as an input and iteratively get the option price data from the api. 

You should make the requests to look like human by setting web browser user-agent and other stuff that can be detected by site admin. 

Although, you don't have to use dynamic scraping like selenium because as you can see, we're communicating with an API in restful way, with POST method. 

When you write the code, make the code modular and keep the function document as simple as possible. Never make verbose docstring that can bloat the code with unnecessary details. 

Below are the information that you need to write the correct API call scraper. 

In [None]:
def generate_date_range(start_date: str, end_date: str) -> List[str]: 
    """
    종료일(end_date)부터 시작일(start_date)까지의 비주말(평일) 날짜를 YYYYMMDD 형식으로 반환합니다.
    날짜는 한국 서울 시간대 기준입니다.

    Args:
        start_date (str): 시작 날짜 ('YYYY-MM-DD' 형식).
        end_date (str): 종료 날짜 ('YYYY-MM-DD' 형식).

    Returns:
        List[str]: 내림차순으로 정렬된 비주말 날짜 리스트 ('YYYYMMDD' 형식).
    """
    tz = ZoneInfo('Asia/Seoul')
    start = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=tz)
    end = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=tz)
    delta = end - start
    date_list = []
    for i in range(delta.days + 1):
        current_date = end - timedelta(days=i)
        if current_date.weekday() < 5:  # 0-4는 월요일~금요일
            date_list.append(current_date.strftime("%Y%m%d"))
    return date_list

def fetch_option_data(session: requests.Session, trade_date: str) -> pd.DataFrame:
    """
    Fetches option data for a specific trade date.

    Args:
        session (requests.Session): The requests session with headers set.
        trade_date (str): Trade date in 'YYYYMMDD' format.

    Returns:
        pd.DataFrame: DataFrame containing option data for the trade date.
    """
    payload = PAYLOAD_TEMPLATE.copy()
    payload["trdDd"] = trade_date

    response = session.post(API_URL, data=payload)
    response.raise_for_status()

    data = response.json()

    if "output" not in data:
        return pd.DataFrame()

    return pd.DataFrame(data["output"])

def scrape_krx_option_data(start_date: str, end_date: str) -> pd.DataFrame:
    """
    Scrapes KRX option data between start_date and end_date.

    Args:
        start_date (str): Start date in 'YYYY-MM-DD' format.
        end_date (str): End date in 'YYYY-MM-DD' format.

    Returns:
        pd.DataFrame: Combined DataFrame containing option data for all dates.
    """
    dates = generate_date_range(start_date, end_date)
    all_data = []

    with requests.Session() as session:
        session.headers.update(HEADERS)
        for date in tqdm(dates, desc="Fetching data"):
            try:
                daily_data = fetch_option_data(session, date)
                if not daily_data.empty:
                    daily_data['Trade_Date'] = datetime.strptime(date, "%Y%m%d").date()
                    all_data.append(daily_data)
                time.sleep(1)  # Delay to mimic human behavior
            except requests.HTTPError as http_err:
                print(f"HTTP error for date {date}: {http_err}")
            except Exception as err:
                print(f"Error for date {date}: {err}")

    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        return pd.DataFrame()

def save_to_excel(df: pd.DataFrame, filename: str):
    """
    Saves the DataFrame to an Excel file.

    Args:
        df (pd.DataFrame): DataFrame to save.
        filename (str): Filename for the Excel file.
    """
    df.to_excel(filename, index=False)
    print(f"Data saved to {filename}")

In [6]:
sample_date = '20241204'

In [7]:
with requests.Session() as session:
    session.headers.update(HEADERS)
    payload = PAYLOAD_TEMPLATE.copy()
    payload["trdDd"] = sample_date

    response = session.post(API_URL, data=payload)

KeyboardInterrupt: 

In [None]:
dd = response.json()

In [21]:
START_DATE = '2021-01-01'
END_DATE = '2024-12-05'


### New Prompt

Implement retry strategy in this code. 
Retry strategy should:
- when there's bad requests like 400, step back and wait and retry. 

You should suggest me additional measure to avoid getting blocked. Is there a free VPN service that I can use to hide my identity? 

Also, to avoid losing all the data when I get blocked, write the code to:

- start from end date to start date
- for each date scraped, save it in the data by appending the data (not overwriting). You should use h5py to save it in h5 format to append the data. 
- periodically (per 100 scrape) copy that h5 file to somewhere else for me to safely use it without racing condition. ( I will copy the copy of the h5 and use it at somewhere else)

In [None]:
# main.py

import requests
import pandas as pd
from datetime import datetime, timedelta
import time
from typing import List
from tqdm import tqdm
import h5py
import os
import shutil
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from zoneinfo import ZoneInfo  # Python 3.9 이상

# Import configurations
from krx_config import API_URL, HEADERS, PAYLOAD_TEMPLATE



# Constants for backup
BACKUP_INTERVAL = 100  # Backup after every 100 successful scrapes
H5_FILE = DATA_PATH / "krx_option_data.h5"

def generate_date_range(start_date: str, end_date: str) -> List[str]: 
    """
    종료일(end_date)부터 시작일(start_date)까지의 비주말(평일) 날짜를 YYYYMMDD 형식으로 반환합니다.
    날짜는 한국 서울 시간대 기준입니다.

    Args:
        start_date (str): 시작 날짜 ('YYYY-MM-DD' 형식).
        end_date (str): 종료 날짜 ('YYYY-MM-DD' 형식).

    Returns:
        List[str]: 내림차순으로 정렬된 비주말 날짜 리스트 ('YYYYMMDD' 형식).
    """
    tz = ZoneInfo('Asia/Seoul')
    start = datetime.strptime(start_date, "%Y-%m-%d").replace(tzinfo=tz)
    end = datetime.strptime(end_date, "%Y-%m-%d").replace(tzinfo=tz)
    delta = end - start
    date_list = []
    for i in range(delta.days + 1):
        current_date = end - timedelta(days=i)
        if current_date.weekday() < 5:  # 0-4는 월요일~금요일
            date_list.append(current_date.strftime("%Y%m%d"))
    return date_list

def setup_session() -> requests.Session:
    """
    Sets up a requests session with headers and retry strategy.
    """
    session = requests.Session()
    session.headers.update(HEADERS)
    
    retry_strategy = Retry(
        total=5,
        status_forcelist=[400, 429, 500, 502, 503, 504],
        method_whitelist=["POST"],
        backoff_factor=1
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    return session

def fetch_option_data(session: requests.Session, trade_date: str) -> pd.DataFrame:
    """
    Fetches option data for a specific trade date.
    """
    payload = PAYLOAD_TEMPLATE.copy()
    payload["trdDd"] = trade_date

    response = session.post(API_URL, data=payload)
    response.raise_for_status()

    data = response.json()

    if "output" not in data:
        return pd.DataFrame()

    return pd.DataFrame(data["output"])

def save_data_h5(df: pd.DataFrame, filename: str):
    """
    Appends DataFrame to an HDF5 file.
    """
    with pd.HDFStore(filename, mode='a') as store:
        store.append('option_data', df, format='table', data_columns=True)

def backup_h5_file(source: str, backup_path: Path):
    """
    Copies the HDF5 file to the backup directory with a timestamp.
    """
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    backup_path = BACKUP_PATH / f"krx_option_data_backup_{timestamp}.h5"
    shutil.copy(source, backup_path)
    print(f"Backup created at {backup_path}")

def scrape_krx_option_data(start_date: str, end_date: str) -> None:
    """
    Scrapes KRX option data between start_date and end_date and saves to HDF5.
    Implements retry and backup strategies.
    """
    dates = generate_date_range(start_date, end_date)
    success_count = 0

    session = setup_session()
    
    for date in tqdm(dates, desc="Fetching data"):
        try:
            daily_data = fetch_option_data(session, date)
            if not daily_data.empty:
                daily_data['Trade_Date'] = datetime.strptime(date, "%Y%m%d").date()
                save_data_h5(daily_data, H5_FILE)
                success_count += 1
                
                if success_count % BACKUP_INTERVAL == 0:
                    backup_h5_file(H5_FILE, BACKUP_PATH)
            
            time.sleep(1)  # Delay to mimic human behavior
        except requests.HTTPError as http_err:
            print(f"HTTP error for date {date}: {http_err}")
            time.sleep(5)  # Wait before retrying
        except Exception as err:
            print(f"Error for date {date}: {err}")
            time.sleep(5)  # Wait before continuing
    
    # Final backup after completion
    backup_h5_file(H5_FILE, BACKUP_PATH)


In [None]:

def main():
    # Example usage
    START_DATE = "2024-12-01"
    END_DATE = "2024-12-06"
    
    print(f"Scraping KRX option data from {START_DATE} to {END_DATE}...")
    scrape_krx_option_data(START_DATE, END_DATE)
    print("Scraping completed.")

if __name__ == "__main__":
    main()
