### Feature Engineering

In [7]:
import yfinance as yf
import pandas as pd
import glob
import warnings
import numpy as np
warnings.filterwarnings(action='ignore')

In [2]:
# file_paths = glob.glob('./data/*.csv')

# dfs = {}

# for file_path in file_paths:
#     file_name = file_path.split('/')[-1].replace('.csv', '')
#     dfs[file_name] = pd.read_csv(file_path)


In [3]:
file_dir = './data/'

['SOXX', 'QQQ', 'XLF', 'CARZ', 'VOX', 'PBJ', 'PJP', 'XLE', 'XLU', 'ITA']

# read data
SP500 = pd.read_csv(file_dir + 'SP500.csv')
NASDAQ = pd.read_csv(file_dir + 'NASDAQ.csv')
SOXX = pd.read_csv(file_dir + 'SOXX.csv')
QQQ = pd.read_csv(file_dir + 'QQQ.csv')
XLF = pd.read_csv(file_dir + 'XLF.csv')
CARZ = pd.read_csv(file_dir + 'CARZ.csv')
VOX = pd.read_csv(file_dir + 'VOX.csv')
PBJ = pd.read_csv(file_dir + 'PBJ.csv')
PJP = pd.read_csv(file_dir + 'PJP.csv')
XLE = pd.read_csv(file_dir + 'XLE.csv')
XLU = pd.read_csv(file_dir + 'XLU.csv')
ITA = pd.read_csv(file_dir + 'ITA.csv')


In [4]:
def cal_return(buy_price, sell_price, dividend=0):
    """
    주식 수익률 계산 함수

    :param buy_price: 주식 구매가
    :param sell_price: 주식 판매가
    :param dividend: 받은 배당금 (기본값은 0)
    :return: 수익률 (%)
    """
    # 수익률 계산
    return_rate = ((sell_price + dividend - buy_price) / buy_price) * 100
    
    return return_rate


In [1]:
# feature engineering function
def feature_engineering(ticker, bm, n_days):
    # 날짜
    date = ticker['Date']

    # 수익률 계산
    price = np.log(ticker['Adj Close']).diff(n_days) - np.log(bm['Adj Close']).diff(n_days)
    volume = ticker['Volume']
    high_low_gap = ticker['High'] - ticker['Low']
    
    # EMA(이동평균선) 계산
    ema_dict = {f'price_ema_{i}': ticker['Adj Close'].ewm(span=i, adjust=False).mean() for i in [5, 10, 20, 60]}

    # 변동성 지표 계산
    vol_ema_dict = {f'vol_ema_{i}': ticker['Volume'].ewm(span=i, adjust=False).std() for i in [5, 10, 20, 60]}

    # 이동 평균선, 변동성 지표 데이터프레임화
    ema_df = pd.DataFrame(ema_dict)
    vol_ema_df = pd.DataFrame(vol_ema_dict)

    # 계산된 특성을 데이터프레임으로 모으기
    features = pd.DataFrame({
        'date': date,
        'price': price,
        'volume': volume,
        'high_low_gap': high_low_gap,
    }).join(ema_df).join(vol_ema_df)
    
    return features


In [12]:
SOXX_new = feature_engineering(SOXX, SP500, (1,5,10,20))
SOXX_new

Unnamed: 0,date,price,volume,high_low_gap,price_ema_5,price_ema_10,price_ema_20,price_ema_60,vol_ema_5,vol_ema_10,vol_ema_20,vol_ema_60
0,2004-01-02,,74000,0.980003,52.486702,52.486702,52.486702,52.486702,74000.000000,74000.000000,74000.000000,74000.000000
1,2004-01-05,0.019702,103300,1.389999,53.056002,52.797229,52.649359,52.542699,83766.666667,79327.272727,76790.476190,74960.655738
2,2004-01-06,0.009412,170100,1.279999,53.629930,53.157331,52.852067,52.615980,112544.444444,95831.404959,85677.097506,78079.978500
3,2004-01-07,0.006569,118300,1.139999,54.176399,53.541332,53.082283,52.702976,114462.962963,99916.604057,88784.040600,79398.667730
4,2004-01-08,0.024458,208600,1.769997,55.090563,54.155434,53.447674,52.841202,145841.975309,119677.221501,100195.084353,83634.776985
...,...,...,...,...,...,...,...,...,...,...,...,...
5028,2023-12-22,0.001540,785900,7.469971,564.562350,556.376848,541.055909,510.980384,897952.607280,949498.683956,940123.133996,899898.925658
5029,2023-12-26,0.013293,635500,9.760010,569.694907,560.664698,544.761062,513.242012,810468.404854,892408.014146,911111.406949,891230.108423
5030,2023-12-27,0.001136,625800,5.359985,573.613276,564.443846,548.255248,515.478340,748912.269902,843933.829756,883938.892001,882527.481918
5031,2023-12-28,-0.002247,351000,4.309998,575.862179,567.337690,551.312842,517.605607,616274.846602,754309.497073,833182.807049,865100.351363
