## TA-Lib으로 필요한 기술지표 변환
- https://github.com/TA-Lib/ta-lib-python
- conda install -c conda-forge ta-lib

### 1. Size
- **Market Capitalization**
- Price-to-Earnings (P/E) ratio
- Price-to-Book (P/B) ratio

### 2. Value
- **Book-to-market ratio (= 1/PBR)**
- Assets(Book Value) 
- Price-to-Earnings (P/E) ratio
- (Dividend, Earning yield, ROE ratio)

### 3. Volatility
- Bollinger Bands(20)
- Average True Range (ATR; 14)
- Chaikin Volatility (10)
- Relative Volatility Index (RVI)
- **Standard deviation**
- 급변 지표 : fK, fD, SD, CCI, TR


### 4. Momentum
-Momentum(MOM)
- Relative Strength Index (RSI; 10,14)
- Moving Average Convergence Divergence (MACD; 6,12)
- Stochastic Oscillator
- Average Directional Index (ADX; 7,14)
- Rate of Change (ROC; 12)
- WillR


In [1]:
import os
import talib
import numpy as np
import pandas as pd

# famafrench

In [2]:
# 예제코드
close = np.random.random(100)
output = talib.SMA(close)
print(output)

[       nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan 0.48780763
 0.48752146 0.48208065 0.47634586 0.49256211 0.4851214  0.48102321
 0.47626972 0.45635163 0.42979871 0.42696076 0.44511829 0.45532835
 0.45079845 0.45806847 0.47917411 0.46935949 0.47083871 0.49039125
 0.48602881 0.45812776 0.46499668 0.45426409 0.43661247 0.46140508
 0.47479445 0.47643974 0.47670012 0.46105665 0.44724212 0.47047112
 0.48427568 0.48433913 0.50956516 0.5064031  0.50598021 0.50305229
 0.49246274 0.48861672 0.51152008 0.52568044 0.5017774  0.48903953
 0.49605186 0.50501083 0.50287578 0.50146594 0.51797473 0.51682899
 0.5135266  0.53738215 0.53537102 0.52896059 0.5275617  0.51585303
 0.51265526 0.50723605 0.48440429 0.48947654 0.4752507  0.4637

In [2]:
# list of functions
print(talib.get_functions())

# dict of functions by group
print(talib.get_function_groups())
print(list(talib.get_function_groups().keys()))

['HT_DCPERIOD', 'HT_DCPHASE', 'HT_PHASOR', 'HT_SINE', 'HT_TRENDMODE', 'ADD', 'DIV', 'MAX', 'MAXINDEX', 'MIN', 'MININDEX', 'MINMAX', 'MINMAXINDEX', 'MULT', 'SUB', 'SUM', 'ACOS', 'ASIN', 'ATAN', 'CEIL', 'COS', 'COSH', 'EXP', 'FLOOR', 'LN', 'LOG10', 'SIN', 'SINH', 'SQRT', 'TAN', 'TANH', 'ADX', 'ADXR', 'APO', 'AROON', 'AROONOSC', 'BOP', 'CCI', 'CMO', 'DX', 'MACD', 'MACDEXT', 'MACDFIX', 'MFI', 'MINUS_DI', 'MINUS_DM', 'MOM', 'PLUS_DI', 'PLUS_DM', 'PPO', 'ROC', 'ROCP', 'ROCR', 'ROCR100', 'RSI', 'STOCH', 'STOCHF', 'STOCHRSI', 'TRIX', 'ULTOSC', 'WILLR', 'BBANDS', 'DEMA', 'EMA', 'HT_TRENDLINE', 'KAMA', 'MA', 'MAMA', 'MAVP', 'MIDPOINT', 'MIDPRICE', 'SAR', 'SAREXT', 'SMA', 'T3', 'TEMA', 'TRIMA', 'WMA', 'CDL2CROWS', 'CDL3BLACKCROWS', 'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE', 'CDL3STARSINSOUTH', 'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLADVANCEBLOCK', 'CDLBELTHOLD', 'CDLBREAKAWAY', 'CDLCLOSINGMARUBOZU', 'CDLCONCEALBABYSWALL', 'CDLCOUNTERATTACK', 'CDLDARKCLOUDCOVER', 'CDLDOJI', 'CDLDOJISTAR',

In [3]:
talib.get_function_groups()["Volatility Indicators"]

['ATR', 'NATR', 'TRANGE']

## 팩터 투자를 위한 기술지표 만드는 클래스 함수

In [3]:
class FactorIndicators():
    def __init__(self) -> None:
        pass

    @staticmethod
    def get_volatility(stock_data:pd.DataFrame):
        upper, middle, lower = talib.BBANDS(stock_data["close"], timeperiod=20) # Bollinger Band
        std = talib.STDDEV(stock_data["close"], timeperiod=5) #STDDEV(close, timeperiod=5, nbdev=1)
        atr = talib.ATR(stock_data["high"], stock_data["low"], stock_data["close"], timeperiod=14)
        adosc = talib.ADOSC(stock_data["high"], stock_data["low"], stock_data["close"], stock_data["volume"],
                            fastperiod=3, slowperiod=10) #Chaikin A/D Oscillator
        cci = talib.CCI(stock_data["high"], stock_data["low"], stock_data["close"], timeperiod=14) # CCI(high, low, close, timeperiod=14)
        
        # make pd.DataFrame() and save it
        # NaN값 제거는... 하아... 고민좀 해보기... ㅜ_ㅜ
        total_df = stock_data.copy()
        total_df = total_df.assign(upper_bb = upper, middle_bb = middle, lower_bb=lower,
                                   std=std, atr=atr, adosc=adosc, cci=cci)
        return total_df
    
    @staticmethod
    def get_momentum(stock_data:pd.DataFrame):
        # mom5 = talib.MOM(close, timeperiod=5) # 100days only
        # ma = talib.MA(close, timeperiod=30, matype=0)
        # wma = talib.WMA(close, timeperiod=30)
        rsi = talib.RSI(stock_data["close"], timeperiod=14) #10, 14
        macd, macd_signal, macd_hist = talib.MACD(stock_data["close"], fastperiod=12, slowperiod=26, signalperiod=9) #6, 12
        adx = talib.ADX(stock_data["high"], stock_data["low"], stock_data["close"], timeperiod=14) #7,14
        roc = talib.ROC(stock_data["close"], timeperiod=12) #12 #Rate of change : ((price/prevPrice)-1)*100
        willr = talib.WILLR(stock_data["high"], stock_data["low"], stock_data["close"], timeperiod=14) 
        slowk, slowd = talib.STOCH(stock_data["high"], stock_data["low"], stock_data["close"], 
                                   fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
        total_df = stock_data.copy()
        total_df = total_df.assign(rsi=rsi, macd=macd,
                                   adx=adx, roc=roc, willr=willr, slowk=slowk, slowd=slowd)
        
        return total_df

    @staticmethod
    def get_rep_factor(stock_data:pd.DataFrame):
        # 대표적인 팩터 한개씩만 사용하기
        std = talib.STDDEV(stock_data["close"], timeperiod=14) 
        macd, macd_signal, macd_hist = talib.MACD(stock_data["close"], fastperiod=12, slowperiod=26, signalperiod=9) #6, 12
        total_df = stock_data.copy()
        total_df = total_df.assign(std=std, macd=macd)
        return total_df

## All Data Load 

In [4]:
BASE_DIR = os.path.abspath('').split('/src')[0]
DATA_DIR = os.path.abspath('').split('/trading')[0]+'/famafrench_data/stockdata/final_data'
BASE_DIR
DATA_DIR

'/home1/hwang/famafrench_data/stockdata/final_data'

In [6]:
BASE_DIR = os.path.abspath('').split('/src')[0]
DATA_DIR = os.path.abspath('').split('/trading')[0]+'/famafrench_data/stockdata/final_data'
SAVE_DIR = os.path.abspath('').split('/trading')[0]+"/famafrench_data/stockdata/final_with_factor"
country = "DJIA" #"USA"

# os.makedirs(SAVE_DIR+f"/{country}", exist_ok=True)
os.makedirs(SAVE_DIR+f"/{country}/onlyfactor", exist_ok=True)
os.makedirs(SAVE_DIR+f"/{country}/rep", exist_ok=True)
os.makedirs(SAVE_DIR+f"/{country}/whole", exist_ok=True)


ticker_list = [i.split(".csv")[0] for i in os.listdir(DATA_DIR+f"/{country}") if i.endswith(".csv")]
for tic in ticker_list:
    df = pd.read_csv(f'{DATA_DIR}/{country}/{tic}.csv', index_col="date", parse_dates=["date"])
    df = df.fillna(method='ffill').fillna(method='bfill')
    
    # make data
    factor = FactorIndicators()
    data1 = factor.get_volatility(df)
    data2 = factor.get_momentum(data1)
    rep_data = factor.get_rep_factor(df)

    # nan 값 없애기
    dataset_nan_deleted = data2.dropna()
    rep_nan_deleted = rep_data.dropna()
    assert len(dataset_nan_deleted)==len(rep_nan_deleted), "len(dataset_nan_deleted)!=len(rep_nan_deleted)"
    start_date = dataset_nan_deleted.index[0]
    print("start_date: ", start_date)

    # save file include ohlcv
    # data2.to_csv(f'{SAVE_DIR}/{country}/whole/{tic}.csv')
    # rep_data.to_csv(f'{SAVE_DIR}/{country}/rep/{tic}.csv')
    dataset_nan_deleted.to_csv(f'{SAVE_DIR}/{country}/whole/{tic}.csv')
    rep_nan_deleted.to_csv(f'{SAVE_DIR}/{country}/rep/{tic}.csv')
    
    # save file exclude ohlcv
    data3 = dataset_nan_deleted.drop(['open', 'high', 'low', 'close', 'volume'], axis=1)
    data3.to_csv(f'{SAVE_DIR}/{country}/onlyfactor/{tic}.csv') 
    print(f"Ticker {tic} is done.")

print("All Finished.")


start_date:  2004-11-16 00:00:00
Ticker AXP is done.
start_date:  2004-11-16 00:00:00
Ticker MSFT is done.
start_date:  2004-11-16 00:00:00
Ticker BA is done.
start_date:  2004-11-16 00:00:00
Ticker PG is done.
start_date:  2004-11-16 00:00:00
Ticker VZ is done.
start_date:  2004-11-16 00:00:00
Ticker CSCO is done.
start_date:  2004-11-16 00:00:00
Ticker DIS is done.
start_date:  2004-11-16 00:00:00
Ticker IBM is done.
start_date:  2004-11-16 00:00:00
Ticker HD is done.
start_date:  2004-11-16 00:00:00
Ticker TRV is done.
start_date:  2004-11-16 00:00:00
Ticker HON is done.
start_date:  2004-11-16 00:00:00
Ticker JNJ is done.
start_date:  2004-11-16 00:00:00
Ticker JPM is done.
start_date:  2004-11-16 00:00:00
Ticker CAT is done.
start_date:  2004-11-16 00:00:00
Ticker MCD is done.
start_date:  2004-11-16 00:00:00
Ticker MRK is done.
start_date:  2004-11-16 00:00:00
Ticker GS is done.
start_date:  2004-11-16 00:00:00
Ticker INTC is done.
start_date:  2004-11-16 00:00:00
Ticker AMGN is 

In [7]:
BASE_DIR = os.path.abspath('').split('/src')[0]
DATA_DIR = os.path.abspath('').split('/trading')[0]+'/famafrench_data/stockdata/final_data'
SAVE_DIR = os.path.abspath('').split('/trading')[0]+"/famafrench_data/stockdata/final_with_factor"
country = "NASDAQ" #"USA"

# os.makedirs(SAVE_DIR+f"/{country}", exist_ok=True)
os.makedirs(SAVE_DIR+f"/{country}/onlyfactor", exist_ok=True)
os.makedirs(SAVE_DIR+f"/{country}/rep", exist_ok=True)
os.makedirs(SAVE_DIR+f"/{country}/whole", exist_ok=True)


ticker_list = [i.split(".csv")[0] for i in os.listdir(DATA_DIR+f"/{country}") if i.endswith(".csv")]
for tic in ticker_list:
    df = pd.read_csv(f'{DATA_DIR}/{country}/{tic}.csv', index_col="date", parse_dates=["date"])
    df = df.fillna(method='ffill').fillna(method='bfill')
    
    # make data
    factor = FactorIndicators()
    data1 = factor.get_volatility(df)
    data2 = factor.get_momentum(data1)
    rep_data = factor.get_rep_factor(df)

    # nan 값 없애기
    dataset_nan_deleted = data2.dropna()
    rep_nan_deleted = rep_data.dropna()
    assert len(dataset_nan_deleted)==len(rep_nan_deleted), "len(dataset_nan_deleted)!=len(rep_nan_deleted)"
    start_date = dataset_nan_deleted.index[0]
    print("start_date: ", start_date)

    # save file include ohlcv
    # data2.to_csv(f'{SAVE_DIR}/{country}/whole/{tic}.csv')
    # rep_data.to_csv(f'{SAVE_DIR}/{country}/rep/{tic}.csv')
    dataset_nan_deleted.to_csv(f'{SAVE_DIR}/{country}/whole/{tic}.csv')
    rep_nan_deleted.to_csv(f'{SAVE_DIR}/{country}/rep/{tic}.csv')
    
    # save file exclude ohlcv
    data3 = dataset_nan_deleted.drop(['open', 'high', 'low', 'close', 'volume'], axis=1)
    data3.to_csv(f'{SAVE_DIR}/{country}/onlyfactor/{tic}.csv') 
    print(f"Ticker {tic} is done.")

print("All Finished.")


start_date:  2002-10-18 00:00:00
Ticker MU is done.
start_date:  2002-10-18 00:00:00
Ticker MSFT is done.
start_date:  2002-10-18 00:00:00
Ticker CSX is done.
start_date:  2002-10-18 00:00:00
Ticker TXN is done.
start_date:  2002-10-18 00:00:00
Ticker ORLY is done.
start_date:  2002-10-18 00:00:00
Ticker ADI is done.
start_date:  2002-10-18 00:00:00
Ticker AMZN is done.
start_date:  2002-10-18 00:00:00
Ticker NFLX is done.
start_date:  2002-10-18 00:00:00
Ticker CSCO is done.
start_date:  2002-10-18 00:00:00
Ticker HON is done.
start_date:  2002-10-18 00:00:00
Ticker INTU is done.
start_date:  2002-10-18 00:00:00
Ticker ADP is done.
start_date:  2002-10-18 00:00:00
Ticker AMAT is done.
start_date:  2002-10-18 00:00:00
Ticker VRTX is done.
start_date:  2002-10-18 00:00:00
Ticker AMD is done.
start_date:  2002-10-18 00:00:00
Ticker COST is done.
start_date:  2002-10-18 00:00:00
Ticker REGN is done.
start_date:  2002-10-18 00:00:00
Ticker INTC is done.
start_date:  2002-10-18 00:00:00
Tic