In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import json
import time


nasq_ticker_name = ['AAPL', 'ADBE', 'ADI', 'ADP', 'AMAT', 'AMD', 'AMGN', 'AMZN', 'BKNG', 'COST', 'CSCO', 'CSX', 'FISV', 'GILD', 'HON', 'INTC', 'INTU', 'ISRG', 'LRCX', 'MSFT', 'MU', 'NFLX', 'ORLY', 'PEP', 'QCOM', 'REGN', 'SBUX', 'SNPS', 'TXN', 'VRTX'] # nasdaq top 30

djia_ticker_name =  ['UNH', 'GS', 'MSFT', 'HD', 'MCD', 'AMGN', 'CAT', 'BA', 'HON', 
                    'CRM', 'TRV', 'AAPL', 'CVX', 'JNJ', 'AXP', 'PG', 'WMT', 'JPM', 'NKE', 
                    'IBM', 'MRK', 'MMM', 'DIS', 'KO', 'CSCO', 'VZ', 'INTC']

# Code usage guide
1. First, Gather new market data and make same with stockdata/final_data
2. Modify the block below: the market_list and file name list (nasdaq_list, djia_list)
3. and run below 

In [13]:
# Set the directory
BASE_DIR = os.path.abspath('').split('/src')[0]
DATA_DIR = os.path.abspath('').split('/trading')[0]+'/famafrench_data'
market_list = ['NASDAQ', 'DJIA']
market = market_list[1]

nasdaq_list = sorted([i.split('.csv')[0] for i in os.listdir(f'{DATA_DIR}/stockdata/final_data/NASDAQ') if i.endswith(".csv")])
djia_list = sorted([i.split('.csv')[0] for i in os.listdir(f'{DATA_DIR}/stockdata/final_data/DJIA') if i.endswith(".csv")])

# exclude 4 stocks (due to lack of data)
# for i in ["DOW", "V", "WBA", "CRM"]: djia_list.remove(i)

# price_filename = f"{DATA_DIR}/stockdata/label_data/{market}/x/stock_2d_actor.csv"
# label_directory = f"{DATA_DIR}/stockdata/label_data/{market_list}"

os.makedirs(f'{DATA_DIR}/stockdata/label_data/{market}/x/', exist_ok=True)
os.makedirs(f'{DATA_DIR}/stockdata/label_data/{market}/y/', exist_ok=True)

print(nasdaq_list)


['AAPL', 'ADBE', 'ADI', 'ADP', 'AMAT', 'AMD', 'AMGN', 'AMZN', 'BKNG', 'COST', 'CSCO', 'CSX', 'FISV', 'GILD', 'HON', 'INTC', 'INTU', 'ISRG', 'LRCX', 'MSFT', 'MU', 'NFLX', 'ORLY', 'PEP', 'QCOM', 'REGN', 'SBUX', 'SNPS', 'TXN', 'VRTX']


In [4]:
# concat all the close price data
def make_2D_pretrain_data(
                        stocks_subset:list,
                        initial_date: str,
                        final_date: str,
                        save_option = True,
                        country = 'USA',
                        ):
    ''' 
    Args
    data_dir : 자산 데이터가 있는 위치
    stocks_subset : 포트폴리오 자산명이 들어있는 리스트
    Out
    2d numpy size : (time, asset*features)
    '''
    initial_date = datetime.strptime(initial_date, '%Y-%m-%d')
    final_date = datetime.strptime(final_date, '%Y-%m-%d')

    # ticker 정보가 있는 txt파일을 열어서, 편입될 종목의 이름 수
    # define data container
    data_container = {}
    y_container = {}

    for ticker in stocks_subset:
        # for key, value in pretrain_json["pretrain"].items() :
            # if ticker in value:
        data = pd.read_csv(f'{DATA_DIR}/stockdata/final_data/{country}/{ticker}.csv',
                            parse_dates=['date'], index_col=0)
        data.columns = [f'close_{ticker}', f'open_{ticker}', f'high_{ticker}', f'low_{ticker}', 
                        f'volume_{ticker}', f'BTM_{ticker}',f'MarketCap_{ticker}']
        data.sort_index(ascending=True, inplace=True)
        # print(initial_date)
        # print(final_date)
        # print(len(data.loc[initial_date:final_date]))
        data_container[ticker] = data.loc[initial_date:final_date]
        

        # making up and down labels (y_data)
        next_p = data[f'close_{ticker}'].shift(-1).fillna(method='ffill')
        _r = next_p/data[f'close_{ticker}']-1
        y_container[ticker] = pd.DataFrame([1 if i>=0 else 0 for i in _r ], index=data.index) ## 정답지
        
        print(ticker, data[f'close_{ticker}'].index[0], data[f'close_{ticker}'].index[-1],
        len(data[f'close_{ticker}']), len(next_p), len(_r), len(y_container[ticker]))
        print()

    
    # Make data to the dataframe
    final_data = pd.DataFrame()
    y_data = pd.DataFrame()

    for tic in data_container:
        # 옆으로 붙여야 의미가 맞지 않을까?
        final_data = pd.concat([final_data, data_container[tic]], axis=1, join='outer')
        y_data = pd.concat([y_data, y_container[tic]], axis=1, join='outer')
    
    # set the labels
    y_data.columns = stocks_subset
    
    time.sleep(1)
    final_data.sort_index(ascending=True, inplace=True)
    y_data.sort_index(ascending=True, inplace=True)

    final_data.fillna(method='ffill', inplace=True)
    y_data.fillna(method='ffill', inplace=True)
    #---------------------------------------------------------------------------------------------------

    if save_option:
        final_data.to_csv(f'{DATA_DIR}/stockdata/label_data/{country}/x/stock_2d_actor.csv')
        y_data.to_csv(f'{DATA_DIR}/stockdata/label_data/{country}/y/stock_2d_y_actor.csv') # updown
    
    time.sleep(1)

    # return final_data, y_data

In [15]:
make_2D_pretrain_data(nasdaq_list, '2002-09-03','2023-05-01',save_option = True, country = 'NASDAQ') #2002-09-03
# print()
# make_2D_pretrain_data(djia_list, '2004-09-30', '2023-05-01', save_option = True, country = 'DJIA') # 2002-08-07

AAPL 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

ADBE 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

ADI 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

ADP 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

AMAT 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

AMD 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

AMGN 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

AMZN 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

BKNG 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

COST 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

CSCO 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

CSX 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

FISV 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

GILD 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

HON 2002-09-03 00:00:00 2022-12-30 00:00:00 5119 5119 5119 5119

INTC 2002-09-03

In [16]:
def make_price_label(directory):
    # directory = f"/home/ubuntu2010/바탕화면/famafrench_data/stockdata/label_data/{market}"
    filedir = f"{directory}/x/stock_2d_actor.csv"

    data = pd.read_csv(filedir, index_col='date', parse_dates=['date'])

    close_df = data[[i for i in data.columns if 'close' in i]]
    y_data = close_df.shift(-1).fillna(method='ffill').fillna(method='bfill')
    data_logr_whole= np.log(data/data.shift(1).fillna(method='bfill'))
    data_logr_whole = data_logr_whole.fillna(method='ffill')
    data_log_close = np.log(close_df/close_df.shift(1).fillna(method='bfill'))
    data_log_close = data_log_close.fillna(method='ffill')
    data_logr_y = data_log_close.shift(-1).fillna(method='ffill').fillna(method='bfill')

    print("# of Null data: ", close_df.isna().sum().sum()) # no null
    print("# of Null data: ", y_data.isna().sum().sum()) # no null
    print("# of Null data: ", data_logr_whole.isna().sum().sum()) # no null
    print("# of Null data: ", data_log_close.isna().sum().sum()) # no null
    
    print("# of data: ", len(close_df)) # no null
    print("# of data: ", len(y_data)) # no null
    print("# of data: ", len(data_logr_whole)) # no null
    print("# of data: ", len(data_log_close)) # no null

    # file save'{DATA_DIR}/stockdata/label_data/{market}
    # origin price data
    close_df.to_csv(f"{directory}/x/stock_2d_onlyclose.csv")
    y_data.to_csv(f"{directory}/y/stock_2d_origin_price.csv")

    # log returns data 
    data_logr_whole.to_csv(f"{directory}/x/stock_2d_log_all.csv")
    data_log_close.to_csv(f"{directory}/x/stock_2d_log_close.csv")
    data_logr_y.to_csv(f"{directory}/y/stock_2d_logr_y.csv")
    
    return close_df, y_data, data_logr_whole, data_log_close, data_logr_y



def make_pretrain_dataset(directory, market:str):
    directory = directory+f"/{market}"
    filedir = f"{directory}/x/stock_2d_actor.csv"

    pretrain_datadir = directory+f"/pretrain"
    os.makedirs(pretrain_datadir, exist_ok=True)
    close_df, y_data, data_logr_whole, data_log_close, data_logr_y = make_price_label(directory)
    data_dict = {"close":close_df, "log_p":data_log_close, "label":y_data, "log_label":data_logr_y}
    tickers = [i.split("_")[-1] for i in y_data.columns]
    print("tickers: ", tickers)
    # container = {}

    for tic in tickers:
        container = []
        for col_name, df in data_dict.items():
            # print(col_name)
            for col in df.columns:
                if tic == col.split("_")[-1]: container.append(df[col])
        print(tic)
        final_df = pd.DataFrame(container)
        final_df = final_df.transpose()
        final_df.columns = ["close", "log_p", "label", "log_label"]
        final_df.to_csv(pretrain_datadir+f"/{tic}_pretrain_dataset.csv")
    return final_df


In [17]:
# price_filename = f"{DATA_DIR}/stockdata/label_data/{market}/x/stock_2d_actor.csv"
directory = f"{DATA_DIR}/stockdata/label_data"

nasq_df = make_pretrain_dataset(directory=directory, market='NASDAQ')
# djia_df = make_pretrain_dataset(directory=directory, market='DJIA')

  result = func(self.values, **kwargs)


# of Null data:  0
# of Null data:  0
# of Null data:  0
# of Null data:  0
# of data:  5119
# of data:  5119
# of data:  5119
# of data:  5119
tickers:  ['AAPL', 'ADBE', 'ADI', 'ADP', 'AMAT', 'AMD', 'AMGN', 'AMZN', 'BKNG', 'COST', 'CSCO', 'CSX', 'FISV', 'GILD', 'HON', 'INTC', 'INTU', 'ISRG', 'LRCX', 'MSFT', 'MU', 'NFLX', 'ORLY', 'PEP', 'QCOM', 'REGN', 'SBUX', 'SNPS', 'TXN', 'VRTX']
AAPL
ADBE
ADI
ADP
AMAT
AMD
AMGN
AMZN
BKNG
COST
CSCO
CSX
FISV
GILD
HON
INTC
INTU
ISRG
LRCX
MSFT
MU
NFLX
ORLY
PEP
QCOM
REGN
SBUX
SNPS
TXN
VRTX


## 학습시 env 설정을 위한 jsonfile만들기

In [8]:
# JSON File도 만들어야 하는데. 그거는 좀이따
portfolio_info_dir = f'{BASE_DIR}/portfolio_info'
json_name = 'portfolio.json' #'diy_portfolio.json'

os.makedirs(portfolio_info_dir+f"/NASDAQ", exist_ok=True)
os.makedirs(portfolio_info_dir+f"/DJIA", exist_ok=True)

nasq_df = pd.read_csv(f"{DATA_DIR}/stockdata/label_data/NASDAQ/x/stock_2d_actor.csv", index_col='date', parse_dates=['date'])
djia_df = pd.read_csv(f"{DATA_DIR}/stockdata/label_data/DJIA/x/stock_2d_actor.csv", index_col='date', parse_dates=['date'])

In [9]:
    
def make_pfsetting_json(df, market='NASDAQ'):
    container = {}
    init_pf_dict = {"Bank_account": 100000} 
    cap_dict = {}

    # df의 marketcap 기준으로 Small, Mid, Big 계산
    # 우선은 평균으로 계산.. 이것도 바꿀 수 있음 좋지만..
    cap_df = df[[i for i in df.columns if 'Market' in i]]
    cap_df_desc = cap_df.describe().T
    # print(cap_df_desc)
    cap_df_desc = cap_df_desc.sort_values(by="mean", ascending=False)
    sorted_ticker_list = [i.split("_")[-1] for i in cap_df_desc.index] #마켓캡 크기순
    ticker_list = sorted(sorted_ticker_list) #abc순

    for i in range(len(sorted_ticker_list)):
        tic = sorted_ticker_list[i]
        abc_tic = ticker_list[i]
        init_pf_dict[abc_tic] = 0

        if i <= int(len(sorted_ticker_list)/3-1): cap_dict[tic] = 'Big'
        elif i > int(len(sorted_ticker_list)*2/3): cap_dict[tic] = 'Small'
        else : cap_dict[tic] = 'Mid'
    
    container['initial_portfolio'] = init_pf_dict
    container['market_cap'] = dict(sorted(cap_dict.items()))
    # print(container)

    # save
    portfolio_info_dir = f'{BASE_DIR}/portfolio_info'
    json_name = 'snp_portfolio_smb.json' #'diy_portfolio.json'
    with open(f'{portfolio_info_dir}/{market}/{json_name}','w') as f:
        json.dump(container, f, indent=4)
    return container



In [10]:
make_pfsetting_json(nasq_df, market='NASDAQ')
make_pfsetting_json(djia_df, market='DJIA')

{'initial_portfolio': {'Bank_account': 100000,
  'AAPL': 0,
  'AMGN': 0,
  'AXP': 0,
  'BA': 0,
  'CAT': 0,
  'CRM': 0,
  'CSCO': 0,
  'CVX': 0,
  'DIS': 0,
  'DOW': 0,
  'GS': 0,
  'HD': 0,
  'HON': 0,
  'IBM': 0,
  'INTC': 0,
  'JNJ': 0,
  'JPM': 0,
  'KO': 0,
  'MCD': 0,
  'MMM': 0,
  'MRK': 0,
  'MSFT': 0,
  'NKE': 0,
  'PG': 0,
  'TRV': 0,
  'UNH': 0,
  'V': 0,
  'VZ': 0,
  'WBA': 0,
  'WMT': 0},
 'market_cap': {'AAPL': 'Big',
  'AMGN': 'Mid',
  'AXP': 'Small',
  'BA': 'Mid',
  'CAT': 'Small',
  'CRM': 'Small',
  'CSCO': 'Mid',
  'CVX': 'Big',
  'DIS': 'Mid',
  'DOW': 'Small',
  'GS': 'Small',
  'HD': 'Mid',
  'HON': 'Small',
  'IBM': 'Mid',
  'INTC': 'Mid',
  'JNJ': 'Big',
  'JPM': 'Big',
  'KO': 'Big',
  'MCD': 'Mid',
  'MMM': 'Mid',
  'MRK': 'Mid',
  'MSFT': 'Big',
  'NKE': 'Small',
  'PG': 'Big',
  'TRV': 'Small',
  'UNH': 'Mid',
  'V': 'Big',
  'VZ': 'Big',
  'WBA': 'Small',
  'WMT': 'Big'}}

## Make sub datasets : DJIA 5, NASDAQ 5, 10