In [1]:
import os
import pickle
import sys
import warnings
from glob import glob
import re
import datetime
import itertools
from pyti.moving_average_convergence_divergence import moving_average_convergence_divergence as macd
from pyti.simple_moving_average import simple_moving_average as sma
from pyti.stochastic import percent_k as srv_k
from pyti.stochastic import percent_d as srv_d
from tqdm import tqdm
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.dates import date2num, DayLocator, DateFormatter
#from mpl_finance import candlestick2_ohlc, volume_overlay
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import lightgbm
import talib

In [2]:
dataset_dir="../data_dir/"

In [3]:
# 読み込むファイルを定義します。
inputs = {
    "stock_list": f"{dataset_dir}/stock_list.csv",
    "stock_price": f"{dataset_dir}/stock_price.csv",
    "stock_fin": f"{dataset_dir}/stock_fin.csv",
    # 本チュートリアルでは使用しないため、コメントアウトしています。
    # "stock_fin_price": f"{dataset_dir}/stock_fin_price.csv.gz",
    "stock_labels": f"{dataset_dir}/stock_labels.csv",
    "stock_fin_growth_rate": f"{dataset_dir}/stock_fin_growth_rate.csv",
    "train_X": f"{dataset_dir}/train_X.csv",
}

# ファイルを読み込みます
dfs = {}
for k, v in inputs.items():
    print(k)
    dfs[k] = pd.read_csv(v)
    # DataFrameのindexを設定します。
    if k == "stock_price":
        dfs[k].loc[:, "datetime"] = pd.to_datetime(
            dfs[k].loc[:, "EndOfDayQuote Date"]
        )
        dfs[k].set_index("datetime", inplace=True)
    elif k in ["stock_fin", "stock_fin_price", "stock_labels"]:
        dfs[k].loc[:, "datetime"] = pd.to_datetime(
            dfs[k].loc[:, "base_date"]
        )
        dfs[k].set_index("datetime", inplace=True)

stock_list
stock_price
stock_fin
stock_labels
stock_fin_growth_rate
train_X


In [63]:
def cross_X(x):
    return np.prod(x)

def get_features_for_predict(dfs, code, start_dt="2016-01-01"):
    """
    Args:
        dfs (dict)  : dict of pd.DataFrame include stock_fin, stock_price
        code (int)  : A local code for a listed company
        start_dt (str): specify date range
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    # おおまかな手順の1つ目
    # stock_finデータを読み込み
    stock_fin = dfs["stock_fin"]
    periods = [10, 20, 40]
    # 特定の銘柄コードのデータに絞る
    stock_fin = stock_fin[stock_fin["Local Code"] == code]
    fin_data = stock_fin[~stock_fin.duplicated(subset=['Local Code', 'Result_FinancialStatement ReportType',"Result_FinancialStatement FiscalYear"],keep='last')]
    # 特徴量の作成には過去60営業日のデータを使用しているため、
    # 予測対象日からバッファ含めて土日を除く過去90日遡った時点から特徴量を生成します
    n = 90
    # 特徴量の生成対象期間を指定
    
    fin_data = fin_data.loc[pd.Timestamp(start_dt) - pd.offsets.BDay(n) :]
    seasons = stock_fin["Result_FinancialStatement ReportType"].unique()
    columns = fin_data.columns
    columns = columns.to_list()
    columns_list = ["Result_FinancialStatement NetSales","Result_FinancialStatement OrdinaryIncome","Result_FinancialStatement TotalAssets","Result_FinancialStatement NetAssets"]
    for column in columns_list:
        a = "last "+column
        print(a)
        print(type(columns))
        columns.append(a)
    df_result = pd.DataFrame(index=[], columns=columns)
    # columns_list.append("base_date")
    # columns_list.append("Local Code")
    for season in seasons:
        #df["last "+column] = 0
        #print(columns_list)
        df_test = fin_data[fin_data["Result_FinancialStatement ReportType"]==season].copy()
        for column in columns_list:
            #print(columns)
            df_test["last "+column] = df_test[column]
            df_test["last "+column] = df_test[column].shift()
            #df = pd.merge(df,df_test[["last "+column,"base_date","Local Code"]],on = ["base_date","Local Code"],how="left")
            #df_ab, df_ac, on='a', how='left'
        #print(df_result)
        #print(df_test)
        df_result = pd.concat([df_result,df_test])
    df_result["NetSales_growth_rate"] = df_result["Result_FinancialStatement NetSales"] / df_result["last Result_FinancialStatement NetSales"]
    df_result["OrdinaryIncome_growth_rate"] = df_result["Result_FinancialStatement OrdinaryIncome"] / df_result["last Result_FinancialStatement OrdinaryIncome"]
    df_result["TotalAssets_growth_rate"] = df_result["Result_FinancialStatement TotalAssets"] / df_result["last Result_FinancialStatement TotalAssets"]
    df_result["NetAssets_growth_rate"] = df_result["Result_FinancialStatement NetAssets"] / df_result["last Result_FinancialStatement NetAssets"]
    #df_result = df_result.drop(["EndOfDayQuote ExchangeOfficialClose","macd_hist_shift","stocas_hist_shift","stocas_huge_signal"], axis=1)
#     # fin_dataのnp.float64のデータのみを取得
#     fin_data = fin_data.select_dtypes(include=["float64"])
#     # 欠損値処理
#     fin_feats = fin_data.fillna(0)

    # おおまかな手順の2つ目
    # stock_priceデータを読み込む
    price = dfs["stock_price"]
    # 特定の銘柄コードのデータに絞る
    price_data = price[price["Local Code"] == code]
    # 終値のみに絞る
    feats = price_data[["EndOfDayQuote ExchangeOfficialClose"]]
    # 特徴量の生成対象期間を指定
    feats = feats.loc[pd.Timestamp(start_dt) - pd.offsets.BDay(n) :].copy()

    # 終値の20営業日リターン
    feats["return_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(20)
    # 終値の40営業日リターン
    feats["return_2month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(40)
    # 終値の60営業日リターン
    feats["return_3month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(60)
    # 終値の20営業日ボラティリティ
    feats["volatility_0.5month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(10).std()
    )
    # 終値の40営業日ボラティリティ
    feats["volatility_1month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(20).std()
    )
    # 終値の60営業日ボラティリティ
    feats["volatility_2month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(40).std()
    )
    
    for period in periods:
        col = "5 windows volatility  {} mean".format(period)
        feats[col] = feats["volatility_0.5month"].rolling(period).mean()
    
    # ヒストリカル・ボラティリティ移動平均
    for period in periods:
        col = "25 windows volatility  {} mean".format(period)
        feats[col] = feats["volatility_1month"].rolling(period).mean()
        
    # ヒストリカル・ボラティリティ移動平均
    for period in periods:
        col = "75 windows volatility  {} mean".format(period)
        feats[col] = feats["volatility_2month"].rolling(period).mean()

    # ヒストリカル・ボラティリティ移動平均微分値
    for period in periods:
        col = "5 windows volatility  {} mean diff".format(period)
        feats[col] = feats["volatility_0.5month"].rolling(10).mean().pct_change(period)

    # ヒストリカル・ボラティリティ移動平均微分値
    for period in periods:
        col = "25 windows volatility  {} mean diff".format(period)
        feats[col] = feats["volatility_1month"].rolling(20).mean().pct_change(period)

    # ヒストリカル・ボラティリティ移動平均微分値
    for period in periods:
        col = "75 windows volatility  {} mean diff".format(period)
        feats[col] = feats["volatility_2month"].rolling(30).mean().pct_change(period)
    
    macd_period = {'long' : 26, 'short' : 12}
    sma_period  = 9
    feats['macd'] = macd(feats['EndOfDayQuote ExchangeOfficialClose'].values.tolist(), 12, 26)
    feats['macd_signal'] = sma(feats['macd'].values.tolist(), sma_period)
    feats["macd_hist"] = feats["macd"] - feats["macd_signal"]
    feats["macd_hist_shift"] = feats["macd_hist"].shift()
    feats.loc[feats["macd_hist"] < 0,"macd_hist_signal"] = -1
    feats.loc[feats["macd_hist"] > 0,"macd_hist_signal"] = 1
    feats.loc[feats["macd_hist"] == 0,"macd_hist_signal"] = 0
    feats["macd_cross_signal"] = feats["macd_hist"]*feats["macd_hist_shift"]
    feats.loc[feats["macd_cross_signal"] <= 0, "macd_cross_signal"] = 0
    feats.loc[feats["macd_cross_signal"] > 0, "macd_cross_signal"] = 1
    feats["macd_cross_sign_20"] = (1-feats["macd_cross_signal"].rolling(20).apply(cross_X))*feats["macd_hist_signal"]
    feats["macd_cross_sign_10"] = (1-feats["macd_cross_signal"].rolling(10).apply(cross_X))*feats["macd_hist_signal"]
    feats["macd_cross_sign_5"] = (1-feats["macd_cross_signal"].rolling(5).apply(cross_X))*feats["macd_hist_signal"]
    #feats.loc[feats["macd_cross_sign"] > 0, "macd_cross_sign"] = 1
    mac_cols = ["macd","macd_signal","macd_hist"]
    mac_cross_cols = ["macd_cross_sign_20","macd_cross_sign_10"]
    feats["slow%k"] = srv_d(feats["EndOfDayQuote ExchangeOfficialClose"].values.tolist(), 14)*100
    feats["slow%d"] = feats["slow%k"].rolling(3).mean()
    feats["stocas_hist"] = feats["slow%k"] - feats["slow%d"]
    feats["stocas_hist_shift"] = feats["stocas_hist"].shift()
    feats["stocas_cross_signal"] = feats["stocas_hist"]*feats["stocas_hist_shift"]
    feats.loc[feats["stocas_cross_signal"] <= 0, "stocas_cross_signal"] = 0
    feats.loc[feats["stocas_cross_signal"] > 0, "stocas_cross_signal"] = 1
    feats.loc[feats["stocas_hist"] < 0,"stocas_hist_signal"] = -1
    feats.loc[feats["stocas_hist"] > 0,"stocas_hist_signal"] = 1
    feats.loc[feats["stocas_hist"] == 0,"stocas_hist_signal"] = 0
    feats["stocas_huge_signal"] = 0
    feats.loc[feats["slow%k"] <= 20,"stocas_huge_signal"] = 1
    feats.loc[feats["slow%k"] >= 80,"stocas_huge_signal"] = 1
    # feats["stocas_cross_sign_20"] = (1-feats["stocas_cross_signal"].rolling(20).apply(cross_X))*feats["stocas_hist_signal"]*feats["stocas_huge_signal"]
    # feats["stocas_cross_sign_10"] = (1-feats["stocas_cross_signal"].rolling(10).apply(cross_X))*feats["stocas_hist_signal"]*feats["stocas_huge_signal"]
    feats["stocas_cross_sign_5"] = (1-feats["stocas_cross_signal"].rolling(5).apply(cross_X))*feats["stocas_hist_signal"]*feats["stocas_huge_signal"]
    
    
    # おおまかな手順の3つ目
    # 欠損値処理
    #feats = feats.fillna(0)
    # 元データのカラムを削除
    feats = feats.drop(["EndOfDayQuote ExchangeOfficialClose","macd_hist_shift","stocas_hist_shift","stocas_huge_signal"], axis=1)

    #財務データの特徴量とマーケットデータの特徴量のインデックスを合わせる
    feats = feats.loc[feats.index.isin(df_result.index)]
    df_result = df_result.loc[df_result.index.isin(feats.index)]
    print(df_result.head())
    print(feats.head())
    #feats = pd.concat([feats, df_result], axis=1).dropna()
    # データを結合
    #print(feats.index)
    feats = pd.merge(df_result,feats,left_index= True,right_index = True ,how = "left")
    

    # 欠損値処理を行います。
    feats = feats.replace([np.inf, -np.inf], 0)

    # 銘柄コードを設定
    feats["code"] = code


    return feats

In [64]:
df = get_features_for_predict(dfs, 9984)
df.T

last Result_FinancialStatement NetSales
<class 'list'>
last Result_FinancialStatement OrdinaryIncome
<class 'list'>
last Result_FinancialStatement TotalAssets
<class 'list'>
last Result_FinancialStatement NetAssets
<class 'list'>
             base_date Local Code  \
2016-04-21  2016/04/21       9984   
2017-02-08  2017/02/08       9984   
2018-02-07  2018/02/07       9984   
2019-02-06  2019/02/06       9984   
2020-04-30  2020/04/30       9984   

           Result_FinancialStatement AccountingStandard  \
2016-04-21                             ConsolidatedIFRS   
2017-02-08                             ConsolidatedIFRS   
2018-02-07                             ConsolidatedIFRS   
2019-02-06                             ConsolidatedIFRS   
2020-04-30                             ConsolidatedIFRS   

           Result_FinancialStatement FiscalPeriodEnd  \
2016-04-21                                   2015/12   
2017-02-08                                   2016/12   
2018-02-07              



Unnamed: 0,2016-04-21 00:00:00,2017-02-08 00:00:00,2018-02-07 00:00:00,2019-02-06 00:00:00,2020-04-30 00:00:00,2016-05-10 00:00:00,2017-05-10 00:00:00,2018-05-09 00:00:00,2019-05-09 00:00:00,2020-05-18 00:00:00,2016-07-28 00:00:00,2017-08-07 00:00:00,2018-08-06 00:00:00,2019-08-07 00:00:00,2020-10-23 00:00:00,2016-11-07 00:00:00,2017-11-06 00:00:00,2018-11-05 00:00:00,2019-11-06 00:00:00,2020-11-09 00:00:00
base_date,2016/04/21,2017/02/08,2018/02/07,2019/02/06,2020/04/30,2016/05/10,2017/05/10,2018/05/09,2019/05/09,2020/05/18,2016/07/28,2017/08/07,2018/08/06,2019/08/07,2020/10/23,2016/11/07,2017/11/06,2018/11/05,2019/11/06,2020/11/09
Local Code,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984,9984
Result_FinancialStatement AccountingStandard,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS,ConsolidatedIFRS
Result_FinancialStatement FiscalPeriodEnd,2015/12,2016/12,2017/12,2018/12,2019/12,2016/03,2017/03,2018/03,2019/03,2020/03,2016/06,2017/06,2018/06,2019/06,2020/06,2016/09,2017/09,2018/09,2019/09,2020/09
Result_FinancialStatement ReportType,Q3,Q3,Q3,Q3,Q3,Annual,Annual,Annual,Annual,Annual,Q1,Q1,Q1,Q1,Q1,Q2,Q2,Q2,Q2,Q2
Result_FinancialStatement FiscalYear,2016,2017,2018,2019,2020,2016,2017,2018,2019,2020,2017,2018,2019,2020,2021,2017,2018,2019,2020,2021
Result_FinancialStatement ModifyDate,2016/02/10,2017/02/08,2018/02/07,2019/02/06,2020/02/12,2016/05/10,2017/05/10,2018/05/09,2019/05/09,2020/05/18,2016/07/28,2017/08/07,2018/08/06,2019/08/07,2020/08/11,2016/11/07,2017/11/06,2018/11/05,2019/11/06,2020/11/09
Result_FinancialStatement CompanyType,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB,GB
Result_FinancialStatement ChangeOfFiscalYearEnd,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Result_FinancialStatement NetSales,6.81021e+06,6.58147e+06,6.81127e+06,7.16845e+06,7.0898e+06,9.15355e+06,8.901e+06,9.15876e+06,9.60224e+06,6.18509e+06,2.12652e+06,2.18606e+06,2.27278e+06,2.3364e+06,1.45006e+06,4.27183e+06,4.41114e+06,4.65385e+06,4.65172e+06,2.63053e+06
