In [1]:
import os
import pickle
import sys
import warnings

import numpy as np
import pandas as pd
import pandas.tseries.offsets as offsets
from pandas.tseries.holiday import *
from pandas.tseries.offsets import CustomBusinessDay

import seaborn as sns
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold

import datetime as dt
from datetime import timedelta

import lightgbm as lgb
#import optuna.integration.lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor

# 表示用の設定変更
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

In [130]:
#祝日カレンダーのクラスJpCalendar定義

class JpCalendar(AbstractHolidayCalendar):
    rules = [
    Holiday('1231', month=12, day=31),
    Holiday('0101', month=1, day=1),
    Holiday('0102', month=1, day=2),
    Holiday('0103', month=1, day=3),        
    Holiday('成人の日', year =2016, month=1, day=11),
    Holiday('建国記念の日', year =2016, month=2, day=11),
    Holiday('春分の日', year =2016, month=3, day=20),
    Holiday('休日', year =2016, month=3, day=21),
    Holiday('昭和の日', year =2016, month=4, day=29),
    Holiday('憲法記念日', year =2016, month=5, day=3),
    Holiday('みどりの日', year =2016, month=5, day=4),
    Holiday('こどもの日', year =2016, month=5, day=5),
    Holiday('海の日', year =2016, month=7, day=18),
    Holiday('山の日', year =2016, month=8, day=11),
    Holiday('敬老の日', year =2016, month=9, day=19),
    Holiday('秋分の日', year =2016, month=9, day=22),
    Holiday('体育の日', year =2016, month=10, day=10),
    Holiday('文化の日', year =2016, month=11, day=3),
    Holiday('勤労感謝の日', year =2016, month=11, day=23),
    Holiday('天皇誕生日', year =2016, month=12, day=23),
    Holiday('休日', year =2017, month=1, day=2),
    Holiday('成人の日', year =2017, month=1, day=9),
    Holiday('建国記念の日', year =2017, month=2, day=11),
    Holiday('春分の日', year =2017, month=3, day=20),
    Holiday('昭和の日', year =2017, month=4, day=29),
    Holiday('憲法記念日', year =2017, month=5, day=3),
    Holiday('みどりの日', year =2017, month=5, day=4),
    Holiday('こどもの日', year =2017, month=5, day=5),
    Holiday('海の日', year =2017, month=7, day=17),
    Holiday('山の日', year =2017, month=8, day=11),
    Holiday('敬老の日', year =2017, month=9, day=18),
    Holiday('秋分の日', year =2017, month=9, day=23),
    Holiday('体育の日', year =2017, month=10, day=9),
    Holiday('文化の日', year =2017, month=11, day=3),
    Holiday('勤労感謝の日', year =2017, month=11, day=23),
    Holiday('天皇誕生日', year =2017, month=12, day=23),
    Holiday('成人の日', year =2018, month=1, day=8),
    Holiday('建国記念の日', year =2018, month=2, day=11),
    Holiday('休日', year =2018, month=2, day=12),
    Holiday('春分の日', year =2018, month=3, day=21),
    Holiday('昭和の日', year =2018, month=4, day=29),
    Holiday('休日', year =2018, month=4, day=30),
    Holiday('憲法記念日', year =2018, month=5, day=3),
    Holiday('みどりの日', year =2018, month=5, day=4),
    Holiday('こどもの日', year =2018, month=5, day=5),
    Holiday('海の日', year =2018, month=7, day=16),
    Holiday('山の日', year =2018, month=8, day=11),
    Holiday('敬老の日', year =2018, month=9, day=17),
    Holiday('秋分の日', year =2018, month=9, day=23),
    Holiday('休日', year =2018, month=9, day=24),
    Holiday('体育の日', year =2018, month=10, day=8),
    Holiday('文化の日', year =2018, month=11, day=3),
    Holiday('勤労感謝の日', year =2018, month=11, day=23),
    Holiday('天皇誕生日', year =2018, month=12, day=23),
    Holiday('休日', year =2018, month=12, day=24),
    Holiday('成人の日', year =2019, month=1, day=14),
    Holiday('建国記念の日', year =2019, month=2, day=11),
    Holiday('春分の日', year =2019, month=3, day=21),
    Holiday('昭和の日', year =2019, month=4, day=29),
    Holiday('休日', year =2019, month=4, day=30),
    Holiday('休日（祝日扱い）', year =2019, month=5, day=1),
    Holiday('休日', year =2019, month=5, day=2),
    Holiday('憲法記念日', year =2019, month=5, day=3),
    Holiday('みどりの日', year =2019, month=5, day=4),
    Holiday('こどもの日', year =2019, month=5, day=5),
    Holiday('休日', year =2019, month=5, day=6),
    Holiday('海の日', year =2019, month=7, day=15),
    Holiday('山の日', year =2019, month=8, day=11),
    Holiday('休日', year =2019, month=8, day=12),
    Holiday('敬老の日', year =2019, month=9, day=16),
    Holiday('秋分の日', year =2019, month=9, day=23),
    Holiday('体育の日（スポーツの日）', year =2019, month=10, day=14),
    Holiday('休日（祝日扱い）', year =2019, month=10, day=22),
    Holiday('文化の日', year =2019, month=11, day=3),
    Holiday('休日', year =2019, month=11, day=4),
    Holiday('勤労感謝の日', year =2019, month=11, day=23),
    Holiday('成人の日', year =2020, month=1, day=13),
    Holiday('建国記念の日', year =2020, month=2, day=11),
    Holiday('天皇誕生日', year =2020, month=2, day=23),
    Holiday('休日', year =2020, month=2, day=24),
    Holiday('春分の日', year =2020, month=3, day=20),
    Holiday('昭和の日', year =2020, month=4, day=29),
    Holiday('憲法記念日', year =2020, month=5, day=3),
    Holiday('みどりの日', year =2020, month=5, day=4),
    Holiday('こどもの日', year =2020, month=5, day=5),
    Holiday('休日', year =2020, month=5, day=6),
    Holiday('海の日', year =2020, month=7, day=23),
    Holiday('スポーツの日', year =2020, month=7, day=24),
    Holiday('山の日', year =2020, month=8, day=10),
    Holiday('敬老の日', year =2020, month=9, day=21),
    Holiday('秋分の日', year =2020, month=9, day=22),
    Holiday('文化の日', year =2020, month=11, day=3),
    Holiday('勤労感謝の日', year =2020, month=11, day=23),
    Holiday('成人の日', year =2021, month=1, day=11),
    Holiday('建国記念の日', year =2021, month=2, day=11),
    Holiday('天皇誕生日', year =2021, month=2, day=23),
    Holiday('春分の日', year =2021, month=3, day=20),
    Holiday('昭和の日', year =2021, month=4, day=29),
    Holiday('憲法記念日', year =2021, month=5, day=3),
    Holiday('みどりの日', year =2021, month=5, day=4),
    Holiday('こどもの日', year =2021, month=5, day=5),
    Holiday('海の日', year =2021, month=7, day=22),
    Holiday('スポーツの日', year =2021, month=7, day=23),
    Holiday('山の日', year =2021, month=8, day=8),
    Holiday('休日', year =2021, month=8, day=9),
    Holiday('敬老の日', year =2021, month=9, day=20),
    Holiday('秋分の日', year =2021, month=9, day=23),
    Holiday('文化の日', year =2021, month=11, day=3),
    Holiday('勤労感謝の日', year =2021, month=11, day=23),
   ]

In [131]:
#変数定義
dataset_dir="./data"
mil = 1000000

#祝日カレンダーのクラス生成
tse = JpCalendar()

In [132]:
#スピアマン順位相関係数の計算関数3つ（numrank, spearman, eval_spearman)定義

def numrank(a):
    u, inv, counts = np.unique(a, return_inverse=True, return_counts=True)
    uniqueRankNd = np.array(np.hstack((0, counts[:-1].cumsum())), dtype='float32')
    uniqueRankNd = (counts == 1) * uniqueRankNd + (counts != 1) * (2 * uniqueRankNd + counts - 1) / counts
    priceRankNd = np.ones_like(inv) * inv.shape[0] - uniqueRankNd[inv]

    return priceRankNd

def spearman(y_true, y_pred):

    y1 = numrank(y_true)
    y2 = numrank(y_pred)
    correlation, pvalue = spearmanr(y1, y2)
    return correlation


def eval_spearman(preds, data):

    y_true = data.get_label()
    return 'sp', spearman(y_true, preds), True
    

前処理

In [133]:
#学習対象期間と評価対象期間の設定
dates_tr = "2017-01-01"
datee_tr = "2019-12-31"
dates_vl = "2020-01-01"
datee_vl = "2020-11-30"


In [134]:
#各種ファイルのインポート
m_conf = pd.read_csv(f"{dataset_dir}/m_forecast_confidence.csv")
m_qs = pd.read_csv(f"{dataset_dir}/m_qsales.csv")
m_soyaku = pd.read_csv(f"{dataset_dir}/m_soyaku.csv")
m_yutai = pd.read_csv(f"{dataset_dir}/m_yutai.csv")

In [135]:
sl_bk = pd.read_csv(f"{dataset_dir}/stock_list.csv.gz")
sp_bk = pd.read_csv(f"{dataset_dir}/stock_price.csv.gz")
sf_bk = pd.read_csv(f"{dataset_dir}/stock_fin.csv.gz")
slb_bk = pd.read_csv(f"{dataset_dir}/stock_labels.csv.gz")

In [136]:
sl = sl_bk.copy()
sp = sp_bk.copy()
sf = sf_bk.copy()
slb = slb_bk.copy()

In [137]:
#項目名のリネーム
sl = sl.rename(columns={
                        "prediction_target": "target", 
                        "Local Code": "code",
                        "Name (English)": "name",
                        "Section/Products": "section",
                        "Size Code (New Index Series)": "sizecode",
                        "17 Sector(Code)": "17sec",
                        "33 Sector(Code)": "33sec",
                        "IssuedShareEquityQuote IssuedShare": "issued",
                       })

sp = sp.rename(columns={
                        "Local Code": "code",
                        "EndOfDayQuote Date": "date",
                        "EndOfDayQuote Open" : "open",
                        "EndOfDayQuote High" : "high",
                        "EndOfDayQuote Low" : "low",
                        "EndOfDayQuote Close" : "close",
                        "EndOfDayQuote ExchangeOfficialClose": "eclose",
                        "EndOfDayQuote Volume": "vol",
                        "EndOfDayQuote CumulativeAdjustmentFactor": "adfac",
                       })

sf = sf.rename(columns={
                        "base_date": "date",
                        "Local Code": "code",
                        "Result_FinancialStatement AccountingStandard": "accstd",
                        "Result_FinancialStatement FiscalPeriodEnd": "period",
                        "Result_FinancialStatement ReportType": "rtype",
                        "Result_FinancialStatement FiscalYear": "fyear",
                        "Result_FinancialStatement ModifyDate": "moddate",
                        "Result_FinancialStatement CompanyType": "ctype",
                        "Result_FinancialStatement ChangeOfFiscalYearEnd": "FYch",
                        "Result_FinancialStatement NetSales": "sales",
                        "Result_FinancialStatement OperatingIncome": "opein",
                        "Result_FinancialStatement OrdinaryIncome": "ordin",
                        "Result_FinancialStatement NetIncome": "netin",
                        "Result_FinancialStatement TotalAssets": "tasset",
                        "Result_FinancialStatement NetAssets": "nasset",
                        "Result_FinancialStatement CashFlowsFromOperatingActivities": "opecf",
                        "Result_FinancialStatement CashFlowsFromFinancingActivities": "fincf",
                        "Result_FinancialStatement CashFlowsFromInvestingActivities": "invcf",
                        "Forecast_FinancialStatement AccountingStandard": "accstd_f",
                        "Forecast_FinancialStatement FiscalPeriodEnd": "period_f",
                        "Forecast_FinancialStatement ReportType": "rtype_f",
                        "Forecast_FinancialStatement FiscalYear": "fyear_f",
                        "Forecast_FinancialStatement ModifyDate": "moddate_f",
                        "Forecast_FinancialStatement CompanyType": "ctype_f",
                        "Forecast_FinancialStatement ChangeOfFiscalYearEnd": "FYch_f",
                        "Forecast_FinancialStatement NetSales": "sales_f",
                        "Forecast_FinancialStatement OperatingIncome": "opein_f",
                        "Forecast_FinancialStatement OrdinaryIncome": "ordin_f",
                        "Forecast_FinancialStatement NetIncome": "netin_f",
                        "Result_Dividend FiscalPeriodEnd": "divperiod",
                        "Result_Dividend ReportType": "div_rtype",
                        "Result_Dividend ModifyDate": "divmoddate",
                        "Result_Dividend RecordDate": "divdt",
                        "Result_Dividend QuarterlyDividendPerShare": "qdiv",
                        "Result_Dividend AnnualDividendPerShare": "adiv",
                        "Forecast_Dividend FiscalPeriodEnd": "divperiod_f",
                        "Forecast_Dividend ReportType": "div_rtype_f",
                        "Forecast_Dividend ModifyDate": "divmoddate_f",
                        "Forecast_Dividend RecordDate": "divdt_f",
                        "Forecast_Dividend QuarterlyDividendPerShare": "qdiv_f",
                        "Forecast_Dividend AnnualDividendPerShare": "adiv_f",
                       })

#日付型に変換
sp.loc[:, "date"] = pd.to_datetime(sp.loc[:, "date"])
sf.loc[:, "date"] = pd.to_datetime(sf.loc[:, "date"])
sf.loc[:, "divdt"] = pd.to_datetime(sf.loc[:, "divdt"])
sf.loc[:, "divdt_f"] = pd.to_datetime(sf.loc[:, "divdt_f"])

#東証システム障害の日除外
sp = sp[sp["date"]!="2020-10-01"]

sl = pd.merge(sl,m_soyaku, on=["code"], how="left")
sl.loc[sl["soyaku"].isnull(), "soyaku"] = 0

sf.loc[sf["divdt_f"].isnull(), "divdt_f"] = pd.to_datetime(sf["divperiod_f"].str[:4] + "-" + sf["divperiod_f"].str[5:7] + "-" + "01") + offsets.MonthEnd()

sf = sf.sort_values(['code', 'date'])
sp = sp.sort_values(['code', 'date'])


In [138]:
#項目名のリネーム
slb = slb.rename(columns={
                        "base_date": "date",
                        "Local Code": "code",
                       })
#日付型に変換
slb.loc[:, "date"] = pd.to_datetime(slb.loc[:, "date"])
slb.loc[:, "label_date_20"] = pd.to_datetime(slb.loc[:, "label_date_20"])


In [139]:
#Predictでは不要
#20日以内修正開示があった場合に、オリジナルを除外
sf = sf[~((sf["code"]==sf.shift(-1)["code"]) & (sf["date"] + dt.timedelta(days = 20) >= sf.shift(-1)["date"]))]


In [140]:
#決算数値を年間数値に換算

sf = pd.merge(sf,m_qs, on=["code"], how="left")

sfan = sf[["code","period","rtype"]].drop_duplicates()
sfan = sfan[sfan["rtype"]=="Annual"]

sfan["period"] = np.where(sfan["period"].isnull(), "0000/00", sfan["period"])
sfan["mondiff"] = np.nan
sfan["mondiff"][1:] = np.where((sfan["period"][1:].isnull()) | (sfan.shift(1)["period"][1:].isnull()) | (sfan["code"][1:]!=sfan.shift(1)["code"][1:]), np.nan, 
                         sfan["period"][1:].str[:4].astype(int)*12 + sfan["period"][1:].str[5:7].astype(int)
                         - (sfan.shift(1)["period"][1:].str[:4].astype(int)*12 + sfan.shift(1)["period"][1:].str[5:7].astype(int)))
sf["period"] = np.where(sf["period"]=="0000/00", np.nan, sf["period"])
sf = pd.merge(sf,sfan, on=["code","period","rtype"], how="left")

sf["qsfac"] = np.where(sf["FYch"] == 1, sf["mondiff"] / 12, 1)
sf["qsfac"] = np.where(sf["rtype"]=="Q1", sf["qs1"], sf["qsfac"])
sf["qsfac"] = np.where(sf["rtype"]=="Q2", sf["qs2"], sf["qsfac"])
sf["qsfac"] = np.where(sf["rtype"]=="Q3", sf["qs3"], sf["qsfac"])

sfan = sf[["code","period_f","rtype_f"]].drop_duplicates()
sfan = sfan[sfan["rtype_f"]=="Annual"]

sfan["period_f"] = np.where(sfan["period_f"].isnull(), "0000/00", sfan["period_f"])
sfan["mondiff_f"] = np.nan
sfan["mondiff_f"][1:] = np.where((sfan["period_f"][1:].isnull()) | (sfan.shift(1)["period_f"][1:].isnull()) | (sfan["code"][1:]!=sfan.shift(1)["code"][1:]), np.nan, 
                         sfan["period_f"][1:].str[:4].astype(int)*12 + sfan["period_f"][1:].str[5:7].astype(int)
                         - (sfan.shift(1)["period_f"][1:].str[:4].astype(int)*12 + sfan.shift(1)["period_f"][1:].str[5:7].astype(int)))
sf["period_f"] = np.where(sf["period_f"]=="0000/00", np.nan, sf["period_f"])
sf = pd.merge(sf,sfan, on=["code","period_f","rtype_f"], how="left")


sf["qsfac_f"] = np.where(sf["FYch_f"] == 1, sf["mondiff_f"] / 12, 1)
sf["qsfac_f"] = np.where(sf["rtype_f"]=="Q1", sf["qs1"], sf["qsfac_f"])
sf["qsfac_f"] = np.where(sf["rtype_f"]=="Q2", sf["qs2"], sf["qsfac_f"])
sf["qsfac_f"] = np.where(sf["rtype_f"]=="Q3", sf["qs3"], sf["qsfac_f"])


sf["noresult"] = np.where(sf["sales"].isnull() & sf["opein"].isnull() & sf["ordin"].isnull() & sf["netin"].isnull(), 1, 0)
sf["nofrct"] = np.where(sf["sales_f"].isnull() & sf["opein_f"].isnull() & sf["ordin_f"].isnull() & sf["netin_f"].isnull(), 1, 0)

sf["nofrct"] = np.where((sf["code"] == sf.shift(1)["code"]) 
         & (sf["accstd_f"] == sf.shift(1)["accstd_f"]) & (sf["period_f"] == sf.shift(1)["period_f"])
         & (sf["sales_f"] == sf.shift(1)["sales_f"]) & (sf["opein_f"] == sf.shift(1)["opein_f"])
         & (sf["ordin_f"] == sf.shift(1)["ordin_f"]) & (sf["netin_f"] == sf.shift(1)["netin_f"]), 
                        1, sf["nofrct"])

sf["sales_aad"] = sf["sales"] * 1 / sf["qsfac"]
sf["sales_aad_f"] = sf["sales_f"] * 1 / sf["qsfac_f"]
sf["opein_aad"] = sf["opein"] * 1 / sf["qsfac"]
sf["opein_aad_f"] = sf["opein_f"] * 1 / sf["qsfac_f"]
sf["ordin_aad"] = sf["ordin"] * 1 / sf["qsfac"]
sf["ordin_aad_f"] = sf["ordin_f"] * 1 / sf["qsfac_f"]
sf["netin_aad"] = sf["netin"] * 1 / sf["qsfac"]
sf["netin_aad_f"] = sf["netin_f"] * 1 / sf["qsfac_f"]

sf["sales_aad_t"] = np.where(sf["sales_aad_f"].isnull()==False, sf["sales_aad_f"], np.where(sf["nofrct"]==0, np.nan, sf["sales_aad"]))
sf["opein_aad_t"] = np.where(sf["opein_aad_f"].isnull()==False, sf["opein_aad_f"], np.where(sf["nofrct"]==0, np.nan, sf["opein_aad"]))
sf["ordin_aad_t"] = np.where(sf["ordin_aad_f"].isnull()==False, sf["ordin_aad_f"], np.where(sf["nofrct"]==0, np.nan, sf["ordin_aad"]))
sf["netin_aad_t"] = np.where(sf["netin_aad_f"].isnull()==False, sf["netin_aad_f"], np.where(sf["nofrct"]==0, np.nan, sf["netin_aad"]))

sf["accstd_t"] = np.where(sf["nofrct"]==0, sf["accstd_f"], sf["accstd"])
sf["fyear_t"] = np.where(sf["nofrct"]==0, sf["fyear_f"], sf["fyear"])

sf["opecf_aad"] = sf["opecf"] * 1 / sf["qsfac"]
sf["fincf_aad"] = sf["fincf"] * 1 / sf["qsfac"]
sf["invcf_aad"] = sf["invcf"] * 1 / sf["qsfac"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [141]:
#四半期配当をみなし年換算する関数div_aad定義
def div_aad(rtype, fig):
    return np.where(rtype == "Q2", fig * 1.5, np.where((rtype == "Q1") | (rtype == "Q3"), fig*2.5, fig))

#四半期配当を年間配当にみなし換算
sf_divdt = sf[["code", "divdt"]].drop_duplicates().copy()
sf_divdt_f = sf[["code", "divdt_f"]].drop_duplicates().copy()
sf_divdt_f = sf_divdt_f.rename(columns={"divdt_f": "divdt"})
sf_divdt = pd.concat([sf_divdt, sf_divdt_f]).drop_duplicates()
sf_divdt = sf_divdt[sf_divdt["divdt"].isnull()==False]

sp_adfac = sp[["code", "date","adfac"]].drop_duplicates().copy()
sp_adfac["adfac"] = sp.groupby("code").shift(3)["adfac"]
sp_adfac = sp_adfac.rename(columns={"date": "divdt", "adfac": "adfac_divdt"})

sp_adfac = pd.concat([sp_adfac, sf_divdt])

sp_adfac = sp_adfac.sort_values(['code', 'divdt'])
sp_adfac["adfac_divdt"] = sp_adfac["adfac_divdt"].fillna(method='ffill')

sp_adfac = sp_adfac.drop_duplicates()
sf = pd.merge(sf, sp_adfac, on=['code', 'divdt'], how="left")

sp_adfac = sp_adfac.rename(columns={"divdt": "divdt_f", "adfac_divdt": "adfac_divdt_f"})
sf = pd.merge(sf, sp_adfac, on=['code', 'divdt_f'], how="left")

sf["qdiv"] = sf["qdiv"] / sf["adfac_divdt"]
sf["adiv"] = sf["adiv"] / sf["adfac_divdt"]

sf["qdiv_f"] = sf["qdiv_f"] / sf["adfac_divdt_f"]
sf["adiv_f"] = sf["adiv_f"] / sf["adfac_divdt_f"]

sf["qdiv_aad"] =  np.where(sf["div_rtype"] == "Annual", sf["adiv"], div_aad(sf["rtype"], sf["qdiv"]))
sf["qdiv_aad_f"] =  np.where(sf["div_rtype_f"] == "Annual", sf["adiv_f"], div_aad(sf["rtype_f"], sf["qdiv_f"]))
sf["qdiv_aad_t"] = np.where(sf["qdiv_aad_f"].isnull(), sf["qdiv_aad"], sf["qdiv_aad_f"])


In [142]:
#銘柄の権利落ち日を特定
sf["divkenridt"] = np.where(sf["divdt"] >= "2019-07-18", sf["divdt"] + offsets.CustomBusinessDay(-2, calendar=tse),sf["divdt"] + offsets.CustomBusinessDay(-3, calendar=tse))
sf["divkenridt_f"] = np.where(sf["divdt_f"] >= "2019-07-18", sf["divdt_f"] + offsets.CustomBusinessDay(-2, calendar=tse),sf["divdt_f"] + offsets.CustomBusinessDay(-3, calendar=tse))

sf["kenri_f"] = np.where(
             (sf["divkenridt_f"] <= sf["date"] + offsets.CustomBusinessDay(30, calendar=tse))
                &(sf["divkenridt_f"] > sf["date"] + offsets.CustomBusinessDay(0, calendar=tse)), 1, 0)

sf["kenriochi_f"] = np.where(
             (sf["divkenridt_f"] + offsets.CustomBusinessDay(1, calendar=tse) <= sf["date"] + offsets.CustomBusinessDay(20, calendar=tse))
                & (sf["divkenridt_f"] + offsets.CustomBusinessDay(1, calendar=tse) > sf["date"] ), 1, 0)

sf["post_kenriochi_f"] = np.where(
            ( (sf["divkenridt_f"] + offsets.CustomBusinessDay(1, calendar=tse) <= sf["date"])
                & (sf["divkenridt_f"] + offsets.CustomBusinessDay(20, calendar=tse) > sf["date"]))
            | ((sf["divkenridt"] + offsets.CustomBusinessDay(1, calendar=tse) <= sf["date"])
                & (sf["divkenridt"] + offsets.CustomBusinessDay(20, calendar=tse) > sf["date"])), 1, 0)




In [143]:
#x営業日日前からの株価変化率を算出
sp = sp.sort_values(['code', 'date'])

sp["eclose"] = sp["eclose"].replace(0, np.nan).interpolate()

sp["eclose_ch_1"] = sp.groupby("code")["eclose"].pct_change(1)
sp["eclose_ch_3"] = sp.groupby("code")["eclose"].pct_change(3)
sp["eclose_ch_5"] = sp.groupby("code")["eclose"].pct_change(5)
sp["eclose_ch_10"] = sp.groupby("code")["eclose"].pct_change(10)
sp["eclose_ch_20"] = sp.groupby("code")["eclose"].pct_change(20)
sp["eclose_ch_40"] = sp.groupby("code")["eclose"].pct_change(40)
sp["eclose_ch_60"] = sp.groupby("code")["eclose"].pct_change(60)
sp["eclose_ch_120"] = sp.groupby("code")["eclose"].pct_change(120)

sp["eclose_ch_mid"] = sp.groupby("code")["eclose"].pct_change(240)

#120営業日前からの出来高変化率の対数を算出
sp["vol_ch"] = np.log(1.01 + np.where((sp.groupby("code").shift(120)["vol"]==0) | (sp.groupby("code").shift(120)["vol"].isnull()), sp["vol"]/100, sp.groupby("code")["vol"].pct_change(120)))

#終値がnull値のレコードを除外
sp = sp[sp["eclose"].isnull()==False]

#日ごと銘柄ごとのボラティリティを算出
sp["vola"] = (sp["high"] - sp["low"])/ sp["eclose"]


In [144]:
#学習期間における銘柄ごとのボラティリティを算出
sp_code_max = sp[(sp["date"]>=dates_tr)&(sp["date"]<=datee_tr)][["code","eclose"]].groupby("code").max().reset_index()
sp_code_min = sp[(sp["date"]>=dates_tr)&(sp["date"]<=datee_tr)][["code","eclose"]].groupby("code").min().reset_index()
sp_code_median = sp[(sp["date"]>=dates_tr)&(sp["date"]<=datee_tr)][["code","eclose"]].groupby("code").median().reset_index()
sp_code_max = sp_code_max.rename(columns={"eclose": "eclose_max"})
sp_code_min = sp_code_min.rename(columns={"eclose": "eclose_min"})
sp_code_median = sp_code_median.rename(columns={"eclose": "eclose_median"})
sp_code = pd.merge(sp_code_max, sp_code_min, on=["code"], how="left")
sp_code = pd.merge(sp_code, sp_code_median, on=["code"], how="left")
sp_code["vola_code"] = (sp_code["eclose_max"] - sp_code["eclose_min"])/sp_code["eclose_median"] 
sp = pd.merge(sp, sp_code, on=["code"], how="left")

In [145]:
#相場平均の株価変化率を算出
sp_macro = sp[["date","eclose_ch_1","eclose_ch_3","eclose_ch_10"]].groupby("date").mean()
sp_macro = sp_macro.rename(columns={"eclose_ch_1": "eclose_ch_1_macro", "eclose_ch_3": "eclose_ch_3_macro", "eclose_ch_10": "eclose_ch_10_macro"})
sp = pd.merge(sp, sp_macro, on=["date"], how="left")


In [146]:
#激しい下落があった個別銘柄・日付の判定
sp["sellmax_code"] = 0
sp.loc[(sp["eclose_ch_10"]>-0.03)&(sp["eclose_ch_1"]<-0.03)&(sp.groupby("code").shift(1)["eclose_ch_1"]<-0.03),"sellmax_code"] = 1
#激しい相場下落があった日付の判定
sp["sellmax_macro"] = 0
sp.loc[(sp["eclose_ch_10_macro"]>-0.08)&(sp["eclose_ch_1_macro"]<-0.025)&(sp["eclose_ch_3_macro"]<-0.03),"sellmax_macro"] = 1


In [147]:
#ストップ高・安の判定
sp["eclose_ori"] = sp["eclose"] * sp["adfac"]

sp["eclose_tbc_pre_1"] = sp.groupby("code").shift(1)["eclose_ori"]/sp.groupby("code").shift(1)["adfac"]*sp["adfac"]
sp["eclose_diff_1"] = np.round(sp["eclose_ori"] - sp["eclose_tbc_pre_1"])

#値幅制限が変更になる場合があるので、>=のほうがよい
sp["stop"] = 0
sp.loc[(sp["eclose_tbc_pre_1"]<100)&(sp["eclose_diff_1"]>=30), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<200)&(sp["eclose_diff_1"]>=50), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<500)&(sp["eclose_diff_1"]>=80), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<700)&(sp["eclose_diff_1"]>=100), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<1000)&(sp["eclose_diff_1"]>=150), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<1500)&(sp["eclose_diff_1"]>=300), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<2000)&(sp["eclose_diff_1"]>=400), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<3000)&(sp["eclose_diff_1"]>=500), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<5000)&(sp["eclose_diff_1"]>=700), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<7000)&(sp["eclose_diff_1"]>=1000), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<10000)&(sp["eclose_diff_1"]>=1500), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<15000)&(sp["eclose_diff_1"]>=3000), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<20000)&(sp["eclose_diff_1"]>=4000), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<30000)&(sp["eclose_diff_1"]>=5000), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<50000)&(sp["eclose_diff_1"]>=7000), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<70000)&(sp["eclose_diff_1"]>=10000), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<100000)&(sp["eclose_diff_1"]>=15000), "stop"] = 1
sp.loc[(sp["eclose_tbc_pre_1"]<100)&(sp["eclose_diff_1"]<=-30), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<200)&(sp["eclose_diff_1"]<=-50), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<500)&(sp["eclose_diff_1"]<=-80), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<700)&(sp["eclose_diff_1"]<=-100), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<1000)&(sp["eclose_diff_1"]<=-150), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<1500)&(sp["eclose_diff_1"]<=-300), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<2000)&(sp["eclose_diff_1"]<=-400), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<3000)&(sp["eclose_diff_1"]<=-500), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<5000)&(sp["eclose_diff_1"]<=-700), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<7000)&(sp["eclose_diff_1"]<=-1000), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<10000)&(sp["eclose_diff_1"]<=-1500), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<15000)&(sp["eclose_diff_1"]<=-3000), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<20000)&(sp["eclose_diff_1"]<=-4000), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<30000)&(sp["eclose_diff_1"]<=-5000), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<50000)&(sp["eclose_diff_1"]<=-7000), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<70000)&(sp["eclose_diff_1"]<=-10000), "stop"] = -1
sp.loc[(sp["eclose_tbc_pre_1"]<100000)&(sp["eclose_diff_1"]<=-15000), "stop"] = -1


In [148]:
#窓開きのチャート判定
sp["window"] = np.where((sp.groupby("code").shift(1)["high"]*1.06<sp["low"])|(sp.groupby("code").shift(1)["low"]>sp["high"]*1.06), 1, 0)

In [149]:
#x日移動平均線との乖離率の算出
sp["eclose_mean5"] = sp.groupby("code")["eclose"].rolling(5,min_periods=1).mean().reset_index()["eclose"]
sp["eclose_ch_mean5"] = (sp["eclose"] - sp["eclose_mean5"]) / sp["eclose_mean5"]
sp["eclose_mean10"] = sp.groupby("code")["eclose"].rolling(10,min_periods=1).mean().reset_index()["eclose"]
sp["eclose_ch_mean10"] = (sp["eclose"] - sp["eclose_mean10"]) / sp["eclose_mean10"]
sp["eclose_mean20"] = sp.groupby("code")["eclose"].rolling(20,min_periods=1).mean().reset_index()["eclose"]
sp["eclose_ch_mean20"] = (sp["eclose"] - sp["eclose_mean20"]) / sp["eclose_mean20"]
sp["eclose_mean40"] = sp.groupby("code")["eclose"].rolling(40,min_periods=1).mean().reset_index()["eclose"]
sp["eclose_ch_mean40"] = (sp["eclose"] - sp["eclose_mean40"]) / sp["eclose_mean40"]
sp["eclose_mean120"] = sp.groupby("code")["eclose"].rolling(120,min_periods=1).mean().reset_index()["eclose"]
sp["eclose_ch_mean120"] = (sp["eclose"] - sp["eclose_mean120"]) / sp["eclose_mean120"]


In [150]:
sa = pd.merge(sl, sf, on=["code"], how="inner")
sa = pd.merge(sa, sp, on=["date", "code"], how="left")
sa = sa.sort_values(['code', 'date'])

In [151]:
sa = pd.merge(sa, slb, on=["date", "code"], how="inner") #predict不要

In [152]:
sa_bk = sa.copy() #predict不要

In [153]:
sa = sa_bk.copy() #predict不要

In [154]:
sa = pd.merge(sa, m_conf, on=["code"], how="left")
sa.loc[sa["fore_conf"].isnull(), "fore_conf"] = 0

In [155]:
#会計年度変更の判定
sa["FYshift"] = np.where((sa["fyear_t"] != sa.groupby("code").shift(1)["fyear_t"]), 1, 0)

In [156]:
#時価総額の対数の算出
sa["jikaso"] = (sa["issued"] * sa["eclose"])/mil
sa["jikaso_log"] = np.log((sa["issued"] * sa["eclose"]+1)/mil)

#時価総額の対数を分母とした業績変化率の算出
sa["salesr_aad_t_ch"] = np.where((sa["code"] == sa.shift(1)["code"]) & (sa["accstd_t"] == sa.shift(1)["accstd_t"]) & (sa["fyear_t"] <= sa.shift(1)["fyear_t"] + 1), sa.groupby(["code", "accstd_t"])["sales_aad_t"].diff(1) / sa["jikaso_log"], np.nan)
sa["opeinr_aad_t_ch"] = np.where((sa["code"] == sa.shift(1)["code"]) & (sa["accstd_t"] == sa.shift(1)["accstd_t"]) & (sa["fyear_t"] <= sa.shift(1)["fyear_t"] + 1), sa.groupby(["code", "accstd_t"])["opein_aad_t"].diff(1) / sa["jikaso_log"], np.nan)
sa["ordinr_aad_t_ch"] = np.where((sa["code"] == sa.shift(1)["code"]) & (sa["accstd_t"] == sa.shift(1)["accstd_t"]) & (sa["fyear_t"] <= sa.shift(1)["fyear_t"] + 1), sa.groupby(["code", "accstd_t"])["ordin_aad_t"].diff(1) / sa["jikaso_log"], np.nan)
sa["netinr_aad_t_ch"] = np.where((sa["code"] == sa.shift(1)["code"]) & (sa["accstd_t"] == sa.shift(1)["accstd_t"]) & (sa["fyear_t"] <= sa.shift(1)["fyear_t"] + 1), sa.groupby(["code", "accstd_t"])["netin_aad_t"].diff(1) / sa["jikaso_log"], np.nan)


In [None]:
#配当金額の差分の算出（記念配当のみなし補正含む）
sa["qdiv_aad_t_ch"] = np.where(sa["qdiv_aad_t"] / sa["eclose"] > 0.06, sa.groupby("code")["qdiv_aad_t"].diff(1)/3, sa.groupby("code")["qdiv_aad_t"].diff(1))

#年利回りの差分の算出
sa["ayld_t_ch"] = sa["qdiv_aad_t_ch"] / sa["eclose"]


In [157]:
#株価売上率の対数の算出
sa["EPS_sal_f"] = sa["sales_aad_f"] * mil / sa["issued"]
sa["PER_sal_f"] = sa["eclose"] / np.where(sa["EPS_sal_f"]<=0, 1, sa["EPS_sal_f"])
sa["PER_sal_log_f"] = np.log(sa["PER_sal_f"])


In [158]:
#33業種ごと240営業日前からの株価変化率の算出
eclose_ch_sec = sa.groupby("33sec")["eclose_ch_mid"].mean().reset_index()
eclose_ch_sec = eclose_ch_sec.rename(columns={"eclose_ch_mid": "eclose_ch_sec",})
sa = pd.merge(sa, eclose_ch_sec, on=["33sec"], how="left")

#各種項目のワンホットベクトル化
sa = pd.get_dummies(sa, columns=['section'])
sa = pd.get_dummies(sa, columns=['sizecode'])

sa["ctype_f"] = np.where(sa["ctype_f"].isnull(), sa["ctype"], sa["ctype_f"])
sa = pd.get_dummies(sa, columns=['ctype'])
sa = pd.get_dummies(sa, columns=['ctype_f'])


In [159]:
sp_fd = sp.groupby("code")[["code","date"]].head(1).copy()
sp_fd = sp_fd.rename(columns={"date": "sp_fdate"})
sa = pd.merge(sa, sp_fd, on=['code'], how="left")

#2016年1月4日時点で上場していたか判定
sa["lc_mature"] = 0
sa["lc_mature"] = np.where(sa["sp_fdate"] <= dt.datetime(2016, 1, 4), 1, 0)

#上場後1440営業日過ぎているか判定
sa["lc_young"] = 0
sa["lc_young"] = np.where((sa["lc_mature"]==0)
                   & (sa["date"] - sa["sp_fdate"] <= dt.timedelta(days=1440)), 1, 0)


In [160]:
#33業種と権利落ち日周辺かどうかで優待受け取りがあるかどうかのみなし判定
sa = pd.merge(sa,m_yutai, on=["33sec"], how="left")
sa.loc[(sa["yutai"].isnull()), "yutai"] = 0

sa.loc[(sa["kenri_f"]==0)&(sa["kenriochi_f"]==0)&(sa["post_kenriochi_f"]==0), "yutai"] = 0
sa.loc[(sa["kenri_f"]==0)&(sa["kenriochi_f"]==0)&(sa["post_kenriochi_f"]==0), "yutai"] = 0


In [161]:
#predictでは不要
#一部の項目がnull値の場合、予測対象フラグをOFF
sa.loc[sa["label_high_20"].isnull(), "target"] = 0
sa.loc[sa["label_low_20"].isnull(), "target"] = 0

sa.loc[sa["eclose_ch_1"].isnull(), "target"] = 0

sa.loc[sa["eclose_ch_mid"].isnull(), "target"] = 0


In [162]:
#予測対象フラグOFFのレコードを予測対象から除外
sa = sa[sa["target"]==1]


In [163]:
#業績数値変化率の一部がnull値の場合、他の業績数値変化率で置き換え
for j in range(2):
    for i in range(10):
        if len(sa["ordinr_aad_t_ch"].isnull()) > 0:
            sa["ordinr_aad_t_ch"] = np.where(sa["ordinr_aad_t_ch"].isnull(), sa["opeinr_aad_t_ch"], sa["ordinr_aad_t_ch"])
        else:
            break

    for i in range(10):
        if len(sa["opeinr_aad_t_ch"].isnull()) > 0:
            sa["opeinr_aad_t_ch"] = np.where(sa["opeinr_aad_t_ch"].isnull(),  
                                        np.where(sa["ordinr_aad_t_ch"].isnull(), sa["salesr_aad_t_ch"], sa["ordinr_aad_t_ch"]), 
                                    sa["opeinr_aad_t_ch"])
        else:
            break

    for i in range(10):
        if len(sa["salesr_aad_t_ch"].isnull()) > 0:
            sa["salesr_aad_t_ch"] = np.where(sa["salesr_aad_t_ch"].isnull(), 
                                    np.where(sa["opeinr_aad_t_ch"].isnull(), sa["ordinr_aad_t_ch"], sa["opeinr_aad_t_ch"]), 
                                    sa["salesr_aad_t_ch"])
        else:
            break

for i in range(10):
    if len(sa["netinr_aad_t_ch"].isnull()) > 0:
        sa["netinr_aad_t_ch"] =  np.where(sa["netinr_aad_t_ch"].isnull(),
                                    np.where(sa["ordinr_aad_t_ch"].isnull(), sa["opeinr_aad_t_ch"], sa["ordinr_aad_t_ch"]), 
                                          sa["netinr_aad_t_ch"])
    else:
        break

#各業績数値が、プラスマイナス転換したか、連続マイナスだったか判定
sa["opeinr_akakuro"] = np.where((sa["opein_aad_t"] * sa.groupby("code").shift(1)["opein_aad_t"]<0), 1, 0)
sa["opeinr_akaaka"] = np.where((sa["opein_aad_t"] < 0) & (sa.groupby("code").shift(1)["opein_aad_t"] < 0), 1, 0)
sa["ordinr_akakuro"] = np.where((sa["ordin_aad_t"] * sa.groupby("code").shift(1)["ordin_aad_t"]<0), 1, 0)
sa["ordinr_akaaka"] = np.where((sa["ordin_aad_t"] < 0) & (sa.groupby("code").shift(1)["ordin_aad_t"] < 0), 1, 0)
sa["netinr_akakuro"] = np.where((sa["netin_aad_t"] * sa.groupby("code").shift(1)["netin_aad_t"]<0), 1, 0)
sa["netinr_akaaka"] = np.where((sa["netin_aad_t"] < 0) & (sa.groupby("code").shift(1)["netin_aad_t"] < 0), 1, 0)


In [164]:
#純資産がプラスマイナス転換したか判定
sa["nasset_akakuro"] = np.where((sa["nasset"] > 0) & (sa.groupby("code").shift(1)["nasset"] <= 0), 1,  np.where((sa["nasset"] <= 0) & (sa.groupby("code").shift(1)["nasset"] > 0), -1, 0))

In [165]:
#日付が、全銘柄の売買単位が100株となった2018年10月1日以降かどうか判定
sa["punit_ch_date"] = np.where(sa["date"]>="2018-10-01", 1, 0)

In [166]:
#売上がマイナスかnull値のレコードを除外
sa = sa[(sa["sales"]>0) | (sa["sales"].isnull())] #Predictでは不要。トレーニングでは必要。3件マイナス売上があるので。

In [167]:
#Predictには不要
#20営業日の間の最高値・最安値を記録した日付を判定

spl = pd.merge(sp,sl, on=["code"], how="inner")
spl["jikaso"] = spl["eclose"] * spl["issued"]

sa["high_20"] = np.where(((sa["code"]==6806)&(sa["date"]<="2018-01-29")) | (sa["eclose"]<10000), np.round(sa["eclose"] * (1 + sa["label_high_20"]),1),np.round(sa["eclose"] * (1 + sa["label_high_20"])))
sa_p = sa[["code","date","high_20"]].copy()

spl["date_high_20"] = spl["date"]
spl["high_20"] = spl["high"]

sa_p = pd.merge(sa_p, spl[["code", "date_high_20","high_20"]], on=["code","high_20"], how="left")
sa_p.loc[(sa_p["code"]==6806)&(sa_p["date"]=="2018-01-30"), "date_high_20"] = pd.to_datetime("2018-01-31")
sa_p = sa_p[(sa_p["date_high_20"] <= sa_p["date"] + dt.timedelta(days=40)) & (sa_p["date_high_20"] >= sa_p["date"])]
sa_p = sa_p.groupby(["code","date"]).head(1).reset_index()

sa = pd.merge(sa, sa_p[["code","date", "date_high_20"]], on=["code","date"], how="left")


sa["low_20"] = np.where(((sa["code"]==6806)&(sa["date"]<="2018-01-29")) | (sa["eclose"]<10000), np.round(sa["eclose"] * (1 + sa["label_low_20"]),1),np.round(sa["eclose"] * (1 + sa["label_low_20"])))

sa_p = sa[["code","date","low_20"]].copy()

spl["date_low_20"] = spl["date"]
spl["low_20"] = spl["low"]

sa_p = pd.merge(sa_p, spl[["code", "date_low_20","low_20"]], on=["code","low_20"], how="left")
sa_p.loc[(sa_p["code"]==6806)&(sa_p["date"]=="2018-01-30"), "date_low_20"] = pd.to_datetime("2018-02-14")
sa_p = sa_p[(sa_p["date_low_20"] <= sa_p["date"] + dt.timedelta(days=40)) & (sa_p["date_low_20"] >= sa_p["date"])]
sa_p = sa_p.groupby(["code","date"]).head(1).reset_index()

sa = pd.merge(sa, sa_p[["code","date", "date_low_20"]], on=["code","date"], how="left")


In [168]:
#Predict対象外
#全銘柄株価の日付ごとの20営業日の間の最高値・最安値を判定
sp_macro2 = sp.groupby("date")[["eclose","high","low"]].mean().reset_index()

sp_macro2 = sp_macro2.rename(columns={
                        "eclose": "eclose_macro",
                        "high": "high_macro",
                        "low": "low_macro",
                       })
sp_macro2["date_high_20"] = sp_macro2["date"]
sp_macro2["date_low_20"] = sp_macro2["date"]

sa = pd.merge(sa,sp_macro2[["date","eclose_macro"]], on=["date"], how="left")
sa = pd.merge(sa,sp_macro2[["high_macro","date_high_20"]], on=["date_high_20"], how="left")
sa["macro_high_ch_20"] = (sa["high_macro"] - sa["eclose_macro"]) / sa["eclose_macro"]
sa = pd.merge(sa,sp_macro2[["low_macro","date_low_20"]], on=["date_low_20"], how="left")
sa["macro_low_ch_20"] = (sa["low_macro"] - sa["eclose_macro"]) / sa["eclose_macro"]


In [169]:
#Predict対象外
#33業種ごと全銘柄株価の日付ごとの20営業日の間の最高値・最安値を判定
sp_sec2 = spl.groupby(["date","33sec"])["eclose","high","low"].mean().reset_index()
sp_sec2 = sp_sec2.rename(columns={
                        "eclose": "eclose_33sec",
                        "high": "high_33sec",
                        "low": "low_33sec",
                       })
sp_sec2["date_high_20"] = sp_sec2["date"]
sp_sec2["date_low_20"] = sp_sec2["date"]

sa = pd.merge(sa,sp_sec2[["date","33sec","eclose_33sec"]], on=["date","33sec"], how="left")
sa = pd.merge(sa,sp_sec2[["33sec","date_high_20","high_33sec"]], on=["date_high_20","33sec"], how="left")
sa["33sec_high_ch_20"] = (sa["high_33sec"] - sa["eclose_33sec"]) / sa["eclose_33sec"]
sa = pd.merge(sa,sp_sec2[["33sec","date_low_20","low_33sec"]], on=["date_low_20","33sec"], how="left")
sa["33sec_low_ch_20"] = (sa["low_33sec"] - sa["eclose_33sec"]) / sa["eclose_33sec"]


  


学習

In [175]:
#説明変数のリストを作成
x_col1_g = [
         "FYshift", "ctype_GB", "ctype_BK","ctype_SE","ctype_IN","ctype_f_GB","ctype_f_BK","ctype_f_SE","ctype_f_IN",
        "salesr_aad_t_ch", "opeinr_aad_t_ch", "ordinr_aad_t_ch", "netinr_aad_t_ch",
        "opeinr_akakuro", "opeinr_akaaka", "ordinr_akakuro", "ordinr_akaaka", "netinr_akakuro", "netinr_akaaka",
        "lc_young","lc_mature",
        "noresult","nofrct","ayld_t_ch",
        "yutai",
        "kenri_f", "kenriochi_f","post_kenriochi_f",
        "fore_conf",
        "eclose",
        "punit_ch_date",
        "eclose_ch_mean5", "eclose_ch_mean10",
        "eclose_ch_1","eclose_ch_5","eclose_ch_10","eclose_ch_20", "eclose_ch_40", "eclose_ch_120",
        "eclose_ch_sec", "vola_code",
        "soyaku",
        "PER_sal_log_f",
          "vola", "vol_ch", "jikaso_log","stop","window",
         "section_First Section (Domestic)", "section_JASDAQ(Growth/Domestic)", "section_JASDAQ(Standard / Domestic)", "section_Mothers (Domestic)",
         "section_Second Section(Domestic)",
        "sizecode_1","sizecode_2","sizecode_4","sizecode_6","sizecode_7","sizecode_-",
        ]
x_col1_h = ["eclose_ch_60"]
x_col1_l = ["eclose_ch_mean20", "eclose_ch_mean40", "eclose_ch_mean120", "eclose_ch_mid","sellmax_code","sellmax_macro","nasset_akakuro"]

In [176]:
#LightGBMのパラメータのリストを作成
params={
        'learning_rate': 0.005, 'objective': 'mae', 
        'metric': 'mae', 
        'boosting_type': 'gbdt', 
         'num_leaves': 62, 'verbose': -1, 'bagging_fraction': 0.523321698496099, 'feature_fraction': 0.884, 
         'bagging_freq': 7, 'feature_pre_filter': False, 
        }


In [185]:
#学習・評価対象期間のデータセットを作成
df_tr = sa[(sa["date"]>=dates_tr)&(sa["date"]<=datee_tr)]
df_vl = sa[(sa["date"]>=dates_vl)&(sa["date"]<=datee_vl)]

#高値予測モデルの学習
hl = "high"
a_macro = 0.5
a_sec = 0
x_col = x_col1_g + x_col1_h
num_boost_round = 6000
verbose_eval = 1000
train_data = lgb.Dataset(df_tr[x_col].values.astype(np.float32), label=(df_tr[f"label_{hl}_20"] - a_macro*df_tr[f"macro_{hl}_ch_20"] - a_sec*df_tr[f"33sec_{hl}_ch_20"]).values.astype(np.float32))
eval_data = lgb.Dataset(df_vl[x_col].values.astype(np.float32), label=(df_vl[f"label_{hl}_20"] - a_macro*df_vl[f"macro_{hl}_ch_20"] - a_sec*df_vl[f"33sec_{hl}_ch_20"]).values.astype(np.float32), reference= train_data)

model = lgb.train(params,train_data,valid_sets=eval_data, num_boost_round=num_boost_round, verbose_eval=verbose_eval, feval=eval_spearman)

#高値予測モデルのエクスポート
modelname = f'model_jpx_{hl}.pkl'

with open(modelname, 'wb') as f:
    pickle.dump(model,f)

#安値予測モデルの学習
hl = "low"
a_macro = 1.0
a_sec = 0.2        
x_col = x_col1_g + x_col1_l
num_boost_round = 1500
verbose_eval = 500
train_data = lgb.Dataset(df_tr[x_col].values.astype(np.float32), label=(df_tr[f"label_{hl}_20"] - a_macro*df_tr[f"macro_{hl}_ch_20"] - a_sec*df_tr[f"33sec_{hl}_ch_20"]).values.astype(np.float32))
eval_data = lgb.Dataset(df_vl[x_col].values.astype(np.float32), label=(df_vl[f"label_{hl}_20"] - a_macro*df_vl[f"macro_{hl}_ch_20"] - a_sec*df_vl[f"33sec_{hl}_ch_20"]).values.astype(np.float32), reference= train_data)

model = lgb.train(params,train_data,valid_sets=eval_data, num_boost_round=num_boost_round, verbose_eval=verbose_eval, feval=eval_spearman)

#安値予測モデルのエクスポート
modelname = f'model_jpx_{hl}.pkl'

with open(modelname, 'wb') as f:
    pickle.dump(model,f)


[1000]	valid_0's l1: 0.085252	valid_0's sp: 0.367355
[2000]	valid_0's l1: 0.0846886	valid_0's sp: 0.375214
[3000]	valid_0's l1: 0.0843893	valid_0's sp: 0.378317
[4000]	valid_0's l1: 0.0843046	valid_0's sp: 0.378021
[5000]	valid_0's l1: 0.0842124	valid_0's sp: 0.379255
[6000]	valid_0's l1: 0.0841859	valid_0's sp: 0.378861
[500]	valid_0's l1: 0.0574764	valid_0's sp: 0.349191
[1000]	valid_0's l1: 0.0574066	valid_0's sp: 0.3553
[1500]	valid_0's l1: 0.0574538	valid_0's sp: 0.3559
