In [15]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn

In [9]:
df = pd.read_csv('/content/stock_price.csv')
df['日付け'] = pd.to_datetime(df['日付け'], format='%Y-%m-%d', errors='coerce')
# 出来高の変換 (Mを1e6、Bを1e9に変換)
df['出来高'] = df['出来高'].replace({'M': 'e6', 'B': 'e9'}, regex=True).astype(float)
# 変化率 % の変換
df['変化率 %'] = df['変化率 %'].str.replace('%', '').astype(float)
# 日付をインデックスに設定
df.set_index('日付け', inplace=True)
# データを逆順に並べ替え
df = df.iloc[::-1].reset_index(drop=False)

def label_change_rate(rate):
  if rate >= 0:
    return 1
  else:
    return 0

df['ラベル'] = df['変化率 %'].apply(label_change_rate)

# RSIの計算関数
def calculate_rsi(data, period=14):
    delta = data['終値'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi


# RSIを追加
df['RSI'] = calculate_rsi(df)
df['rsi_overbought'] = (df['RSI'] >= 70).astype(int)
df['rsi_oversold'] = (df['RSI'] <= 30).astype(int)

df['O-C'] = df['始値'] - df['終値']
df['H-L'] = df['高値'] - df['安値']
# SMAの計算
df['sma5'] = df['終値'].rolling(window=5).mean()
df['sma10'] = df['終値'].rolling(window=10).mean()
df['sma15'] = df['終値'].rolling(window=15).mean()
df['sma20'] = df['終値'].rolling(window=20).mean()
df['sma30'] = df['終値'].rolling(window=30).mean()
df['sma50'] = df['終値'].rolling(window=50).mean()
df['sma80'] = df['終値'].rolling(window=80).mean()
df['sma100'] = df['終値'].rolling(window=100).mean()
df['sma200'] = df['終値'].rolling(window=200).mean()

# SMA比率の計算
df['sma5_ratio'] = df['終値'] / df['sma5']
df['sma10_ratio'] = df['終値'] / df['sma10']
df['sma15_ratio'] = df['終値'] / df['sma15']
df['sma20_ratio'] = df['終値'] / df['sma20']
df['sma30_ratio'] = df['終値'] / df['sma30']
df['sma50_ratio'] = df['終値'] / df['sma50']
df['sma80_ratio'] = df['終値'] / df['sma80']
df['sma100_ratio'] = df['終値'] / df['sma100']
df['sma200_ratio'] = df['終値'] / df['sma200']

# ATRの計算
df['high_low'] = df['高値'] - df['安値']
df['high_close'] = np.abs(df['高値'] - df['終値'].shift())
df['low_close'] = np.abs(df['安値'] - df['終値'].shift())
df['tr'] = df[['high_low', 'high_close', 'low_close']].max(axis=1)
df['atr'] = df['tr'].rolling(window=14).mean()

# ボラティリティ (20日の標準偏差)
df['volatility'] = df['終値'].rolling(window=20).std()

# CCIの計算
typical_price = (df['高値'] + df['安値'] + df['終値']) / 3
df['cci'] = (typical_price - typical_price.rolling(window=20).mean()) / (0.015 * typical_price.rolling(window=20).std())

# ボリンジャーバンドの計算
df['bol_mid'] = df['終値'].rolling(window=20).mean()
df['bol_upper'] = df['bol_mid'] + (df['終値'].rolling(window=20).std() * 2)
df['bol_lower'] = df['bol_mid'] - (df['終値'].rolling(window=20).std() * 2)

# OBVの計算
df['obv'] = (np.sign(df['終値'].diff()) * df['出来高']).fillna(0).cumsum()

# ROCの計算 (10日間の変化率)
df['roc2'] = df['終値'].pct_change(periods=2) * 100
df['roc3'] = df['終値'].pct_change(periods=3) * 100
df['roc4'] = df['終値'].pct_change(periods=4) * 100
df['roc5'] = df['終値'].pct_change(periods=5) * 100

# MACDの計算
ema12 = df['終値'].ewm(span=12, adjust=False).mean()
ema26 = df['終値'].ewm(span=26, adjust=False).mean()
df['macd'] = ema12 - ema26
df['signal'] = df['macd'].ewm(span=9, adjust=False).mean()
# macd_diffの計算 (MACDとSignalの差)
df['macd_diff'] = df['macd'] - df['signal']

In [10]:
# 全てのfloat64列をfloat32に変換
df = df.astype({col: 'float32' for col in df.select_dtypes(include='float64').columns})
df = df.dropna()

#データ範囲の絞り込み
df = df[df['日付け'] >= pd.to_datetime('1950-01-01') ]
df.set_index('日付け', inplace=True)

#為替データの追加
import yfinance as yf

# 米ドルから円への為替レート (USD/JPY)
ticker = 'JPY=X'
data = yf.download(ticker, start='1987-01-01', end='2024-08-02')

# 前日比で終値が何%増減したかを計算する
data['Pct_Change'] = data['Close'].pct_change() * 100

# 為替データのClose,Pct_Change列のみ抽出
exchange_close = data[['Close','Pct_Change']].rename(columns={'Close': 'Exchange_Close', 'Pct_Change': 'Exchange_Pct_Change'})

# 株価データと為替データを日付でマージ (inner join)
merged_data = df.join(exchange_close, how='inner')

# 結果の確認
print(merged_data.head())

[*********************100%***********************]  1 of 1 completed

                   終値         始値         高値         安値         出来高  変化率 %  \
1996-10-30  79.800003  80.199997  80.400002  79.699997  29970000.0  -0.50   
1996-11-01  78.699997  78.699997  79.000000  78.099998  38400000.0  -1.01   
1996-11-05  77.699997  78.699997  78.800003  77.599998  37890000.0  -1.27   
1996-11-06  79.800003  78.000000  80.000000  77.800003  37840000.0   2.70   
1996-11-08  80.000000  78.500000  80.000000  78.500000  40910000.0   1.27   

            ラベル        RSI  rsi_overbought  rsi_oversold  ...           obv  \
1996-10-30    0  48.863636               0             0  ...  1.621699e+10   
1996-11-01    0  43.956043               0             0  ...  1.615320e+10   
1996-11-05    0  35.789474               0             0  ...  1.611531e+10   
1996-11-06    1  47.413792               0             0  ...  1.615315e+10   
1996-11-08    1  45.238094               0             0  ...  1.612566e+10   

                roc2      roc3      roc4      roc5      macd  




In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9003 entries, 1987-11-27 to 2024-08-01
Data columns (total 48 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   終値              9003 non-null   float32
 1   始値              9003 non-null   float32
 2   高値              9003 non-null   float32
 3   安値              9003 non-null   float32
 4   出来高             9003 non-null   float32
 5   変化率 %           9003 non-null   float32
 6   ラベル             9003 non-null   int64  
 7   RSI             9003 non-null   float32
 8   rsi_overbought  9003 non-null   int64  
 9   rsi_oversold    9003 non-null   int64  
 10  O-C             9003 non-null   float32
 11  H-L             9003 non-null   float32
 12  sma5            9003 non-null   float32
 13  sma10           9003 non-null   float32
 14  sma15           9003 non-null   float32
 15  sma20           9003 non-null   float32
 16  sma30           9003 non-null   float32
 17  sma50          

In [13]:
#train, valid, testデータの分割
#2022をvalid,2023をtest
train_data = merged_data[merged_data.index < '2022-08-01']
valid_data = merged_data[(merged_data.index >= '2022-08-01') & (merged_data.index < '2023-08-01')]
test_data = merged_data[merged_data.index >= '2023-08-01']

print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

(6265, 50)
(246, 50)
(247, 50)


In [16]:
def create_sequences(data_X, X, data_y, seq_length,isTrain):
    Xs, ys = [], []
    for i in range(seq_length, len(data_X)):
        if(isTrain):
          if X[i][1] >= 0.2 or X[i][1] <= -0.2:
            Xs.append(data_X[i - seq_length:i])
            ys.append(data_y[i])
          else:
            continue
        else:
          Xs.append(data_X[i - seq_length:i])
          ys.append(data_y[i])

    return np.array(Xs), np.array(ys)

#使用する特徴量の設定
features = ['終値', '変化率 %','Exchange_Close','Exchange_Pct_Change','H-L','O-C','roc2','RSI','bol_upper','bol_lower']
#features = ['終値', '変化率 %','Exchange_Close','H-L','Exchange_Pct_Change','O-C','roc2','RSI','sma5']
#features = ['終値', '変化率 %','Close','始値','高値','安値','出来高']
#features = ['終値','変化率 %', 'Close']
train_data_X = train_data[features].values
valid_data_X = valid_data[features].values
test_data_X = test_data[features].values
train_target = train_data['ラベル'].values
valid_target = valid_data['ラベル'].values
test_target = test_data['ラベル'].values

scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data_X)
valid_data_scaled = scaler.transform(valid_data_X)
test_data_scaled = scaler.transform(test_data_X)

#LSTMに学習させるためにデータの形状を変更
seq_length = 5
train_X_seq, train_y_seq = create_sequences(train_data_scaled,train_data_X,train_target, seq_length,True)
valid_X_seq, valid_y_seq = create_sequences(valid_data_scaled,valid_data_X, valid_target,seq_length,False)
test_X_seq, test_y_seq = create_sequences(test_data_scaled,test_data_X, test_target,seq_length,False)
# PyTorchのテンソルに変換
X_train = torch.tensor(train_X_seq, dtype=torch.float32)
y_train = torch.tensor(train_y_seq, dtype=torch.long)
X_valid = torch.tensor(valid_X_seq, dtype=torch.float32)
y_valid = torch.tensor(valid_y_seq, dtype=torch.long)
X_test = torch.tensor(test_X_seq, dtype=torch.float32)
y_test = torch.tensor(test_y_seq, dtype=torch.long)

print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)
print(X_test.shape)
print(y_test.shape)

torch.Size([5559, 5, 10])
torch.Size([5559])
torch.Size([241, 5, 10])
torch.Size([241])
torch.Size([242, 5, 10])
torch.Size([242])
