# Feature enginering

In [1]:
import yfinance as yf
import numpy as np
from tqdm import tqdm

def fetch_stock_price(stock_symbol, start_date, end_date):
    # 使用 yf.Ticker() 建立 Ticker 對象
    stock = yf.Ticker(stock_symbol)

    # 使用 history() 方法取得歷史價格資訊
    stock_data = stock.history(start=start_date, end=end_date)

    return stock_data

stock_symbol = '5871.TW'

# 起始日期和結束日期
end_date = '2024-12-31'

# 擷取股票價格資訊
stock_price_data = fetch_stock_price(stock_symbol=stock_symbol, start_date='2012-01-02',end_date=end_date)


In [2]:
stock_price_data['do'] = stock_price_data['Open'].pct_change()
stock_price_data['dh'] = stock_price_data['High'].pct_change()
stock_price_data['dl'] = stock_price_data['Low'].pct_change()
stock_price_data['dc'] = stock_price_data['Close'].pct_change()
stock_price_data['dv'] = stock_price_data['Volume'].pct_change()
stock_price_data['oc'] = stock_price_data['Open']-stock_price_data['Close']

stock_price_data['curr_bar_state'] = np.sign(stock_price_data['oc'])
stock_price_data = stock_price_data.dropna()

In [3]:
p_forward = 10
for i in range(1, p_forward+1):
    stock_price_data[f'bar_state_{str(i)}'] = stock_price_data['curr_bar_state'].shift(-i-p_forward+1)

In [4]:
# df = stock_price_data.iloc[:,7:]
df = stock_price_data
# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with NaN values
df = df.dropna()

In [5]:
df_test = df.loc['2023':]
df = df.loc[:'2022']

In [6]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
df[['do', 'dh', 'dl', 'dc', 'dv']] = scaler.fit_transform(df[['do', 'dh', 'dl', 'dc', 'dv']])
df_test[['do', 'dh', 'dl', 'dc', 'dv']] = scaler.fit_transform(df_test[['do', 'dh', 'dl', 'dc', 'dv']])

In [7]:
# df[:20]

# Prepare traning data

In [29]:
window_size = 10

x1_list, y1_list = [], []

# Iterate over the DataFrame to create the training and testing sets
for i in tqdm(range(len(df)-window_size+1)):
    window = df.iloc[i:i+window_size]  # Extract the window of data
    # print(window.T.values)
    x1_values = window[['do', 'dh', 'dl', 'dc', 'dv', 'curr_bar_state']].T.values  # Adjust column names as needed
    # print(x1_values)
    # print(window[['bar_state_1', 'bar_state_2', 'bar_state_3', 'bar_state_4', 'bar_state_5']])
    # y1_values = window[['bar_state_1', 'bar_state_2', 'bar_state_3', 'bar_state_4', 'bar_state_5']].iloc[0].T.values # Take the last value of 'bar_state_1' as the output
    y1_values = window[['bar_state_1']].iloc[0].T.values
    x1_list.append(x1_values)
    y1_list.append(y1_values)

# Convert the lists to NumPy arrays
x = np.array(x1_list)
y = np.array(y1_list)

  0%|          | 0/2669 [00:00<?, ?it/s]

100%|██████████| 2669/2669 [00:10<00:00, 252.51it/s]


In [30]:
import random
# valid
percentage = 20
num_numbers = int((percentage / 100) * len(x))

# Generate a list of randomly selected numbers
valid_numbers = random.sample(range(0, len(x)), num_numbers)
training_numbers = [num for num in range(0, len(x)) if num not in valid_numbers]

In [31]:
x_train = x[training_numbers]
x_valid = x[valid_numbers]

y_train = y[training_numbers]
y_valid = y[valid_numbers]

In [32]:
x_train.shape

(2136, 6, 10)

# Define model

In [95]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define the 1D CNN model
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(6, 36, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(36 * 5, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)   
        x = x.view(x.size(0), -1) 
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return torch.tanh(x)


In [166]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride = 1, downsample = None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Sequential(
                        nn.Conv1d(in_channels, out_channels, kernel_size = 3, stride = stride, padding = 1),
                        nn.BatchNorm1d(out_channels),
                        nn.ReLU())
        self.conv2 = nn.Sequential(
                        nn.Conv1d(out_channels, out_channels, kernel_size = 3, stride = 1, padding = 1),
                        nn.BatchNorm1d(out_channels))
        self.downsample = downsample
        self.relu = nn.ReLU()
        self.out_channels = out_channels
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

In [None]:
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes = 1):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Sequential(
                        nn.Conv2d(3, 64, kernel_size = 7, stride = 2, padding = 3),
                        nn.BatchNorm2d(64),
                        nn.ReLU())
        self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)
        self.layer0 = self._make_layer(block, 64, layers[0], stride = 1)
        self.layer1 = self._make_layer(block, 128, layers[1], stride = 2)
        self.layer2 = self._make_layer(block, 256, layers[2], stride = 2)
        self.layer3 = self._make_layer(block, 512, layers[3], stride = 2)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512, num_classes)
        
    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes:
            
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [162]:
x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

x_val_tensor = torch.tensor(x_valid, dtype=torch.float32)
y_val_tensor = torch.tensor(y_valid, dtype=torch.float32)

In [163]:
dataset_train = TensorDataset(x_train_tensor, y_train_tensor)
dataset_valid = TensorDataset(x_val_tensor, y_val_tensor)

dataloader_train = DataLoader(dataset_train , batch_size=12, shuffle=True)
dataloader_valid = DataLoader(dataset_valid , batch_size=12, shuffle=True)

In [164]:
model = ResNet50(num_classes=1)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, weight_decay=0.01)

In [165]:
num_epochs = 500
for epoch in range(num_epochs):
    # Training phase
    model.train()
    for batch_x, batch_y in dataloader_train:
        optimizer.zero_grad()
        outputs = model(batch_x)
        # print(batch_y, outputs)

        loss = criterion(outputs, batch_y) 
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = []

    with torch.no_grad():
        for batch_x_val, batch_y_val in dataloader_valid:
            outputs_val = model(batch_x_val)
            loss_val = criterion(outputs_val, batch_y_val)
            val_loss.append(loss_val.item())


    # Print statistics
    if epoch%10==0:
        print(f'Epoch [{epoch+1}/{num_epochs}]', 
            f'Training Loss: {loss.item():.10f}',
            f'Valid Loss: {sum(val_loss)/64:.10f}')

torch.Size([12, 256, 6])


RuntimeError: Given groups=1, weight of size [128, 32, 1], expected input[12, 256, 6] to have 32 channels, but got 256 channels instead

# Prepare test data

In [105]:
window_size = 10

x2_list, y2_list = [], []

# Iterate over the DataFrame to create the training and testing sets
for i in tqdm(range(len(df_test)-window_size+1)):
    window = df_test.iloc[i:i+window_size]  # Extract the window of data
    # print(window.T.values)
    x1_values = window[['do', 'dh', 'dl', 'dc', 'dv', 'curr_bar_state']].T.values  # Adjust column names as needed
    # print(x1_values)
    # print(window[['bar_state_1', 'bar_state_2', 'bar_state_3', 'bar_state_4', 'bar_state_5']])
    # y1_values = window[['bar_state_1', 'bar_state_2', 'bar_state_3' , 'bar_state_4', 'bar_state_5']].iloc[0].T.values # Take the last value of 'bar_state_1' as the output
    y1_values = window[['bar_state_1']].iloc[0].T.values
    x2_list.append(x1_values)
    y2_list.append(y1_values)

# Convert the lists to NumPy arrays
x_test = np.array(x2_list)
y_test = np.array(y2_list)

100%|██████████| 230/230 [00:01<00:00, 229.06it/s]


In [112]:
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [113]:
torch.set_printoptions(precision=6, sci_mode=False)

In [116]:
torch.round(model(x_test_tensor), decimals=3)[:10]

tensor([[ 0.055000],
        [ 0.012000],
        [-0.067000],
        [ 0.014000],
        [ 0.050000],
        [ 0.031000],
        [ 0.009000],
        [-0.055000],
        [-0.021000],
        [-0.049000]], grad_fn=<SliceBackward0>)

In [117]:
y_test_tensor[:10]

tensor([[ 1.],
        [ 1.],
        [ 1.],
        [-1.],
        [ 1.],
        [ 1.],
        [-1.],
        [ 1.],
        [-1.],
        [-1.]])

In [44]:
b = y_test_tensor - torch.round(model(x_test_tensor))

In [45]:
1-len(torch.nonzero(torch.sum(b, dim=1)))/len(b)

0.27391304347826084

In [46]:
b[:100]

tensor([[ 1.],
        [ 1.],
        [ 1.],
        [ 0.],
        [ 1.],
        [ 2.],
        [-1.],
        [ 1.],
        [-1.],
        [ 1.],
        [-2.],
        [ 0.],
        [-1.],
        [-1.],
        [ 2.],
        [ 0.],
        [ 1.],
        [ 0.],
        [-1.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 1.],
        [ 0.],
        [ 1.],
        [-1.],
        [-1.],
        [-1.],
        [ 0.],
        [-1.],
        [ 1.],
        [ 1.],
        [ 0.],
        [-1.],
        [ 1.],
        [ 2.],
        [-1.],
        [-1.],
        [-1.],
        [-1.],
        [ 1.],
        [ 1.],
        [ 2.],
        [ 2.],
        [ 0.],
        [-1.],
        [-1.],
        [ 0.],
        [ 0.],
        [-1.],
        [ 0.],
        [ 1.],
        [-1.],
        [ 1.],
        [ 1.],
        [ 1.],
        [-2.],
        [ 1.],
        [ 0.],
        [ 2.],
        [-1.],
        [-1.],
        [ 3.],
        [ 1.],
        [ 1.],
        [ 0.],
        [ 