In [5]:
# %%
import pandas as pd
import numpy as np
import conn
import logging
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
warnings.filterwarnings('ignore')

def get_gstc_code(code):
    try:
        conn.connect_to_database()
        query = f'''
            SELECT ts.GSTC_CODE
              FROM TB_STOCKCLASSIFY ts
             WHERE ts.KSTC_CODE = '{code}'
        '''
        
        conn.global_cursor.execute(query)
        df = pd.read_sql(query, conn.global_conn)
        
        return df
    except Exception as e:
        logging.error(f'Error occurred while fetching data from database: {e}')
        return None
    finally:
        conn.close_database_connection()

df = get_gstc_code('005930')

def select_data(gstc_code):
    try:
        conn.connect_to_database()
        query = f'''
            SELECT td.STCK_BSOP_DATE, td.STCK_CLPR, td.STCK_OPRC, td.STCK_HGPR, td.STCK_LWPR, td.ACML_VOL, td.ACML_TR_PBMN
              FROM TB_DAILYSTOCK td
             WHERE td.GSTC_CODE = '{gstc_code}'
             ORDER BY td.STCK_BSOP_DATE ASC
        '''
        
        conn.global_cursor.execute(query)
        df = pd.read_sql(query, conn.global_conn)
        
        return df
    except Exception as e:
        logging.error(f'Error occurred while fetching data from database: {e}')
        return None
    finally:
        conn.close_database_connection()

def data_preprocess(code):
    gstc_df = get_gstc_code(code)
    gstc_code = gstc_df.iloc[0, 0]
    data = select_data(gstc_code)
    
    data = data.drop_duplicates(subset='STCK_BSOP_DATE')
    
    data['STCK_BSOP_DATE'] = pd.to_datetime(data['STCK_BSOP_DATE'], format='%Y%m%d')
    data = data.sort_values('STCK_BSOP_DATE').reset_index(drop=True)

    columns_to_convert = ['STCK_CLPR', 'STCK_OPRC', 'STCK_HGPR', 'STCK_LWPR', 'ACML_VOL', 'ACML_TR_PBMN']
    data[columns_to_convert] = data[columns_to_convert].astype(float)

    data['next_day_close'] = data['STCK_CLPR'].shift(-1)
    
    # 3진 분류 라벨링
    # 0: 하락, 1: 변동 없음, 2: 상승
    data['label'] = np.where(
        data['next_day_close'] > data['STCK_CLPR'], 
        2, 
        np.where(data['next_day_close'] < data['STCK_CLPR'], 0, 1)
    )
    data = data.iloc[:-1]
    
    # 이동 평균
    data['MA5'] = data['STCK_CLPR'].rolling(window=5).mean()
    data['MA10'] = data['STCK_CLPR'].rolling(window=10).mean()
    data['MA20'] = data['STCK_CLPR'].rolling(window=20).mean()
    data['MA50'] = data['STCK_CLPR'].rolling(window=50).mean()

    # 지수 이동 평균
    data['EMA5'] = data['STCK_CLPR'].ewm(span=5, adjust=False).mean()
    data['EMA10'] = data['STCK_CLPR'].ewm(span=10, adjust=False).mean()
    data['EMA20'] = data['STCK_CLPR'].ewm(span=20, adjust=False).mean()

    # 상대 강도 지수 (RSI)
    delta = data['STCK_CLPR'].diff()
    up = delta.clip(lower=0)
    down = -1 * delta.clip(upper=0)
    ema_up = up.ewm(com=13, adjust=False).mean()
    ema_down = down.ewm(com=13, adjust=False).mean()
    rs = ema_up / ema_down
    data['RSI'] = 100 - (100 / (1 + rs))

    # 이동 평균 수렴 발산 (MACD)
    exp1 = data['STCK_CLPR'].ewm(span=12, adjust=False).mean()
    exp2 = data['STCK_CLPR'].ewm(span=26, adjust=False).mean()
    data['MACD'] = exp1 - exp2
    data['MACD_signal'] = data['MACD'].ewm(span=9, adjust=False).mean()

    # 볼린저 밴드
    data['20_day_MA'] = data['STCK_CLPR'].rolling(window=20).mean()
    data['20_day_STD'] = data['STCK_CLPR'].rolling(window=20).std()
    data['Bollinger_High'] = data['20_day_MA'] + (data['20_day_STD'] * 2)
    data['Bollinger_Low'] = data['20_day_MA'] - (data['20_day_STD'] * 2)

    # 스토캐스틱 오실레이터
    low14 = data['STCK_LWPR'].rolling(window=14).min()
    high14 = data['STCK_HGPR'].rolling(window=14).max()
    data['%K'] = 100 * ((data['STCK_CLPR'] - low14) / (high14 - low14))

    data['%D'] = data['%K'].rolling(window=3).mean()

    # 기술 지표 계산으로 인해 발생하는 NaN 값 제거
    data = data.dropna().reset_index(drop=True)
    
    return data

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)  # [max_len, d_model]
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # [max_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))  # [d_model/2]
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x shape: [batch_size, seq_len, d_model]
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, num_timesteps, num_features, feature_size=128, num_layers=2, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.feature_size = feature_size

        self.pos_encoder = PositionalEncoding(feature_size, dropout, num_timesteps)
        encoder_layers = nn.TransformerEncoderLayer(d_model=feature_size, nhead=8, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

        self.embedding = nn.Linear(num_features, feature_size)
        self.decoder = nn.Linear(feature_size, 3)  # 출력 클래스 수를 3으로 변경

    def forward(self, src):
        # src shape: [batch_size, seq_len, num_features]
        src = self.embedding(src) * np.sqrt(self.feature_size)
        src = self.pos_encoder(src)
        # Transformer expects input of shape (seq_len, batch_size, feature_size)
        src = src.permute(1, 0, 2)
        output = self.transformer_encoder(src)
        # 마지막 시점의 출력 사용
        output = output[-1, :, :]
        output = self.decoder(output)
        return output

data = data_preprocess('005930')

# Define features and labels
features = data.drop(['STCK_BSOP_DATE', 'next_day_close', 'label'], axis=1)
labels = data['label']

# Convert to numpy arrays
features = features.values
labels = labels.values

# Define sequence length
sequence_length = 30  # Using 60 timesteps

# Create sequences
X = []
y = []
for i in range(sequence_length, len(features)):
    X.append(features[i-sequence_length:i])
    y.append(labels[i])

X = np.array(X)
y = np.array(y)

# Split into training and testing sets
train_size = int(len(X) * 0.8)
X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

# Reshape for scaling
num_samples_train, num_timesteps, num_features = X_train.shape
X_train_reshaped = X_train.reshape(-1, num_features)
X_test_reshaped = X_test.reshape(-1, num_features)

# Scaling
scaler = StandardScaler()
scaler.fit(X_train_reshaped)
X_train_scaled = scaler.transform(X_train_reshaped)
X_test_scaled = scaler.transform(X_test_reshaped)

# 스케일러 저장
with open('./scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Reshape back to original shape
X_train = X_train_scaled.reshape(num_samples_train, num_timesteps, num_features)
X_test = X_test_scaled.reshape(X_test.shape[0], num_timesteps, num_features)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
feature_size = 64
model = TransformerModel(num_timesteps=num_timesteps, num_features=num_features, feature_size=feature_size, num_layers=2, dropout=0.1)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def evaluate(model, loader):
    model.eval()
    total = 0
    correct = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            output = model(X_batch)
            _, predicted = torch.max(output.data, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    accuracy = correct / total
    return accuracy, all_preds, all_labels

# %%
num_epochs = 20
best_accuracy = 0.0
model_save_path = './model.pth'

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    model.eval()
    val_accuracy, _, _ = evaluate(model, test_loader)
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    # Save the best model
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), model_save_path)
        print(f"Best model saved to {model_save_path} with accuracy: {best_accuracy:.4f}")

# %%

# Evaluate on the test set
test_accuracy, test_preds, test_labels = evaluate(model, test_loader)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification report
print("Classification Report:")
print(classification_report(test_labels, test_preds, digits=4))

# Confusion matrix
cm = confusion_matrix(test_labels, test_preds)
print("Confusion Matrix:")
print(cm)

# %%


MariaDB 서버에 성공적으로 연결되었습니다. 서버 버전: 10.4.34-MariaDB
현재 사용 중인 데이터베이스: cryptoStockTrading
MariaDB 연결이 종료되었습니다.
MariaDB 서버에 성공적으로 연결되었습니다. 서버 버전: 10.4.34-MariaDB
현재 사용 중인 데이터베이스: cryptoStockTrading
MariaDB 연결이 종료되었습니다.
MariaDB 서버에 성공적으로 연결되었습니다. 서버 버전: 10.4.34-MariaDB
현재 사용 중인 데이터베이스: cryptoStockTrading
MariaDB 연결이 종료되었습니다.
Epoch 1/20, Loss: 0.8460
Validation Accuracy: 0.4693
Best model saved to ./model.pth with accuracy: 0.4693
Epoch 2/20, Loss: 0.8242
Validation Accuracy: 0.4726
Best model saved to ./model.pth with accuracy: 0.4726
Epoch 3/20, Loss: 0.8230
Validation Accuracy: 0.4834
Best model saved to ./model.pth with accuracy: 0.4834
Epoch 4/20, Loss: 0.8240
Validation Accuracy: 0.4693
Epoch 5/20, Loss: 0.8205
Validation Accuracy: 0.4693
Epoch 6/20, Loss: 0.8234
Validation Accuracy: 0.4693
Epoch 7/20, Loss: 0.8222
Validation Accuracy: 0.4834
Epoch 8/20, Loss: 0.8220
Validation Accuracy: 0.4710
Epoch 9/20, Loss: 0.8211
Validation Accuracy: 0.4693
Epoch 10/20, Loss: 0.8198
Validation Acc

In [11]:
len(data.columns)

25