In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('mode.chained_assignment',  None)
data = pd.read_csv('train.csv')
sub = pd.read_csv('sample_submission.csv')

data['일자'] = pd.to_datetime(data['일자'], format='%Y%m%d')
data


Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,2021-06-01,A060310,3S,166690,2890,2970,2885,2920
1,2021-06-01,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,2021-06-01,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,2021-06-01,A054620,APS,462544,14600,14950,13800,14950
4,2021-06-01,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,2023-05-30,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,2023-05-30,A000540,흥국화재,50218,3250,3255,3195,3215
987997,2023-05-30,A003280,흥아해운,130664,1344,1395,1340,1370
987998,2023-05-30,A037440,희림,141932,9170,9260,9170,9200


# DLinear

In [2]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

def split_df(df, window_size, forcast_size, not_col):
    train_lst = []
    test_lst = []
    y_train_lst = []
    for code in tqdm(df['종목코드'].unique()):
        data = df[df['종목코드'] == code]
        train = data.iloc[:-forcast_size]
        test = data.iloc[-window_size:]
        
        # 정규화
        std = StandardScaler()
        if len(not_col) == 1:
            col =  [col for col in list(train.columns) if col not in [not_col]]
        else:
            col =  [col for col in list(train.columns) if col not in not_col]
            
        std.fit(train[col])
        train[col] = std.transform(train[col])
        test[col] = std.transform(test[col])
        
        # train_lst.append(train)
        # test_lst.append(test)
        # y_train_lst.append(y_train)
        test_lst.append(test)
        
        # Sliding Window
        not_col.append('index')
        train, test = train.reset_index().drop(not_col, axis=1), test.reset_index().drop(not_col, axis=1)
        # test_lst.append(test)
        
        for i in train.index:
            sw_train = train.iloc[i:i+window_size]
            if sw_train.shape[0] != window_size:
                break
            
            train_lst.append(sw_train.to_numpy())
            
            y_train = data.iloc[i+window_size:i+window_size+forcast_size]
            y_train_lst.append(y_train['종가'].to_numpy())
                
    return np.array(train_lst), pd.concat(test_lst), np.array(y_train_lst)

        
window_size = 365
forcast_size = 15

# dum = data[data['종목코드'] == 'A060310'].reset_index().drop(['종목코드', '종목명', 'index'], axis=1)
# train_df_fe, test_df_fe, mean_, std_ = standardization(dum.iloc[:-15], dum.iloc[-15:], '일자', '종가')
# train_df_fe, test_df_fe, mean_, std_ = standardization(dum.iloc[:-15], dum.iloc[-15:], '일자', '종가')

train, test, y_train = split_df(data, window_size, forcast_size, ['일자', '종목코드', '종목명'])

100%|██████████| 2000/2000 [01:53<00:00, 17.70it/s]


In [3]:
from sklearn.model_selection import train_test_split

# train-validation 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(train, y_train, test_size=0.3, random_state=42, shuffle=True)


In [4]:
X_valid.shape

(69000, 365, 5)

In [5]:
class Data(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return self.X, self.Y
    
train_ds = Data(X_train, y_train)
valid_ds = Data(X_valid, y_valid)

train_dl = DataLoader(train_ds, batch_size = 8, shuffle=False,)
valid_dl = DataLoader(valid_ds, batch_size = 8, shuffle=False)


In [6]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

class LTSF_NLinear(torch.nn.Module):
    def __init__(self, window_size, forcast_size, individual, feature_size):
        super(LTSF_NLinear, self).__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear.append(torch.nn.Linear(self.window_size, self.forcast_size))
        else:
            self.Linear = torch.nn.Linear(self.window_size, self.forcast_size)

    def forward(self, x):
        seq_last = x[:,-1:,:].detach()
        x = x - seq_last
        if self.individual:
            output = torch.zeros([x.size(0), self.forcast_size, x.size(2)],dtype=x.dtype).to(x.device)
            for i in range(self.channels):
                output[:,:,i] = self.Linear[i](x[:,:,i])
            x = output
        else:
            x = self.Linear(x.permute(0,2,1)).permute(0,2,1)
        x = x + seq_last
        return x
    
NLinear_model = LTSF_NLinear(
                            window_size=window_size,
                            forcast_size=forcast_size,
                            # kernel_size=25,
                            individual=False,
                            # feature_size=X_train.shape[2],
                            feature_size=5,
                            ).to(device)

True


In [12]:
### 모델 학습 ###
train_loss_list = []
valid_loss_list = []
test_loss_list = []
epoch = 50
lr = 0.001

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(NLinear_model.parameters(), lr=lr)
max_loss = 999999999

for epoch in tqdm(range(1, epoch + 1)):
    loss_list = []
    NLinear_model.train()
    for batch_idx, (data, target) in enumerate(train_dl):
        data = data.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        output = NLinear_model(data)
        loss = criterion(output, target.unsqueeze(-1))
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
    train_loss_list.append(np.mean(loss_list))

    NLinear_model.eval()
    with torch.no_grad():
        for data, target in valid_dl:
            data = data.to(device)
            target = target.to(device)
            output = NLinear_model(data)
            valid_loss = criterion(output, target.unsqueeze(-1))
            valid_loss_list.append(valid_loss)

    if valid_loss < max_loss:
        torch.save(NLinear_model, 'NLinear_model.pth')
        max_loss = valid_loss
        print("valid_loss={:.3f}, Model Save".format(valid_loss))

    print("epoch = {}, train_loss : {:.3f}, valid_loss : {:.3f}".format(epoch, np.mean(loss_list), valid_loss))


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# 예측코드 해야함
