<a href="https://colab.research.google.com/github/joosk3R/jskRprac/blob/main/dacon_KRX_trading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings("ignore")

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
train = pd.read_csv("/content/drive/MyDrive/open (9)/train.csv")

In [None]:
train['일자'].value_counts()

20210601    2000
20221017    2000
20221013    2000
20221012    2000
20221011    2000
            ... 
20220120    2000
20220119    2000
20220118    2000
20220117    2000
20230530    2000
Name: 일자, Length: 494, dtype: int64

In [None]:
# 추론 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

# 각 종목코드에 대해서 모델 학습 및 추론 반복
for code in tqdm(unique_codes):

    # 학습 데이터 생성
    train_close = train[train['종목코드'] == code][['일자', '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index('일자', inplace=True)
    tc = train_close['종가']

    # 모델 선언, 학습 및 추론
    model = ARIMA(tc, order=(2, 1, 2))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측

    # 최종 수익률 계산
    final_return = (predictions.iloc[-1] - predictions.iloc[0]) / predictions.iloc[0]

    # 결과 저장
    results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)

100%|██████████| 2000/2000 [11:54<00:00,  2.80it/s]


In [None]:
predictions

494    8347.212039
495    8364.999395
496    8404.132409
497    8413.293572
498    8425.703125
499    8429.641682
500    8433.680945
501    8435.243744
502    8436.586781
503    8437.180743
504    8437.634781
505    8437.854846
506    8438.010281
507    8438.090528
508    8438.144234
Name: predicted_mean, dtype: float64

In [None]:
results_df['순위'] = results_df['final_return'].rank(method='first').astype('int') # 각 순위를 중복없이 생성
results_df

Unnamed: 0,종목코드,final_return,순위
0,A060310,-0.056272,10
1,A095570,-0.002833,322
2,A006840,0.003040,1647
3,A054620,0.001876,1513
4,A265520,0.002090,1549
...,...,...,...
1995,A189980,0.000630,1271
1996,A000540,0.002514,1601
1997,A003280,0.001430,1448
1998,A037440,0.002921,1638


In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/open (9)/sample_submission.csv')
sample_submission

Unnamed: 0,종목코드,순위
0,A000020,1
1,A000040,2
2,A000050,3
3,A000070,4
4,A000080,5
...,...,...
1995,A375500,1996
1996,A378850,1997
1997,A383220,1998
1998,A383310,1999


In [None]:
baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
baseline_submission
baseline_submission.to_csv('baseline_submission.csv', index=False)

In [None]:
baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
baseline_submission

In [None]:
pip install -U finance-datareader


Collecting finance-datareader
  Downloading finance_datareader-0.9.50-py3-none-any.whl (19 kB)
Collecting requests-file (from finance-datareader)
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, finance-datareader
Successfully installed finance-datareader-0.9.50 requests-file-1.5.1


In [None]:
import warnings
warnings.filterwarnings('ignore')

import os, datetime
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import IPython.display as ipd
from tqdm import tqdm

import FinanceDataReader as fdr


def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 0
seed_everything(seed)

In [None]:
stock_list = train
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
718502,20221114,A000020,동화약품,92076,9160,9390,9140,9270
144502,20210910,A000020,동화약품,197264,14950,15350,14800,15150
950502,20230502,A000020,동화약품,45890,8380,8430,8310,8430
82502,20210728,A000020,동화약품,94912,14300,14600,14300,14450
296502,20220104,A000020,동화약품,235805,15000,15000,14750,14900
...,...,...,...,...,...,...,...,...
140145,20210908,A383800,LX홀딩스,296939,10300,10350,10100,10150
152145,20210916,A383800,LX홀딩스,244940,10000,10100,9960,9980
36145,20210625,A383800,LX홀딩스,953657,11300,11300,11150,11200
824145,20230131,A383800,LX홀딩스,100031,8720,8830,8670,8820


In [None]:
stock_list = stock_list.drop(['일자','거래량','시가',"고가","저가","종가"] ,aixs = 1)

TypeError: ignored

In [None]:
start_date = '2021-06-01'
end_date = '2023-5-30'   # 예측하고 싶은 week의 마지막 날짜
num_val = 2

In [None]:
# 이동평균 추가하는 함수 정의
def add_rolling_mean(df:pd.DataFrame, col_name:str, window_list:list)->pd.DataFrame:
  for window in window_list:
    df[col_name + f'_rolling{window}'] = df[col_name].rolling(window).mean().fillna(method = 'bfill')
  return df

# 컬럼값을 lag하는 함수 정의
def lag_features(df:pd.DataFrame, col_list:list, lag_num:int)->pd.DataFrame:
  for col in col_list:
    df[col] = df[col].shift(lag_num).fillna(method = 'ffill').fillna(method = 'bfill')
  return df

In [None]:
stock_df_dict = {}

stock_list['종목코드'] =train['종목코드'].apply(lambda x: str(x).zfill(6))
stock_codes = np.sort(stock_list['종목코드'].values)


In [None]:
stock_codes

array(['A000020', 'A000020', 'A000020', ..., 'A383800', 'A383800',
       'A383800'], dtype=object)

In [None]:
import pandas as pd
import numpy as np

import FinanceDataReader as fdr
from statsmodels.tsa.seasonal import seasonal_decompose

from xgboost import XGBRegressor

import os
from tqdm import tqdm
import warnings
warnings.filterwarnings(action = 'ignore')
pd.options.display.max_columns = None

In [None]:
train

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,20210601,A060310,3S,166690,2890,2970,2885,2920
1,20210601,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,20210601,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,20210601,A054620,APS,462544,14600,14950,13800,14950
4,20210601,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,20230530,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,20230530,A000540,흥국화재,50218,3250,3255,3195,3215
987997,20230530,A003280,흥아해운,130664,1344,1395,1340,1370
987998,20230530,A037440,희림,141932,9170,9260,9170,9200


In [None]:

# 종목별로 stock_df 만들어서 딕셔너리 stock_df_dict에 저장하기
for stock_code in tqdm(stock_codes):

  # 날짜 설정
  dates = pd.date_range('20210601', '20230530', freq = 'B')
  df = pd.DataFrame(dates, columns = ['Date'])

  # 연, 월, 주, 일, 요일, 월별주수 정보 추가
  df['Year'] = df['Date'].dt.year
  df['Month'] = df['Date'].dt.month
  df['Week'] = df['Date'].dt.week
  df['Day'] = df['Date'].dt.day
  df['Weekday'] = df['Date'].dt.weekday
  df['WeekNum'] = np.ceil(df['Day']/7).astype(int)

  # 종가 데이터 시계열 분해
  df = df.set_index('Date')
  decomposition = seasonal_decompose(df[:-5]['Close'], model='multiplicative')
  df['Close_trend'] = decomposition.trend.fillna(method = 'ffill').fillna(method = 'bfill') # 종가 추세
  df['Close_seosonal'] = decomposition.seasonal.fillna(method = 'ffill').fillna(method = 'bfill') # 종가 계절성

  # 5일, 20일, 60일, 120일, 240일 이동평균 추가
  df = add_rolling_mean(df, 'Close', [5, 20, 60, 120, 240])

  # lag features
  col_list = list(df.iloc[:, 6:].columns)
  col_list.remove('Close')
  df = lag_features(df, col_list, 5)
  df.drop(df.index[:5])

  df = df.reset_index()
  df['Index'] = df.index
  stock_df_dict[stock_code] = df

  0%|          | 0/988000 [00:00<?, ?it/s]


KeyError: ignored

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os, datetime
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import IPython.display as ipd
from tqdm import tqdm

import FinanceDataReader as fdr


def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 0
seed_everything(seed)

In [None]:
stock_list = pd.read_csv('/content/drive/MyDrive/open (9)/train.csv')
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
718502,20221114,A000020,동화약품,92076,9160,9390,9140,9270
144502,20210910,A000020,동화약품,197264,14950,15350,14800,15150
950502,20230502,A000020,동화약품,45890,8380,8430,8310,8430
82502,20210728,A000020,동화약품,94912,14300,14600,14300,14450
296502,20220104,A000020,동화약품,235805,15000,15000,14750,14900
...,...,...,...,...,...,...,...,...
140145,20210908,A383800,LX홀딩스,296939,10300,10350,10100,10150
152145,20210916,A383800,LX홀딩스,244940,10000,10100,9960,9980
36145,20210625,A383800,LX홀딩스,953657,11300,11300,11150,11200
824145,20230131,A383800,LX홀딩스,100031,8720,8830,8670,8820


In [None]:
stock_list = stock_list.drop_duplicates(['종목코드'])

In [None]:
start_date = '2021-06-01'
end_date = '2023-06-23'   # 예측하고 싶은 week의 마지막 날짜
num_val = 2

In [None]:
features = ['Close']    # ['Close', 'Open', 'High', 'Low', 'Volume', 'Change']
norm_factors = {'Close': 1e6}

seq_len = 8   # 8주의 데이터로 다음 주의 종가 예측
dim_f = len(features)
dim_d =  17 # number of business days

In [None]:
type(data_series)

pandas.core.series.Series

In [None]:

data_series = stock_list["종목코드"]
result = [element[1:] for element in data_series]

In [None]:
len(result)

2000

In [None]:
result

In [None]:
stock_data = fdr.DataReader('383800', start=start_date, end=end_date).reset_index()

In [None]:
stock_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change
0,2021-06-01,11000,11300,10900,11000,1879288,0.004566
1,2021-06-02,10700,10850,10600,10700,2426922,-0.027273
2,2021-06-03,10650,10700,10300,10400,2709800,-0.028037
3,2021-06-04,10450,10650,10350,10450,1737593,0.004808
4,2021-06-07,10550,11150,10500,10800,2714980,0.033493
...,...,...,...,...,...,...,...
504,2023-06-15,8220,8240,8140,8190,157341,-0.002436
505,2023-06-16,8230,8240,8170,8220,58978,0.003663
506,2023-06-19,8220,8250,8210,8220,60518,0.000000
507,2023-06-20,8240,8250,8160,8190,121992,-0.003650


In [None]:
def get_data(code, start_date, end_date):
    stock_data = fdr.DataReader(code, start=start_date, end=end_date).reset_index()

    # 토요일, 일요일 제거
    week_days = pd.DataFrame(pd.date_range(start_date, end_date, freq='B'), columns = ['Date'])
    stock_data = pd.merge(week_days, stock_data.drop(columns=['Change']), how = 'left')

    # 주말 외 휴일의 NaN 값을 이전 날의 데이터로 매꿈.
    # 상장폐지 종목들의 폐지 날짜 이후의 종가가 NaN이 아닌 마지막 종가로 대체가 되어
    # 학습 데이터에 포함되게 되지만, 성능에 큰 영향은 없었음.
    stock_data = stock_data.ffill()

    return stock_data


def preprocess(df):
    df = df[features]
    df.dropna(how='any', axis=0, inplace=True)
    df = df[(len(df)%5):]
    for column in df.columns:
        df[column] /= norm_factors[column]
    return df.values.reshape(-1,dim_d,dim_f)    # shape = [num_weeks, num_business_days, num_features]


def split(data):
    train = data[:-(num_val+1)]
    val = data[-(seq_len+num_val):] if num_val>0 else []
    test = data[-(seq_len+1):]
    return train, val, test


def to_xy(time_series):
    xy = []
    for i in range(seq_len, len(time_series)):
        x = time_series[i-seq_len:i]
        y = time_series[i,:,features.index('Close')]
        xy.append({'x': x, 'y': y})
    return xy

In [None]:
df = get_data(code, start_date, end_date)

In [None]:
 result= pd.Series(result)

In [None]:
result

0       000020
1       000040
2       000050
3       000070
4       000080
         ...  
1995    375500
1996    378850
1997    383220
1998    383310
1999    383800
Length: 2000, dtype: object

In [None]:
train_data, val_data, test_data = [], [], []
for code in tqdm(result):
    df = get_data(code, start_date, end_date)
    train, val, test = split(preprocess(df))
    train_data += to_xy(train)
    val_data += to_xy(val)
    test_data += to_xy(test)

len(train_data), len(val_data)

  0%|          | 0/2000 [00:01<?, ?it/s]


ValueError: ignored

In [None]:
!pip install statsmodels



In [None]:
!pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m27.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.3


In [None]:
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima, ndiffs
from tqdm import tqdm

In [None]:
def NMAE(y_pred, y_train):
    return np.mean(abs(y_train-y_pred)/y_train)*100

In [None]:
# 수집 데이터 기간
start_date = '20210601'
end_date = '20230530'

# public 기간
public_start_date = '20230531'
public_end_date = '20230621'

# 검증 기간(11/8~26)
validate_start_date = '20230509'
validate_end_date = '20230530'

# private
private_start_date = '20210731'
private_end_date = '20230821'

In [None]:
start_weekday = pd.to_datetime(start_date).weekday()
max_weeknum = pd.to_datetime(end_date).strftime('%V')
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )

WEEKDAY of "start_date" : 1
NUM of WEEKS to "end_date" : 22
HOW MANY "Business_days" : (521, 1)


In [None]:
data_series = stock_list["종목코드"]
result = [element[1:] for element in data_series]

In [None]:
result

In [None]:
stock_code = result

In [None]:
len(result)

2000

In [None]:
# ARIMA를 위한 종목별 종가 데이터셋 생성
df_stock_close = Business_days.copy()
for code in tqdm(stock_code):
    df_tmp = fdr.DataReader(code, start=start_date, end=end_date)[['Close']].reset_index()
    df_stock_close = pd.merge(df_stock_close, df_tmp, how='outer', on='Date')

df_stock_close.fillna(method='bfill', inplace=True)
df_stock_close = df_stock_close.set_index('Date')
df_stock_close.columns = stock_code

100%|██████████| 2000/2000 [42:02<00:00,  1.26s/it]


In [None]:
df_stock_close.to_csv("/content/drive/MyDrive/open (9)/df_stock.csv1")

In [None]:
 df_stock_close.head(-5)

Unnamed: 0_level_0,000020,000040,000050,000070,000080,000100,000120,000140,000150,000180,...,363280,365590,368770,369370,373200,375500,378850,383220,383310,383800
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-01,14600.0,1175.0,14950.0,114000.0,39600.0,59691.0,169500.0,18900.0,83900.0,2900.0,...,28450.0,2089.0,2863.0,3516.0,2288.0,67605.0,5980.0,98500.0,37208.0,11000.0
2021-06-02,14500.0,1210.0,15100.0,114500.0,40100.0,58866.0,168500.0,19300.0,90000.0,3005.0,...,27750.0,2089.0,2339.0,3540.0,2314.0,69107.0,5940.0,96600.0,36532.0,10700.0
2021-06-03,14600.0,1200.0,15400.0,114500.0,39900.0,58775.0,170000.0,19200.0,85900.0,2935.0,...,27250.0,2089.0,2433.0,3532.0,2309.0,69608.0,5880.0,99400.0,37658.0,10400.0
2021-06-04,14700.0,1195.0,15000.0,113500.0,39200.0,58226.0,167000.0,18750.0,88100.0,2900.0,...,27200.0,2089.0,2349.0,3459.0,2217.0,69858.0,5890.0,97000.0,38784.0,10450.0
2021-06-07,15150.0,1210.0,14900.0,113000.0,39650.0,58409.0,166500.0,18800.0,103000.0,2940.0,...,28100.0,2089.0,2286.0,3427.0,2207.0,70359.0,5760.0,96600.0,38634.0,10800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-17,8790.0,686.0,10410.0,74900.0,23150.0,58000.0,81600.0,9620.0,93800.0,1985.0,...,10850.0,1930.0,3530.0,1944.0,4720.0,34350.0,4650.0,139900.0,63400.0,8380.0
2023-05-18,9020.0,697.0,10390.0,73900.0,23200.0,57500.0,81200.0,9610.0,95600.0,1999.0,...,10830.0,2010.0,3910.0,1912.0,4890.0,34850.0,4680.0,138800.0,62200.0,8350.0
2023-05-19,9090.0,707.0,10400.0,74000.0,23400.0,57700.0,81300.0,9620.0,95400.0,2015.0,...,10840.0,2140.0,3845.0,1964.0,4880.0,35300.0,4690.0,137600.0,62600.0,8350.0
2023-05-22,9280.0,722.0,10400.0,75100.0,23900.0,58800.0,83100.0,9640.0,97100.0,2025.0,...,10880.0,2080.0,4065.0,1956.0,4900.0,35950.0,4740.0,135200.0,63600.0,8360.0


In [None]:
df= pd.read_csv("/content/drive/MyDrive/open (9)/df_stock.csv1")

In [None]:
df.head(-5)

Unnamed: 0,Date,000020,000040,000050,000070,000080,000100,000120,000140,000150,...,363280,365590,368770,369370,373200,375500,378850,383220,383310,383800
0,2021-06-01,14600.0,1175.0,14950.0,114000.0,39600.0,59691.0,169500.0,18900.0,83900.0,...,28450.0,2089.0,2863.0,3516.0,2288.0,67605.0,5980.0,98500.0,37208.0,11000.0
1,2021-06-02,14500.0,1210.0,15100.0,114500.0,40100.0,58866.0,168500.0,19300.0,90000.0,...,27750.0,2089.0,2339.0,3540.0,2314.0,69107.0,5940.0,96600.0,36532.0,10700.0
2,2021-06-03,14600.0,1200.0,15400.0,114500.0,39900.0,58775.0,170000.0,19200.0,85900.0,...,27250.0,2089.0,2433.0,3532.0,2309.0,69608.0,5880.0,99400.0,37658.0,10400.0
3,2021-06-04,14700.0,1195.0,15000.0,113500.0,39200.0,58226.0,167000.0,18750.0,88100.0,...,27200.0,2089.0,2349.0,3459.0,2217.0,69858.0,5890.0,97000.0,38784.0,10450.0
4,2021-06-07,15150.0,1210.0,14900.0,113000.0,39650.0,58409.0,166500.0,18800.0,103000.0,...,28100.0,2089.0,2286.0,3427.0,2207.0,70359.0,5760.0,96600.0,38634.0,10800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511,2023-05-17,8790.0,686.0,10410.0,74900.0,23150.0,58000.0,81600.0,9620.0,93800.0,...,10850.0,1930.0,3530.0,1944.0,4720.0,34350.0,4650.0,139900.0,63400.0,8380.0
512,2023-05-18,9020.0,697.0,10390.0,73900.0,23200.0,57500.0,81200.0,9610.0,95600.0,...,10830.0,2010.0,3910.0,1912.0,4890.0,34850.0,4680.0,138800.0,62200.0,8350.0
513,2023-05-19,9090.0,707.0,10400.0,74000.0,23400.0,57700.0,81300.0,9620.0,95400.0,...,10840.0,2140.0,3845.0,1964.0,4880.0,35300.0,4690.0,137600.0,62600.0,8350.0
514,2023-05-22,9280.0,722.0,10400.0,75100.0,23900.0,58800.0,83100.0,9640.0,97100.0,...,10880.0,2080.0,4065.0,1956.0,4900.0,35950.0,4740.0,135200.0,63600.0,8360.0


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

p_values = range(0,3)
d_values = range(0,1)   # ndiffs 를 통한 계산
q_values = range(0,3)

In [None]:
def evaluate_arima_model(X, arima_order):
	train_size = int(len(X) * 0.95)
	train, test = X[0:train_size], X[train_size:]
	history = [x for x in train]
	predictions = list()
	for t in range(len(test)):
		model = ARIMA(history, order=arima_order)
		model_fit = model.fit()
		yhat = model_fit.forecast()[0]
		predictions.append(yhat)
		history.append(test[t])
	error = np.sqrt(mean_squared_error(test, predictions))
	return error

In [None]:
# 적합 기간 확인
df_kospi200_2020 = fdr.DataReader('KS200', '2022')
df_kospi200_2021 = fdr.DataReader('KS200', '2023')

In [None]:
df_kospi200_2021

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-02,292.899994,294.019989,289.190002,289.790009,289.790009,92500
2023-01-03,290.640015,290.950012,284.320007,289.579987,289.579987,106600
2023-01-04,288.339996,296.700012,287.940002,295.980011,295.980011,109900
2023-01-05,297.649994,299.690002,296.010010,297.869995,297.869995,137300
2023-01-06,296.690002,303.100006,296.399994,301.529999,301.529999,121000
...,...,...,...,...,...,...
2023-06-30,337.559998,338.980011,335.040009,337.950012,337.950012,104000
2023-07-03,340.260010,343.500000,340.070007,343.079987,343.079987,98800
2023-07-04,343.540009,343.880005,341.420013,341.589996,341.589996,103800
2023-07-05,341.470001,342.820007,338.929993,339.040009,339.040009,130500


In [None]:
import warnings
warnings.filterwarnings("ignore")

target = df_kospi200_2020.Close

target = target.astype('float32')
best_score, best_cfg = float("inf"), None
for p in p_values:
    for d in d_values:
        for q in q_values:
            order = (p,d,q)
            try:
                rmse = evaluate_arima_model(target, order)
                if rmse < best_score:
                    best_score, best_cfg = rmse, order
            except:
                continue

print('df_kospi200_2020 Best ARIMA RMSE=%.3f' % (best_score))

target = df_kospi200_2021.Close

target = target.astype('float32')
best_score, best_cfg = float("inf"), None
for p in p_values:
    for d in d_values:
        for q in q_values:
            order = (p,d,q)
            try:
                rmse = evaluate_arima_model(target, order)
                if rmse < best_score:
                    best_score, best_cfg = rmse, order
            except:
                continue

print('df_kospi200_2021 Best ARIMA RMSE=%.3f' % (best_score))

df_kospi200_2020 Best ARIMA RMSE=inf
df_kospi200_2021 Best ARIMA RMSE=inf


In [None]:
df_train_public = df_stock_close
df_train_private = df_stock_close

In [None]:
import warnings
warnings.filterwarnings("ignore")

# 각 종목별 order 검색

df_search = df_train_public[df_train_public.columns[:5]]
df_arima_order = pd.DataFrame(columns = ['stock_code','order'])
count = 1

for code in df_search.columns :
    target = df_search[code]
    target = target.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(target, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                except:
                    continue
    df_tmp = pd.DataFrame(columns = ['stock_code','order'])
    df_tmp = df_tmp.append(pd.DataFrame([[code, best_cfg]], columns=['stock_code', 'order']))
    df_arima_order = df_arima_order.append(df_tmp)

    print(code + ' : Best ARIMA%s RMSE=%.3f' % (best_cfg, best_score))

000020 : Best ARIMANone RMSE=inf
000040 : Best ARIMANone RMSE=inf
000050 : Best ARIMANone RMSE=inf
000070 : Best ARIMANone RMSE=inf
000080 : Best ARIMANone RMSE=inf


실패

하도

In [None]:
import pandas as pd
import os
import FinanceDataReader as fdr
import numpy as np
import math
import sklearn
import sklearn.preprocessing
import datetime
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm

In [None]:
valid_set_size_percentage = 10


In [None]:
stock_list = result

In [None]:
stock_list = pd.Series(stock_list)
stock_list = pd.DataFrame(stock_list)

In [None]:
stock_list = pd.DataFrame(stock_list)

In [None]:
stock_list

Unnamed: 0,0
0,000020
1,000040
2,000050
3,000070
4,000080
...,...
1995,375500
1996,378850
1997,383220
1998,383310


In [None]:
stock_list.rename(columns={0:'종목코드'},inplace = True)

In [None]:
stock_list

Unnamed: 0,종목코드
0,000020
1,000040
2,000050
3,000070
4,000080
...,...
1995,375500
1996,378850
1997,383220
1998,383310


In [None]:
data = pd.DataFrame()
volume = pd.DataFrame()
high = pd.DataFrame()
low = pd.DataFrame()
data2 = pd.DataFrame()
volume2 = pd.DataFrame()
high2 = pd.DataFrame()
low2 = pd.DataFrame()

start_date = '20160104'
end_date = '20200214'
start_date2 = '20200608'
end_date2 = '20230530'

for stock_code in stock_list["종목코드"]:
    stock = fdr.DataReader(stock_code, start = start_date, end = end_date)
    data = pd.concat([data,stock["Close"]], axis=1)
    volume = pd.concat([volume, stock["Volume"]], axis=1)
    high = pd.concat([high,stock["High"]], axis=1)
    low = pd.concat([low,stock["Low"]], axis=1)

for stock_code in stock_list["종목코드"]:
    stock = fdr.DataReader(stock_code, start = start_date2, end = end_date2)
    data2 = pd.concat([data2,stock["Close"]], axis=1)
    volume2 = pd.concat([volume2, stock["Volume"]], axis=1)
    high2 = pd.concat([high2,stock["High"]], axis=1)
    low2 = pd.concat([low2,stock["Low"]], axis=1)

data = pd.concat([data,data2], axis=0)
volume = pd.concat([volume, volume2], axis=0)
high = pd.concat([high,high2], axis=0)
low = pd.concat([low,low2], axis=0)

data.columns = stock_list["종목코드"]
volume.columns = stock_list["종목코드"]
high.columns = stock_list["종목코드"]
low.columns = stock_list["종목코드"]


In [None]:
data.to_csv("/content/drive/MyDrive/open (9)/data.csv")
volume.to_csv("/content/drive/MyDrive/open (9)/volume.csv")
high.to_csv("/content/drive/MyDrive/open (9)/high.csv")
low.to_csv("/content/drive/MyDrive/open (9)/low.csv")

In [None]:
data.head()

종목코드,000020,000040,000050,000070,000080,000100,000120,000140,000150,000180,...,363280,365590,368770,369370,373200,375500,378850,383220,383310,383800
2016-01-04,8140,7972,18000,151500,22800,40071,185000,12900,65067,3245,...,,,,,,,,,,
2016-01-05,8190,8002,18250,148000,23600,40293,200000,13400,65382,3285,...,,,,,,,,,,
2016-01-06,8550,7697,18550,168000,23700,41397,204500,13400,65303,3305,...,,,,,,,,,,
2016-01-07,8380,7514,18450,164500,26550,43164,197500,14400,64516,3240,...,,,,,,,,,,
2016-01-08,8770,7544,18850,178500,27500,44785,201500,14400,63728,3180,...,,,,,,,,,,


In [None]:
from pmdarima.arima import auto_arima
data_diff = data.iloc[1:, :] - data.iloc[:-1, :].values

def dr_outlier(df):
    quartile_1 = df.quantile(0.25)
    quartile_3 = df.quantile(0.75)
    IQR = quartile_3 - quartile_1
    condition = (df < (quartile_1 - 1.5 * IQR)) | (df > (quartile_3 + 1.5 * IQR))
    search_df = df[condition]
    return df.drop(search_df.index, axis=0)

from pmdarima.arima import ndiffs
from sklearn.linear_model import LinearRegression

def arima_pred_model(train):
    if train.isnull().sum() > 0:
        train = train.dropna(axis=0)
    kpss_diffs = ndiffs(train, alpha=0.1, test='kpss', max_d=6)
    adf_diffs = ndiffs(train, alpha=0.1, test='adf', max_d=6)
    n_diffs = max(adf_diffs, kpss_diffs)
    m = auto_arima(train, d=n_diffs, error_action='ignore', start_p=1, start_q=1, max_p=3, max_q=3)
    pred = m.predict(n_periods=5)
    return pred

result = []
for index in tqdm(range(2000)):
    train = data_diff.iloc[:, index]
    train = dr_outlier(train)
    pred = arima_pred_model(train)
    result.append(pred)

sub = pd.read_csv("sample_submission.csv")
result = pd.DataFrame(result).T
for i in range(1,2001):
    for j in range(5):
        result_sum = result.iloc[:j+1, i-1].sum()
        sub.iloc[j + 5, i] = np.pad(result_sum + data.iloc[-1, i - 1], (5, 0), 'constant', constant_values=0)
sub["031390"] = 0
sub.to_csv("arima_end1.csv", index=0)

100%|██████████| 2000/2000 [1:55:34<00:00,  3.47s/it]


IndexError: ignored